fast5_to_pod5 and dorado (not demultiplexed)

nf-core · Oct 17, 2023 · 1867595 · 1867595
1 parent 2020960
commit 1867595
Show file tree

Hide file tree

Showing 14 changed files with 340 additions and 65 deletions.
diff --git a/conf/test.config b/conf/test.config
@@ -1,33 +1,39 @@
 /*
- * -------------------------------------------------
- *  Nextflow config file for running tests
- * -------------------------------------------------
- * Defines bundled input files and everything required
- * to run a fast and simple test. Use as follows:
- *   nextflow run nf-core/nanoseq -profile test_nobc_dx,<docker/singularity>
- */
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/nanoseq -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
 
 params {
     config_profile_name        = 'Test profile'
     config_profile_description = 'Minimal test dataset to check pipeline function'
 
-    // Limit resources
-    max_cpus            = 2
-    max_memory          = 6.GB
-    max_time            = 12.h
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 10
+    max_memory = '6.GB'
+    max_time   = '6.h'
 
-    // Input data to perform demultipexing
-    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv'
-    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
-    gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
-    run_nanolyse        = true
-    protocol            = 'DNA'
+    // Input data to perform both basecalling and demultiplexing
+    input               = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    protocol            = 'cDNA'
+    flowcell            = 'FLO-MIN106'
+    kit                 = 'SQK-DCS109'
     barcode_kit         = 'NBD103/NBD104'
-    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz'
-    skip_bigwig         = true
-    skip_bigbed         = true
+    trim_barcodes=true
+    output_demultiplex_fast5 = true
+    run_nanolyse        = true
     skip_quantification = true
     skip_fusion_analysis= true
     skip_modification_analysis=true
-    aligner             = 'graphmap2'
+
+    // This variable is just for reference and isnt actually required for the tests
+    // Files are downloaded and staged using the "GetTestData" process
+    input_path          = '/home/wanyk/3.2/test-datasets/fast5/barcoded_multi/'
 }
diff --git a/conf/test_bc_nodx.config b/conf/test_bc_nodx.config
@@ -0,0 +1,35 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/nanoseq -profile test_bc_nodx,<docker/singularity>
+ */
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on Travis
+    max_cpus            = 10
+    max_memory          = 6.GB
+    max_time            = 12.h
+
+    // Input data to perform basecalling and to skip demultipexing
+    input               = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_nodx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    protocol            = 'cDNA'
+    flowcell            = 'FLO-MIN106'
+    kit                 = 'SQK-DCS108'
+    skip_bigbed         = true
+    skip_bigwig         = true
+    skip_demultiplexing = true
+    skip_quantification = true
+    skip_fusion_analysis= true
+    skip_modification_analysis=true
+
+    // This variable is just for reference and isnt actually required for the tests
+    // Files are downloaded and staged using the "GetTestData" process
+    input_path          = '/home/wanyk/3.2/test-datasets/fast5/nonbarcoded_multi/'
+}
diff --git a/conf/test_nobc_dx.config b/conf/test_nobc_dx.config
@@ -0,0 +1,33 @@
+/*
+ * -------------------------------------------------
+ *  Nextflow config file for running tests
+ * -------------------------------------------------
+ * Defines bundled input files and everything required
+ * to run a fast and simple test. Use as follows:
+ *   nextflow run nf-core/nanoseq -profile test_nobc_dx,<docker/singularity>
+ */
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources
+    max_cpus            = 2
+    max_memory          = 6.GB
+    max_time            = 12.h
+
+    // Input data to perform demultipexing
+    input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_dx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
+    gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
+    skip_basecalling    = true
+    run_nanolyse        = true
+    protocol            = 'DNA'
+    barcode_kit         = 'NBD103/NBD104'
+    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fastq/nondemultiplexed/sample_nobc_dx.fastq.gz'
+    skip_bigwig         = true
+    skip_bigbed         = true
+    skip_quantification = true
+    skip_fusion_analysis= true
+    skip_modification_analysis=true
+}
diff --git a/conf/test_nodx_noaln.config → conf/test_nobc_nodx_noaln.config b/conf/test_nodx_noaln.config → conf/test_nobc_nodx_noaln.config
@@ -20,6 +20,7 @@ params {
     input                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_noaln.csv'
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17550000.fa'
     gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_1-17500000.gtf'
+    skip_basecalling    = true
     protocol              = 'directRNA'
     skip_demultiplexing   = true
     skip_alignment        = true

diff --git a/conf/test_nodx_rnamod.config → conf/test_nobc_nodx_rnamod.config b/conf/test_nodx_rnamod.config → conf/test_nobc_nodx_rnamod.config
@@ -20,6 +20,7 @@ params {
     input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_rnamod.csv'
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.fa'
     gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/modification_transcriptome_subset.gtf'
+    skip_basecalling    = true
     protocol            = 'directRNA'
     run_nanolyse        = true
     skip_bigbed         = true

diff --git a/conf/test_nodx_stringtie.config → conf/test_nobc_nodx_stringtie.config b/conf/test_nodx_stringtie.config → conf/test_nobc_nodx_stringtie.config
@@ -21,6 +21,7 @@ params {
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.fa'
     gtf                 = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/chr22_23800000-23980000.gtf'
     protocol            = 'directRNA'
+    skip_basecalling    = true
     skip_demultiplexing = true
     skip_fusion_analysis= true
     skip_modification_analysis=true

diff --git a/conf/test_nodx_vc.config → conf/test_nobc_nodx_vc.config b/conf/test_nodx_vc.config → conf/test_nobc_nodx_vc.config
@@ -19,6 +19,7 @@ params {
     // Input data to skip demultiplexing and variant call
     input               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/3.2/samplesheet/samplesheet_nobc_nodx_vc.csv'
     fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    skip_basecalling    = true
     protocol            = 'DNA'
     skip_quantification = true
     skip_demultiplexing = true

diff --git a/conf/test_withpull.config b/conf/test_withpull.config
@@ -0,0 +1,39 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/nanoseq -profile test,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data to perform both basecalling and demultiplexing
+    input               = 'https://raw.githubusercontent.com/yuukiiwa/test-datasets/nanoseq/3.2/samplesheet/samplesheet_bc_dx.csv'
+    fasta               = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/reference/hg19_KCMF1.fa'
+    protocol            = 'cDNA'
+    flowcell            = 'FLO-MIN106'
+    kit                 = 'SQK-DCS109'
+    barcode_kit         = 'EXP-NBD103'
+    trim_barcodes=true
+    output_demultiplex_fast5 = true
+    run_nanolyse        = true
+    skip_quantification = true
+    skip_fusion_analysis= true
+    skip_modification_analysis=true
+
+    // This variable is just for reference and isnt actually required for the tests
+    // Files are downloaded and staged using the "GetTestData" process
+    input_path          = 'https://raw.githubusercontent.com/nf-core/test-datasets/nanoseq/fast5/barcoded/'
+}
diff --git a/modules/local/dorado.nf b/modules/local/dorado.nf
@@ -0,0 +1,38 @@
+process DORADO {
+    label 'process_medium'
+
+    container "docker.io/ontresearch/dorado"
+
+    input:
+    path(input_path)
+    val meta
+    path dorado_config
+    path dorado_model
+
+    output:
+    path "*.fastq.gz"                    , emit: fastq
+    path "versions.yml"                  , emit: versions
+
+    script:
+    def fast5_dir_path = workflow.profile.contains('test') ? "input_path" : "$input_path"
+    def trim_barcodes = params.trim_barcodes ? "--trim_barcodes" : ""
+    def barcode_kit  = params.barcode_kit ? "--barcode_kits $params.barcode_kit" : ""
+    def barcode_ends = params.barcode_both_ends ? "--require_barcodes_both_ends" : ""
+    def proc_options = params.dorado_gpu ? "--device $params.gpu_device --num_callers $task.cpus --cpu_threads_per_caller $params.dorado_cpu_threads --gpu_runners_per_device $params.dorado_gpu_runners" : "--num_callers 2 --cpu_threads_per_caller ${task.cpus/2}"
+    def config   = "--flowcell $params.flowcell --kit $params.kit"
+    if (params.dorado_config) config = file(params.dorado_config).exists() ? "--config ./$dorado_config" : "--config $params.dorado_config"
+    def model    = ""
+    if (params.dorado_model)  model  = file(params.dorado_model).exists() ? "--model ./$dorado_model" : "--model $params.dorado_model"
+    """
+    dorado download --model [email protected]
+    dorado basecaller [email protected] $input_path --device cpu --emit-fastq > basecall.fastq
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        dorado: \$(echo \$(dorado --version 2>&1) | sed -r 's/.{81}//')
+    END_VERSIONS
+
+    gzip basecall.fastq
+    """
+}
+
diff --git a/modules/local/fast5_to_pod5.nf b/modules/local/fast5_to_pod5.nf
@@ -0,0 +1,26 @@
+process FAST5_TO_POD5 {
+    label 'process_medium'
+
+    conda "conda-forge::r-base=4.0.3 bioconda::bioconductor-bambu=3.0.8 bioconda::bioconductor-bsgenome=1.66.0"
+    container "docker.io/yuukiiwa/pod5:0.2.4"
+
+    input:
+    path input_path
+
+    output:
+    path "pod5/"         , emit: pod5
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    output_name = "pod5/converted.pod5"
+    """
+    pod5 convert fast5 $input_path --output $output_name
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        pod5: 
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/get_test_data.nf b/modules/local/get_test_data.nf
@@ -4,7 +4,7 @@ process GET_TEST_DATA {
     container "docker.io/yuukiiwa/git:latest"
 
     output:
-    path "test-datasets/fast5/$barcoded/*"        , emit: ch_input_fast5s_path
+    path "test-datasets/fast5/$barcoded/"         , emit: ch_input_fast5_dir_path
     path "test-datasets/modification_fast5_fastq/", emit: ch_input_dir_path
     path "versions.yml"                           , emit: versions
 

diff --git a/modules/nf-core/qcat/main.nf b/modules/nf-core/qcat/main.nf
diff --git a/nextflow.config b/nextflow.config
@@ -19,15 +19,24 @@ params {
     gtf                        = null
 
 
-    // Options: Demultiplexing
+    // Options: Basecalling and Demultiplexing
     input_path                 = null
+    flowcell                   = null
+    kit                        = null
     barcode_kit                = null
     barcode_both_ends          = false
     trim_barcodes              = false
+    dorado_config               = null
+    dorado_model                = null
+    dorado_gpu                  = false
+    dorado_gpu_runners          = 6
+    dorado_cpu_threads          = 1
     gpu_device                 = 'auto'
     gpu_cluster_options        = null
+    output_demultiplex_fast5   = false
     qcat_min_score             = 60
     qcat_detect_middle         = false
+    skip_basecalling           = false
     skip_demultiplexing        = false
 
     // Options: Raw read cleaning
@@ -221,12 +230,14 @@ profiles {
         executor.cpus          = 16
         executor.memory        = 60.GB
     }
+    test      { includeConfig 'conf/test.config'      }
     test_full { includeConfig 'conf/test_full.config' }
-    test                { includeConfig 'conf/test.config'                }
-    test_nodx_stringtie { includeConfig 'conf/test_nodx_stringtie.config' }
-    test_nodx_noaln     { includeConfig 'conf/test_nodx_noaln.config'     }
-    test_nodx_vc        { includeConfig 'conf/test_nodx_vc.config'        }
-    test_nodx_rnamod    { includeConfig 'conf/test_nodx_rnamod.config'    }
+    test_bc_nodx             { includeConfig 'conf/test_bc_nodx.config'             }
+    test_nobc_dx             { includeConfig 'conf/test_nobc_dx.config'             }
+    test_nobc_nodx_stringtie { includeConfig 'conf/test_nobc_nodx_stringtie.config' }
+    test_nobc_nodx_noaln     { includeConfig 'conf/test_nobc_nodx_noaln.config'     }
+    test_nobc_nodx_vc        { includeConfig 'conf/test_nobc_nodx_vc.config'        }
+    test_nobc_nodx_rnamod    { includeConfig 'conf/test_nobc_nodx_rnamod.config'    }
 }