diff --git a/docs/mop_preprocess.rst b/docs/mop_preprocess.rst index 2c6f51f..1dd2273 100644 --- a/docs/mop_preprocess.rst +++ b/docs/mop_preprocess.rst @@ -9,7 +9,7 @@ MOP_PREPROCESS This pipeline takes as input the raw fast5 reads - single or multi - and it produces several outputs (basecalled fast5, sequences in fastq format, aligned reads in BAM format etc). The pre-processing pipeline can perform base-calling, demultiplexing (optional), filtering, quality control, mapping to a reference (either a genome or a transcriptome), feature counting, discovery of novel transcripts, and it generates a final report with the performance and results of each of the steps performed. -It automatically detects the kind of input fast5 file (single or multi-sequence). It can also support the new pod5 format but it won't output basecalled fastq useful for the other pipelines. The basecalling can be performed with guppy or dorado and the demultiplexing with either guppy, or deeplexicon. Basecalled fastq and Fast5 files can be demultiplexed as well. You can restrict the number of barcodes by indicating a file with barcode list using the **barcodes** parameter. +It automatically detects the kind of input fast5 file (single or multi-sequence). It can also support the new pod5 format but it won't output basecalled fastq useful for the other pipelines. The basecalling can be performed with guppy or dorado and the demultiplexing with either guppy, seqtagger, or deeplexicon. Basecalled fastq and Fast5 files can be demultiplexed as well. You can restrict the number of barcodes by indicating a file with barcode list using the **barcodes** parameter. .. image:: ../img/flow_preproc.png @@ -133,6 +133,9 @@ The first column indicates the processing step as **basecalling** or **demultipl .. note:: Readucks is run after guppy demultiplexing. It refines the demultiplexing generating different fastqs +.. tip:: + You don't need to specify the whole path for the models of seqtagger, just the name of the model will be enough + Model libraries for specific tools ==================== @@ -144,6 +147,9 @@ The following folders are available for the respective tools. Some models are al * pAmps-rep2-4-train1_newdata_nanopore_UResNet20v2_model.039.h5 * dorado_models * rna002_70bps_hac@v3 +* seqtagger_models + * b04_RNA002 + * b04_RNA004 .. note:: You need to download the models you want to use in case they are not already available. For instance, if you need another model for dorado you need to do: @@ -158,6 +164,7 @@ You also need to add the dedicated parameter within the tool_opts file for the s .. code-block:: console basecalling dorado "rna002_70bps_hac@v3" + demultiplexing seqtagger "-k b100" demultiplexing deeplexicon "-f multi -m resnet20-final.h5" .. note:: @@ -182,7 +189,7 @@ The sample id is given by either the folder containing the fast5 files or the ba .. note:: - The naming convention of the different barcodes is decided by each tool, so guppy will produce **barcode01**, **barcode02**, etc. + The naming convention of the different barcodes is decided by each tool, so guppy will produce **barcode01**, **barcode02**, while seqtagger will produce bc_1, bc_2, etc. Basecalling with the m6A-aware model diff --git a/img/flow_preproc.png b/img/flow_preproc.png index 5825e61..ae9ec09 100644 Binary files a/img/flow_preproc.png and b/img/flow_preproc.png differ diff --git a/mop_preprocess/mop_preprocess.nf b/mop_preprocess/mop_preprocess.nf index 87c951c..4673e10 100644 --- a/mop_preprocess/mop_preprocess.nf +++ b/mop_preprocess/mop_preprocess.nf @@ -32,8 +32,8 @@ reference : ${params.reference} annotation : ${params.annotation} granularity : ${params.granularity} - ref_type : ${params.ref_type} + pars_tools : ${params.pars_tools} barcodes : ${params.barcodes} @@ -99,6 +99,9 @@ switch(params.demultiplexing) { case "deeplexicon": demux_models = "${projectDir}/deeplexicon_models/" break; + case "seqtagger": + demux_models = "${projectDir}/seqtagger_models/" + break; } dorado_models = "${projectDir}/dorado_models/" @@ -127,6 +130,7 @@ def demulti_fast5_opt = homogenizeVals(params.demulti_fast5) def basecall_label = (params.GPU != 'OFF' ? 'basecall_gpus' : 'big_cpus') def deeplexi_basecall_label = (params.GPU != 'OFF' ? 'demulti_gpus' : '') + def output_bc = (demulti_fast5_opt == 'ON' ? '' : outputFast5) //def outputMinionQC = (demulti_fast5_opt == 'ON' ? '': outputQual) @@ -160,7 +164,7 @@ barcodes_to_include = get_barcode_list(params.barcodes) def guppy_basecall_pars = guppypars + " " + progPars["basecalling--guppy"] def basecaller_pars = ["guppy" : guppy_basecall_pars, "dorado" : progPars["basecalling--dorado"] ] -def demux_pars = ["guppy" : progPars["demultiplexing--guppy"] + " " + guppy_basecall_pars, "deeplexicon": progPars["demultiplexing--deeplexicon"] ] +def demux_pars = ["guppy" : progPars["demultiplexing--guppy"] + " " + guppy_basecall_pars, "seqtagger": progPars["demultiplexing--seqtagger"], "deeplexicon": progPars["demultiplexing--deeplexicon"] ] // INCLUDE WORKFLOWS @@ -186,7 +190,7 @@ include { GET_VERSION as FASTQC_VER} from "${subworkflowsDir}/qc/fastqc" include { SORT as SAMTOOLS_SORT } from "${subworkflowsDir}/misc/samtools" addParams(LABEL: 'big_cpus', OUTPUT:outputMapping) include { INDEX as SAMTOOLS_INDEX } from "${subworkflowsDir}/misc/samtools" addParams(OUTPUT:outputMapping) include { GET_VERSION as SAMTOOLS_VERSION; CAT as SAMTOOLS_CAT } from "${subworkflowsDir}/misc/samtools" -include { MOP_QC as NANOPLOT_QC } from "${subworkflowsDir}/qc/nanoplot" addParams(LABEL: 'big_cpus_ignore') +include { MOP_QC as NANOPLOT_QC } from "${subworkflowsDir}/qc/nanoplot" addParams(LABEL: 'big_cpus_retry') include { GET_VERSION as NANOPLOT_VER } from "${subworkflowsDir}/qc/nanoplot" include { GET_VERSION as NANOCOUNT_VER } from "${subworkflowsDir}/read_count/nanocount" include { COUNT as NANOCOUNT } from "${subworkflowsDir}/read_count/nanocount" addParams(LABEL: 'big_mem', EXTRAPARS: progPars["counting--nanocount"], OUTPUT:outputCounts) @@ -407,6 +411,7 @@ workflow { else { // BASECALL AND DEMULTIPLEX switch(params.demultiplexing) { case "deeplexicon": + case "seqtagger": outbc = BASECALL(fast5_4_analysis) demux = DEMULTIPLEX(fast5_4_analysis, outbc.basecalled_fastq) demufq = demux.demultiplexed_fastq @@ -507,7 +512,7 @@ workflow { // Perform fastqc QC on fastq fastqc_files = FASTQC(fastq_files) - multiqc_data = multiqc_data.mix(stats_aln).mix(fastqc_files.map{it[1]}) + multiqc_data = multiqc_data.mix(fastqc_files.map{it[1]}) stats_counts = COUNTING(sorted_alns, aln_indexes).stats_counts diff --git a/mop_preprocess/seqtagger_models/b04_RNA002/barcodes.fa b/mop_preprocess/seqtagger_models/b04_RNA002/barcodes.fa new file mode 100644 index 0000000..dcf8189 --- /dev/null +++ b/mop_preprocess/seqtagger_models/b04_RNA002/barcodes.fa @@ -0,0 +1,8 @@ +>Barcode_1 +GGCTTCTTCTTGCTCTTAGG +>Barcode_2 +GTGATTCTCGTCTTTCTGCG +>Barcode_3 +GTACTTTTCTCTTTGCGCGG +>Barcode_4 +GGTCTTCGCTCGGTCTTATT diff --git a/mop_preprocess/seqtagger_models/b04_RNA002/barcodes.fa.fai b/mop_preprocess/seqtagger_models/b04_RNA002/barcodes.fa.fai new file mode 100644 index 0000000..73e6046 --- /dev/null +++ b/mop_preprocess/seqtagger_models/b04_RNA002/barcodes.fa.fai @@ -0,0 +1,4 @@ +Barcode_1 20 11 20 21 +Barcode_2 20 43 20 21 +Barcode_3 20 75 20 21 +Barcode_4 20 107 20 21 diff --git a/mop_preprocess/seqtagger_models/b04_RNA002/config.toml b/mop_preprocess/seqtagger_models/b04_RNA002/config.toml new file mode 100644 index 0000000..9449fd0 --- /dev/null +++ b/mop_preprocess/seqtagger_models/b04_RNA002/config.toml @@ -0,0 +1,29 @@ +[model] +package = "bonito.crf" + +[labels] +labels = [ "N", "A", "C", "G", "T",] + +[input] +features = 1 + +[qscore] +scale = 0.0 +bias = 1.0 + +[encoder] +winlen = 31 +stride = 10 +scale = 5.0 +features = 96 +rnn_type = "lstm" +activation = "swish" +blank_score = 2.0 + +[global_norm] +state_len = 4 + +[basecaller] +batchsize = 256 +chunksize = 3000 +overlap = 500 diff --git a/mop_preprocess/seqtagger_models/b04_RNA002/weights_49.tar b/mop_preprocess/seqtagger_models/b04_RNA002/weights_49.tar new file mode 100644 index 0000000..46f27bf Binary files /dev/null and b/mop_preprocess/seqtagger_models/b04_RNA002/weights_49.tar differ diff --git a/mop_preprocess/seqtagger_models/b04_RNA004/barcodes.fa b/mop_preprocess/seqtagger_models/b04_RNA004/barcodes.fa new file mode 100644 index 0000000..dcf8189 --- /dev/null +++ b/mop_preprocess/seqtagger_models/b04_RNA004/barcodes.fa @@ -0,0 +1,8 @@ +>Barcode_1 +GGCTTCTTCTTGCTCTTAGG +>Barcode_2 +GTGATTCTCGTCTTTCTGCG +>Barcode_3 +GTACTTTTCTCTTTGCGCGG +>Barcode_4 +GGTCTTCGCTCGGTCTTATT diff --git a/mop_preprocess/seqtagger_models/b04_RNA004/barcodes.fa.fai b/mop_preprocess/seqtagger_models/b04_RNA004/barcodes.fa.fai new file mode 100644 index 0000000..73e6046 --- /dev/null +++ b/mop_preprocess/seqtagger_models/b04_RNA004/barcodes.fa.fai @@ -0,0 +1,4 @@ +Barcode_1 20 11 20 21 +Barcode_2 20 43 20 21 +Barcode_3 20 75 20 21 +Barcode_4 20 107 20 21 diff --git a/mop_preprocess/seqtagger_models/b04_RNA004/config.toml b/mop_preprocess/seqtagger_models/b04_RNA004/config.toml new file mode 100644 index 0000000..57f00b6 --- /dev/null +++ b/mop_preprocess/seqtagger_models/b04_RNA004/config.toml @@ -0,0 +1,29 @@ +[model] +package = "bonito.crf" + +[labels] +labels = [ "N", "A", "C", "G", "T",] + +[input] +features = 1 + +[qscore] +scale = 0.0 +bias = 1.0 + +[encoder] +winlen = 31 +stride = 10 +scale = 5.0 +features = 96 +rnn_type = "lstm" +activation = "swish" +blank_score = 2.0 + +[global_norm] +state_len = 4 + +[basecaller] +batchsize = 256 +chunksize = 2000 +overlap = 500 diff --git a/mop_preprocess/seqtagger_models/b04_RNA004/weights_39.tar b/mop_preprocess/seqtagger_models/b04_RNA004/weights_39.tar new file mode 100644 index 0000000..796b6b0 Binary files /dev/null and b/mop_preprocess/seqtagger_models/b04_RNA004/weights_39.tar differ diff --git a/mop_preprocess/tool_opts/cdna_tool_opt.tsv b/mop_preprocess/tool_opts/cdna_tool_opt.tsv index 91f04c9..1ed3e36 100644 --- a/mop_preprocess/tool_opts/cdna_tool_opt.tsv +++ b/mop_preprocess/tool_opts/cdna_tool_opt.tsv @@ -1,6 +1,7 @@ #step tool extrapars basecalling guppy "" demultiplexing guppy "--barcode_kits EXP-NBD104" +demultiplexing seqtagger "" filtering nanofilt "" filtering nanoq "" mapping graphmap "" diff --git a/mop_preprocess/tool_opts/cdna_tool_readucks_opt.tsv b/mop_preprocess/tool_opts/cdna_tool_readucks_opt.tsv index 844e401..c34f8ec 100644 --- a/mop_preprocess/tool_opts/cdna_tool_readucks_opt.tsv +++ b/mop_preprocess/tool_opts/cdna_tool_readucks_opt.tsv @@ -2,6 +2,7 @@ basecalling guppy "" demultiplexing guppy "--flowcell FLO-MIN106 --kit SQK-DCS109 --barcode_kits EXP-NBD104" demultiplexing readucks "--limit_barcodes_to 1 2 3 4 --native_barcodes --threshold 50" +demultiplexing seqtagger "" filtering nanofilt "" filtering nanoq "" mapping graphmap "" diff --git a/mop_preprocess/tool_opts/dna_tool_opt.tsv b/mop_preprocess/tool_opts/dna_tool_opt.tsv index ca20a08..00788a1 100644 --- a/mop_preprocess/tool_opts/dna_tool_opt.tsv +++ b/mop_preprocess/tool_opts/dna_tool_opt.tsv @@ -1,6 +1,7 @@ #step tool extrapars basecalling guppy "" demultiplexing guppy "--barcode_kits EXP-NBD114" +demultiplexing seqtagger "" filtering nanofilt "" filtering nanoq "" mapping graphmap "" diff --git a/mop_preprocess/tool_opts/drna_tool_m6A_splice_opt.tsv b/mop_preprocess/tool_opts/drna_tool_m6A_splice_opt.tsv index 21e30fb..fed9fca 100644 --- a/mop_preprocess/tool_opts/drna_tool_m6A_splice_opt.tsv +++ b/mop_preprocess/tool_opts/drna_tool_m6A_splice_opt.tsv @@ -3,6 +3,7 @@ basecalling dorado "rna002_70bps_hac@v3" basecalling guppy "-c rna_r9.4.1_70bps_m6A_hac.cfg" demultiplexing deeplexicon "" demultiplexing guppy "" +demultiplexing seqtagger "" filtering nanofilt "" filtering nanoq "" mapping graphmap "" diff --git a/mop_preprocess/tool_opts/drna_tool_splice_opt_ozu.tsv b/mop_preprocess/tool_opts/drna_tool_seqtagger_opt.tsv similarity index 74% rename from mop_preprocess/tool_opts/drna_tool_splice_opt_ozu.tsv rename to mop_preprocess/tool_opts/drna_tool_seqtagger_opt.tsv index 9cf2dc2..ba885da 100644 --- a/mop_preprocess/tool_opts/drna_tool_splice_opt_ozu.tsv +++ b/mop_preprocess/tool_opts/drna_tool_seqtagger_opt.tsv @@ -1,5 +1,7 @@ #step tool extrapars -basecalling guppy "-c dna_r9.4.1_450bps_hac.cfg --barcode_kits EXP-NBD104 " +basecalling dorado "rna002_70bps_hac@v3" +basecalling guppy "" +demultiplexing seqtagger "-k b100" demultiplexing deeplexicon "" demultiplexing guppy "" filtering nanofilt "" diff --git a/mop_preprocess/tool_opts/drna_tool_splice_opt.tsv b/mop_preprocess/tool_opts/drna_tool_splice_opt.tsv index 25f469e..859fc4a 100644 --- a/mop_preprocess/tool_opts/drna_tool_splice_opt.tsv +++ b/mop_preprocess/tool_opts/drna_tool_splice_opt.tsv @@ -3,6 +3,7 @@ basecalling dorado "rna002_70bps_hac@v3" basecalling guppy "" demultiplexing deeplexicon "-f multi -m resnet20-final.h5" demultiplexing guppy "" +demultiplexing seqtagger "" filtering nanofilt "" filtering nanoq "" mapping graphmap ""