diff --git a/conf/modules.config b/conf/modules.config index afae335d..fb82328d 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -100,7 +100,6 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] ext.prefix = 'count_table' - } withName: MAGECK_MLE { @@ -131,7 +130,12 @@ process { } withName: CUTADAPT { - ext.args = '-g file:overrepresented.fasta -N' + if(params.analysis == 'targeted') { + ext.args = '-g file:overrepresented.fasta -N' + } + if(params.analysis == 'screening' && params.cutadapt) { + ext.args = "-a ${params.cutadapt}" + } publishDir = [ path: { "${params.outdir}/preprocessing/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, diff --git a/conf/test_targeted.config b/conf/test_targeted.config index c2e9ef7f..9bf6cabc 100644 --- a/conf/test_targeted.config +++ b/conf/test_targeted.config @@ -25,4 +25,7 @@ params { // Aligner aligner = 'minimap2' + + // Steps + overrepresented = true } diff --git a/docs/output/screening.md b/docs/output/screening.md index 869a7d33..aa163333 100644 --- a/docs/output/screening.md +++ b/docs/output/screening.md @@ -16,6 +16,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [Preprocessing](#preprocessing) - [FastQC](#fastqc) - Read Quality Control + - [cutadapt](#cutadapt) - Trimming reads from fastq files - [Counting](#counting) - [MAGeCK count](#mageck-count) - Mapping reads to reference - [CNV correction](#cnv-correction)) @@ -42,6 +43,20 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +### cutadapt + +
+Output files + +- `cutadapt/` + - `*.log`: log file of the command ran and the output + - `*.trim.fastq.gz`: trimmed fastq files + +
+ +[cutadapt](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/). Cutadapt finds and removes adapter sequences, primers, poly-A tails and other types of unwanted sequence from your high-throughput sequencing reads. MAGeCK count normally automatically detects adapter sequences and trims, however if trimming lengths are different, cutadapt can be used, as mentioned [here](https://sourceforge.net/p/mageck/wiki/advanced_tutorial/). +For further reading and documentation see the [cutadapt helper page](https://cutadapt.readthedocs.io/en/stable/guide.html). + ## Counting ### MAGeCK count diff --git a/docs/usage/screening.md b/docs/usage/screening.md index 63fba93e..34325513 100644 --- a/docs/usage/screening.md +++ b/docs/usage/screening.md @@ -52,15 +52,17 @@ After the alignment step, the pipeline currently supports 3 algorithms to detect MAGeCK RRA performs robust ranking aggregation to identify genes that are consistently ranked highly across multiple replicate screens. To run MAGeCK rra, `--rra_contrasts` contains two columns : treatment and reference. These two columns should be separated with a dot comma (;) and contain the `csv` extension. You can also integrate several samples/conditions by comma separating them. Please find an example here below : -| treatment | reference | -| ----------------------- | ------------------- | -| treatment1 | control1 | -| treatment1,treatment2 | control1,control2 | -| ----------------------- | ------------------- | -| treatment1 | control1 | +| reference | treatment | +| ----------------- | --------------------- | +| control1 | treatment1 | +| control1,control2 | treatment1,treatment2 | A full example can be found [here](https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/full_test/samplesheet_full.csv). +### cutadapt + +MAGeCK is normally able to automatically determine the trimming length and sgRNA length, in most cases. Therefore, you don't need to go to this step unless MAGeCK fails to do so by itself. If the nucleotide length in front of sgRNA varies between different reads, you can use cutadapt to remove the adaptor sequences by using the flag `--cutadapt ADAPTER`. + ### MAGeCK mle MAGeCK MLE uses a maximum likelihood estimation approach to estimate the effects of gene knockout on cell fitness. It models the read count data of guide RNAs targeting each gene and estimates the dropout probability for each gene. MAGeCK mle requires a design matrix. The design matrix is a `txt` file indicating the effects of different conditions on different samples. diff --git a/modules/nf-core/cutadapt/main.nf b/modules/nf-core/cutadapt/main.nf index 6f2fcd14..2c6bf763 100644 --- a/modules/nf-core/cutadapt/main.nf +++ b/modules/nf-core/cutadapt/main.nf @@ -21,7 +21,6 @@ process CUTADAPT { script: def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" - if (adapter_seq != []) """ cutadapt \\ --cores $task.cpus \\ @@ -34,14 +33,7 @@ process CUTADAPT { cutadapt: \$(cutadapt --version) END_VERSIONS """ - else - """ - cat <<-END_VERSIONS > versions.yml - "${task.process}": - cutadapt: \$(cutadapt --version) - END_VERSIONS - """ - + stub: def prefix = task.ext.prefix ?: "${meta.id}" def trimmed = meta.single_end ? "${prefix}.trim.fastq.gz" : "${prefix}_1.trim.fastq.gz ${prefix}_2.trim.fastq.gz" diff --git a/nextflow.config b/nextflow.config index 5f6dc691..1a8aa11d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -16,6 +16,7 @@ params { protospacer = null library = null crisprcleanr = null + cutadapt = null rra_contrasts = null mle_design_matrix = null count_table = null diff --git a/nextflow_schema.json b/nextflow_schema.json index ddc1856e..ec0365f6 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -183,6 +183,10 @@ "type": "string", "description": "sgRNA library annotation for crisprcleanR" }, + "cutadapt": { + "type": "string", + "description": "cutadapt adapter for screening analysis" + }, "min_reads": { "type": "number", "description": "a filter threshold value for sgRNAs, based on their average counts in the control sample", diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf index 83cd51b9..e311bff5 100644 --- a/workflows/crisprseq_screening.nf +++ b/workflows/crisprseq_screening.nf @@ -55,6 +55,7 @@ ch_multiqc_custom_methods_description = params.multiqc_methods_description ? fil // MODULE: Installed directly from nf-core/modules // include { FASTQC } from '../modules/nf-core/fastqc/main' +include { CUTADAPT } from '../modules/nf-core/cutadapt/main' include { MULTIQC } from '../modules/nf-core/multiqc/main' include { MAGECK_COUNT } from '../modules/nf-core/mageck/count/main' include { MAGECK_MLE } from '../modules/nf-core/mageck/mle/main' @@ -90,6 +91,34 @@ workflow CRISPRSEQ_SCREENING { return [ meta + [ single_end:fastq_2?false:true ], fastq_2?[ fastq_1, fastq_2 ]:[ fastq_1 ] ] } .set { ch_input } + + // + // MODULE: Run FastQC + // + FASTQC ( + ch_input + ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + + + empty_channel = Channel.value([[]]) + ch_input_cutadapt = ch_input.combine(Channel.value([[]])) + + if(params.cutadapt) { + CUTADAPT( + ch_input_cutadapt + ) + ch_versions = ch_versions.mix(CUTADAPT.out.versions) + + CUTADAPT.out.reads + .map{ meta, fastq -> + [meta, [fastq]] + } + .set { ch_input } + } + + // this is to concatenate everything for mageck count + ch_input .map { meta, fastqs -> if(fastqs.size() == 1){ @@ -98,6 +127,8 @@ workflow CRISPRSEQ_SCREENING { [meta.condition, [fastqs[0]], meta.single_end, [fastqs[1]]] } } + // if one element is paired-end and the other single-end throw an error + // otherwise just concatenate the conditions and the fastqs .reduce { a, b -> if(a[2] != b[2] ) { error "Your samplesheet contains a mix of single-end and paired-end data. This is not supported." @@ -121,6 +152,7 @@ workflow CRISPRSEQ_SCREENING { ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + // // MODULE: Run mageck count //