Merge pull request nf-core#99 from LaurenceKuhl/crisprcleanr

Change the workflow to take MAGeCK MLE and BAGEL2 as a default and add tests
LaurenceKuhl · Jan 15, 2024 · 8781f5a · 8781f5a
2 parents fca2d35 + 08269c2
commit 8781f5a
Show file tree

Hide file tree

Showing 10 changed files with 150 additions and 24 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -29,8 +29,10 @@ jobs:
         ANALYSIS:
           - "test_screening"
           - "test_screening_paired"
+          - "test_screening_rra"
           - "test_targeted"
           - "test_umis"
+
     steps:
       - name: Check out pipeline code
         uses: actions/checkout@v3

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,12 +10,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Added
 
 - Update all modules to the last version in nf-core/modules ([#92](https://github.com/nf-core/crisprseq/pull/92))
+- More documentation for screening analysis.  ([#99](https://github.com/nf-core/crisprseq/pull/99))
+- Contrasts are now given under a different flag and MAGeCK MLE and BAGEL2 are automatically run instead of MAGeCK RRA. ([#99](https://github.com/nf-core/crisprseq/pull/99))
 - Added cutadapt for screening analysis ([#95](https://github.com/nf-core/crisprseq/pull/95))
 
 ### Fixed
 
 - Fixed paired-end for screening analysis ([#94](https://github.com/nf-core/crisprseq/pull/94))
 
+
 ## [v2.1.0 - Jamon Salas](https://github.com/nf-core/crisprseq/releases/tag/2.1.0) - [14.11.2023]
 
 ### Added

diff --git a/conf/modules.config b/conf/modules.config
@@ -177,6 +177,14 @@ process {
         ]
     }
 
+    withName: MATRICESCREATION {
+        publishDir = [
+            path: { "${params.outdir}/design_matrix" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+        ]
+    }
+
     withName: MINIMAP2_ALIGN_UMI_1 {
         ext.args = '-x map-ont'
         ext.prefix = { "${reads.baseName}_cycle1" }

diff --git a/conf/test_screening.config b/conf/test_screening.config
@@ -23,7 +23,6 @@ params {
     input             = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/samplesheet_test.csv'
     analysis          = 'screening'
     crisprcleanr      = "Brunello_Library"
-    mle_design_matrix = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/design_matrix.txt"
     library           = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/brunello_target_sequence.txt"
-    rra_contrasts     = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/rra_contrasts.txt"
+    contrasts         = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/rra_contrasts.txt"
 }
diff --git a/conf/test_screening_rra.config b/conf/test_screening_rra.config
@@ -0,0 +1,29 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/crisprseq -profile test_screening_rra,<conda/docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test screening profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input             = 'https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/samplesheet_test.csv'
+    analysis          = 'screening'
+    crisprcleanr      = "Brunello_Library"
+    library           = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/brunello_target_sequence.txt"
+    contrasts         = "https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/rra_contrasts.txt"
+    rra               = true
+}
diff --git a/docs/usage/screening.md b/docs/usage/screening.md
@@ -21,6 +21,7 @@ nextflow run nf-core/crisprseq --analysis screening --input samplesheet.csv --li
 ```
 
 The following required parameters are here described.
+ If you wish to input a raw count or normalized table, you can skip the samplesheet parameter as well as the library one and directly input your table using count_table `--count_table your_count_table`. If your count table is normalized, be sure to set the normalization method to none in MAGeCK MLE or MAGeCK RRA using a config file.
 
 ### Full samplesheet
 
@@ -41,16 +42,29 @@ SRR8983580,SRR8983580.small.fastq.gz,,treatment
 
 An [example samplesheet](https://github.com/nf-core/test-datasets/blob/crisprseq/testdata/samplesheet_test.csv) has been provided with the pipeline.
 
+
+### cutadapt
+
+MAGeCK count which is the main alignment software used is normally able to automatically determine the trimming length and sgRNA length, in most cases. Therefore, you don't need to go to this step unless MAGeCK fails to do so by itself. If the nucleotide length in front of sgRNA varies between different reads, you can use cutadapt to remove the adaptor sequences by using the flag `--cutadapt ADAPTER`.
+
 ### library
 
 If you are running the pipeline with fastq files and wish to obtain a count table, the library parameter is needed. The library table has three mandatory columns : id, target transcript (or gRNA sequence) and gene symbol.
 An [example](https://github.com/nf-core/test-datasets/blob/crisprseq/testdata/brunello_target_sequence.txt) has been provided with the pipeline. Many libraries can be found on [addgene](https://www.addgene.org/).
 
-After the alignment step, the pipeline currently supports 3 algorithms to detect gene essentiality, MAGeCK rra, MAGeCK mle and BAGEL2. MAGeCK MLE (Maximum Likelihood Estimation) and MAGeCK RRA (Robust Ranking Aggregation) are two different methods provided by the MAGeCK software package to analyze CRISPR-Cas9 screens. BAGEL2 identifies gene essentiality through Bayesian Analysis.
+After the alignment step, if you are performing KO (Knock-Out) screens, you can choose to correction of gene independent cell responses to CRISPR-cas9 targeting using crisprcleanr. If you are performing a CRISPR interference or activation screen, this step is not needed.
 
-### MAGeCK rra
+The pipeline currently supports 3 algorithms to detect gene essentiality, MAGeCK rra, MAGeCK mle and BAGEL2. MAGeCK MLE (Maximum Likelihood Estimation) and MAGeCK RRA (Robust Ranking Aggregation) are two different methods provided by the MAGeCK software package to analyze CRISPR-Cas9 screens. BAGEL2 identifies gene essentiality through Bayesian Analysis.
+We recommend to run MAGeCK MLE and BAGEL2 as these are the most used and most recent algorithms to determine gene essentiality.
+
+### Running CRISPRcleanR
+
+CRISPRcleanR is used for gene count normalization and the removal of biases for genomic segments for which copy numbers are amplified. Currently, the pipeline only supports annotation libraries already present in the R package and which can be found [here](https://github.com/francescojm/CRISPRcleanR/blob/master/Reference_Manual.pdf). To use CRISPRcleanR normalization, use `--crisprcleanr library`, `library` being the exact name as the library in the CRISPRcleanR documentation (e.g: "AVANA_Library").
 
-MAGeCK RRA performs robust ranking aggregation to identify genes that are consistently ranked highly across multiple replicate screens. To run MAGeCK rra, `--rra_contrasts` contains two columns : treatment and reference. These two columns should be separated with a dot comma (;) and contain the `csv` extension. You can also integrate several samples/conditions by comma separating them. Please find an example here below :
+
+### Running MAGeCK MLE and BAGEL2 with a contrast file
+
+To run both MAGeCK MLE and BAGEL2, you can provide a contrast file with the flag `--contrasts` with the mandatory headers "treatment" and "reference". These two columns should be separated with a dot comma (;) and contain the `csv` extension. You can also integrate several samples/conditions by comma separating them in each column. Please find an example here below :
 
 | reference         | treatment             |
 | ----------------- | --------------------- |
@@ -59,24 +73,22 @@ MAGeCK RRA performs robust ranking aggregation to identify genes that are consis
 
 A full example can be found [here](https://raw.githubusercontent.com/nf-core/test-datasets/crisprseq/testdata/full_test/samplesheet_full.csv).
 
-### cutadapt
+### Running MAGeCK RRA only
 
-MAGeCK is normally able to automatically determine the trimming length and sgRNA length, in most cases. Therefore, you don't need to go to this step unless MAGeCK fails to do so by itself. If the nucleotide length in front of sgRNA varies between different reads, you can use cutadapt to remove the adapter sequences by using the flag `--cutadapt ADAPTER`.
+MAGeCK RRA performs robust ranking aggregation to identify genes that are consistently ranked highly across multiple replicate screens. To run MAGeCK rra, you can define the contrasts as previously stated in the last section and also specify  `--rra` .
 
-### MAGeCK mle
+### Running MAGeCK MLE only
 
-MAGeCK MLE uses a maximum likelihood estimation approach to estimate the effects of gene knockout on cell fitness. It models the read count data of guide RNAs targeting each gene and estimates the dropout probability for each gene. MAGeCK mle requires a design matrix. The design matrix is a `txt` file indicating the effects of different conditions on different samples.
-An [example design matrix](https://github.com/nf-core/test-datasets/blob/crisprseq/testdata/design_matrix.txt) has been provided with the pipeline.
+If you wish to run MAGeCK MLE only, you can specify several design matrices (where you state which comparisons you wish to run) with the flag `--mle_design_matrix`.
+MAGeCK MLE uses a maximum likelihood estimation approach to estimate the effects of gene knockout on cell fitness. It models the read count data of guide RNAs targeting each gene and estimates the dropout probability for each gene.
+MAGeCK MLE requires one or several design matrices. The design matrix is a `txt` file indicating the effects of different conditions on different samples.
+An [example design matrix](https://github.com/nf-core/test-datasets/blob/crisprseq/testdata/design_matrix.txt) has been provided with the pipeline. The row names need to match the condition stated in the sample sheet.
 If there are several designs to be run, you can input a folder containing all the design matrices. The output results will automatically take the name of the design matrix, so make sure you give a meaningful name to the file, for instance "Drug_vs_control.txt".
 
-### Running CRISPRcleanR
-
-CRISPRcleanR is used for gene count normalization and the removal of biases for genomic segments for which copy numbers are amplified. Currently, the pipeline only supports annotation libraries already present in the R package and which can be found [here](https://github.com/francescojm/CRISPRcleanR/blob/master/Reference_Manual.pdf). To use CRISPRcleanR normalization, use `--crisprcleanr library`, `library` being the exact name as the library in the CRISPRcleanR documentation (e.g: "AVANA_Library").
-
-### BAGEL2
+### Running BAGEL2
 
 BAGEL2 (Bayesian Analysis of Gene Essentiality with Location) is a computational tool developed by the Hart Lab at Harvard University. It is designed for analyzing large-scale genetic screens, particularly CRISPR-Cas9 screens, to identify genes that are essential for the survival or growth of cells under different conditions. BAGEL2 integrates information about the location of guide RNAs within a gene and leverages this information to improve the accuracy of gene essentiality predictions.
-BAGEL2 uses the same contrasts from `--rra_contrasts`.
+BAGEL2 uses the same contrasts from `--contrasts`.
 
 Note that the pipeline will create the following files in your working directory:
 

diff --git a/modules/local/matricescreation.nf b/modules/local/matricescreation.nf
@@ -0,0 +1,51 @@
+process MATRICESCREATION {
+    label 'process_single'
+
+    conda 'r-ggplot2=3.4.3 bioconductor-shortread=1.58.0 r-ggpubr=0.6.0 r-ggmsa=1.0.2 r-seqmagick=0.1.6 r-tidyr=1.3.0 r-ggseqlogo=0.1 r-cowplot=1.1.1 r-seqinr=4.2_30 r-optparse=1.7.3 r-dplyr=1.1.2 r-plyr=1.8.8 r-stringr=1.5.0 r-plotly=4.10.2'
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/mulled-v2-6de07928379e6eface08a0019c4a1d6b5192e805:0d77388f37ddd923a087f7792e30e83ab54c918c-0' :
+        'biocontainers/mulled-v2-6de07928379e6eface08a0019c4a1d6b5192e805:0d77388f37ddd923a087f7792e30e83ab54c918c-0' }"
+
+    input:
+    path(contrasts)
+
+    output:
+    path("*.txt"), emit: design_matrix
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+
+    """
+    #!/usr/bin/env Rscript
+    #### author: Laurence Kuhlburger
+    #### Released under the MIT license. See git repository (https://github.com/nf-core/crisprseq) for full license text.
+    ####
+    #### Orient a reference sequence according to reads orientation.
+
+    data <- read.table("$contrasts", header = TRUE, sep = ";", stringsAsFactors = FALSE)
+    # Loop through each row in the data
+    for (i in 1:nrow(data)) {
+        # Extract control and treatment samples for the current row
+        control_samples <- unlist(strsplit(data\$reference[i], ","))
+        treatment_samples <- unlist(strsplit(data\$treatment[i], ","))
+
+        # Create a vector of all unique samples
+        all_samples <- unique(c(control_samples, treatment_samples))
+
+        # Initialize a matrix to store the design matrix
+        design_matrix <- data.frame(matrix(0, nrow = length(all_samples), ncol = 3,
+                        dimnames = list(all_samples, c("Samples", "baseline", paste0(gsub(',', '_', data\$treatment[i] ),"_vs_", data\$reference[i])))))
+
+        # Set baseline and treatment values in the design matrix
+        design_matrix[, "Samples"] <- rownames(design_matrix)
+        design_matrix\$baseline <- 1
+        design_matrix[treatment_samples, paste0(gsub(',', '_', data\$treatment[1] ),"_vs_",gsub(",","_",data\$reference[i]))] <- 1
+
+        # Print the design matrix to a file
+        output_file <- paste0(gsub(',', '_', data\$treatment[1] ),"_vs_",gsub(",","_",data\$reference[i]),".txt")
+        write.table(design_matrix, output_file, sep = "\t", quote = FALSE, row.names=FALSE)
+    }
+    """
+}
diff --git a/nextflow.config b/nextflow.config
@@ -16,12 +16,13 @@ params {
     protospacer                = null
     library                    = null
     crisprcleanr               = null
+    contrasts                  = null
     cutadapt                   = null
-    rra_contrasts              = null
     mle_design_matrix          = null
     count_table                = null
     min_reads                  = 30
     min_targeted_genes         = 3
+    rra                        = false
     bagel_reference_essentials =    'https://raw.githubusercontent.com/hart-lab/bagel/master/CEGv2.txt'
     bagel_reference_nonessentials = 'https://raw.githubusercontent.com/hart-lab/bagel/master/NEGv1.txt'
 
@@ -200,6 +201,7 @@ profiles {
     test_screening_full   { includeConfig 'conf/test_screening_full.config' }
     test_screening        { includeConfig 'conf/test_screening.config' }
     test_screening_paired { includeConfig 'conf/test_screening_paired.config' }
+    test_screening_rra    { includeConfig 'conf/test_screening_rra.config' }
 }
 
 // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile

diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -156,12 +156,17 @@
                     "exists": true,
                     "description": "Design matrix used for MAGeCK MLE to call essential genes under multiple conditions while considering sgRNA knockout efficiency"
                 },
-                "rra_contrasts": {
+                "contrasts": {
                     "type": "string",
                     "format": "file-path",
                     "exists": true,
                     "description": "Comma-separated file with the conditions to be compared. The first one will be the reference (control)"
                 },
+                "rra": {
+                    "type": "boolean",
+                    "description": "Parameter indicating if MAGeCK RRA should be ran instead of MAGeCK MLE.",
+                    "default": false
+                },
                 "count_table": {
                     "type": "string",
                     "format": "file-path",

diff --git a/workflows/crisprseq_screening.nf b/workflows/crisprseq_screening.nf
@@ -24,6 +24,14 @@ if(params.mle_design_matrix) {
         .set { ch_design }
 }
 
+if(params.rra && params.mle_design_matrix) {
+    warning "mle_design_matrix will only be used for the MAGeCK MLE computations"
+    }
+
+if(params.rra && !params.contrasts) {
+    error "Please also provide the contrasts table to compare the samples for MAGeCK RRA"
+    }
+
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     CONFIG FILES
@@ -67,6 +75,7 @@ include { BAGEL2_FC                   } from '../modules/local/bagel2/fc'
 include { BAGEL2_BF                   } from '../modules/local/bagel2/bf'
 include { BAGEL2_PR                   } from '../modules/local/bagel2/pr'
 include { BAGEL2_GRAPH                } from '../modules/local/bagel2/graph'
+include { MATRICESCREATION            } from '../modules/local/matricescreation'
 
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -176,8 +185,8 @@ workflow CRISPRSEQ_SCREENING {
         }.set { ch_counts }
     }
 
-    if(params.rra_contrasts) {
-        Channel.fromPath(params.rra_contrasts)
+    if(params.rra) {
+        Channel.fromPath(params.contrasts)
             .splitCsv(header:true, sep:';' )
             .set { ch_contrasts }
         counts = ch_contrasts.combine(ch_counts)
@@ -194,8 +203,8 @@ workflow CRISPRSEQ_SCREENING {
         ch_versions = ch_versions.mix(MAGECK_GRAPHRRA.out.versions)
     }
 
-    if(params.rra_contrasts) {
-        Channel.fromPath(params.rra_contrasts)
+    if(params.contrasts) {
+        Channel.fromPath(params.contrasts)
             .splitCsv(header:true, sep:';' )
             .set { ch_bagel }
     counts = ch_bagel.combine(ch_counts)
@@ -235,8 +244,14 @@ workflow CRISPRSEQ_SCREENING {
 
     }
 
-    if(params.mle_design_matrix) {
-        ch_mle = ch_counts.combine(ch_design)
+    if((params.mle_design_matrix) || (params.contrasts && !params.rra)) {
+        if(params.mle_design_matrix) {
+            ch_mle = ch_counts.combine(ch_design)
+            }
+        if(params.contrasts) {
+            MATRICESCREATION(params.contrasts)
+            ch_mle = ch_counts.combine(MATRICESCREATION.out.design_matrix)
+        }
         ch_mle.map {
             it -> [[id: it[1].getBaseName()], it[0], it[1]]
         }.set { ch_designed_mle }