main.nf

#!/usr/bin/env nextflow
/*
========================================================================================
                         nf-core/demultiplex
========================================================================================
 nf-core/demultiplex Analysis Pipeline.
 #### Homepage / Documentation
 https://github.com/nf-core/demultiplex
----------------------------------------------------------------------------------------
*/

def helpMessage() {
    // TODO nf-core: Add to this help message with new command line parameters
    log.info nfcoreHeader()
    log.info"""

    Usage:

    The typical command for running the pipeline is as follows:

    nextflow run nf-core/demultiplex --input samplesheet.csv  --run_dir /path/to/run/directory/ -profile docker

    Mandatory arguments:
      --input [file]                      Full path to samplesheet
      --run_dir [file]                    Full path to run directory (will parse name of run from the last directory in path)
      -profile [str]                      Configuration profile to use. Can use multiple (comma separated)
                                          Available: docker, singularity, test, awsbatch, <institute> and more

    References                            If not specified in the configuration file or you wish to overwrite any of the references.
      --tenx_genomes_base [file]          Base directory for 10x genomes
      --fastq_screen_conf [file]          Full path to fastq_screen genome config file
      --kraken_db [file]                  Full path to Kraken2 DB for contaminant screeening

    bcl2fastq
      --adapter_stringency                The minimum match rate that would trigger the masking or trimming process
      --barcode_mismatches                Number of allowed mismatches per index
      --create_fastq_for_indexreads       Create FASTQ files also for Index Reads. 0 (False default) 1 (True).
      --ignore_missing_bcls               Missing or corrupt BCL files are ignored. Assumes 'N'/'#' for missing calls
      --ignore_missing_filter             Missing or corrupt filter files are ignored. Assumes Passing Filter for all clusters in tiles where filter files are missing
      --ignore_missing_positions          Missing or corrupt positions files are ignored. If corresponding position files are missing, bcl2fastq writes unique coordinate positions in FASTQ header.
      --minimum_trimmed_readlength        Minimum read length after adapter trimming.
      --mask_short_adapter_reads          This option applies when a read is shorter than the length specified by --minimum-trimmed-read-length (note that the read does not specifically have to be trimmed for this option to trigger, it need only fall below the —minimum-trimmed-read-length for any reason).
      --tiles                             The --tiles argument takes a regular expression to select for processing only a subset of the tiles available in the flow cell Multiple selections can be made by separating the regular expressions with commas
      --use_bases_mask                    The --use-bases-mask string specifies how to use each cycle
      --with_failed_reads                 Include all clusters in the output, even clusters that are non-PF. These clusters would have been excluded by default
      --write_fastq_reversecomplement     Generate FASTQ files containing reverse complements of actual data.
      --no_bgzf_compression               Turn off BGZF compression, and use GZIP for FASTQ files. BGZF compression allows downstream applications to decompress in parallel.
      --fastq_compression_level           Zlib compression level (1–9) used for FASTQ files.
      --no_lane_splitting                 Do not split FASTQ files by lane.
      --find_adapters_withsliding_window  Find adapters with simple sliding window algorithm. Insertions and deletions of bases inside the adapter sequence are not handled.

    QC
      --kraken_db_size [int]              Specify size parameters to build the Kraken database if no database available
      --skip_fastqc [bool]                Skip FastQC
      --skip_kraken2 [bool]               Skip Kraken2
      --skip_multiqc [bool]               Skip MultiQC
      --skip_multiqc_stats [bool]         Exclude general statistics table from MultiQC report

    Other options:
      --outdir [file]                     The output directory where the results will be saved
      --email [email]                     Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits
      --email_on_fail [email]             Same as --email, except only send mail if the workflow is not successful
      --max_multiqc_email_size [str]      Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB)
      -name [str]                         Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic

    AWSBatch options:
        --awsqueue [str]                  The AWSBatch JobQueue that needs to be set when running on AWSBatch
        --awsregion [str]                 The AWS Region for your AWS Batch job to run on
        --awscli [str]                    Path to the AWS CLI tool
    """.stripIndent()
}

// Show help message
if (params.help) {
    helpMessage()
    exit 0
}

/*
 * SET UP CONFIGURATION VARIABLES
 */

//Show help message
if (params.help) {
    helpMessage()
    exit 0
}

// Has the run name been specified by the user?
//  this has the bonus effect of catching both -name and --name
custom_runName = params.name
if (!(workflow.runName ==~ /[a-z]+_[a-z]+/)) {
    custom_runName = workflow.runName
}

// ////////////////////////////////////////////////////
// /* --          VALIDATE INPUTS                 -- */
// ////////////////////////////////////////////////////

if (params.input) { ss_sheet = file(params.input, checkIfExists: true) } else { exit 1, "Sample sheet not found!" }
if (params.run_dir) { runDir = file(params.run_dir, checkIfExists: true) } else { exit 1, "Run directory not found!" }
runName = runDir.getName()

// Stage config files
if (params.fastq_screen_conf) { ch_fastq_screen_config = file(params.fastq_screen_conf, checkIfExists: true) }
ch_multiqc_config = file("$baseDir/assets/multiqc_config.yaml", checkIfExists: true)
ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty()
ch_output_docs = file("$baseDir/docs/output.md", checkIfExists: true)

if (workflow.profile.contains('awsbatch')) {
    // AWSBatch sanity checking
    if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!"
    // Check outdir paths to be S3 buckets if running on AWSBatch
    // related: https://github.com/nextflow-io/nextflow/issues/813
    if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!"
    // Prevent trace files to be stored on S3 since S3 does not support rolling files.
    if (params.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles."
}

// Header log info
log.info nfcoreHeader()
def summary = [:]
if (workflow.revision) summary['Pipeline Release'] = workflow.revision
summary['Run Name']                          = custom_runName ?: workflow.runName
// TODO nf-core: Report custom parameters here
summary['Samplesheet']                       = params.input
summary['Run Directory']                     = params.run_dir
summary['10X Genome Dir']                    = params.tenx_genomes_base
summary['Adapter Stringency']                = params.adapter_stringency
summary['Barcode Mismatch']                  = params.barcode_mismatches
if (params.create_fastq_for_indexreads)      summary['FastQ Index Reads'] = params.create_fastq_for_indexreads
if (params.ignore_missing_bcls)              summary['Skip Missing BCLs'] = params.ignore_missing_bcls
if (params.ignore_missing_filter)            summary['Skip Missing Filter'] = params.ignore_missing_filter
if (params.ignore_missing_positions)         summary['Skip Missing Positions'] = params.ignore_missing_positions
summary['Min Trim Read Length']              = params.minimum_trimmed_readlength
summary['Mask Short Adapt Reads']            = params.mask_short_adapter_reads
if (params.no_bgzf_compression)              summary['No BGZF Compress'] = params.no_bgzf_compression
if (params.tiles) summary['Tiles']           = params.tiles
if (params.use_bases_mask)                   summary['Bases Mask'] = params.use_bases_mask
if (params.with_failed_reads)                summary['With Failed Reads'] = params.with_failed_reads
if (params.write_fastq_reversecomplement)    summary['Write FastQ Rev Comp'] = params.write_fastq_reversecomplement
summary['FastQ Compress Level']              = params.fastq_compression_level
if (params.no_lane_splitting)                summary['No Lane Splitting'] = params.no_lane_splitting
if (params.find_adapters_withsliding_window) summary['Adapt Sliding Window'] = params.find_adapters_withsliding_window
if (params.fastq_screen_conf)                summary['FastQ Screen Conf'] = params.fastq_screen_conf
if (params.kraken_db)                        summary['Kraken2 DB'] = params.kraken_db
if (params.skip_fastqc)                      summary['Skip FastQC'] = params.skip_fastqc
if (params.skip_kraken2)                     summary['Skip Kraken2'] = params.skip_kraken2
if (params.skip_multiqc)                     summary['Skip MultiQC'] = params.skip_multiqc
if (params.skip_multiqc_stats)               summary['Skip MultiQC Stats'] = params.skip_multiqc_stats
summary['Max Resources']                     = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job"
if (workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container"
summary['Output dir']                        = params.outdir
summary['Launch dir']                        = workflow.launchDir
summary['Working dir']                       = workflow.workDir
summary['Script dir']                        = workflow.projectDir
summary['User']                              = workflow.userName
if (workflow.profile.contains('awsbatch')) {
    summary['AWS Region']                    = params.awsregion
    summary['AWS Queue']                     = params.awsqueue
    summary['AWS CLI']                       = params.awscli
}
summary['Config Profile']                    = workflow.profile
if (params.config_profile_description) summary['Config Description'] = params.config_profile_description
if (params.config_profile_contact)     summary['Config Contact']     = params.config_profile_contact
if (params.config_profile_url)         summary['Config URL']         = params.config_profile_url
if (params.email || params.email_on_fail) {
    summary['E-mail Address']    = params.email
    summary['E-mail on failure'] = params.email_on_fail
    summary['MultiQC maxsize']   = params.max_multiqc_email_size
}
log.info summary.collect { k,v -> "${k.padRight(22)}: $v" }.join("\n")
log.info "-\033[2m--------------------------------------------------\033[0m-"

// Check the hostnames against configured profiles
checkHostname()

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --               Sample Sheet Reformatting and Check`                  -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 1 - Check sample sheet for 10X samples.
 *        - This will pull out 10X samples into new samplesheet.
 */
process reformat_samplesheet {
    tag "${sheet.name}"
    label 'process_low'

    input:
    file sheet from ss_sheet

    output:
    file "*.standard.csv" into standard_samplesheet1, standard_samplesheet2, standard_samplesheet3, standard_samplesheet4
    file "*.bcl2fastq.txt" into bcl2fastq_results1, bcl2fastq_results2, bcl2fastq_results3
    file "*.tenx.txt" into tenx_results1, tenx_results2, tenx_results3, tenx_results4, tenx_results5
    file "*tenx.csv" optional true into tenx_samplesheet1, tenx_samplesheet2

    script:
    """
    reformat_samplesheet.py --samplesheet "${sheet}"
    """
}

/*
 * STEP 2 - Check samplesheet for single and dual mixed lanes and long and short
 *          indexes on same lanes and output pass or fail file to next processes.
 */
process check_samplesheet {
    tag "${sheet.name}"
    label 'process_low'

    input:
    file sheet from standard_samplesheet1

    output:
    file "*.txt" into resultChannel1, resultChannel2, resultChannel3, resultChannel4, resultChannel5

    script:
    """
    check_samplesheet.py --samplesheet "${sheet}"
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --               Problem Sample Sheet Processes                        -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 3 - If previous process finds samples that will cause problems, this process
 *          will remove problem samples from entire sample and create a new one.
 *          ONLY RUNS WHEN SAMPLESHEET FAILS.
 */
process make_fake_SS {
    tag "problem_samplesheet"
    label 'process_low'

    input:
    file sheet from standard_samplesheet2
    file result from resultChannel1

    when:
    result.name =~ /^fail.*/

    output:
    file "*.csv" into fake_samplesheet
    file "*.txt" into problem_samples_list1, problem_samples_list2

    script:
    """
    create_falseSS.py --samplesheet "${sheet}"
    """
}

/*
 * STEP 4 -  Running bcl2fastq on the false_samplesheet with problem samples removed.
 *           ONLY RUNS WHEN SAMPLESHEET FAILS.
 */
process bcl2fastq_problem_SS {
    tag "problem_samplesheet"
    label 'process_high'

    input:
    file sheet from fake_samplesheet
    file result from resultChannel2

    when:
    result.name =~ /^fail.*/

    output:
    file "Stats/Stats.json" into stats_json_file

    script:
    """
    bcl2fastq \\
        --runfolder-dir ${runDir} \\
        --output-dir . \\
        --sample-sheet ${sheet} \\
        --ignore-missing-bcls \\
        --ignore-missing-filter \\
        --with-failed-reads \\
        --barcode-mismatches 0 \\
        --loading-threads 8 \\
        --processing-threads 24 \\
        --writing-threads 6
    """
}

/*
 * STEP 5 -  Parsing .json file output from the bcl2fastq run to access the unknown barcodes section.
 *           The barcodes that match the short indexes and/or missing index 2 with the highest count
 *           to remake the sample sheet so that bcl2fastq can run properly.
 *           ONLY RUNS WHEN SAMPLESHEET FAILS.
 */
updated_samplesheet2 = Channel.create()
process parse_jsonfile {
    tag "problem_samplesheet"
    label 'process_low'

    input:
    file json from stats_json_file
    file sheet from standard_samplesheet3
    file samp_probs from problem_samples_list1
    file result from resultChannel3

    when:
    result.name =~ /^fail.*/

    output:
    file "*.csv" into updated_samplesheet1, updated_samplesheet2

    script:
    """
    parse_json.py --samplesheet "${sheet}" --jsonfile "${json}" --problemsamples "${samp_probs}"
    """
}

/*
 * STEP 6 -  Checking the remade sample sheet.
 *           If this fails again the pipeline will exit and fail.
 *           ONLY RUNS WHEN SAMPLESHEET FAILS.
 */
PROBLEM_SS_CHECK2 = Channel.create()
process recheck_samplesheet {
    tag "problem_samplesheet"
    label 'process_low'

    input:
    file sheet from ss_sheet
    file ud_sheet from updated_samplesheet1
    file prob_samps from problem_samples_list2
    file result from resultChannel4

    when:
    result.name =~ /^fail.*/

    output:
    file "*.txt" into PROBLEM_SS_CHECK2

    script:
    """
    recheck_samplesheet.py --samplesheet "${sheet}" --newsamplesheet "${ud_sheet}" --problemsamples "${prob_samps}"
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --               Single Cell Processes`                                -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 7 - CellRanger MkFastQ.
 *          ONLY RUNS WHEN ANY TYPE OF 10X SAMPLESHEET EXISTS.
 */
process cellRangerMkFastQ {
    tag "${sheet.name}"
    label 'process_high'
    publishDir path: "${params.outdir}/${runName}", mode: 'copy'

    input:
    file sheet from tenx_samplesheet1
    file result from tenx_results1

    when:
    result.name =~ /^true.*/

    output:
    file "*/outs/fastq_path/Undetermined_*.fastq.gz" into cr_undetermined_default_fq_ch, cr_undetermined_fastqs_screen_ch, cr_undetermined_move_fq_ch mode flatten
    file "*/outs/fastq_path/*/**.fastq.gz" into cr_fastqs_count_ch, cr_fastqs_fqc_ch, cr_fastqs_screen_ch, cr_fastqs_copyfs_ch mode flatten
    file "*/outs/fastq_path/Reports" into cr_b2fq_default_reports_ch
    file "*/outs/fastq_path/Stats" into cr_b2fq_default_stats_ch

    script:
    """
    cellranger mkfastq --id mkfastq --run ${runDir} --samplesheet ${sheet}
    """
}

/*
 * STEP 8 - Copy CellRanger FastQ files to new folder.
 *          ONLY RUNS WHEN ANY TYPE OF 10X SAMPLES EXISTS.
 */
def getCellRangerSampleName(fqfile) {
    def sampleName = (fqfile =~ /.*\/outs\/fastq_path\/.*\/(.+)_S\d+_L00\d_[IR][123]_001\.fastq\.gz/)
    if (sampleName.find()) {
        return sampleName.group(1)
    }
    return fqfile
}

def getCellRangerProjectName(fqfile) {
    def projectName = (fqfile =~ /.*\/outs\/fastq_path\/([a-zA-Z0-9_]*)\//)
    if (projectName.find()) {
        return projectName.group(1)
    }
    return fqfile
}

cr_fastqs_copyfs_tuple_ch = cr_fastqs_copyfs_ch.map { fqfile -> [ getCellRangerProjectName(fqfile), getCellRangerSampleName(fqfile), fqfile.getFileName() ] }
cr_undetermined_fastqs_copyfs_tuple_ch = cr_undetermined_move_fq_ch.map { fqfile -> [ "Undetermined", fqfile.getFileName() ] }

/*
 * STEP 9 - CellRanger count.
 *          ONLY RUNS WHEN A 10X SAMPLESHEET EXISTS.
 */
def getCellRangerFastqPath(fqfile) {
    def fastqPath = (fqfile =~ /(.*\/outs\/fastq_path\/[a-zA-Z0-9_]*)\//)
    if (fastqPath.find()) {
        return fastqPath.group(1)
    }
    return fqfile
}

cr_samplesheet_info_ch = tenx_samplesheet2.splitCsv(header: true, skip: 1).map { row -> [ row.Sample_ID, row.Sample_Project, row.ReferenceGenome, row.DataAnalysisType ] }
cr_fqname_fqfile_ch = cr_fastqs_count_ch.map { fqfile -> [ getCellRangerSampleName(fqfile), getCellRangerFastqPath(fqfile) ] }.unique()

cr_fqname_fqfile_ch
    .phase(cr_samplesheet_info_ch)
    .map{ left, right ->
        def sampleID = left[0]
        def projectName = right[1]
        def refGenome = right[2]
        def dataType = right[3]
        def fastqDir = left[1]
        tuple(sampleID, projectName, refGenome, dataType, fastqDir) }
   .set { cr_grouped_fastq_dir_sample_ch }

process cellRangerCount {
    tag "${projectName}/${sampleID}"
    publishDir "${params.outdir}/${runName}", mode: 'copy',
    saveAs: { filename ->
        if (dataType =~ /10X-3prime/) "count/${projectName}/$filename"
    }

    label 'process_high'
    errorStrategy 'ignore'

    input:
    set sampleID, projectName, refGenome, dataType, fastqDir from cr_grouped_fastq_dir_sample_ch
    file result from tenx_results3

    when:
    result.name =~ /^true.*/

    output:
    file "${sampleID}/" into count_output

    script:
    genome_ref_conf_filepath = params.cellranger_genomes.get(refGenome, false)
    """
    cellranger count --id=$sampleID --transcriptome=${genome_ref_conf_filepath.tenx_transcriptomes} --fastqs=$fastqDir --sample=$sampleID
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --               Main Demultiplexing Processes`                        -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 10 - Running bcl2fastq on the remade samplesheet or a sample sheet that
 *           passed the initial check. bcl2fastq parameters can be changed when
 *           staring up the pipeline.
 *           ONLY RUNS WHEN SAMPLES REMAIN AFTER Single Cell SAMPLES ARE SPLIT OFF
 *           INTO SEPARATE SAMPLE SHEETS.
 */
process bcl2fastq_default {
    tag "${std_samplesheet.name}"
    publishDir path: "${params.outdir}/${runName}/fastq", mode: 'copy'

    label 'process_high'

    input:
    file result2 from PROBLEM_SS_CHECK2.ifEmpty { true }
    file result from resultChannel5
    file std_samplesheet from standard_samplesheet4
    file sheet from updated_samplesheet2.ifEmpty { true }
    file bcl_result from bcl2fastq_results1

    when:
    bcl_result.name =~ /^true.bcl2fastq.txt/

    output:
    file "*/**{R1,R2,R3}_001.fastq.gz" into fastqs_fqc_ch, fastqs_screen_ch, fastq_kraken_ch mode flatten
    file "*/**{I1,I2}_001.fastq.gz" optional true into fastqs_idx_ch
    file "*{R1,R2,R3}_001.fastq.gz" into undetermined_default_fq_ch, undetermined_default_fastqs_screen_ch, undetermined_fastq_kraken_ch mode flatten
    file "*{I1,I2}_001.fastq.gz" optional true into undetermined_idx_fq_ch
    file "Reports" into b2fq_default_reports_ch
    file "Stats" into b2fq_default_stats_ch

    script:
    ignore_miss_bcls = params.ignore_missing_bcls ? "--ignore-missing-bcls " : ""
    ignore_miss_filt = params.ignore_missing_filter ? "--ignore-missing-filter " : ""
    ignore_miss_pos = params.ignore_missing_positions ? "--ignore-missing-positions " : ""
    bases_mask = params.use_bases_mask ? "--use-bases-mask ${params.use_bases_mask} " : ""
    tiles = params.tiles ? "--tiles ${params.tiles} " : ""
    fq_index_rds = params.create_fastq_for_indexreads ? "--create-fastq-for-index-reads " : ""
    failed_rds = params.with_failed_reads ? "--with-failed-reads " : ""
    fq_rev_comp = params.write_fastq_reversecomplement ? "--write-fastq-reverse-complement" : ""
    no_bgzf_comp = params.no_bgzf_compression ? "--no-bgzf-compression " : ""
    no_lane_split = params.no_lane_splitting ? "--no-lane-splitting " : ""
    slide_window_adapt =  params.find_adapters_withsliding_window ? "--find-adapters-with-sliding-window " : ""

    if (result.name =~ /^pass.*/){
        """
        bcl2fastq \\
            --runfolder-dir ${runDir} \\
            --output-dir . \\
            --sample-sheet ${std_samplesheet} \\
            --adapter-stringency ${params.adapter_stringency} \\
            $tiles \\
            $ignore_miss_bcls \\
            $ignore_miss_filt \\
            $ignore_miss_pos \\
            --minimum-trimmed-read-length ${params.minimum_trimmed_readlength} \\
            --mask-short-adapter-reads ${params.mask_short_adapter_reads} \\
            --fastq-compression-level ${params.fastq_compression_level} \\
            --barcode-mismatches ${params.barcode_mismatches} \\
            $bases_mask $fq_index_rds $failed_rds  \\
            $fq_rev_comp $no_bgzf_comp $no_lane_split $slide_window_adapt
        """
    } else if (result2.name =~ /^fail.*/){
        exit 1, "Remade sample sheet still contains problem samples"
    } else if (result.name =~ /^fail.*/){
        """
        bcl2fastq \\
            --runfolder-dir ${runDir} \\
            --output-dir . \\
            --sample-sheet ${sheet} \\
            --adapter-stringency ${params.adapter_stringency} \\
            $tiles \\
            $ignore_miss_bcls \\
            $ignore_miss_filt \\
            $ignore_miss_pos \\
            --minimum-trimmed-read-length ${params.minimum_trimmed_readlength} \\
            --mask-short-adapter-reads ${params.mask_short_adapter_reads} \\
            --fastq-compression-level ${params.fastq_compression_level} \\
            --barcode-mismatches ${params.barcode_mismatches}
            $bases_mask $fq_index_rds $failed_rds  \\
            $fq_rev_comp $no_bgzf_comp $no_lane_split $slide_window_adapt
        """
    }
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                           FastQC                                    -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 11 - FastQC
 */
fqname_fqfile_ch = fastqs_fqc_ch.map { fqFile -> [fqFile.getParent().getName(), fqFile ] }
undetermined_default_fqfile_tuple_ch = undetermined_default_fq_ch.map { fqFile -> ["Undetermined_default", fqFile ] }
cr_fqname_fqfile_fqc_ch = cr_fastqs_fqc_ch.map { fqFile -> [getCellRangerProjectName(fqFile), fqFile ] }
cr_undetermined_default_fq_tuple_ch = cr_undetermined_default_fq_ch.map { fqFile -> ["Undetermined_default", fqFile ] }

fastqcAll = Channel.empty()
fastqcAll_ch = fastqcAll.mix(fqname_fqfile_ch, undetermined_default_fqfile_tuple_ch, cr_fqname_fqfile_fqc_ch, cr_undetermined_default_fq_tuple_ch)

process fastqc {
    tag "${projectName}"
    publishDir path: "${params.outdir}/${runName}/fastqc/${projectName}", mode: 'copy'
    label 'process_high'

    when:
    !params.skip_fastqc

    input:
    set val(projectName), file(fqFile) from fastqcAll_ch

    output:
    set val(projectName), file("*_fastqc") into fqc_folder_ch, all_fcq_files_tuple
    file "*.html" into fqc_html_ch

    script:
    """
    fastqc --extract ${fqFile}
    """
}

// function to determine if paired end or not
def getFastqPairName(fqfile) {
    def sampleName = (fqfile =~ /.*\/(.+)_[R][12]_001\.fastq\.gz/)
    if (sampleName.find()) {
        return sampleName.group(1)
    }
    return fqfile
}

// fastq_kraken_ch.map { fastq -> [ getFastqPairName(fastq), fastq] }.groupTuple().set{ fastq_pairs_ch }
// process kraken2 {
//     tag "${projectName}"
//     publishDir path: "${params.outdir}/${runName}/kraken2/${projectName}", mode: 'copy'
//     label 'process_high'
//
//     when:
//     !params.skip_kraken2
//
//     input:
//     set val(projectName), file(fqFile) from fastq_pairs_ch
//
//     output:
//     set val(projectName), file("*_fastqc") into fqc_folder_ch, all_fcq_files_tuple
//     file "*.html" into fqc_html_ch
//
//     script:
//
//     """
//     kraken2 \\
//         --db $kraken_db \\
//         --threads $task.cpus \\
//         --output %s.out.txt \\
//         --report %s.report.txt
//         $single_end \\
//         --gzip-compressed %s \\
//         $fastq_files
//     """
// }

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                         FastQ Screen                                -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 12 - FastQ Screen
 */
fastqs_screen_fqfile_ch = fastqs_screen_ch.map { fqFile -> [fqFile.getParent().getName(), fqFile ] }
undetermined_fastqs_screen_fqfile_ch = undetermined_default_fastqs_screen_ch.map { fqFile -> ["Undetermined_default", fqFile ] }
cr_fqname_fqfile_screen_ch = cr_fastqs_screen_ch.map { fqFile -> [getCellRangerProjectName(fqFile), fqFile ] }
cr_undetermined_fastqs_screen_tuple_ch = cr_undetermined_fastqs_screen_ch.map { fqFile -> ["Undetermined_default", fqFile ] }

fastqcScreenAll = Channel.empty()
grouped_fqscreen_ch = fastqcScreenAll.mix(fastqs_screen_fqfile_ch, cr_fqname_fqfile_screen_ch, cr_undetermined_fastqs_screen_tuple_ch, undetermined_fastqs_screen_fqfile_ch)

if (params.fastq_screen_conf) {
    process fastq_screen {
        tag "${projectName}"
        publishDir "${params.outdir}/${runName}/fastq_screen/${projectName}", mode: 'copy'
        label 'process_high'

        input:
        set val(projectName), file(fqFile) from grouped_fqscreen_ch
        file fastq_screen_config from ch_fastq_screen_config

        output:
        set val(projectName), file("*_screen.txt") into fastq_screen_txt, all_fq_screen_txt_tuple
        file "*_screen.html" into fastq_screen_html

        script:
        """
        fastq_screen --force --subset 200000 --conf $ch_fastq_screen_config --aligner bowtie2 ${fqFile}
        """
    }
} else {
    fastq_screen_txt = Channel.create()
    all_fq_screen_txt_tuple = Channel.create()
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                           MultiQC                                   -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

/*
 * STEP 13.1 - MultiQC per project
 */
process multiqc {
    tag "${projectName}"
    publishDir path: "${params.outdir}/${runName}/multiqc/${projectName}", mode: 'copy'

    when:
    !params.skip_multiqc

    input:
    file ('fastqc/*') from fqc_folder_ch.collect{it[1]}.ifEmpty([])
    file ('fastq_screen/*') from fastq_screen_txt.collect{it[1]}.ifEmpty([])
    file (multiqc_config) from ch_multiqc_config
    file (mqc_custom_config) from ch_multiqc_custom_config.collect().ifEmpty([])
    // TODO nf-core: Add in log files from your new processes for MultiQC to find!
    file ('software_versions/*') from ch_software_versions_yaml.collect()
    file workflow_summary from ch_workflow_summary.collectFile(name: "workflow_summary_mqc.yaml")

    output:
    file "*multiqc_report.html" into ch_multiqc_report
    file "*_data"
    file "multiqc_plots"
    val(projectName) into projectList

    script:
    rtitle = custom_runName ? "--title \"$custom_runName\"" : ''
    rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : ''
    custom_config_file = params.multiqc_config ? "--config $mqc_custom_config" : ''
    mqcstats = params.skip_multiqc_stats ? '--cl_config "skip_generalstats: true"' : ''
    // TODO nf-core: Specify which MultiQC modules to use with -m for a faster run time
    """
    multiqc ${fqFiles} ${fqScreen} $rtitle $rfilename $custom_config_file $mqcstats .
    """
}

/*
 * STEP 13.2- MultiQC for all projects
 */
all_fcq_files = all_fcq_files_tuple.map { k,v -> v }.flatten().collect()
all_fq_screen_files = all_fq_screen_txt_tuple.map { k,v -> v }.flatten().collect()
bcl_stats_empty = Channel.empty()
b2fq_default_stats_all_ch = bcl_stats_empty.mix(b2fq_default_stats_ch)

process multiqcAll {
    tag "${runName}"
    publishDir path: "${params.outdir}/${runName}/multiqc", mode: 'copy'

    when:
    !params.skip_multiqc

    input:
    file fqFile from all_fcq_files
    file fqScreen from all_fq_screen_files
    file bcl_stats from b2fq_default_stats_all_ch.ifEmpty('')
    file multiqc_config from ch_multiqc_config

    output:
    file "*multiqc_report.html" into multiqc_report_all
    file "*_data"
    file "multiqc_plots"

    script:
    mqcstats = params.skip_multiqc_stats ? '--cl_config "skip_generalstats: true"' : ''
    """
    multiqc ${fqFile} ${fqScreen} ${bcl_stats} --config $multiqc_config $mqcstats .
    """
}

///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
/* --                                                                     -- */
/* --                     Reports/Documentation                           -- */
/* --                                                                     -- */
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////

Channel.from(summary.collect{ [it.key, it.value] })
    .map { k,v -> "<dt>$k</dt><dd><samp>${v ?: '<span style=\"color:#999999;\">N/A</a>'}</samp></dd>" }
    .reduce { a, b -> return [a, b].join("\n            ") }
    .map { x -> """
    id: 'nf-core-demultiplex-summary'
    description: " - this information is collected when the pipeline is started."
    section_name: 'nf-core/demultiplex Workflow Summary'
    section_href: 'https://github.com/nf-core/demultiplex'
    plot_type: 'html'
    data: |
        <dl class=\"dl-horizontal\">
            $x
        </dl>
    """.stripIndent() }
    .set { ch_workflow_summary }

/*
 * Parse software version numbers
 */
process get_software_versions {
    publishDir "${params.outdir}/pipeline_info", mode: 'copy',
        saveAs: {filename ->
            if (filename.indexOf(".csv") > 0) filename
            else null
        }

    output:
    file 'software_versions_mqc.yaml' into ch_software_versions_yaml
    file "software_versions.csv"

    script:
    // TODO nf-core: Get all tools to print their version number here
    """
    echo $workflow.manifest.version > v_pipeline.txt
    echo $workflow.nextflow.version > v_nextflow.txt
    fastqc --version > v_fastqc.txt
    fastq_screen --version > v_fastqscreen.txt
    #kraken2
    multiqc --version > v_multiqc.txt
    echo \$(bcl2fastq --version 2>&1) > v_bcl2fastq.txt
    cellranger mkfastq --version > v_cellranger.txt
    #cellranger-atac --version > v_cellrangeratac.txt
    #cellranger-dna --version > v_cellrangerdna.txt
    scrape_software_versions.py &> software_versions_mqc.yaml
    """
}

/*
 * STEP 14 - Output Description HTML
 */
process output_documentation {
    publishDir "${params.outdir}/pipeline_info", mode: 'copy'

    input:
    file output_docs from ch_output_docs

    output:
    file "results_description.html"

    script:
    """
    markdown_to_html.py $output_docs -o results_description.html
    """
}

/*
 * Completion e-mail notification
 */
workflow.onComplete {

    // Set up the e-mail variables
    def subject = "[nf-core/demultiplex] Successful: $workflow.runName"
    if(!workflow.success){
        subject = "[nf-core/demultiplex] FAILED: $workflow.runName"
    }

    def extra_links =[:]
    def all_multiqc
    if(workflow.success && workflow.profile == 'crick') {
        def projectList_2 = []
        projectList.subscribe { projectList_2.add("$it") }

        all_multiqc = projectList_2.collect{ project -> ["${project}", "https://sample-selector-bioinformatics.crick.ac.uk/sequencing/${runName}/multiqc/${project}/multiqc_report.html"] }
        extra_links.put("MultiQC Global", "https://sample-selector-bioinformatics.crick.ac.uk/sequencing/${runName}/multiqc/multiqc_report.html")
        extra_links.put("Demultiplexing Default", "https://sample-selector-bioinformatics.crick.ac.uk/sequencing/${runName}/fastq/Reports/html/index.html")
    }

    def email_fields = [:]
    if(workflow.success && workflow.profile == 'crick') email_fields['project_QC_links'] = all_multiqc
    if(workflow.success && workflow.profile == 'crick') email_fields['extra_links'] = extra_links
    email_fields['version'] = workflow.manifest.version
    email_fields['runName'] = custom_runName ?: workflow.runName
    email_fields['success'] = workflow.success
    email_fields['dateComplete'] = workflow.complete
    email_fields['duration'] = workflow.duration
    email_fields['exitStatus'] = workflow.exitStatus
    email_fields['errorMessage'] = (workflow.errorMessage ?: 'None')
    email_fields['errorReport'] = (workflow.errorReport ?: 'None')
    email_fields['commandLine'] = workflow.commandLine
    email_fields['projectDir'] = workflow.projectDir
    email_fields['summary'] = summary
    email_fields['summary']['Date Started'] = workflow.start
    email_fields['summary']['Date Completed'] = workflow.complete
    email_fields['summary']['Pipeline script file path'] = workflow.scriptFile
    email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId
    if (workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository
    if (workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId
    if (workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision
    email_fields['summary']['Nextflow Version'] = workflow.nextflow.version
    email_fields['summary']['Nextflow Build'] = workflow.nextflow.build
    email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp

    // TODO nf-core: If not using MultiQC, strip out this code (including params.max_multiqc_email_size)
    // On success try attach the multiqc report
    def mqc_report = null
    try {
        if (workflow.success) {
            mqc_report = ch_multiqc_report.getVal()
            if (mqc_report.getClass() == ArrayList) {
                log.warn "[nf-core/demultiplex] Found multiple reports from process 'multiqc', will use only one"
                mqc_report = mqc_report[0]
            }
        }
    } catch (all) {
        log.warn "[nf-core/demultiplex] Could not attach MultiQC report to summary email"
    }

    // Check if we are only sending emails on failure
    email_address = params.email
    if (!params.email && params.email_on_fail && !workflow.success) {
        email_address = params.email_on_fail
    }

    // Render the TXT template
    def engine = new groovy.text.GStringTemplateEngine()
    def tf = new File("$baseDir/assets/email_template.txt")
    def txt_template = engine.createTemplate(tf).make(email_fields)
    def email_txt = txt_template.toString()

    // Render the HTML template
    def hf = new File("$baseDir/assets/email_template.html")
    def html_template = engine.createTemplate(hf).make(email_fields)
    def email_html = html_template.toString()

    // Render the sendmail template
    def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.max_multiqc_email_size.toBytes() ]
    def sf = new File("$baseDir/assets/sendmail_template.txt")
    def sendmail_template = engine.createTemplate(sf).make(smail_fields)
    def sendmail_html = sendmail_template.toString()

    // Send the HTML e-mail
    if (email_address) {
        try {
            if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') }
            // Try to send HTML e-mail using sendmail
            [ 'sendmail', '-t' ].execute() << sendmail_html
            log.info "[nf-core/demultiplex] Sent summary e-mail to $email_address (sendmail)"
        } catch (all) {
            // Catch failures and try with plaintext
            [ 'mail', '-s', subject, email_address ].execute() << email_txt
            log.info "[nf-core/demultiplex] Sent summary e-mail to $email_address (mail)"
        }
    }

    // Write summary e-mail HTML to a file
    def output_d = new File("${params.outdir}/pipeline_info/")
    if (!output_d.exists()) {
        output_d.mkdirs()
    }
    def output_hf = new File(output_d, "pipeline_report.html")
    output_hf.withWriter { w -> w << email_html }
    def output_tf = new File(output_d, "pipeline_report.txt")
    output_tf.withWriter { w -> w << email_txt }

    c_green = params.monochrome_logs ? '' : "\033[0;32m";
    c_purple = params.monochrome_logs ? '' : "\033[0;35m";
    c_red = params.monochrome_logs ? '' : "\033[0;31m";
    c_reset = params.monochrome_logs ? '' : "\033[0m";

    if (workflow.stats.ignoredCount > 0 && workflow.success) {
        log.info "-${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}-"
        log.info "-${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCount} ${c_reset}-"
        log.info "-${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCount} ${c_reset}-"
    }

    if (workflow.success) {
        log.info "-${c_purple}[nf-core/demultiplex]${c_green} Pipeline completed successfully${c_reset}-"
    } else {
        checkHostname()
        log.info "-${c_purple}[nf-core/demultiplex]${c_red} Pipeline completed with errors${c_reset}-"
    }

}

def nfcoreHeader() {
    // Log colors ANSI codes
    c_black = params.monochrome_logs ? '' : "\033[0;30m";
    c_blue = params.monochrome_logs ? '' : "\033[0;34m";
    c_cyan = params.monochrome_logs ? '' : "\033[0;36m";
    c_dim = params.monochrome_logs ? '' : "\033[2m";
    c_green = params.monochrome_logs ? '' : "\033[0;32m";
    c_purple = params.monochrome_logs ? '' : "\033[0;35m";
    c_reset = params.monochrome_logs ? '' : "\033[0m";
    c_white = params.monochrome_logs ? '' : "\033[0;37m";
    c_yellow = params.monochrome_logs ? '' : "\033[0;33m";

    return """    -${c_dim}--------------------------------------------------${c_reset}-
                                            ${c_green},--.${c_black}/${c_green},-.${c_reset}
    ${c_blue}        ___     __   __   __   ___     ${c_green}/,-._.--~\'${c_reset}
    ${c_blue}  |\\ | |__  __ /  ` /  \\ |__) |__         ${c_yellow}}  {${c_reset}
    ${c_blue}  | \\| |       \\__, \\__/ |  \\ |___     ${c_green}\\`-._,-`-,${c_reset}
                                            ${c_green}`._,._,\'${c_reset}
    ${c_purple}  nf-core/demultiplex v${workflow.manifest.version}${c_reset}
    -${c_dim}--------------------------------------------------${c_reset}-
    """.stripIndent()
}

def checkHostname() {
    def c_reset = params.monochrome_logs ? '' : "\033[0m"
    def c_white = params.monochrome_logs ? '' : "\033[0;37m"
    def c_red = params.monochrome_logs ? '' : "\033[1;91m"
    def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m"
    if (params.hostnames) {
        def hostname = "hostname".execute().text.trim()
        params.hostnames.each { prof, hnames ->
            hnames.each { hname ->
                if (hostname.contains(hname) && !workflow.profile.contains(prof)) {
                    log.error "====================================================\n" +
                            "  ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" +
                            "  but your machine hostname is ${c_white}'$hostname'${c_reset}\n" +
                            "  ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" +
                            "============================================================"
                }
            }
        }
    }
}