diff --git a/needlestack.nf b/needlestack.nf index fe74797..24aaf39 100644 --- a/needlestack.nf +++ b/needlestack.nf @@ -16,7 +16,7 @@ // You should have received a copy of the GNU General Public License // along with this program. If not, see . -// run using for ex.: +// run using for ex.: // needlestack.nf --bed my_bed_file.bed --nsplit 20 --fasta_ref reference.fasta --bam_folder BAM/ // requirement: @@ -30,19 +30,19 @@ params.min_dp = 50 // minimum coverage in at least one sample to consider a site params.min_ao = 5 // minimum number of non-ref reads in at least one sample to consider a site -params.nsplit = 1 // split the bed file in nsplit pieces and run in parallel -params.min_qval = 50 // qvalue in Phred scale to consider a variant +params.nsplit = 1 // split the bed file in nsplit pieces and run in parallel +params.min_qval = 50 // qvalue in Phred scale to consider a variant // http://gatkforums.broadinstitute.org/discussion/5533/strandoddsratio-computation filter out SOR > 4 for SNVs and > 10 for indels // filter out RVSB > 0.85 (maybe less stringent for SNVs) params.sb_type = "SOR" // strand bias measure to be used: "SOR" or "RVSB" -params.sb_snv = 100 // strand bias threshold for snv +params.sb_snv = 100 // strand bias threshold for snv params.sb_indel = 100 // strand bias threshold for indels params.map_qual = 20 // min mapping quality (passed to samtools) params.base_qual = 20 // min base quality (passed to samtools) params.max_DP = 30000 // downsample coverage per sample (passed to samtools) params.use_file_name = false //put these argument to use the bam file names as sample names and do not to use the sample name filed from the bam files (SM tag) params.all_SNVs = false // output all sites, even when no variant is detected -params.no_plots = false // do not produce pdf plots of regressions +params.no_plots = false // do not produce pdf plots of regressions params.out_folder = params.bam_folder // if not provided, outputs will be held on the input bam folder params.no_indels = false // do not skip indels @@ -107,13 +107,13 @@ if (fasta_ref.exists()) {assert fasta_ref_fai.exists() : "input fasta reference if (fasta_ref.exists() && params.fasta_ref.tokenize('.')[-1] == 'gz') {assert fasta_ref_gzi.exists() : "input gz fasta reference does not seem to have a .gzi index (use samtools faidx)"} try { assert bed.exists() : "\n WARNING : bed file not located in execution directory" } catch (AssertionError e) { println e.getMessage() } try { assert file(params.bam_folder).exists() : "\n WARNING : input BAM folder not located in execution directory" } catch (AssertionError e) { println e.getMessage() } -assert file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0 : "BAM folder contains no BAM" +assert file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0 : "BAM folder contains no BAM" if (file(params.bam_folder).exists()) { - if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 10) {println "\n ERROR : BAM folder contains less than 10 BAM, exit."; System.exit(0)} - else if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 20) {println "\n WARNING : BAM folder contains less than 20 BAM, method accuracy not warranted."} + if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 10) {println "\n ERROR : BAM folder contains less than 10 BAM, exit."; System.exit(0)} + else if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 20) {println "\n WARNING : BAM folder contains less than 20 BAM, method accuracy not warranted."} bamID = file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.collect { it.getName() }.collect { it.replace('.bam','') } baiID = file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam.bai/ }.collect { it.getName() }.collect { it.replace('.bam.bai','') } - assert baiID.containsAll(bamID) : "check that every bam file has an index (.bam.bai)" + assert baiID.containsAll(bamID) : "check that every bam file has an index (.bam.bai)" } assert (params.min_dp > 0) : "minimum coverage must be higher than 0 (--min_dp)" assert (params.max_DP > 1) : "maximum coverage before downsampling must be higher than 1 (--max_DP)" @@ -153,27 +153,25 @@ log.info "Strand bias threshold for SNVs (--sb_snv) : ${pa log.info "Strand bias threshold for indels (--sb_indel) : ${params.sb_indel}" log.info "Samtools minimum mapping quality (--map_qual) : ${params.map_qual}" log.info "Samtools minimum base quality (--base_qual) : ${params.base_qual}" -log.info "Samtools maximum coverage before downsampling (--max_DP) : ${params.max_DP}" +log.info "Samtools maximum coverage before downsampling (--max_DP) : ${params.max_DP}" log.info "Sample names definition (--use_file_name) : ${sample_names}" -log.info(params.all_SNVs == true ? "Output all SNVs (--all_SNVs) : yes" : "Output all SNVs (--all_SNVs) : no" ) +log.info(params.all_SNVs == true ? "Output all SNVs (--all_SNVs) : yes" : "Output all SNVs (--all_SNVs) : no" ) log.info(params.no_plots == true ? "PDF regression plots (--no_plots) : no" : "PDF regression plots (--no_plots) : yes" ) -log.info(params.no_indels == true ? "Skip indels (--no_indels) : yes" : "Skip indels (--no_indels) : no" ) +log.info(params.no_indels == true ? "Skip indels (--no_indels) : yes" : "Skip indels (--no_indels) : no" ) log.info "output folder (--out_folder) : ${params.out_folder}" log.info "\n" -bam = Channel.fromPath( params.bam_folder+'/*.bam' ).toList() +bam = Channel.fromPath( params.bam_folder+'/*.bam' ).toList() bai = Channel.fromPath( params.bam_folder+'/*.bam.bai' ).toList() /* split bed file into nsplit regions */ process split_bed { - - storeDir { params.out_folder+'/BED_REGIONS/' } - + intput: file bed - + output: - file '*_regions' into split_bed mode flatten + file '*_regions' into split_bed mode flatten shell: ''' @@ -184,21 +182,19 @@ process split_bed { // create mpileup file + sed to have "*" when there is no coverage (otherwise pileup2baseindel.pl is unhappy) process samtools_mpileup { - storeDir { params.out_folder+'/PILEUP/' } - - tag { region_tag } - + tag { region_tag } + input: file split_bed file bam - file bai + file bai file fasta_ref file fasta_ref_fai file fasta_ref_gzi - + output: set val(region_tag), file("${region_tag}.pileup") into pileup - + shell: region_tag = split_bed.baseName ''' @@ -208,23 +204,21 @@ process samtools_mpileup { ''' } -// split mpileup file and convert to table +// split mpileup file and convert to table process mpileup2table { - + errorStrategy 'ignore' - - storeDir { params.out_folder+'/PILEUP/'+region_tag } - - tag { region_tag } - + + tag { region_tag } + input: set val(region_tag), file("${region_tag}.pileup") from pileup file bam val sample_names - + output: set val(region_tag), file('sample*.txt'), file('names.txt') into table - + shell: if ( params.no_indels ) { indel_par = "-no-indels" @@ -233,13 +227,13 @@ process mpileup2table { } ''' nb_pos=$(wc -l < !{region_tag}.pileup) - if [ $nb_pos -gt 0 ]; then + if [ $nb_pos -gt 0 ]; then # split and convert pileup file pileup2baseindel.pl -i !{region_tag}.pileup !{indel_par} # rename the output (the converter call files sample{i}.txt) i=1 - for cur_bam in !{bam} - do + for cur_bam in !{bam} + do if [ "!{sample_names}" == "FILE" ]; then # use bam file name as sample name bam_file_name="${cur_bam%.*}" @@ -250,7 +244,7 @@ process mpileup2table { SM=$(samtools view -H $cur_bam | grep @RG | head -1 | sed "s/.*SM:\\([^\\t]*\\).*/\\1/" | tr -d '[:space:]') fi printf "sample$i\\t$SM\\n" >> names.txt - i=$((i+1)) + i=$((i+1)) done fi ''' @@ -258,52 +252,48 @@ process mpileup2table { // perform regression in R process R_regression { - - storeDir { params.out_folder+'/VCF/' } - - tag { region_tag } - + + publishDir params.out_folder+'/PDF/', mode: 'move', pattern: "*[ATCG-].pdf" + + tag { region_tag } + input: set val(region_tag), file(table_file), file('names.txt') from table file fasta_ref file fasta_ref_fai - file fasta_ref_gzi - + file fasta_ref_gzi + output: file "${region_tag}.vcf" into vcf file '*.pdf' into PDF - + shell: ''' - # create a dummy empty pdf to avoid an error in the process when no variant is found + # create a dummy empty pdf to avoid an error in the process when no variant is found touch !{region_tag}_empty.pdf needlestack.r --out_file=!{region_tag}.vcf --fasta_ref=!{fasta_ref} --GQ_threshold=!{params.min_qval} --min_coverage=!{params.min_dp} --min_reads=!{params.min_ao} --SB_type=!{params.sb_type} --SB_threshold_SNV=!{params.sb_snv} --SB_threshold_indel=!{params.sb_indel} --output_all_SNVs=!{params.all_SNVs} --do_plots=!{!params.no_plots} ''' } //PDF.flatten().filter { it.size() == 0 }.subscribe { it.delete() } -// merge all vcf files in one big file +// merge all vcf files in one big file process collect_vcf_result { - storeDir { params.out_folder } + publishDir params.out_folder, mode: 'move' input: file '*.vcf' from vcf.toList() file fasta_ref_fai - + output: file 'all_variants.vcf' into big_vcf shell: - empty_files = file(params.out_folder+'/VCF/*_empty.pdf') - for( def file : empty_files ) { - file.delete() - } ''' nb_vcf=$(find . -maxdepth 1 -name '*vcf' | wc -l) if [ $nb_vcf -gt 1 ]; then vcfoverlay *.vcf > all_variants.vcf - else + else cp .vcf all_variants.vcf fi # Add contigs in the VCF header