IARCbioinfo · tdelhomme · Dec 11, 2015 · Dec 10, 2015 · Dec 10, 2015 · Dec 11, 2015
diff --git a/needlestack.nf b/needlestack.nf
@@ -16,7 +16,7 @@
 // You should have received a copy of the GNU General Public License
 // along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-// run using for ex.: 
+// run using for ex.:
 // needlestack.nf --bed my_bed_file.bed --nsplit 20 --fasta_ref reference.fasta --bam_folder BAM/
 
 // requirement:
@@ -30,19 +30,19 @@
 
 params.min_dp = 50 // minimum coverage in at least one sample to consider a site
 params.min_ao = 5 // minimum number of non-ref reads in at least one sample to consider a site
-params.nsplit = 1 // split the bed file in nsplit pieces and run in parallel 
-params.min_qval = 50 // qvalue in Phred scale to consider a variant 
+params.nsplit = 1 // split the bed file in nsplit pieces and run in parallel
+params.min_qval = 50 // qvalue in Phred scale to consider a variant
 // http://gatkforums.broadinstitute.org/discussion/5533/strandoddsratio-computation filter out SOR > 4 for SNVs and > 10 for indels
 // filter out RVSB > 0.85 (maybe less stringent for SNVs)
 params.sb_type = "SOR" // strand bias measure to be used: "SOR" or "RVSB"
-params.sb_snv = 100 // strand bias threshold for snv 
+params.sb_snv = 100 // strand bias threshold for snv
 params.sb_indel = 100 // strand bias threshold for indels
 params.map_qual = 20 // min mapping quality (passed to samtools)
 params.base_qual = 20 // min base quality (passed to samtools)
 params.max_DP = 30000 // downsample coverage per sample (passed to samtools)
 params.use_file_name = false //put these argument to use the bam file names as sample names and do not to use the sample name filed from the bam files (SM tag)
 params.all_SNVs = false //  output all sites, even when no variant is detected
-params.no_plots = false  // do not produce pdf plots of regressions 
+params.no_plots = false  // do not produce pdf plots of regressions
 params.out_folder = params.bam_folder // if not provided, outputs will be held on the input bam folder
 params.no_indels = false // do not skip indels
 
@@ -107,13 +107,13 @@ if (fasta_ref.exists()) {assert fasta_ref_fai.exists() : "input fasta reference
 if (fasta_ref.exists() && params.fasta_ref.tokenize('.')[-1] == 'gz') {assert fasta_ref_gzi.exists() : "input gz fasta reference does not seem to have a .gzi index (use samtools faidx)"}
 try { assert bed.exists() : "\n WARNING : bed file not located in execution directory" } catch (AssertionError e) { println e.getMessage() }
 try { assert file(params.bam_folder).exists() : "\n WARNING : input BAM folder not located in execution directory" } catch (AssertionError e) { println e.getMessage() }
-assert file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0 : "BAM folder contains no BAM" 
+assert file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0 : "BAM folder contains no BAM"
 if (file(params.bam_folder).exists()) {
-  if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 10) {println "\n ERROR : BAM folder contains less than 10 BAM, exit."; System.exit(0)} 
-    else if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 20) {println "\n WARNING : BAM folder contains less than 20 BAM, method accuracy not warranted."} 
+  if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 10) {println "\n ERROR : BAM folder contains less than 10 BAM, exit."; System.exit(0)}
+    else if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 20) {println "\n WARNING : BAM folder contains less than 20 BAM, method accuracy not warranted."}
   bamID = file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.collect { it.getName() }.collect { it.replace('.bam','') }
   baiID = file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam.bai/ }.collect { it.getName() }.collect { it.replace('.bam.bai','') }
-  assert baiID.containsAll(bamID) : "check that every bam file has an index (.bam.bai)" 
+  assert baiID.containsAll(bamID) : "check that every bam file has an index (.bam.bai)"
 }
 assert (params.min_dp > 0) : "minimum coverage must be higher than 0 (--min_dp)"
 assert (params.max_DP > 1) : "maximum coverage before downsampling must be higher than 1 (--max_DP)"
@@ -153,27 +153,25 @@ log.info "Strand bias threshold for SNVs (--sb_snv)                       : ${pa
 log.info "Strand bias threshold for indels (--sb_indel)                   : ${params.sb_indel}"
 log.info "Samtools minimum mapping quality (--map_qual)                   : ${params.map_qual}"
 log.info "Samtools minimum base quality (--base_qual)                     : ${params.base_qual}"
-log.info "Samtools maximum coverage before downsampling (--max_DP)        : ${params.max_DP}"          
+log.info "Samtools maximum coverage before downsampling (--max_DP)        : ${params.max_DP}"
 log.info "Sample names definition (--use_file_name)                       : ${sample_names}"
-log.info(params.all_SNVs == true ? "Output all SNVs (--all_SNVs)                                    : yes" : "Output all SNVs (--all_SNVs)                                    : no" ) 
+log.info(params.all_SNVs == true ? "Output all SNVs (--all_SNVs)                                    : yes" : "Output all SNVs (--all_SNVs)                                    : no" )
 log.info(params.no_plots == true ? "PDF regression plots (--no_plots)                               : no"  : "PDF regression plots (--no_plots)                               : yes" )
-log.info(params.no_indels == true ? "Skip indels (--no_indels)                                       : yes" : "Skip indels (--no_indels)                                       : no" )  
+log.info(params.no_indels == true ? "Skip indels (--no_indels)                                       : yes" : "Skip indels (--no_indels)                                       : no" )
 log.info "output folder (--out_folder)                                    : ${params.out_folder}"
 log.info "\n"
 
-bam = Channel.fromPath( params.bam_folder+'/*.bam' ).toList()   
+bam = Channel.fromPath( params.bam_folder+'/*.bam' ).toList()
 bai = Channel.fromPath( params.bam_folder+'/*.bam.bai' ).toList()
 
 /* split bed file into nsplit regions */
 process split_bed {
-
-     storeDir { params.out_folder+'/BED_REGIONS/' } 
-
+
      intput:
      file bed
-        
+
 	output:
-	file '*_regions' into split_bed mode flatten 
+	file '*_regions' into split_bed mode flatten
 
 	shell:
 	'''
@@ -184,21 +182,19 @@ process split_bed {
 // create mpileup file + sed to have "*" when there is no coverage (otherwise pileup2baseindel.pl is unhappy)
 process samtools_mpileup {
 
-     storeDir { params.out_folder+'/PILEUP/' }   
-
-     tag { region_tag }   
-
+     tag { region_tag }
+
      input:
      file split_bed
 	file bam
-	file bai  
+	file bai
 	file fasta_ref
 	file fasta_ref_fai
 	file fasta_ref_gzi
-     
+
      output:
      set val(region_tag), file("${region_tag}.pileup") into pileup
-        
+
  	shell:
  	region_tag = split_bed.baseName
 	'''
@@ -208,23 +204,21 @@ process samtools_mpileup {
 	'''
 }
 
-// split mpileup file and convert to table 
+// split mpileup file and convert to table
 process mpileup2table {
-     
+
      errorStrategy 'ignore'
-
-     storeDir { params.out_folder+'/PILEUP/'+region_tag }   
-
-     tag { region_tag }   
-
+
+     tag { region_tag }
+
      input:
      set val(region_tag), file("${region_tag}.pileup") from pileup
      file bam
      val sample_names
-     
+
      output:
      set val(region_tag), file('sample*.txt'), file('names.txt') into table
-        
+
  	shell:
 	if ( params.no_indels ) {
 		indel_par = "-no-indels"
@@ -233,13 +227,13 @@ process mpileup2table {
 	}
  	'''
  	nb_pos=$(wc -l < !{region_tag}.pileup)
-	if [ $nb_pos -gt 0 ]; then 
+	if [ $nb_pos -gt 0 ]; then
 		# split and convert pileup file
 		pileup2baseindel.pl -i !{region_tag}.pileup !{indel_par}
 		# rename the output (the converter call files sample{i}.txt)
 		i=1
-		for cur_bam in !{bam} 
-		do 
+		for cur_bam in !{bam}
+		do
 			if [ "!{sample_names}" == "FILE" ]; then
 				# use bam file name as sample name
 				bam_file_name="${cur_bam%.*}"
@@ -250,60 +244,56 @@ process mpileup2table {
 				SM=$(samtools view -H $cur_bam | grep @RG | head -1 | sed "s/.*SM:\\([^\\t]*\\).*/\\1/" | tr -d '[:space:]')
 			fi
 			printf "sample$i\\t$SM\\n" >> names.txt
-			i=$((i+1)) 
+			i=$((i+1))
 		done
 	fi
 	'''
 }
 
 // perform regression in R
 process R_regression {
-       
-     storeDir { params.out_folder+'/VCF/' }   
-        
-     tag { region_tag }   
-        
+
+     publishDir  params.out_folder+'/PDF/', mode: 'move', pattern: "*[ATCG-].pdf"
+
+     tag { region_tag }
+
      input:
      set val(region_tag), file(table_file), file('names.txt') from table
      file fasta_ref
      file fasta_ref_fai
-	file fasta_ref_gzi
-     
+     file fasta_ref_gzi
+
      output:
      file "${region_tag}.vcf" into vcf
      file '*.pdf' into PDF
-        
+
  	shell:
  	'''
- 	# create a dummy empty pdf to avoid an error in the process when no variant is found 
+ 	# create a dummy empty pdf to avoid an error in the process when no variant is found
  	touch !{region_tag}_empty.pdf
 	needlestack.r --out_file=!{region_tag}.vcf --fasta_ref=!{fasta_ref} --GQ_threshold=!{params.min_qval} --min_coverage=!{params.min_dp} --min_reads=!{params.min_ao} --SB_type=!{params.sb_type} --SB_threshold_SNV=!{params.sb_snv} --SB_threshold_indel=!{params.sb_indel} --output_all_SNVs=!{params.all_SNVs} --do_plots=!{!params.no_plots}
 	'''
 }
 //PDF.flatten().filter { it.size() == 0 }.subscribe { it.delete() }
 
-// merge all vcf files in one big file 
+// merge all vcf files in one big file
 process collect_vcf_result {
 
-	storeDir { params.out_folder }
+	publishDir  params.out_folder, mode: 'move'
 
 	input:
 	file '*.vcf' from vcf.toList()
      file fasta_ref_fai
-  
+
 	output:
 	file 'all_variants.vcf' into big_vcf
 
 	shell:
-	empty_files = file(params.out_folder+'/VCF/*_empty.pdf')
-	for( def file : empty_files ) {
-    		file.delete()
-	}
 	'''
 	nb_vcf=$(find . -maxdepth 1 -name '*vcf' | wc -l)
 	if [ $nb_vcf -gt 1 ]; then
 		vcfoverlay *.vcf > all_variants.vcf
-	else 
+	else
 		cp .vcf all_variants.vcf
 	fi
 	# Add contigs in the VCF header