Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

storeDir to publishDir #80

Merged
merged 3 commits into from
Dec 11, 2015
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 45 additions & 55 deletions needlestack.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
// You should have received a copy of the GNU General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.

// run using for ex.:
// run using for ex.:
// needlestack.nf --bed my_bed_file.bed --nsplit 20 --fasta_ref reference.fasta --bam_folder BAM/

// requirement:
Expand All @@ -30,19 +30,19 @@

params.min_dp = 50 // minimum coverage in at least one sample to consider a site
params.min_ao = 5 // minimum number of non-ref reads in at least one sample to consider a site
params.nsplit = 1 // split the bed file in nsplit pieces and run in parallel
params.min_qval = 50 // qvalue in Phred scale to consider a variant
params.nsplit = 1 // split the bed file in nsplit pieces and run in parallel
params.min_qval = 50 // qvalue in Phred scale to consider a variant
// http://gatkforums.broadinstitute.org/discussion/5533/strandoddsratio-computation filter out SOR > 4 for SNVs and > 10 for indels
// filter out RVSB > 0.85 (maybe less stringent for SNVs)
params.sb_type = "SOR" // strand bias measure to be used: "SOR" or "RVSB"
params.sb_snv = 100 // strand bias threshold for snv
params.sb_snv = 100 // strand bias threshold for snv
params.sb_indel = 100 // strand bias threshold for indels
params.map_qual = 20 // min mapping quality (passed to samtools)
params.base_qual = 20 // min base quality (passed to samtools)
params.max_DP = 30000 // downsample coverage per sample (passed to samtools)
params.use_file_name = false //put these argument to use the bam file names as sample names and do not to use the sample name filed from the bam files (SM tag)
params.all_SNVs = false // output all sites, even when no variant is detected
params.no_plots = false // do not produce pdf plots of regressions
params.no_plots = false // do not produce pdf plots of regressions
params.out_folder = params.bam_folder // if not provided, outputs will be held on the input bam folder
params.no_indels = false // do not skip indels

Expand Down Expand Up @@ -107,13 +107,13 @@ if (fasta_ref.exists()) {assert fasta_ref_fai.exists() : "input fasta reference
if (fasta_ref.exists() && params.fasta_ref.tokenize('.')[-1] == 'gz') {assert fasta_ref_gzi.exists() : "input gz fasta reference does not seem to have a .gzi index (use samtools faidx)"}
try { assert bed.exists() : "\n WARNING : bed file not located in execution directory" } catch (AssertionError e) { println e.getMessage() }
try { assert file(params.bam_folder).exists() : "\n WARNING : input BAM folder not located in execution directory" } catch (AssertionError e) { println e.getMessage() }
assert file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0 : "BAM folder contains no BAM"
assert file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() > 0 : "BAM folder contains no BAM"
if (file(params.bam_folder).exists()) {
if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 10) {println "\n ERROR : BAM folder contains less than 10 BAM, exit."; System.exit(0)}
else if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 20) {println "\n WARNING : BAM folder contains less than 20 BAM, method accuracy not warranted."}
if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 10) {println "\n ERROR : BAM folder contains less than 10 BAM, exit."; System.exit(0)}
else if (file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.size() < 20) {println "\n WARNING : BAM folder contains less than 20 BAM, method accuracy not warranted."}
bamID = file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam/ }.collect { it.getName() }.collect { it.replace('.bam','') }
baiID = file(params.bam_folder).listFiles().findAll { it.name ==~ /.*bam.bai/ }.collect { it.getName() }.collect { it.replace('.bam.bai','') }
assert baiID.containsAll(bamID) : "check that every bam file has an index (.bam.bai)"
assert baiID.containsAll(bamID) : "check that every bam file has an index (.bam.bai)"
}
assert (params.min_dp > 0) : "minimum coverage must be higher than 0 (--min_dp)"
assert (params.max_DP > 1) : "maximum coverage before downsampling must be higher than 1 (--max_DP)"
Expand Down Expand Up @@ -153,27 +153,25 @@ log.info "Strand bias threshold for SNVs (--sb_snv) : ${pa
log.info "Strand bias threshold for indels (--sb_indel) : ${params.sb_indel}"
log.info "Samtools minimum mapping quality (--map_qual) : ${params.map_qual}"
log.info "Samtools minimum base quality (--base_qual) : ${params.base_qual}"
log.info "Samtools maximum coverage before downsampling (--max_DP) : ${params.max_DP}"
log.info "Samtools maximum coverage before downsampling (--max_DP) : ${params.max_DP}"
log.info "Sample names definition (--use_file_name) : ${sample_names}"
log.info(params.all_SNVs == true ? "Output all SNVs (--all_SNVs) : yes" : "Output all SNVs (--all_SNVs) : no" )
log.info(params.all_SNVs == true ? "Output all SNVs (--all_SNVs) : yes" : "Output all SNVs (--all_SNVs) : no" )
log.info(params.no_plots == true ? "PDF regression plots (--no_plots) : no" : "PDF regression plots (--no_plots) : yes" )
log.info(params.no_indels == true ? "Skip indels (--no_indels) : yes" : "Skip indels (--no_indels) : no" )
log.info(params.no_indels == true ? "Skip indels (--no_indels) : yes" : "Skip indels (--no_indels) : no" )
log.info "output folder (--out_folder) : ${params.out_folder}"
log.info "\n"

bam = Channel.fromPath( params.bam_folder+'/*.bam' ).toList()
bam = Channel.fromPath( params.bam_folder+'/*.bam' ).toList()
bai = Channel.fromPath( params.bam_folder+'/*.bam.bai' ).toList()

/* split bed file into nsplit regions */
process split_bed {

storeDir { params.out_folder+'/BED_REGIONS/' }


intput:
file bed

output:
file '*_regions' into split_bed mode flatten
file '*_regions' into split_bed mode flatten

shell:
'''
Expand All @@ -184,21 +182,19 @@ process split_bed {
// create mpileup file + sed to have "*" when there is no coverage (otherwise pileup2baseindel.pl is unhappy)
process samtools_mpileup {

storeDir { params.out_folder+'/PILEUP/' }

tag { region_tag }

tag { region_tag }

input:
file split_bed
file bam
file bai
file bai
file fasta_ref
file fasta_ref_fai
file fasta_ref_gzi

output:
set val(region_tag), file("${region_tag}.pileup") into pileup

shell:
region_tag = split_bed.baseName
'''
Expand All @@ -208,23 +204,21 @@ process samtools_mpileup {
'''
}

// split mpileup file and convert to table
// split mpileup file and convert to table
process mpileup2table {

errorStrategy 'ignore'

storeDir { params.out_folder+'/PILEUP/'+region_tag }

tag { region_tag }


tag { region_tag }

input:
set val(region_tag), file("${region_tag}.pileup") from pileup
file bam
val sample_names

output:
set val(region_tag), file('sample*.txt'), file('names.txt') into table

shell:
if ( params.no_indels ) {
indel_par = "-no-indels"
Expand All @@ -233,13 +227,13 @@ process mpileup2table {
}
'''
nb_pos=$(wc -l < !{region_tag}.pileup)
if [ $nb_pos -gt 0 ]; then
if [ $nb_pos -gt 0 ]; then
# split and convert pileup file
pileup2baseindel.pl -i !{region_tag}.pileup !{indel_par}
# rename the output (the converter call files sample{i}.txt)
i=1
for cur_bam in !{bam}
do
for cur_bam in !{bam}
do
if [ "!{sample_names}" == "FILE" ]; then
# use bam file name as sample name
bam_file_name="${cur_bam%.*}"
Expand All @@ -250,60 +244,56 @@ process mpileup2table {
SM=$(samtools view -H $cur_bam | grep @RG | head -1 | sed "s/.*SM:\\([^\\t]*\\).*/\\1/" | tr -d '[:space:]')
fi
printf "sample$i\\t$SM\\n" >> names.txt
i=$((i+1))
i=$((i+1))
done
fi
'''
}

// perform regression in R
process R_regression {
storeDir { params.out_folder+'/VCF/' }
tag { region_tag }

publishDir params.out_folder+'/PDF/', mode: 'move', pattern: "*[ATCG-].pdf"

tag { region_tag }

input:
set val(region_tag), file(table_file), file('names.txt') from table
file fasta_ref
file fasta_ref_fai
file fasta_ref_gzi
file fasta_ref_gzi

output:
file "${region_tag}.vcf" into vcf
file '*.pdf' into PDF

shell:
'''
# create a dummy empty pdf to avoid an error in the process when no variant is found
# create a dummy empty pdf to avoid an error in the process when no variant is found
touch !{region_tag}_empty.pdf
needlestack.r --out_file=!{region_tag}.vcf --fasta_ref=!{fasta_ref} --GQ_threshold=!{params.min_qval} --min_coverage=!{params.min_dp} --min_reads=!{params.min_ao} --SB_type=!{params.sb_type} --SB_threshold_SNV=!{params.sb_snv} --SB_threshold_indel=!{params.sb_indel} --output_all_SNVs=!{params.all_SNVs} --do_plots=!{!params.no_plots}
'''
}
//PDF.flatten().filter { it.size() == 0 }.subscribe { it.delete() }

// merge all vcf files in one big file
// merge all vcf files in one big file
process collect_vcf_result {

storeDir { params.out_folder }
publishDir params.out_folder, mode: 'move'

input:
file '*.vcf' from vcf.toList()
file fasta_ref_fai

output:
file 'all_variants.vcf' into big_vcf

shell:
empty_files = file(params.out_folder+'/VCF/*_empty.pdf')
for( def file : empty_files ) {
file.delete()
}
'''
nb_vcf=$(find . -maxdepth 1 -name '*vcf' | wc -l)
if [ $nb_vcf -gt 1 ]; then
vcfoverlay *.vcf > all_variants.vcf
else
else
cp .vcf all_variants.vcf
fi
# Add contigs in the VCF header
Expand Down