diff --git a/bin/bam_stats.sh b/bin/bam_stats.sh new file mode 100755 index 0000000..e21ea06 --- /dev/null +++ b/bin/bam_stats.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +FILE=${1} +#LIST=$2 + +#STAT=$(basename $(basename ${LIST}) .txt) +echo "Filename"$'\t'"Total"$'\t'"mapped_reads"$'\t'"unmapped_reads"$'\t'"proportion_mapped"$'\t'"proportion_unmapped" > ${FILE}.mappedstats.txt + + +mapped_reads=$(samtools stats ${FILE}.sorted.bam | grep ^SN | awk 'NR == 7 {print $4}') +unmapped_reads=$(samtools stats ${FILE}.sorted.bam | grep ^SN | awk 'NR == 9 {print $4}') +total=$(samtools stats ${FILE}.sorted.bam | grep ^SN | awk 'NR == 1 {print $5}') +proportion_mapped=$(awk "BEGIN {printf \"%.2f\n\", $mapped_reads*100/$total}") +proportion_unmapped=$(awk "BEGIN {printf \"%.2f\n\", $unmapped_reads*100/$total}") +printf "%s\t%s\t%s\t%s\t%s\t%s\n" $FILE $total $mapped_reads $unmapped_reads $proportion_mapped $proportion_unmapped >> ${FILE}.mappedstats.txt +# remove bam file +#rm -rf /MIGE/01_DATA/02_MAPPING/${FILE}*.{bam,bai} diff --git a/clean.nf b/clean.nf index 5daf743..a818254 100755 --- a/clean.nf +++ b/clean.nf @@ -181,10 +181,9 @@ lib_pairedness = params.input_type == 'illumina' ? 'paired' : 'single' include { prepare_contamination } from './workflows/prepare_contamination_wf' addParams( tool: tool ) include { check_own as prepare_keep } from './modules/prepare_contamination' - include { clean } from './workflows/clean_wf' addParams( tool: tool, lib_pairedness: lib_pairedness ) include { keep } from './workflows/keep_wf' addParams( tool: tool, lib_pairedness: lib_pairedness ) - +include { summarize } from './workflows/summarize_wf' include { qc } from './workflows/qc_wf' /************************** @@ -196,7 +195,12 @@ workflow { contamination = prepare_contamination.out clean(input_ch, contamination, nanoControlBedChannel, 'map-to-remove') - + + if (!params.bbduk) { + bam_sort = clean.out.sort_bam_ch + summarize(bam_sort) + } + if (params.keep){ prepare_keep(keepFastaChannel) keep_fasta = prepare_keep.out diff --git a/modules/prepare_contamination.nf b/modules/prepare_contamination.nf index f9d8cd5..f285b8a 100644 --- a/modules/prepare_contamination.nf +++ b/modules/prepare_contamination.nf @@ -18,29 +18,29 @@ process download_host { """ case $host in hsa) - wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' -O host-temp.fa.gz + wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' -o host-temp.fa.gz ;; mmu) - wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz' -O host-temp.fa.gz + wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz' -o host-temp.fa.gz ;; cli) - wget 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/337/935/GCF_000337935.1_Cliv_1.0/GCF_000337935.1_Cliv_1.0_genomic.fna.gz' -O host-temp.fa.gz + wget 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/337/935/GCF_000337935.1_Cliv_1.0/GCF_000337935.1_Cliv_1.0_genomic.fna.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/337/935/GCF_000337935.1_Cliv_1.0/GCF_000337935.1_Cliv_1.0_genomic.fna.gz' -o host-temp.fa.gz ;; csa) - wget 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/409/795/GCF_000409795.2_Chlorocebus_sabeus_1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz' -O host-temp.fa.gz + wget 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/409/795/GCF_000409795.2_Chlorocebus_sabeus_1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/409/795/GCF_000409795.2_Chlorocebus_sabeus_1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz' -o host-temp.fa.gz ;; gga) - wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz' -O host-temp.fa.gz + wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz' -o host-temp.fa.gz ;; eco) - wget 'ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz' -O host-temp.fa.gz + wget 'ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz' -o host-temp.fa.gz ;; sc2) - wget 'https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3?download=true' -O host-temp.fa + wget 'https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3?download=true' -O host-temp.fa || curl 'https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3?download=true' -o host-temp.fa gzip host-temp.fa ;; t2t) - wget 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz' -O host-temp.fa.gz + wget 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz' -O host-temp.fa.gz || curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz' -o host-temp.fa.gz ;; *) echo "Unknown host ($host)." diff --git a/modules/summarize.nf b/modules/summarize.nf new file mode 100644 index 0000000..21793a3 --- /dev/null +++ b/modules/summarize.nf @@ -0,0 +1,83 @@ +process BAM_STATISTICS { + + label 'minimap2' + + publishDir ( + + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "*.mappedstats.txt", + enabled: !params.no_intermediate, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" + } + ) + + input: + tuple val(name), val(type), path(bam) + + + output: + path("*.mappedstats.txt") + + + script: + """ + bam_stats.sh ${name} + + """ + + stub: + """ + touch ${bam.baseName}.mappedstats.txt + """ +} + + +process COMBINE_BAM_STATISTICS { + + label 'minimap2' + + publishDir ( + + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "combined_bam_stats.txt", + enabled: !params.no_intermediate, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" + } + ) + + tag { 'combine bam statistics files'} + + + input: + path(bam_statistics_files) + + + output: + path("combined_bam_stats.txt") + + + script: + """ + BAM_STATISTICS_FILES=(${bam_statistics_files}) + + for index in \${!BAM_STATISTICS_FILES[@]}; do + BAM_STATISTICS_FILE=\${BAM_STATISTICS_FILES[\$index]} + + # add header line if first file + if [[ \$index -eq 0 ]]; then + echo "\$(head -1 \${BAM_STATISTICS_FILE})" >> combined_bam_stats.txt + fi + echo "\$(awk 'FNR==2 {print}' \${BAM_STATISTICS_FILE})" >> combined_bam_stats.txt + done + + """ + + stub: + """ + touch combined_bam_stats.txt + """ +} diff --git a/workflows/clean_wf.nf b/workflows/clean_wf.nf index d18eae9..904ce06 100644 --- a/workflows/clean_wf.nf +++ b/workflows/clean_wf.nf @@ -22,8 +22,9 @@ workflow clean { flagstats = Channel.empty() out_reads = bbduk.out.cleaned_reads.concat(bbduk.out.contaminated_reads) bams_bai = Channel.empty() - } else { - + sort_bam_ch = Channel.empty() + } + else { if ( params.bwa ) { bwa_index(contamination) bwa(input, bwa_index.out, contamination) | sort_bam | index_bam | ( idxstats_from_bam & flagstats_from_bam ) @@ -32,7 +33,6 @@ workflow clean { minimap2(input, contamination) | sort_bam | index_bam | ( idxstats_from_bam & flagstats_from_bam ) split_bam(minimap2.out.bam) } - contamination_bam = split_bam.out.mapped cleaned_bam = split_bam.out.unmapped if ( params.control && 'dcs' in params.control.split(',') && params.dcs_strict ) { @@ -57,6 +57,7 @@ workflow clean { idxstats = idxstats_from_bam.out flagstats = flagstats_from_bam.out out_reads = fastq_from_bam.out + sort_bam_ch = sort_bam.out } emit: @@ -65,4 +66,5 @@ workflow clean { flagstats out_reads bams_bai + sort_bam_ch } diff --git a/workflows/summarize_wf.nf b/workflows/summarize_wf.nf new file mode 100644 index 0000000..d050144 --- /dev/null +++ b/workflows/summarize_wf.nf @@ -0,0 +1,23 @@ +include { BAM_STATISTICS; COMBINE_BAM_STATISTICS } from '../modules/summarize' + +workflow summarize { + + take: + sorted_bam + + main: + BAM_STATISTICS(sorted_bam) + + collected_bam_statistics_ch = BAM_STATISTICS.out.collect( sort: {a, b -> a[0].getBaseName() <=> b[0].getBaseName()} ) + + + COMBINE_BAM_STATISTICS(collected_bam_statistics_ch) + + // define output + bamstats = BAM_STATISTICS.out + summarystats = COMBINE_BAM_STATISTICS.out + + emit: + bamstats + summarystats +}