From 16bbe34e0358f9e57a490d71f21e26ffcff5c541 Mon Sep 17 00:00:00 2001 From: ayoraind Date: Fri, 6 Sep 2024 13:32:03 +0200 Subject: [PATCH 1/7] had to change wget to curl due to the error "Unable to establish SSL connection" from the default pipeline --- modules/prepare_contamination.nf | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/prepare_contamination.nf b/modules/prepare_contamination.nf index f9d8cd5..c11acbd 100644 --- a/modules/prepare_contamination.nf +++ b/modules/prepare_contamination.nf @@ -18,29 +18,29 @@ process download_host { """ case $host in hsa) - wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' -O host-temp.fa.gz + curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' -o host-temp.fa.gz ;; mmu) - wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz' -O host-temp.fa.gz + curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz' -o host-temp.fa.gz ;; cli) - wget 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/337/935/GCF_000337935.1_Cliv_1.0/GCF_000337935.1_Cliv_1.0_genomic.fna.gz' -O host-temp.fa.gz + curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/337/935/GCF_000337935.1_Cliv_1.0/GCF_000337935.1_Cliv_1.0_genomic.fna.gz' -o host-temp.fa.gz ;; csa) - wget 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/409/795/GCF_000409795.2_Chlorocebus_sabeus_1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz' -O host-temp.fa.gz + curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/409/795/GCF_000409795.2_Chlorocebus_sabeus_1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz' -o host-temp.fa.gz ;; gga) - wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz' -O host-temp.fa.gz + curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz' -o host-temp.fa.gz ;; eco) - wget 'ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz' -O host-temp.fa.gz + curl 'ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz' -o host-temp.fa.gz ;; sc2) - wget 'https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3?download=true' -O host-temp.fa + curl 'https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3?download=true' -o host-temp.fa gzip host-temp.fa ;; t2t) - wget 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz' -O host-temp.fa.gz + curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz' -o host-temp.fa.gz ;; *) echo "Unknown host ($host)." From 2586da5ec3994a24e6cc5ec706faee559484f919 Mon Sep 17 00:00:00 2001 From: ayoraind Date: Wed, 11 Sep 2024 11:07:30 +0200 Subject: [PATCH 2/7] wget or curl instead of just curl --- modules/prepare_contamination.nf | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/modules/prepare_contamination.nf b/modules/prepare_contamination.nf index c11acbd..f285b8a 100644 --- a/modules/prepare_contamination.nf +++ b/modules/prepare_contamination.nf @@ -18,29 +18,29 @@ process download_host { """ case $host in hsa) - curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' -o host-temp.fa.gz + wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.primary_assembly.fa.gz' -o host-temp.fa.gz ;; mmu) - curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz' -o host-temp.fa.gz + wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/mus_musculus/dna/Mus_musculus.GRCm38.dna.primary_assembly.fa.gz' -o host-temp.fa.gz ;; cli) - curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/337/935/GCF_000337935.1_Cliv_1.0/GCF_000337935.1_Cliv_1.0_genomic.fna.gz' -o host-temp.fa.gz + wget 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/337/935/GCF_000337935.1_Cliv_1.0/GCF_000337935.1_Cliv_1.0_genomic.fna.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/337/935/GCF_000337935.1_Cliv_1.0/GCF_000337935.1_Cliv_1.0_genomic.fna.gz' -o host-temp.fa.gz ;; csa) - curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/409/795/GCF_000409795.2_Chlorocebus_sabeus_1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz' -o host-temp.fa.gz + wget 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/409/795/GCF_000409795.2_Chlorocebus_sabeus_1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/409/795/GCF_000409795.2_Chlorocebus_sabeus_1.1/GCF_000409795.2_Chlorocebus_sabeus_1.1_genomic.fna.gz' -o host-temp.fa.gz ;; gga) - curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz' -o host-temp.fa.gz + wget 'ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensembl.org/pub/release-99/fasta/gallus_gallus/dna/Gallus_gallus.GRCg6a.dna.toplevel.fa.gz' -o host-temp.fa.gz ;; eco) - curl 'ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz' -o host-temp.fa.gz + wget 'ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz' -O host-temp.fa.gz || curl 'ftp://ftp.ensemblgenomes.org/pub/release-45/bacteria//fasta/bacteria_90_collection/escherichia_coli_k_12/dna/Escherichia_coli_k_12.ASM80076v1.dna.toplevel.fa.gz' -o host-temp.fa.gz ;; sc2) - curl 'https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3?download=true' -o host-temp.fa + wget 'https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3?download=true' -O host-temp.fa || curl 'https://www.ebi.ac.uk/ena/browser/api/fasta/MN908947.3?download=true' -o host-temp.fa gzip host-temp.fa ;; t2t) - curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz' -o host-temp.fa.gz + wget 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz' -O host-temp.fa.gz || curl 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/009/914/755/GCA_009914755.4_T2T-CHM13v2.0/GCA_009914755.4_T2T-CHM13v2.0_genomic.fna.gz' -o host-temp.fa.gz ;; *) echo "Unknown host ($host)." From 1b5c09dfe3367ef5dbd311aef6e97eee600f68eb Mon Sep 17 00:00:00 2001 From: ayoraind Date: Wed, 11 Sep 2024 14:21:17 +0200 Subject: [PATCH 3/7] implemented the 'summarize workflow' --- clean.nf | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/clean.nf b/clean.nf index 395b29e..5c0dab7 100755 --- a/clean.nf +++ b/clean.nf @@ -184,7 +184,7 @@ include { check_own as prepare_keep } from './modules/prepare_contamination' include { clean } from './workflows/clean_wf' addParams( tool: tool, lib_pairedness: lib_pairedness ) include { keep } from './workflows/keep_wf' addParams( tool: tool, lib_pairedness: lib_pairedness ) - +include { summarize } from './workflows/summarize_wf' include { qc } from './workflows/qc_wf' /************************** @@ -196,7 +196,15 @@ workflow { contamination = prepare_contamination.out clean(input_ch, contamination, nanoControlBedChannel, 'map-to-remove') + + bam_sort = clean.out.sort_bam_ch + + if (!params.bbduk) { + + summarize(bam_sort) + } + if (params.keep){ prepare_keep(keepFastaChannel) keep_fasta = prepare_keep.out From 59e84b2cf5d5943d183dac243b94593842cdcced Mon Sep 17 00:00:00 2001 From: ayoraind Date: Wed, 11 Sep 2024 14:22:12 +0200 Subject: [PATCH 4/7] created an output channel for the sort_bam process --- workflows/clean_wf.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflows/clean_wf.nf b/workflows/clean_wf.nf index b61ed5b..6d6c234 100644 --- a/workflows/clean_wf.nf +++ b/workflows/clean_wf.nf @@ -24,7 +24,7 @@ workflow clean { } else { minimap2(input, contamination) | sort_bam | index_bam | ( idxstats_from_bam & flagstats_from_bam ) - + split_bam(minimap2.out.bam) contamination_bam = split_bam.out.mapped cleaned_bam = split_bam.out.unmapped @@ -50,6 +50,7 @@ workflow clean { idxstats = idxstats_from_bam.out flagstats = flagstats_from_bam.out out_reads = fastq_from_bam.out + sort_bam_ch = sort_bam.out } emit: @@ -58,4 +59,5 @@ workflow clean { flagstats out_reads bams_bai + sort_bam_ch } From d652d0073704a6e2801d2e81cfa149f7a506251d Mon Sep 17 00:00:00 2001 From: ayoraind Date: Wed, 11 Sep 2024 14:23:24 +0200 Subject: [PATCH 5/7] added a bash script to generate a summary file from sort_bam output --- bin/bam_stats.sh | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100755 bin/bam_stats.sh diff --git a/bin/bam_stats.sh b/bin/bam_stats.sh new file mode 100755 index 0000000..e21ea06 --- /dev/null +++ b/bin/bam_stats.sh @@ -0,0 +1,17 @@ +#!/bin/bash + +FILE=${1} +#LIST=$2 + +#STAT=$(basename $(basename ${LIST}) .txt) +echo "Filename"$'\t'"Total"$'\t'"mapped_reads"$'\t'"unmapped_reads"$'\t'"proportion_mapped"$'\t'"proportion_unmapped" > ${FILE}.mappedstats.txt + + +mapped_reads=$(samtools stats ${FILE}.sorted.bam | grep ^SN | awk 'NR == 7 {print $4}') +unmapped_reads=$(samtools stats ${FILE}.sorted.bam | grep ^SN | awk 'NR == 9 {print $4}') +total=$(samtools stats ${FILE}.sorted.bam | grep ^SN | awk 'NR == 1 {print $5}') +proportion_mapped=$(awk "BEGIN {printf \"%.2f\n\", $mapped_reads*100/$total}") +proportion_unmapped=$(awk "BEGIN {printf \"%.2f\n\", $unmapped_reads*100/$total}") +printf "%s\t%s\t%s\t%s\t%s\t%s\n" $FILE $total $mapped_reads $unmapped_reads $proportion_mapped $proportion_unmapped >> ${FILE}.mappedstats.txt +# remove bam file +#rm -rf /MIGE/01_DATA/02_MAPPING/${FILE}*.{bam,bai} From c5d53a2f37bd91942fe9b907b0fd70bf0599fca3 Mon Sep 17 00:00:00 2001 From: ayoraind Date: Wed, 11 Sep 2024 14:24:49 +0200 Subject: [PATCH 6/7] created new module and added two processes for creating summary file --- modules/summarize.nf | 83 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 modules/summarize.nf diff --git a/modules/summarize.nf b/modules/summarize.nf new file mode 100644 index 0000000..21793a3 --- /dev/null +++ b/modules/summarize.nf @@ -0,0 +1,83 @@ +process BAM_STATISTICS { + + label 'minimap2' + + publishDir ( + + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "*.mappedstats.txt", + enabled: !params.no_intermediate, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" + } + ) + + input: + tuple val(name), val(type), path(bam) + + + output: + path("*.mappedstats.txt") + + + script: + """ + bam_stats.sh ${name} + + """ + + stub: + """ + touch ${bam.baseName}.mappedstats.txt + """ +} + + +process COMBINE_BAM_STATISTICS { + + label 'minimap2' + + publishDir ( + + path: "${params.output}/intermediate", + mode: params.publish_dir_mode, + pattern: "combined_bam_stats.txt", + enabled: !params.no_intermediate, + saveAs: { fn -> + fn.startsWith("keep_") ? "map-to-keep/${fn.replaceAll(~'^keep_', '')}" : "map-to-remove/${fn}" + } + ) + + tag { 'combine bam statistics files'} + + + input: + path(bam_statistics_files) + + + output: + path("combined_bam_stats.txt") + + + script: + """ + BAM_STATISTICS_FILES=(${bam_statistics_files}) + + for index in \${!BAM_STATISTICS_FILES[@]}; do + BAM_STATISTICS_FILE=\${BAM_STATISTICS_FILES[\$index]} + + # add header line if first file + if [[ \$index -eq 0 ]]; then + echo "\$(head -1 \${BAM_STATISTICS_FILE})" >> combined_bam_stats.txt + fi + echo "\$(awk 'FNR==2 {print}' \${BAM_STATISTICS_FILE})" >> combined_bam_stats.txt + done + + """ + + stub: + """ + touch combined_bam_stats.txt + """ +} From 54a4450463ad89e2a5e44aacd261153ac0af0134 Mon Sep 17 00:00:00 2001 From: ayoraind Date: Wed, 11 Sep 2024 14:25:41 +0200 Subject: [PATCH 7/7] created a new workflow to implement the summary --- workflows/summarize_wf.nf | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 workflows/summarize_wf.nf diff --git a/workflows/summarize_wf.nf b/workflows/summarize_wf.nf new file mode 100644 index 0000000..d050144 --- /dev/null +++ b/workflows/summarize_wf.nf @@ -0,0 +1,23 @@ +include { BAM_STATISTICS; COMBINE_BAM_STATISTICS } from '../modules/summarize' + +workflow summarize { + + take: + sorted_bam + + main: + BAM_STATISTICS(sorted_bam) + + collected_bam_statistics_ch = BAM_STATISTICS.out.collect( sort: {a, b -> a[0].getBaseName() <=> b[0].getBaseName()} ) + + + COMBINE_BAM_STATISTICS(collected_bam_statistics_ch) + + // define output + bamstats = BAM_STATISTICS.out + summarystats = COMBINE_BAM_STATISTICS.out + + emit: + bamstats + summarystats +}