diff --git a/README.md b/README.md index f4441373..8c5e97a4 100644 --- a/README.md +++ b/README.md @@ -283,58 +283,58 @@ _Click to show details_ Public Tools Used by the Workflow All tools build with public easybuild configuration files, available [here](https://github.com/easybuilders/easybuild).
-*Last Updated Jan 29th, 2021* +*Last Updated July 29th, 2021* | Tool | Version Implemented | Current Version | Dependency and Notes | EasyBuild | | :---: | :---: | :---: | :--- | :---: | -| [bcftools](https://github.com/samtools/bcftools/releases) | 1.10.2 | **1.11** | | Yes | +| [bcftools](https://github.com/samtools/bcftools/releases) | 1.10.2 | **1.13** | | Yes | | [bedtools](https://github.com/arq5x/bedtools2/releases) | 2.29.0 | **2.30.0** | delly-filter, addmatchRNA, vardict, vcfmerger2 | Yes | -| [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) | 2.3.5.1 | **2.4.2** | star-fusion | Yes | +| [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) | 2.3.5.1 | **2.4.4** | star-fusion | Yes | | [bwa](https://github.com/lh3/bwa/releases) | 0.7.17 | 0.7.17 | | Yes | -| [cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/count) | 3.1.0 | **5.0** | | Yes | -| [cuda-toolkit](https://developer.nvidia.com/cuda-downloads) | 10.1.243 | **11.2** | | No | -| [deepvariant](https://github.com/google/deepvariant/releases) | 0.10.0 | **1.1.0** | singularity container | Yes | +| [cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/count) | 6.0 | 6.0 | | Yes | +| [cuda-toolkit](https://developer.nvidia.com/cuda-downloads) | 10.1.243 | **11.4** | | No | +| [deepvariant](https://github.com/google/deepvariant/releases) | 0.10.0 | **1.2.0** | singularity container | Yes | | [delly](https://github.com/dellytools/delly/releases) | 0.7.6 | **0.8.7** | staying with 0.7.6 for compatibility reasons | Yes | | [dle](https://github.com/tgen/Discordant_Loci_Extractor/releases) | 0.1.5 | 0.1.5 | private repo | No | -| [freebayes](https://github.com/ekg/freebayes/releases) | 1.3.1 | **1.3.4** | 1.3.2 ensures python3 compatibility | Yes | -| [gatk](https://github.com/broadinstitute/gatk//releases) | 4.1.8.0 | **4.1.9.0** | | Yes | -| [gmap-gsnp](http://research-pub.gene.com/gmap/) | 2019-09-12 | **2020-12-17** | star-fusion | Yes | -| [gridss](https://github.com/PapenfussLab/gridss/releases) | 2.6.3 | **2.10.2** | | | +| [freebayes](https://github.com/ekg/freebayes/releases) | 1.3.1 | **1.3.4** | | Yes | +| [gatk](https://github.com/broadinstitute/gatk//releases) | 4.1.8.0 | **4.2.0.0** | | Yes | +| [gmap-gsnp](http://research-pub.gene.com/gmap/) | 2019-09-12 | **2021-07-23** | star-fusion | Yes | +| [gridss](https://github.com/PapenfussLab/gridss/releases) | 2.6.3 | **2.12.0** | | | | [hmmcopyutils](https://github.com/shahcompbio/hmmcopy_utils) | 1.0 | 1.0 | no official release | Yes | | [htseq](https://github.com/htseq/htseq/releases) | 0.12.3 | **0.13.5** | | | -| [htslib](https://github.com/samtools/htslib/releases) | 1.10.2 | **1.11** | star-fusion(bgzip) | Yes | +| [htslib](https://github.com/samtools/htslib/releases) | 1.10.2 | **1.13** | star-fusion(bgzip) | Yes | | [ichor](https://github.com/broadinstitute/ichorCNA/releases) | 0.2.0 | 0.2.0 | package in R/3.6.1-phoenix module | Yes? | | [jellyfish](https://github.com/gmarcais/Jellyfish/releases) | 2.3.0 | 2.3.0 | star-fusion | Yes | | [lancet](https://github.com/nygenome/lancet/releases) | 1.1.0 | 1.1.0 | | Yes | | [lumosVar2](https://github.com/tgen/lumosVar2/releases) | 1.1 | 1.1 | | Yes | | [manta](https://github.com/Illumina/manta/releases) | 1.6.0 | 1.6.0 | | Yes | | [manta_tgen](https://github.com/tgen/manta/releases) | 1.6.0 | 1.6.0 | modified for internal use | No | -| [multiQC](https://github.com/ewels/MultiQC/releases) | 1.9 | 1.9 | python3 pip | Yes | -| [octopus](https://github.com/luntergroup/octopus/releases) | 0.6.3-beta | **0.7.0** | | Yes | -| [openmpi](https://github.com/open-mpi/ompi/releases) | 3.1.3 | **4.1.0** | delly | No | +| [multiQC](https://github.com/ewels/MultiQC/releases) | 1.9 | **1.11**| python3 pip | Yes | +| [octopus](https://github.com/luntergroup/octopus/releases) | 0.6.3-beta | **0.7.4** | | Yes | +| [openmpi](https://github.com/open-mpi/ompi/releases) | 3.1.3 | **4.1.1** | delly | No | | [pairoscope](https://github.com/genome/pairoscope/releases) | 0.4.2 | 0.4.2 | | Yes | -| [perl](https://github.com/Perl/perl5/releases) | 5.28.1 | **5.33.6** | star-fusion | Yes | +| [perl](https://github.com/Perl/perl5/releases) | 5.28.1 | **5.35.2** | star-fusion | Yes | | [phaser](https://github.com/secastel/phaser/tree/master/phaser) | 1.1.1 | 1.1.1 | vcfmerger2 | Yes | | [python2](https://www.python.org/downloads/) | 2.7.15 | **2.7.18** | | Yes | -| [python3](https://www.python.org/downloads/) | 3.7.2 | **3.9.1** | star-fusion, vcfmerger2 | Yes | -| [R](https://www.r-project.org/) | 3.6.1 | **4.0.3** | gatk cnv, varDict, vcfmerger2 | Yes | -| [salmon](https://github.com/COMBINE-lab/salmon/releases) | 1.2.1 | **1.4.0** | self, star-fusion | Yes | +| [python3](https://www.python.org/downloads/) | 3.7.2 | **3.9.6** | star-fusion, vcfmerger2 | Yes | +| [R](https://www.r-project.org/) | 3.6.1 | **4.1.0** | gatk cnv, varDict, vcfmerger2 | Yes | +| [salmon](https://github.com/COMBINE-lab/salmon/releases) | 1.2.1 | **1.5.2** | self, star-fusion | Yes | | [sambamba](https://github.com/biod/sambamba/releases) | 0.7.0 | **0.8.0** | | | | [samblaster](https://github.com/GregoryFaust/samblaster/releases) | 0.1.24 | **0.1.26** | | | -| [samtools](https://github.com/samtools/samtools/releases) | 1.10 | **1.11** | | Yes | -| [singularity](https://github.com/sylabs/singularity/releases) | 3.5.2 | **3.7.1** | deepvariant | Yes, release-candidate | +| [samtools](https://github.com/samtools/samtools/releases) | 1.10 | **1.13** | | Yes | +| [singularity](https://github.com/sylabs/singularity/releases) | 3.5.2 | **3.8.1** | deepvariant | Yes | | [snpEff](https://sourceforge.net/projects/snpeff/files/) | 4.3t | **4.5covid19** | covid related release? | Yes | | [snpSniffer](https://github.com/tgen/snpSniffer/releases) | 7.0.0 | 7.0.0 | | No | -| [star](https://github.com/alexdobin/STAR/releases) | 2.7.5a | **2.7.7a** | self, star-fusion | Yes | -| [star-fusion](https://github.com/STAR-Fusion/STAR-Fusion/releases) | 1.8.1 | **1.9.1** | | Yes | +| [star](https://github.com/alexdobin/STAR/releases) | 2.7.5a | **2.7.9a** | self, star-fusion | Yes | +| [star-fusion](https://github.com/STAR-Fusion/STAR-Fusion/releases) | 1.8.1 | **1.10.1** | | Yes | | [strelka](https://github.com/Illumina/strelka/releases) | 2.9.10 | 2.9.10 | | Yes | -| [subread](https://sourceforge.net/projects/subread/) | 2.0.0 | **2.0.1** | part of subread package | Yes | -| [tgen_mutation_burden](https://github.com/tgen/tgen_mutation_burden/releases) | 1.2.1 | 1.2.1 | | No | +| [subread](https://sourceforge.net/projects/subread/) | 2.0.0 | **2.0.3** | part of subread package | Yes | +| [tgen_mutation_burden](https://github.com/tgen/tgen_mutation_burden/releases) | 1.2.3 | 1.2.3 | | No | | [transParser](https://github.com/tgen/transParser/releases) | 1.0.1 | 1.0.1 | | No | -| [trinityrnaseq](https://github.com/trinityrnaseq/trinityrnaseq/releases) | 2.8.6 | **2.11.0** | star-fusion | Yes | +| [trinityrnaseq](https://github.com/trinityrnaseq/trinityrnaseq/releases) | 2.8.6 | **2.12.0** | star-fusion | Yes | | [vardictJava](https://github.com/AstraZeneca-NGS/VarDictJava/releases) | 1.7.0 | **1.8.2** | | Yes | | [vcfmerger2](https://github.com/tgen/vcfMerger2/releases) | 0.8.7 | 0.8.7 | | Yes | -| [vep](https://github.com/Ensembl/ensembl-vep/releases) | 98.3 | **102.0** | | Yes | +| [vep](https://github.com/Ensembl/ensembl-vep/releases) | 98.3 | **104.3** | | Yes | | [verifybamid2](https://github.com/Griffan/VerifyBamID/releases) | 1.0.6 | **2.0.1** | | Yes | | [vt](https://github.com/atks/vt/releases) | 0_57721 | 0_57721 | | Yes | diff --git a/modules/constitutional/deepvariant.jst b/modules/constitutional/deepvariant.jst index 4cc214c0..92462203 100755 --- a/modules/constitutional/deepvariant.jst +++ b/modules/constitutional/deepvariant.jst @@ -1,4 +1,5 @@ {% from 'modules/annotation/main.jst' import annotate_vcfs with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {%- macro deepvariant(sample, aligner='bwa', taskPrefix='Genome') %} @@ -152,4 +153,7 @@ bcftools index --tbi --force "{{ pass_vcf }}" {{- annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'deepvariant', 'constitutional', 'snp_indel_caller') }} +{{- vcf_stats(pass_vcf, results_dir) }} +{{- vcf_stats(all_vcf, results_dir) }} + {% endmacro %} diff --git a/modules/constitutional/expansion_hunter.jst b/modules/constitutional/expansion_hunter.jst new file mode 100644 index 00000000..e3d29dc7 --- /dev/null +++ b/modules/constitutional/expansion_hunter.jst @@ -0,0 +1,74 @@ + +{% macro expansion_hunter(sample, aligner='bwa') %} +{% set bam %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}/{{ sample.name }}.{{ aligner }}.bam{% endset %} +{% set temp_dir %}temp/{{ sample.gltype }}/constitutional_variant_calls/expansion_hunter/{{ sample.name }}{% endset %} +{% set results_dir %}{{ sample.gltype }}/constitutional_variant_calls/expansion_hunter/{{ sample.name }}{% endset %} + +- name: expansion_hunter_{{ aligner }}_{{ sample.name }}_{{ aligner }} + tags: [{{ sample.gltype }}, repeats, expansion, SRT, {{ sample.name }}] + input: {{ bam }} + output: + - {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz + - {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz.tbi + walltime: "24:00:00" + cpus: 1 + mem: 2G + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.expansion_hunter.module }} + module load {{ constants.tools.bcftools.module }} + + rm -r {{ temp_dir }} || true + mkdir -p {{ temp_dir }} + mkdir -p {{ results_dir }} + + {# The variant catalog is in the install path, we grab the path here #} + {# ExpansionHunter is in a /bin directory, we want to go one level higher #} + eh_path=$(dirname $(dirname `which ExpansionHunter`)) + + ExpansionHunter \ + --reads {{ bam }} \ + --reference {{ constants.coyote.reference_fasta }} \ + --variant-catalog ${eh_path}/variant_catalog/grch38/variant_catalog.json \ + --output-prefix {{ temp_dir }}/{{ sample.name }}.expansion_hunter + + bcftools view \ + --output-type z \ + --output-file {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz \ + {{ temp_dir }}/{{ sample.name }}.expansion_hunter.vcf + + bcftools index --tbi --force {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz + + +- name: expansion_hunter_filter_variants_{{ sample.name }}_{{ aligner }} + tags: [{{ sample.gltype}}, constitutional, snp_indel_caller, deepvariant, {{ sample.name }}] + input: + - {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz + - {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz.tbi + output: + - {{ results_dir }}/{{ sample.name }}.expansion_hunter.pass.vcf.gz + - {{ results_dir }}/{{ sample.name }}.expansion_hunter.pass.vcf.gz.tbi + cpus: 1 + mem: 2G + walltime: "12:00:00" + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.bcftools.module }} + + {# Then filter out the PASS variants to a separate file #} + bcftools filter \ + --output-type z \ + --include 'FILTER == "PASS"' \ + {% if sample.gltype == 'exome' %} + --targets-file "{{ sample.capture_kit.extended_bed }}" \ + {% endif %} + "{{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz" \ + > "{{ results_dir }}/{{ sample.name }}.expansion_hunter.pass.vcf.gz" + + bcftools index --tbi --force "{{ results_dir }}/{{ sample.name }}.expansion_hunter.pass.vcf.gz" + +{% endmacro %} diff --git a/modules/constitutional/freebayes.jst b/modules/constitutional/freebayes.jst index 96000649..8f94044b 100755 --- a/modules/constitutional/freebayes.jst +++ b/modules/constitutional/freebayes.jst @@ -1,5 +1,6 @@ {% from 'modules/annotation/main.jst' import annotate_vcfs with context %} {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {%- macro freebayes(sample, aligner='bwa', taskPrefix='Genome') %} @@ -109,4 +110,7 @@ {{- annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'freebayes', 'constitutional', 'snp_indel_caller') }} -{% endmacro %} \ No newline at end of file +{{- vcf_stats(pass_vcf, results_dir) }} +{{- vcf_stats(all_vcf, results_dir) }} + +{% endmacro %} diff --git a/modules/constitutional/gatk_cnv.jst b/modules/constitutional/gatk_cnv.jst new file mode 100644 index 00000000..de1b9fc8 --- /dev/null +++ b/modules/constitutional/gatk_cnv.jst @@ -0,0 +1,341 @@ +{% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'modules/annotation/annot_seg.jst' import annot_seg with context %} +# The macros found in this template are used to generate the gatk4 tumor_only cnv workflow. + +{% macro gatk_cnv_constitutional(sample, normal, aligner='bwa') %} +{% set normal_temp %}temp/tumor_only/control_data_files/{{ normal.assayCode }}{% endset %} +{% set cnvIntervals %}{{ normal_temp }}/{{ normal.gatkCnvPonIntervals | basename }}{% endset %} +{% set cnvPon %}{{ normal_temp }}/{{ normal.gatkCnvPon | basename }}{% endset %} +{% set sample_bam %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}/{{ sample.name }}.{{ aligner }}.bam{% endset %} +{% set cnvSnps %}{{ normal_temp }}/{{ normal.gatkCnvSnps | basename }}{% endset %} +{% set temp_dir %}temp/{{ sample.gltype }}/constitutional_copy_number/gatk/{{ sample.name }}_{{ aligner }}{% endset %} +{% set results_dir %}{{ sample.gltype }}/constitutional_copy_number/gatk/{{ sample.name }}{% endset %} + +{% if wesAssayToPair is defined and normal.gatkExomeCnvSnps is defined %} + {% set exomeCnvSnps %}{{ normal_temp }}/{{ normal.gatkExomeCnvSnps | basename }}{% endset %} + + {% set exome = {} %} + {% do exome.update( (dataFiles|selectattr('assayCode', 'eq', wesAssayToPair)|selectattr('rgsm', 'eq', sample.rgsm)|first|default({'no': 'exome'}))) %} + + {% if exome.sampleMergeKey is defined %} + {% set sample_exome_bam %}exome/alignment/{{ aligner }}/{{ exome.sampleMergeKey }}/{{ exome.sampleMergeKey }}.{{ aligner }}.bam{% endset %} + {% elif exome.sampleName is defined %} + {% set sample_exome_bam %}exome/alignment/{{ aligner }}/{{ exome.sampleName }}/{{ exome.sampleName }}.{{ aligner }}.bam{% endset %} + {% endif %} +{% endif %} + +{# Step 1 #} +- name: constitutional_gatk_call_cnv_step1_{{ sample.name }}_{{ aligner }} + tags: [{{ sample.gltype }}, constitutional, cna_caller, gatk, {{ sample.name }}] + reset: prepare_tumor_only_{{ normal.name }} + input: + - {{ sample_bam }} + - {{ cnvPon }} + - {{ cnvIntervals }} + {% if sample_exome_bam is defined %} + - {{ sample_exome_bam }} + - {{ exomeCnvSnps }} + {% else %} + - {{ cnvSnps }} + {% endif %} + output: + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts.tsv + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.standardizedCR.tsv + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.tsv + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.hets.normal.tsv + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.hets.tsv + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelBegin.seg + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelBegin.cr.param + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelBegin.af.param + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.seg + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.cr.param + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.af.param + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.cr.seg + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.cr.igv.seg + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.af.igv.seg + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.called.seg + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.called.igv.seg + {% if sample_exome_bam is defined %} + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts_filt.tsv + {% endif %} + cpus: 1 + mem: 16G + walltime: "24:00:00" + cmd: | + set -eu + set -o pipefail + + {# Load modules #} + module load {{ constants.tools.gatk.module }} + + {# Make working and temp directories #} + mkdir -p "{{ temp_dir }}/temp" + mkdir -p "{{ temp_dir }}/results/plots" + mkdir -p "{{ results_dir }}" + + {# CollectReadCounts for the tumor using the intervals provided to the config that matches the PON #} + gatk --java-options "-Xmx12g" CollectReadCounts \ + --tmp-dir {{ temp_dir }}/temp/ \ + --input {{ sample_bam }} \ + --intervals {{ cnvIntervals }} \ + --interval-merging-rule OVERLAPPING_ONLY \ + --read-filter FirstOfPairReadFilter \ + --output {{ temp_dir }}/{{ sample.name }}.{{ aligner }}.counts.hdf5 + + {# CollectAllelicCounts for the tumor using the snps provided to the config #} + gatk --java-options "-Xmx12g" CollectAllelicCounts \ + --tmp-dir {{ temp_dir }}/temp/ \ + {% if sample_exome_bam is defined %} + --input {{ sample_exome_bam }} \ + --intervals {{ exomeCnvSnps }} \ + {% else %} + --input {{ sample_bam }} \ + --intervals {{ cnvSnps }} \ + {% endif %} + --reference {{ constants.coyote.reference_fasta }} \ + --output {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts.tsv + + {% if sample_exome_bam is defined %} + awk -F'\t' '$1 ~ /^@/ { print $0 ; next } ; + $1 == "CONTIG" { print $0 ; next } ; + $3 != 0 && $4 != 0 { + DP = $3+$4 ; + RATIO = $4/DP ; + if ( DP > 99 && RATIO >= 0.4643519 && RATIO <= 0.5043519) { print $0 } + }' {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts.tsv > {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts_filt.tsv + {% endif %} + + {# Denoise the tumor using the panel of normals provided to in the configuration #} + gatk --java-options "-Xmx12g" DenoiseReadCounts \ + --tmp-dir {{ temp_dir }}/temp/ \ + --input {{ temp_dir }}/{{ sample.name }}.{{ aligner }}.counts.hdf5 \ + --count-panel-of-normals {{ cnvPon }} \ + --standardized-copy-ratios {{ results_dir }}/{{ sample.name }}.{{ aligner }}.standardizedCR.tsv \ + --denoised-copy-ratios {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.tsv + + gatk --java-options "-Xmx12g" ModelSegments \ + --tmp-dir {{ temp_dir }}/temp/ \ + --number-of-smoothing-iterations-per-fit 1 \ + --number-of-changepoints-penalty-factor 0.05 \ + --kernel-variance-allele-fraction 0.0 \ + --kernel-scaling-allele-fraction 0.0 \ + --smoothing-credible-interval-threshold-copy-ratio 3.5 \ + --smoothing-credible-interval-threshold-allele-fraction 3.5 \ + --window-size 4 \ + --window-size 8 \ + --window-size 16 \ + --window-size 32 \ + --window-size 64 \ + --window-size 128 \ + --window-size 256 \ + --denoised-copy-ratios {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.tsv \ + {% if sample_exome_bam is defined %} + --allelic-counts {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts_filt.tsv \ + {% else %} + --allelic-counts {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts.tsv \ + {% endif %} + --output {{ results_dir }} \ + --output-prefix {{ sample.name }}.{{ aligner }} + + gatk --java-options "-Xmx12g" CallCopyRatioSegments \ + --tmp-dir {{ temp_dir }}/temp/ \ + --input {{ results_dir }}/{{ sample.name }}.{{ aligner }}.cr.seg \ + --output {{ results_dir }}/{{ sample.name }}.{{ aligner }}.called.seg + +{# Step 2 #} +- name: constitutional_gatk_call_cnv_step2_{{ sample.name }}_{{ aligner }} + tags: [{{ sample.gltype }}, constitutional, cna_caller, gatk, {{ sample.name }}] + reset: constitutional_gatk_call_cnv_step1_{{ sample.name }}_{{ aligner }} + input: + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.tsv + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.seg + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.hets.tsv + output: + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.genderCorrected.tsv + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.re_centered.cr.igv.seg + - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.dLRs.tsv + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}.hets.density.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_cna_withhets.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_cna.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_baf.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr1.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr2.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr3.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr4.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr5.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr6.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr7.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr8.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr9.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr10.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr11.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr12.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr13.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr14.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr15.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr16.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr17.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr18.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr19.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr20.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr21.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr22.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chrX.png + - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chrY.png + cpus: 1 + mem: 16G + walltime: "24:00:00" + cmd: | + set -eu + set -o pipefail + + {# Load modules #} + module load {{ constants.tools.R.module }} + module load {{ constants.tools.python_3_7_2.module }} + + {# Make working and temp directories #} + mkdir -p "{{ temp_dir }}" + + {# Get the sex results of the normal. If undetermined then use the sex from the config if known. else default to female. #} + SEX="Female" + + touch {{ temp_dir }}/results/plots/{{ sample.name }}.{{ aligner }}_chrY.png + cp {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.tsv {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.denoisedCR.genderCorrected.tsv + cp {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.seg {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg + contig_names_string="chr1CONTIG_DELIMITERchr2CONTIG_DELIMITERchr3CONTIG_DELIMITERchr4CONTIG_DELIMITERchr5CONTIG_DELIMITERchr6CONTIG_DELIMITERchr7CONTIG_DELIMITERchr8CONTIG_DELIMITERchr9CONTIG_DELIMITERchr10CONTIG_DELIMITERchr11CONTIG_DELIMITERchr12CONTIG_DELIMITERchr13CONTIG_DELIMITERchr14CONTIG_DELIMITERchr15CONTIG_DELIMITERchr16CONTIG_DELIMITERchr17CONTIG_DELIMITERchr18CONTIG_DELIMITERchr19CONTIG_DELIMITERchr20CONTIG_DELIMITERchr21CONTIG_DELIMITERchr22CONTIG_DELIMITERchrX" + contig_lengths_string="248956422CONTIG_DELIMITER242193529CONTIG_DELIMITER198295559CONTIG_DELIMITER190214555CONTIG_DELIMITER181538259CONTIG_DELIMITER170805979CONTIG_DELIMITER159345973CONTIG_DELIMITER145138636CONTIG_DELIMITER138394717CONTIG_DELIMITER133797422CONTIG_DELIMITER135086622CONTIG_DELIMITER133275309CONTIG_DELIMITER114364328CONTIG_DELIMITER107043718CONTIG_DELIMITER101991189CONTIG_DELIMITER90338345CONTIG_DELIMITER83257441CONTIG_DELIMITER80373285CONTIG_DELIMITER58617616CONTIG_DELIMITER64444167CONTIG_DELIMITER46709983CONTIG_DELIMITER50818468CONTIG_DELIMITER156040895" + + + for CHR in {1..24} + do + if [[ ${CHR} -eq "23" ]] + then + CHR="chrX" + elif [[ ${CHR} -eq "24" ]] + then + CHR="chrY" + else + CHR="chr${CHR}" + fi + + declare START_${CHR}=$(gawk -F'\t' -v CHROM=${CHR} '$1 == CHROM' {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg | sort -k2,2n | awk -F'\t' 'NR == 1 { print $2 }') + export START_${CHR} + declare STOP_${CHR}=$(gawk -F'\t' -v CHROM=${CHR} '$1 == CHROM' {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg | sort -k3,3nr | awk -F'\t' 'NR == 1 { print $3 }') + export STOP_${CHR} + done + + for CHR in {1..24} + do + # Get centromere start and stop + if [[ $CHR -eq "23" ]] + then + CHR="chrX" + elif [[ $CHR -eq "24" ]] + then + CHR="chrY" + else + CHR="chr${CHR}" + fi + + eval "START=\${START_${CHR}}" + eval "STOP=\${STOP_${CHR}}" + + START_C=$(gawk -v CHR=$CHR '$1==CHR { print $2 }' {{ constants.coyote.centromere }}) + STOP_C=$(gawk -v CHR=$CHR '$1==CHR { print $3 }' {{ constants.coyote.centromere }}) + + if [[ ${START} -ge ${START_C} ]] + then + START=${STOP_C} + fi + + export CHR + export START + export START_C + export STOP + export STOP_C + + echo -e "${CHR}\t${START}\t${STOP}" + echo -e "${CHR}\t${START_C}\t${STOP_C}" + + LINES=$(wc -l < "{{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg") + HEADER=$(grep -c "@" "{{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg" || :) + + gawk -F'\t' -v HEADER="${HEADER}" -v CHROM="${CHR}" -v START="${START}" -v STOP="${STOP}" -v LINES="${LINES}" -v STARTC="${START_C}" -v STOPC="${STOP_C}" 'BEGIN { INC=1 } ; + # Skip GATK header + $0 ~ /^@/ { next } ; + # Print the header + NR == HEADER + 1 { print $0 ; next } ; + # Remove segments that fall within the centromere + $2 == CHROM && $3 >= STARTC && $4 <= STOPC { next } ; + # Store the second line in the seg file and set the INC to 2 if we are working on the chromosome + NR == HEADER + 2 && $1 == CHROM { C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=2 ; next } ; + NR == HEADER + 2 && $1 != CHROM { C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; next } ; + ## Last line of seg file + NR == LINES && $1 == CHROM && INC == 1 { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,START,STOP,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ; + NR == LINES && $1 == CHROM && INC == 2 { OFS = "\t" ; print C1,START,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,$2,STOP,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ; + NR == LINES && $1 == CHROM && INC == 3 { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,$2,STOP,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ; + NR == LINES && $1 != CHROM && INC == 2 { OFS = "\t" ; print C1,START,STOP,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ; + NR == LINES && $1 != CHROM && INC == 3 { OFS = "\t" ; print C1,C2,STOP,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ; + NR == LINES && $1 != CHROM { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ; + ### Segment is the current chromosome we are working on + ## First segment in the CHROM, print previous segment variables + NR != LINES && $1 == CHROM && INC == 1 { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=2 ; next } ; + ## Second segment for CHROM, Tests and print is for the First segment + # If the previous segments start is >= the centromere stop + NR != LINES && $1 == CHROM && INC == 2 && C2 >= STOPC { OFS = "\t" ; print C1,START,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=3 ; next } ; + NR != LINES && $1 == CHROM && INC == 2 && C2 < STARTC && C3 < STOPC && $2 > STARTC && $3 > STOPC { OFS = "\t" ; print C1,START,STARTC,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=STOPC ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=3 ; next } ; + NR != LINES && $1 == CHROM && INC == 2 && C2 < STARTC && C3 < STOPC && $2 < STARTC { OFS = "\t" ; print C1,START,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=3 ; next } ; + NR != LINES && $1 == CHROM && INC == 2 && C3 > STOPC { OFS = "\t" ; print C1,START,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=3 ; next } ; + # Third segment for CHROM + NR != LINES && $1 == CHROM && INC == 3 && C2 < STARTC && C3 < STOPC && $2 > STARTC && $3 > STOPC { OFS = "\t" ; print C1,C2,STARTC,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=STOPC ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; next } ; + NR != LINES && $1 == CHROM && INC == 3 && C2 < STARTC && C3 < STOPC && $2 < STARTC { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; next } ; + NR != LINES && $1 == CHROM && INC == 3 { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; next } ; + ## Segment is on a Chromosome that is NOT our current CHROM + NR != LINES && $1 != CHROM && INC == 2 { OFS = "\t" ; print C1,START,STOP,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=1 ; next } ; + NR != LINES && $1 != CHROM && INC == 3 { OFS = "\t" ; print C1,C2,STOP,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=1 ; next } ; + NR != LINES && $1 != CHROM && INC == 1 { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; next }' {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg > {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg.temp + + mv {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg.temp {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg + done + + python ${JS_PIPELINE_PATH}/required_scripts/{{ constants.coyote.cna_seg_extend }} {{ constants.coyote.centromere }} {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg + + Rscript ${JS_PIPELINE_PATH}/required_scripts/{{ constants.coyote.plotCNVplus_Rscript }} \ + --point_size 2 \ + --hetDPfilter 1 \ + --hetAFlow 0.45 \ + --hetAFhigh 0.55 \ + --hetMAFposteriorOffset 0.01 \ + --lowerCNvalidatePeakOffset 0.125 \ + --UpperCNvalidatePeakOffset 0.125 \ + --lowerCNcenteringPeakOffset 0.125 \ + --UpperCNcenteringPeakOffset 0.125 \ + --sample_name={{ sample.name }} \ + --output_prefix={{ sample.name }}.{{ aligner }} \ + --plots_directory={{ temp_dir }}/results/plots \ + --re_center_CNA=TRUE \ + --re_centered_seg_directory={{ temp_dir }}/results \ + --contig_names_string=${contig_names_string} \ + --contig_lengths_string=${contig_lengths_string} \ + --denoised_copy_ratios_file={{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.denoisedCR.genderCorrected.tsv \ + --allelic_counts_file={{ results_dir }}/{{ sample.name }}.{{ aligner }}.hets.tsv \ + --modeled_segments_file={{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg + + {# If the task has already been run then we need to remove the plots directory. #} + if [ -d {{ results_dir }}/plots ] + then + rm -rf {{ results_dir }}/plots + fi + + mv {{ temp_dir }}/results/* {{ results_dir }}/ + + {% set final_seg %}{{ results_dir }}/{{ sample.name }}.{{ aligner }}.re_centered.cr.igv.seg{% endset %} + {{- annot_seg(final_seg, sample.name) }} + + {% set task %}constitutional_gatk_call_cnv_step2_{{ sample.name }}_{{ aligner }}{% endset %} + {% set directory %}{{ temp_dir }}{% endset %} + {{- remove_files(directory,none,task) }} + +{% endmacro %} diff --git a/modules/constitutional/gatk_genotypegvcf.jst b/modules/constitutional/gatk_genotypegvcf.jst index ed37f949..dcd6aa41 100755 --- a/modules/constitutional/gatk_genotypegvcf.jst +++ b/modules/constitutional/gatk_genotypegvcf.jst @@ -1,5 +1,6 @@ {% from 'modules/annotation/main.jst' import annotate_vcfs with context %} {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {%- macro genotypegvcf(sample, aligner='bwa', taskPrefix='Genome') %} @@ -136,12 +137,12 @@ --resource "{{ resource }}" \ {% endfor %} --variant "{{ all_vcf }}" \ - --output "{{ temp_dir }}/{{ sample.name }}.hc.fvt.vcf.gz" \ - + --output "{{ temp_dir }}/{{ sample.name }}.hc.fvt.vcf.gz" + {# Then filter out the passing variants to a separate file #} bcftools filter \ --output-type z \ - --include 'FILTER == "."' \ + --include 'FILTER == "PASS"' \ {% if sample.gltype == 'exome' %} --targets-file "{{ sample.capture_kit.extended_bed }}" \ {% endif %} @@ -152,5 +153,7 @@ {{- annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'hc', 'constitutional', 'genotype_hc_gvcf') }} -{% endmacro %} +{{- vcf_stats(pass_vcf, results_dir) }} +{{- vcf_stats(all_vcf, results_dir) }} +{% endmacro %} diff --git a/modules/constitutional/main.jst b/modules/constitutional/main.jst index e7917979..d17f5c05 100755 --- a/modules/constitutional/main.jst +++ b/modules/constitutional/main.jst @@ -7,12 +7,29 @@ {% from 'modules/constitutional/deepvariant.jst' import deepvariant with context %} {% from 'modules/constitutional/octopus.jst' import octopus_constitutional with context %} {% from 'modules/constitutional/ichorcna.jst' import ichorcna with context %} +{% from 'modules/constitutional/gatk_cnv.jst' import gatk_cnv_constitutional with context %} +{% from 'modules/constitutional/expansion_hunter.jst' import expansion_hunter with context %} # Constitutional tools generally operate on a single DNA samples and should not run -# on "tumor" samples with the exception of ichor that cna be run on tumor or constitutional. +# on "tumor" samples with the exception of ichor that can be run on tumor or constitutional. {%- macro constitutional_variant_calling(samples) %} +{% set normSamples = {} %} + +{% if controlDataFiles is defined %} +{% for file in controlDataFiles %} + + {% set name %}{{ study }}_{{ file.assayCode }}{% endset %} + {% do file.update({'name': name}) %} + + {% if name not in normSamples %} + {% do normSamples.update({name: {}}) %} + {% do normSamples[name].update(file) %} + {% endif %} +{% endfor %} +{% endif %} + {%- for sample in samples.values() if sample.gltype in ('exome', 'genome') %} {% if sample.gltype in 'exome' %} @@ -22,6 +39,11 @@ {% if tasks.Genome_constitutional_cna_caller_ichor|default(true) %} {{- ichorcna(sample, aligner='bwa', taskPrefix=taskPrefix) }} {% endif %} + {# MODULE_DISABLED: Human specific - variant catalog does not exist for canine + {% if tasks[taskPrefix+"_constitutional_structural_caller_expansion_hunter"]|default(true) %} + {{- expansion_hunter(sample, aligner='bwa') }} + {% endif %} + #} {% endif %} {%- if sample.subGroup|lower != 'tumor' %} @@ -49,6 +71,11 @@ {%- if tasks[taskPrefix+"_constitutional_snp_indel_caller_octopus"]|default(false) %} {{- octopus_constitutional(sample, aligner='bwa', taskPrefix=taskPrefix) }} {% endif %} + {%- if tasks[taskPrefix+"_constitutional_cna_caller_gatk"]|default(true) %} + {% for normal in normSamples.values() if normal.assayCode == sample.assayCode and normal.gatkCnvPon is defined %} + {{- gatk_cnv_constitutional(sample, normal, aligner='bwa') }} + {% endfor %} + {% endif %} {% endif %} {% endfor %} diff --git a/modules/constitutional/octopus.jst b/modules/constitutional/octopus.jst index 2ab06148..c3c0b12f 100644 --- a/modules/constitutional/octopus.jst +++ b/modules/constitutional/octopus.jst @@ -1,5 +1,6 @@ {% from 'modules/annotation/main.jst' import annotate_vcfs with context %} {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {%- macro octopus_constitutional(sample, aligner='bwa', taskPrefix='Genome') %} {%- set bam %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}/{{ sample.name }}.{{ aligner }}.bam{% endset %} @@ -127,4 +128,7 @@ {{- annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'octopus', 'constitutional', 'snp_indel_caller') }} +{{- vcf_stats(pass_vcf, results_dir) }} +{{- vcf_stats(all_vcf, results_dir) }} + {% endmacro -%} diff --git a/modules/constitutional/strelka2.jst b/modules/constitutional/strelka2.jst index d8dc33a8..b7317ada 100755 --- a/modules/constitutional/strelka2.jst +++ b/modules/constitutional/strelka2.jst @@ -1,5 +1,6 @@ {% from 'modules/annotation/main.jst' import annotate_vcfs with context %} {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {%- macro strelka2_constitutional(sample, aligner='bwa', taskPrefix='Genome') %} @@ -138,4 +139,7 @@ {{- annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'strelka2', 'constitutional', 'snp_indel_caller' ) }} +{{- vcf_stats(pass_vcf, results_dir) }} +{{- vcf_stats(all_vcf, results_dir) }} + {% endmacro %} diff --git a/modules/dna_alignment/bowtie2_samtools.jst b/modules/dna_alignment/bowtie2_samtools.jst index 035c7ada..27c38f85 100644 --- a/modules/dna_alignment/bowtie2_samtools.jst +++ b/modules/dna_alignment/bowtie2_samtools.jst @@ -35,6 +35,14 @@ set -eu set -o pipefail + {% if fastq.fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + + {% endif %} + {# + This comment is here for protect render spacing, do not remove. + #} rm -r "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/" || true mkdir -p "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/" @@ -249,8 +257,8 @@ {# Extracting ecoli alignments #} samtools view \ - -b - -o "{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2_ecoli.bam" + -b \ + -o "{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2_ecoli.bam" \ "{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2.bam" \ "U00096.3" @@ -265,7 +273,11 @@ {# Calculating spike and scale for genomecov #} SPIKE=$(wc -l < "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2_ecoli.bed") - SCALE=$(echo "10000 / $SPIKE" | bc -l) + if [ $SPIKE -gt 0 ]; then + SCALE=$(echo "10000 / $SPIKE" | bc -l) + else + SCALE=$(echo "10000") + fi TAB=$'\t' cat < "temp/{{ sample.gltype }}/alignment/bowtie2/callRegions.bed" @@ -277,11 +289,10 @@ bedtools genomecov \ -bg \ -scale $SCALE \ - -i "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2.bed" \ - -g "temp/{{ sample.gltype }}/alignment/bowtie2/callRegions.bed" \ + -i "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.no_decoy.bowtie2.bed" \ + -g "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/callRegions.bed" \ > "{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2_ecoli_norm.bedgraph" {% endif %} {% endmacro %} - diff --git a/modules/dna_alignment/bwa_mem2_samtools.jst b/modules/dna_alignment/bwa_mem2_samtools.jst new file mode 100644 index 00000000..7bedaa8f --- /dev/null +++ b/modules/dna_alignment/bwa_mem2_samtools.jst @@ -0,0 +1,312 @@ +# Aligns fastqs for a sample using BWA MEM2. Samples may have multiple read +# groups. +# +# ? ? +# ? -----fastqs--> temp/.bwa.bam +# ? ? +# + +# This alignment command prefix is shared by all modules using bwa +{% from 'utilities/read_group_line.jst' import read_group_line with context %} +{% from 'utilities/remove_files.jst' import remove_files with context %} + +# This macro splits large fastqs into chunks prior to aligning. +# If fastq is less than reads_per_chunk (48000000) then one chunk is made. +{% macro bwa_mem2_samtools_chunked(sample, reads_per_chunk, opt_dup_distance) %} + +{% set temp_dir %}temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}{% endset %} +{% set results_dir %}{{ sample.gltype }}/alignment/bwa/{{ sample.name }}{% endset %} + +{% for rgid, rg in sample.read_groups.items() %} +{% set r1fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R1')|first %} +{% set r2fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R2')|first %} + +{# Comment about the math, using the assumed legacy illumina interpretation, it is a TGen modification #} +{% set n_lines = (reads_per_chunk * 4)|int %} +{% set n_chunks = (r1fastq.numberOfReads / 2 / reads_per_chunk)|round(0, method='ceil')|int %} +{% if n_chunks > 99 %}{{ raise('ValueError', 'Too many chunks!') }}{% endif %} + +{% for fastq in [r1fastq, r2fastq] %} +- name: split_fastq_{{ fastq.basename | replace(".", "_") }} + tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, split, {{ sample.name }}] + reset: predecessors + input: + - temp/fastqs/{{ fastq.basename }} + cpus: 1 + walltime: "4:00:00" + cmd: | + set -eu + set -o pipefail + + {% if fastq.fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + + {% endif %} + {# + This comment is here for protect render spacing, do not remove. + #} + rm -r "{{ temp_dir }}/{{ rgid }}/{{ fastq.basename }}/" || true + mkdir -p "{{ temp_dir }}/{{ rgid }}/{{ fastq.basename }}/" + + zcat "temp/fastqs/{{ fastq.basename }}" |\ + split \ + --numeric-suffixes \ + --suffix-length 2 \ + --lines {{ n_lines }} \ + - \ + "{{ temp_dir }}/{{ rgid }}/{{ fastq.basename }}/" + + N_CREATED=$(ls "{{ temp_dir }}/{{ rgid }}/{{ fastq.basename }}/" | wc -l) + + if [[ ${N_CREATED} -ne {{ n_chunks }} ]] + then + echo "Chunks created does not match expected value" + exit 1 + fi + +{% endfor %} + +{% for i in range(n_chunks) %} +{% set chunk_suffix = '%02d' % i %} +- name: chunked_bwa_mem2_samtools_fixmate_{{ sample.name }}_{{ rgid }}_{{ chunk_suffix }} + tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, {{ sample.name }}] + reset: predecessors + after: + - split_fastq_{{ r1fastq.basename | replace(".", "_") }} + - split_fastq_{{ r2fastq.basename | replace(".", "_") }} + output: {{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}.bwa.bam + walltime: "48:00:00" + cpus: 10 + mem: 32G + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.bwa_mem2.module }} + module load {{ constants.tools.samtools.module }} + + {# + # If this task was interrupted previously, temp files may exist + # that will cause errors with samtools sort. Here, we purge any + # existing temp files before making the directory again. + #} + rm -r "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}_st_sort_temp/" || true + mkdir -p "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}_st_sort_temp/" + + {# No long options available for the following: + bwa mem + -v INT Verbosity: 3 = message (default = 3) + -Y Use soft clipping for supplementary alignments + -K INT Process INT input bases in each batch regardless of nThreads (for reproducibility) + -t INT Number of threads to use + -R STR Read group header line such as '@RG\tID:foo\tSM:bar' [null] + + samtools fixmate + -m Add mate score tag, REQUIRED for samtools markdup + - Input from stdin + - Output to stdout + + samtools sort + -l INT Set compression level, from 0 (uncompressed) to 9 (best) + -m INT Set maximum memory per thread; suffix K/M/G recognized [768M] + -T PREFIX Write temporary files to PREFIX.nnnn.bam + - Input from stdin + -o FILE Write final output to FILE rather than standard output + #} + bwa-mem2 mem \ + -v 3 \ + -Y \ + -K 100000000 \ + -t 9 \ + -R "{{ read_group_line(rg, format='bwa') }}" \ + "{{ constants.coyote.bwa_mem2_index }}" \ + "{{ temp_dir }}/{{ rgid }}/{{ r1fastq.basename }}/{{ chunk_suffix }}" \ + "{{ temp_dir }}/{{ rgid }}/{{ r2fastq.basename }}/{{ chunk_suffix }}" |\ + samtools fixmate \ + --threads 1 \ + -m \ + - \ + - |\ + samtools sort \ + -T "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}_st_sort_temp/{{ chunk_suffix }}" \ + -l 2 \ + -m 768M \ + --threads 9 \ + --output-fmt BAM \ + -o "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}.bwa.bam" \ + - + +{% endfor %} + +- name: chunked_samtools_merge_rgid_bams_{{ sample.name }}_{{ rgid }} + tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, {{ sample.name }}] + reset: predecessors + input: + {% for i in range(n_chunks) %} + {% set chunk_suffix = '%02d' % i %} + - {{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}.bwa.bam + {% endfor %} + output: {{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.bam + walltime: "24:00:00" + cpus: 8 + mem: 8G + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.samtools.module }} + + {# No long options available for the following: + -c Combine @RG headers with colliding IDs [alter IDs to be distinct] + -f Overwrite the output BAM if exist + -l INT Compression level, from 0 to 9 [-1] + #} + samtools merge \ + --threads 8 \ + -c \ + -f \ + -l 6 \ + "{{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.bam" \ + {% for i in range(n_chunks) %} + {% set chunk_suffix = '%02d' % i %} + {% if not loop.last %} + "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}.bwa.bam" \ + {% else %} + "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}.bwa.bam" + {% endif %} + {% endfor %} + + {# Cleanup the tempfiles + # removes fastq chunks and intermediate chunk bam files + #} + {% if not debug %} + rm -r "{{ temp_dir }}/{{ rgid }}/" + {% endif %} + + +- name: samtools_markdup_{{ sample.name }}_{{ rgid }}_bwa + tags: [{{ sample.gltype }}, alignment, mark_duplicates, samtools, {{ sample.name }}] + reset: predecessors + methods: Duplicate reads for {{ sample.name }} were marked with + {{ constants.tools.samtools.verbose }} markdup. + input: {{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.bam + output: + - {{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.md.bam + - {{ temp_dir }}/stats/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt + walltime: "48:00:00" + cpus: 4 + mem: 16G + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.samtools.module }} + + {# + # If this task was interrupted previously, temp files may exist + # that will cause errors with samtools markdup. Here, we purge any + # existing temp files before making the directory again. + #} + rm -r "{{ temp_dir }}/markdup_temp/{{ sample.name }}_{{ rgid }}" || true + rm "{{ temp_dir }}/stats/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt" || true + mkdir -p "{{ temp_dir }}/markdup_temp/{{ sample.name }}_{{ rgid }}" + mkdir -p "{{ temp_dir }}/stats/" + + {# No long options available for the following: + -d Optical distance + -S Mark supplemenary alignments of duplicates as duplicates (slower) + -f Write stats to named file. Implies -s (report stats) + -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp + 2> Stats are output to stderr which is redirected to ".bwa.bam.markdup.txt" + #} + samtools markdup \ + -d {{ opt_dup_distance }} \ + -S \ + -f "{{ temp_dir }}/stats/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt" \ + --threads 4 \ + --write-index \ + -T "{{ temp_dir }}/markdup_temp/{{ sample.name }}_{{ rgid }}" \ + "{{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.bam" \ + "{{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.md.bam" + +{% endfor %} + +- name: samtools_markdup_merge_rg_bams_{{ sample.name }} + tags: [{{ sample.gltype }}, alignment, mark_duplicates, samtools, {{ sample.name }}] + reset: predecessors + methods: Duplicate reads for {{ sample.name }} were marked with + {{ constants.tools.samtools.verbose }} markdup. + input: + {% for rgid in sample.read_groups %} + - {{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.md.bam + - {{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt + {% endfor %} + output: + - {{ temp_dir }}/{{ sample.name }}.bwa.md.bam + - {{ results_dir }}/stats/{{ sample.name }}.bwa.bam.samtools.markdup.txt + walltime: "48:00:00" + cpus: 8 + mem: 16G + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.samtools.module }} + + mkdir -p "{{ results_dir }}/stats/" + + {# No long options available for the following: + -c Combine @RG headers with colliding IDs [alter IDs to be distinct] + -f Overwrite the output BAM if exist + -l INT Compression level, from 0 to 9 [-1] + #} + samtools merge \ + --threads 8 \ + -c \ + -f \ + -l 6 \ + "{{ temp_dir }}/{{ sample.name }}.bwa.md.bam" \ + {% for rgid in sample.read_groups %} + {% if not loop.last %} + "{{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.md.bam" \ + {% else %} + "{{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.md.bam" + {% endif %} + {% endfor %} + + {# Merging markdup stat files #} + {% set first_sample_rgid = (sample.read_groups.values()|first).rgid|default('') %} + + {# + Pasting the files together and stripping string down to integer values + then bc sum them together respectively. + #} + paste \ + {% for rgid in sample.read_groups %} + "{{ temp_dir }}/stats/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt" + {%- endfor %} + | head -n -1 | tail -n +2 | sed 's/\t[a-zA-Z :_]\+/+/g' | awk '{ print $NF }' | bc \ + > {{ temp_dir }}/stats/{{ sample.name }}.bwa.bam.samtools_markdup_merge.txt + + paste \ + {% for rgid in sample.read_groups %} + "{{ temp_dir }}/stats/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt" + {%- endfor %} + | tail -n1 | sed 's/\t[a-zA-Z :_]\+/\ /g' | cut -d' ' -f2- |\ + awk '{m=$1;for(i=1;i<=NF;i++)if($i>m)m=$i;print m}' \ + >> {{ temp_dir }}/stats/{{ sample.name }}.bwa.bam.samtools_markdup_merge.txt + + + {# Combining the summed values with their naming #} + tail -n +2 {{ temp_dir }}/stats/{{ sample.name }}_{{ first_sample_rgid }}.bwa.bam.samtools.markdup.txt |\ + cut -d':' -f1 |\ + paste - {{ temp_dir }}/stats/{{ sample.name }}.bwa.bam.samtools_markdup_merge.txt |\ + sed 's/\t/: /g' \ + > {{ results_dir }}/stats/{{ sample.name }}.bwa.bam.samtools.markdup.txt + + {# insert COMMAND: into the top of the merged file #} + header=$(head -n 1 {{ temp_dir }}/stats/{{ sample.name }}_{{ first_sample_rgid }}.bwa.bam.samtools.markdup.txt) + sed -i "1i ${header}" {{ results_dir }}/stats/{{ sample.name }}.bwa.bam.samtools.markdup.txt + +{% endmacro %} diff --git a/modules/dna_alignment/bwa_mem_gatk_picard.jst b/modules/dna_alignment/bwa_mem_gatk_picard.jst index 79ff436f..d6debd50 100755 --- a/modules/dna_alignment/bwa_mem_gatk_picard.jst +++ b/modules/dna_alignment/bwa_mem_gatk_picard.jst @@ -34,6 +34,14 @@ set -eu set -o pipefail + {% if fastq.fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + + {% endif %} + {# + This comment is here for protect render spacing, do not remove. + #} rm -r "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/" || true mkdir -p "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/" @@ -60,7 +68,7 @@ - name: chunked_bwa_mem_samtools_view_{{ sample.name }}_{{ rgid }}_{{ chunk_suffix }} tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, {{ sample.name }}] reset: predecessors - after: + after: - split_fastq_{{ r1fastq.basename | replace(".", "_") }} - split_fastq_{{ r2fastq.basename | replace(".", "_") }} walltime: "48:00:00" @@ -163,7 +171,7 @@ --MAX_RECORDS_IN_RAM 300000 \ --INPUT "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.md-uns.bam" \ --OUTPUT "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.md.bam" - + {# Cleanup the tempfiles #} {% if not debug %} rm "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.md-uns.bam" diff --git a/modules/dna_alignment/bwa_mem_samtools.jst b/modules/dna_alignment/bwa_mem_samtools.jst index 47356b1f..00f3cf44 100755 --- a/modules/dna_alignment/bwa_mem_samtools.jst +++ b/modules/dna_alignment/bwa_mem_samtools.jst @@ -34,6 +34,14 @@ set -eu set -o pipefail + {% if fastq.fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + + {% endif %} + {# + This comment is here for protect render spacing, do not remove. + #} rm -r "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/" || true mkdir -p "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/" @@ -60,7 +68,7 @@ - name: chunked_bwa_mem_samtools_fixmate_{{ sample.name }}_{{ rgid }}_{{ chunk_suffix }} tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, {{ sample.name }}] reset: predecessors - after: + after: - split_fastq_{{ r1fastq.basename | replace(".", "_") }} - split_fastq_{{ r2fastq.basename | replace(".", "_") }} walltime: "48:00:00" @@ -219,4 +227,3 @@ "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.md.bam" {% endmacro %} - diff --git a/modules/dna_alignment/main.jst b/modules/dna_alignment/main.jst index 8f07b290..faa8a722 100755 --- a/modules/dna_alignment/main.jst +++ b/modules/dna_alignment/main.jst @@ -1,6 +1,8 @@ # DNA Alignment with BWA MEM # Take DNA fastq data files to aligned bams with qc data {% from 'modules/dna_alignment/bwa_mem_samtools.jst' import bwa_mem_samtools_chunked with context %} +{% from 'modules/dna_alignment/bwa_mem2_samtools.jst' import bwa_mem2_samtools_chunked with context %} +{% from 'modules/dna_alignment/pb_fq2bam.jst' import fq2bam with context %} {% from 'modules/dna_alignment/bwa_mem_gatk_picard.jst' import bwa_mem_gatk_picard_chunked with context %} {% from 'modules/dna_alignment/bwa_mem_samblaster_sambamba.jst' import bwa_mem_blaster_bamba with context %} {% from 'modules/dna_alignment/gatk_baserecalibration.jst' import baserecalibration, nobaserecalibration with context %} @@ -23,14 +25,19 @@ {% set opt_dup_distance = 100 %} {% endif %} - {% if alignment_style == 'tgen' %} - {{- bwa_mem_samtools_chunked(sample, reads_per_chunk, opt_dup_distance) }} - {% elif alignment_style == 'broad' %} - {{- bwa_mem_gatk_picard_chunked(sample, reads_per_chunk, opt_dup_distance) }} - {% elif alignment_style == 'ashion' %} - {{- bwa_mem_blaster_bamba(sample) }} + {# PCRfree will not have any pcr cycles #} + {% if sample.pcrCycles is defined and sample.pcrCycles == 0 %} + {{- bwa_mem2_samtools_chunked(sample, reads_per_chunk, opt_dup_distance) }} {% else %} - {{ raise('Unknown dnaAlignmentStyle: {}'.format(dnaAlignmentStyle)) }} + {% if alignment_style == 'tgen' %} + {{- bwa_mem_samtools_chunked(sample, reads_per_chunk, opt_dup_distance) }} + {% elif alignment_style == 'broad' %} + {{- bwa_mem_gatk_picard_chunked(sample, reads_per_chunk, opt_dup_distance) }} + {% elif alignment_style == 'ashion' %} + {{- bwa_mem_blaster_bamba(sample) }} + {% else %} + {{ raise('Unknown dnaAlignmentStyle: {}'.format(dnaAlignmentStyle)) }} + {% endif %} {% endif %} {% if sample.gltype == 'genome' %} diff --git a/modules/dna_alignment/pb_fq2bam.jst b/modules/dna_alignment/pb_fq2bam.jst new file mode 100644 index 00000000..7997a6de --- /dev/null +++ b/modules/dna_alignment/pb_fq2bam.jst @@ -0,0 +1,49 @@ +# Aligns fastqs from a sample using Parabricks fq2BAM. This aligns all samples +# in on large run. No need to split or merge. It will also run +# mark duplicatese and BQSR if needed + +# This alignment command prefix is shared by all modules using bwa +{% from 'utilities/read_group_line.jst' import read_group_line with context %} +{% from 'utilities/remove_files.jst' import remove_files with context %} + +{% macro fq2bam(sample, opt_dup_distance) %} + {% set bam %}{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.bam{% endset %} + +- name: fq2bam_{{sample.name}} + tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, {{ sample.name }}] + reset: predecessors + input: + {% for rgid, rg in sample.read_groups.items() %} + {% set r1fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R1')|first %} + {% set r2fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R2')|first %} + - temp/fastqs/{{ r1fastq.basename }} + - temp/fastqs/{{ r2fastq.basename }} + {% endfor %} + output: temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.bam + walltime: "24:00:00" + sbatch_args: ['-p', 'gpu', '--exclusive'] + cmd: | + set -eu + set -o pipefail + + rm -r "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/markdup_temp/" || true + mkdir -p "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/markdup_temp/" + mkdir -p "{{ sample.gltype }}/alignment/bwa/{{ sample.name }}" + mkdir -p "{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/stats/" + + module load {{ constants.tools.parabricks.module }} + + pbrun fq2bam \ + --tmp-dir "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/markdup_temp/" \ + --ref "{{ constants.coyote.bwa_index }}" \ + --bwa-options "-Y -K 100000000" \ + {% for rgid, rg in sample.read_groups.items() %} + {% set r1fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R1')|first %} + {% set r2fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R2')|first %} + --in-fq "temp/fastqs/{{ r1fastq.basename }}" "temp/fastqs/{{ r2fastq.basename }}" "{{ read_group_line(rg, format='bwa') }}" \ + {% endfor %} + --out-duplicate-metrics "{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/stats/{{ sample.name }}.mdmetrics.txt" \ + --optical-duplicate-pixel-distance {{ opt_dup_distance }} \ + --out-bam "{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.bam" + +{% endmacro %} diff --git a/modules/metrics/main.jst b/modules/metrics/main.jst index d86cc1ae..b0726d05 100644 --- a/modules/metrics/main.jst +++ b/modules/metrics/main.jst @@ -1,5 +1,7 @@ {%- from 'modules/metrics/mutation_burden.jst' import mutation_burden with context %} {%- from 'modules/metrics/msisensor_pro.jst' import msisensor_pro with context %} +{%- from 'modules/metrics/sigprofiler.jst' import sigprofiler with context %} +{%- from 'modules/metrics/tucon.jst' import tucon with context %} {%- macro collect_somatic_metrics(pair, normal_bam, tumor_bam, results_dir, taskPrefix, aligner, variant_caller) %} @@ -40,29 +42,48 @@ {% set flags.vep = true %} {% endif %} -{% if tasks[taskPrefix+"_somatic_sample_metric_tgen_mutation_burden"]|default(true) %} - {% if flags.bcftools %} - {# If pair.normal.pathToBam is defined then we have a tumor_only pair #} - {% if pair.normal.pathToBam is defined %} - {% set final_vcf_prefix %}{{ vcf_prefix }}.db.flt{% endset %} - {% else %} - {% set final_vcf_prefix %}{{ vcf_prefix }}.db{% endset %} - {% endif %} +{% if flags.bcftools %} + {# If pair.normal.pathToBam is defined then we have a tumor_only pair #} + {% if pair.normal.pathToBam is defined %} + {% set final_vcf_prefix %}{{ vcf_prefix }}.db.flt{% endset %} {% else %} - {% set final_vcf_prefix %}{{ vcf_prefix }}{% endset %} + {% set final_vcf_prefix %}{{ vcf_prefix }}.db{% endset %} {% endif %} - {% if flags.vep %} +{% else %} + {% set final_vcf_prefix %}{{ vcf_prefix }}{% endset %} +{% endif %} +{% if flags.vep %} + {% if tasks[taskPrefix+"_somatic_sample_metric_tgen_mutation_burden"]|default(true) %} {% set final_vcf %}{{ final_vcf_prefix }}.vep.pick.vcf.gz{% endset %} {{- mutation_burden(pair, normal_bam, tumor_bam, final_vcf, variant_caller, aligner, 'vep') }} {% endif %} - {% if flags.snpeff %} + {% if tasks[taskPrefix+"_somatic_sample_metric_tucon"]|default(true) %} + {% if tasks[taskPrefix+"_somatic_annotate_vcfs_bcftools_topmed"]|default(true) %} + {% set final_vcf %}{{ final_vcf_prefix }}.vep.full.vcf.gz{% endset %} + {{- tucon(pair, final_vcf, 'vep') }} + {% endif %} + {% endif %} +{% endif %} +{% if flags.snpeff %} + {% if tasks[taskPrefix+"_somatic_sample_metric_tgen_mutation_burden"]|default(true) %} {% set final_vcf %}{{ final_vcf_prefix }}.snpeff.can.vcf.gz{% endset %} {{- mutation_burden(pair, normal_bam, tumor_bam, final_vcf, variant_caller, aligner, 'snpeff') }} {% endif %} + {% if tasks[taskPrefix+"_somatic_sample_metric_tucon"]|default(true) %} + {% if tasks[taskPrefix+"_somatic_annotate_vcfs_bcftools_topmed"]|default(true) %} + {% set final_vcf %}{{ final_vcf_prefix }}.snpeff.full.vcf.gz{% endset %} + {{- tucon(pair, final_vcf, 'snpeff') }} + {% endif %} + {% endif %} {% endif %} {# {% if tasks[taskPrefix+"_somatic_sample_metric_msisensor_pro"]|default(true) %} {{- msisensor_pro(pair, normal_bam, tumor_bam, aligner) }} {% endif %} #} + +{% if tasks[taskPrefix+"_somatic_sample_metric_sigprofiler"]|default(true) %} + {{- sigprofiler(pair, vcf_prefix, variant_caller, aligner) }} +{% endif %} + {% endmacro %} diff --git a/modules/metrics/mutation_burden.jst b/modules/metrics/mutation_burden.jst index b2c48d6e..bbc9e47a 100644 --- a/modules/metrics/mutation_burden.jst +++ b/modules/metrics/mutation_burden.jst @@ -8,6 +8,14 @@ {% set mutation_burden_output %}{{ results_dir }}/{{ pair.name }}.{{ annotate_flag }}.mutation_burden.txt{% endset %} {% set mutation_burden_json %}{{ results_dir }}/{{ pair.name }}.{{ annotate_flag }}.mutation_burden.json{% endset %} +{% if pair.callers is defined and pair.callers|length > 1 %} + {% if pair.callers | length > 3 %} + {% set CC_filter %}INFO/CC>={{ pair.callers | length - 2 }}{% endset %} + {% else %} + {% set CC_filter %}INFO/CC>={{ pair.callers | length - 1 }}{% endset %} + {% endif %} +{% endif %} + - name: tgen_mutation_burden_{{ variant_caller }}_{{ pair.name }}_{{ aligner }}_{{ annotate_flag }} tags: input: @@ -53,7 +61,7 @@ {% endif %} bcftools filter \ - --include 'INFO/CC>=3 && ({{ ns_list|join(' || ') }})' \ + --include '{{ CC_filter }} && ({{ ns_list|join(' || ') }})' \ --output {{ mutation_burden_temp }}/{{ input_vcf | basename }} \ --output-type z \ {{ input_vcf }} diff --git a/modules/metrics/sigprofiler.jst b/modules/metrics/sigprofiler.jst new file mode 100644 index 00000000..0b182efa --- /dev/null +++ b/modules/metrics/sigprofiler.jst @@ -0,0 +1,101 @@ +{% from 'utilities/variant_filtering.jst' import filter_variants with context %} + +{% macro sigprofiler(pair, vcf_prefix, variant_caller, aligner) %} + +{% set temp_dir %}temp/{{ pair.gltype }}/metrics/sigprofiler/{{ pair.name }}{% endset %} +{% set temp_vcf_dir %}{{ temp_dir }}/vcfs{% endset %} +{% set results_dir %}{{ pair.gltype }}/metrics/sigprofiler/{{ pair.name }}{% endset %} +{% set all_vcf %}{{ vcf_prefix }}.vcf.gz{% endset %} +{% set filt_vcf %}{{ temp_vcf_dir }}/{{ vcf_prefix|basename }}.pass.vcf{% endset %} + +{% set task %}sigprofiler_{{ variant_caller }}_{{ pair.name }}_{{ aligner }}{% endset %} +{{- filter_variants(pair, all_vcf, temp_vcf_dir, filt_vcf, task) }} + +- name: sigprofiler_{{ variant_caller }}_{{ pair.name }}_{{ aligner }} + tags: + input: + - {{ filt_vcf }} + output: + - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_Activities_refit.txt + - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_Activity_Plots_refit.pdf + - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_TMB_plot_refit.pdf + - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt + - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Signatures/SBS96_De-Novo_Signatures.txt + - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Signatures/SBS_96_plots_SBS96_De-Novo.pdf + - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_Activities_refit.txt + - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_Activity_Plots_refit.pdf + - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_TMB_plot_refit.pdf + - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt + - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Signatures/ID83_De-Novo_Signatures.txt + - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Signatures/ID_83_plots_ID83_De-Novo.pdf + - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_Activities_refit.txt + - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_Activity_Plots_refit.pdf + - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_TMB_plot_refit.pdf + - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt + - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Signatures/DBS78_De-Novo_Signatures.txt + - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Signatures/DBS_78_plots_DBS78_De-Novo.pdf + - {{ results_dir }}/extraneous_results.tar + walltime: "24:00:00" + cpus: 10 + mem: 8G + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.sigprofiler.module }} + module load {{ constants.tools.bcftools.module }} + + {# Remove previously generated matrix #} + rm -r {{ temp_dir }}/vcfs/*/ || true + + {# Should already exist #} + mkdir -p {{ temp_dir }}/vcfs + mkdir -p {{ results_dir }} + + python3 ${JS_PIPELINE_PATH}/required_scripts/{{ constants.coyote.sigprofiler }} \ + --threads 10 \ + --vcfpath {{ temp_dir }}/vcfs \ + --output {{ temp_dir }} \ + --project {{ pair.name }} \ + --genome dog \ + --extract_only + + {# It's possible for no signatures to be found, so we need an if statement to check for output #} + if [ "$(ls -A {{ temp_dir }}/vcfs/output)" ]; then + {# Remove true temp files #} + rm -r {{ temp_dir }}/JOB_METADATA.txt {{ temp_dir }}/Seeds.txt + + {# Prepare the known useful output #} + {# rsync -R copies the relative path after the /./ section #} + if [ -d "{{ temp_dir }}/SBS96/Suggested_Solution" ]; then + rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_Activities_refit.txt {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_Activity_Plots_refit.pdf {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_TMB_plot_refit.pdf {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Signatures/SBS96_De-Novo_Signatures.txt {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Signatures/SBS_96_plots_SBS96_De-Novo.pdf {{ results_dir }} + fi + if [ -d "{{ temp_dir }}/ID83/Suggested_Solution" ]; then + rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_Activities_refit.txt {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_Activity_Plots_refit.pdf {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_TMB_plot_refit.pdf {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Signatures/ID83_De-Novo_Signatures.txt {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Signatures/ID_83_plots_ID83_De-Novo.pdf {{ results_dir }} + fi + if [ -d "{{ temp_dir }}/DBS78/Suggested_Solution" ]; then + rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_Activities_refit.txt {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_Activity_Plots_refit.pdf {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_TMB_plot_refit.pdf {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Signatures/DBS78_De-Novo_Signatures.txt {{ results_dir }} + rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Signatures/DBS_78_plots_DBS78_De-Novo.pdf {{ results_dir }} + fi + + {# tar the files that are not known to be immediately important #} + tar -cvf {{ results_dir }}/extraneous_results.tar {{ temp_dir }}/* + else + echo "No signatures found" + fi + +{% endmacro %} diff --git a/modules/metrics/thred.jst b/modules/metrics/thred.jst new file mode 100644 index 00000000..54f4a1f4 --- /dev/null +++ b/modules/metrics/thred.jst @@ -0,0 +1,53 @@ +{% macro thred(sample_or_pair, input_seg) %} + +{% set results_dir %}{{ sample_or_pair.gltype }}/metrics/thred/{{ sample_or_pair.name }}{% endset %} +{% set temp_dir %}temp/{{ sample_or_pair.gltype }}/metrics/thred/{{ sample_or_pair.name }}{% endset %} + +- name: thred_{{ sample_or_pair.name }} + tags: [{{ sample_or_pair.gltype }}, HRD, {{ sample_or_pair.name }}] + input: + - {{ input_seg }} + output: + - {{ results_dir }}/{{ sample_or_pair.name }}_hrd_scores.txt + - {{ results_dir }}/{{ sample_or_pair.name }}_hrd_flt_segments.txt + - {{ results_dir }}/{{ sample_or_pair.name }}_hrd_ori_segments.txt + - {{ results_dir }}/{{ sample_or_pair.name }}_excluded90_hrd_excluded_segments.txt + - {{ results_dir }}/{{ sample_or_pair.name }}_hrd_captured_genome_territory.txt + - {{ results_dir }}/{{ sample_or_pair.name }}_original_segments_karyoplot_1.png + - {{ results_dir }}/{{ sample_or_pair.name }}_original_segments_karyoplot_2.png + - {{ results_dir }}/{{ sample_or_pair.name }}_segments_filtered_karyoplot_1.png + - {{ results_dir }}/{{ sample_or_pair.name }}_segments_filtered_karyoplot_2.png + - {{ results_dir }}/{{ sample_or_pair.name }}_segments_excluded_karyoplot_1.png + - {{ results_dir }}/{{ sample_or_pair.name }}_segments_excluded_karyoplot_2.png + cpus: 1 + mem: 2G + walltime: "4:00:00" + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.python_3_7_2.module }} + module load {{ constants.tools.thred.module }} + + mkdir -p {{ results_dir }} + + {# Purge any existing run files prior to starting #} + rm -r "{{ temp_dir }}" || true + mkdir -p "{{ temp_dir }}" + + sed 's/$/\tcentromere/' {{ constants.coyote.centromere }} \ + > {{ temp_dir }}/genomic_regions.bed + + tHReD.py \ + --genomic-regions {{ temp_dir }}/genomic_regions.bed \ + --seg {{ input_seg }} \ + --sample {{ sample_or_pair.name }} \ + --outfile {{ sample_or_pair.name }} \ + --dir-out {{ results_dir }} \ + --th-log2r -0.1613 \ + --minsize 1000000 \ + --th-pct-overlapping 0.90 \ + --plots \ + --exclude-contigs "X" + +{% endmacro %} diff --git a/modules/metrics/tucon.jst b/modules/metrics/tucon.jst new file mode 100644 index 00000000..4e1c5e4e --- /dev/null +++ b/modules/metrics/tucon.jst @@ -0,0 +1,37 @@ +{% from 'utilities/variant_filtering.jst' import filter_variants with context %} + +{% macro tucon(pair, input_vcf, annotate_flag) %} + +{% set temp_dir %}temp/{{ pair.gltype }}/metrics/tucon/{{ pair.name }}{% endset %} +{% set results_dir %}{{ pair.gltype }}/metrics/tucon/{{ pair.name }}{% endset %} +{% set tucon_output %}{{ results_dir }}/{{ pair.name }}_{{ annotate_flag }}_tucon.tsv{% endset %} +{% set filtered_vcf %}{{ temp_dir }}/{{ pair.name }}_{{ annotate_flag }}.flt.vcf.gz{% endset %} + +{% set task %}tucon_{{ pair.name }}_{{ annotate_flag }}{% endset %} +{{- filter_variants(pair, input_vcf, temp_dir, filtered_vcf, task) }} + +- name: tucon_{{ pair.name }}_{{ annotate_flag }} + tags: [tucon, {{ annotate_flag }}] + input: + - {{ filtered_vcf }} + output: + - {{ tucon_output }} + walltime: "12:00:00" + cpus: 1 + mem: 2G + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.bcftools.module }} + + mkdir -p {{ results_dir }} + + echo -e "VCF\tMUTATION_COUNT\tTOPMED_COUNT" > {{ tucon_output }} + + VCF_BASE=$(basename {{ filtered_vcf }}) + MUT_COUNT=$(bcftools view -H {{ filtered_vcf }} | wc -l) + EVA_COUNT=$(bcftools view -H -i 'INFO/GCA_2285.2=1' {{ filtered_vcf }} | wc -l) + echo -e "${VCF_BASE}\t${MUT_COUNT}\t${EVA_COUNT}" >> {{ tucon_output }} + +{% endmacro %} diff --git a/modules/qc/stats2lims.jst b/modules/qc/stats2lims.jst index 8651309a..74836a3a 100644 --- a/modules/qc/stats2lims.jst +++ b/modules/qc/stats2lims.jst @@ -3,7 +3,7 @@ # to upload to LIMS via REST api integration. {% macro stats2lims(tag1, tag2, task, input_file, file_type) %} -{% if submissionSource == "TGenLIMS" %} +{% if submissionSource == "TGenLIMS" and file_type != "samtools_idxstats" %} - name: stats2lims_{{ task }}_{{ file_type }} tags: [{{ tag1 }}, quality_control, stats, stats2lims, {{ tag2 }}] @@ -28,4 +28,4 @@ {{ study }} {% endif %} -{% endmacro %} \ No newline at end of file +{% endmacro %} diff --git a/modules/rna/salmon.jst b/modules/rna/salmon.jst index 29d88815..85c8003f 100755 --- a/modules/rna/salmon.jst +++ b/modules/rna/salmon.jst @@ -9,10 +9,10 @@ {% set r2fqlist = [] %} {% for rgid, rg in sample.read_groups.items() %} {% for fq in rg.data_files|selectattr('fastqCode', 'eq', 'R1') %} - {% do r1fqlist.append('"temp/fastqs/' + (fq.basename) + '"') %} + {% do r1fqlist.append(fq) %} {% endfor %} {% for fq in rg.data_files|selectattr('fastqCode', 'eq', 'R2') %} - {% do r2fqlist.append('"temp/fastqs/' + (fq.basename) + '"') %} + {% do r2fqlist.append(fq) %} {% endfor %} {% endfor %} @@ -24,10 +24,10 @@ reset: predecessors input: {% for fq in r1fqlist %} - - {{ fq }} + - temp/fastqs/{{ fq.basename }} {% endfor %} {% for fq in r2fqlist %} - - {{ fq }} + - temp/fastqs/{{ fq.basename }} {% endfor %} output: - {{ results_dir }}/{{ sample.name }}.transcripts.sf @@ -41,6 +41,14 @@ set -o pipefail module load {{ constants.tools.salmon.module }} + {% if r1fqlist[0].fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + + {% endif %} + {# + This comment is here for protect render spacing, do not remove. + #} mkdir -p "{{ temp_dir }}" mkdir -p "{{ results_dir }}" @@ -52,9 +60,9 @@ --libType "{{ constants.coyote.strandedness_options[sample.strandedness].salmon }}" \ --index "{{ constants.coyote.salmon_index }}" \ --geneMap "{{ constants.coyote.gtf }}" \ - -1 {{ r1fqlist|join(' ') }} \ + -1 {% for fq in r1fqlist %}"temp/fastqs/{{ fq.basename }}"{% if not loop.last%} {% endif %}{% endfor %} \ {% if r2fqlist %} - -2 {{ r2fqlist|join(' ') }} \ + -2 {% for fq in r2fqlist %}"temp/fastqs/{{ fq.basename }}"{% if not loop.last%} {% endif %}{% endfor %} \ {% endif %} --output "{{ temp_dir }}" @@ -87,4 +95,4 @@ {% set directory %}{{ temp_dir }}{% endset %} {{- remove_files(directory,none,task) }} -{% endmacro %} \ No newline at end of file +{% endmacro %} diff --git a/modules/rna/star_fusion.jst b/modules/rna/star_fusion.jst index 5fb63e56..a9cda339 100755 --- a/modules/rna/star_fusion.jst +++ b/modules/rna/star_fusion.jst @@ -40,6 +40,14 @@ set -eu set -o pipefail + {% if r1fqlist[0].fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + + {% endif %} + {# + This comment is here for protect render spacing, do not remove. + #} module load {{ constants.tools.star_fusion.module }} rm -r {{ temp_dir }} || true @@ -115,6 +123,14 @@ mkdir -p {{ temp_dir }}/_starF_checkpoints mkdir -p {{ temp_dir }}/star-fusion.preliminary + {% if r1fqlist[0].fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + + {% endif %} + {# + This comment is here for protect render spacing, do not remove. + #} module load {{ constants.tools.samtools.module }} module load {{ constants.tools.star_fusion.module }} sf_path=$(dirname `which STAR-Fusion`) diff --git a/modules/rna/star_quant.jst b/modules/rna/star_quant.jst index 039c68cb..3e952243 100755 --- a/modules/rna/star_quant.jst +++ b/modules/rna/star_quant.jst @@ -66,6 +66,14 @@ set -o pipefail module load {{ constants.tools.star.module }} + {% if r1fqlist[0].fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + + {% endif %} + {# + This comment is here for protect render spacing, do not remove. + #} mkdir -p "{{ temp_dir }}" mkdir -p "{{ results_dir }}/stats" mkdir -p "{{ star_finalcountsdir }}" @@ -338,7 +346,8 @@ which appears to be working now after some code changes, keeping this here in ca {% set task %}markduplicates_star_gatk_{{ sample.name }}{% endset %} {% do task_list.append(task) %} - {{- remove_files(temp_dir,none,task_list) }} + {% set task_name %}{{ sample.name }}_star_quant{% endset %} + {{- remove_files(temp_dir, none, task_list, task_name) }} {% endmacro %} {% macro htseq(sample) %} @@ -464,8 +473,8 @@ which appears to be working now after some code changes, keeping this here in ca --alignments \ --paired-end \ --no-bam-output \ - --strandedness "{{ constants.phoenix.strandedness_options[sample.strandedness].rsem }}" \ + --strandedness "{{ constants.coyote.strandedness_options[sample.strandedness].rsem }}" \ {{ star_bam_transcript }} \ - {{ constants.phoenix.rsem_index }} \ + {{ constants.coyote.rsem_index }} \ "{{ sample.gltype }}/quant/rsem/{{ sample.name }}/{{ sample.name }}" {% endmacro %} diff --git a/modules/single_cell/cellranger_count.jst b/modules/single_cell/cellranger_count.jst index d2793548..f9eac291 100755 --- a/modules/single_cell/cellranger_count.jst +++ b/modules/single_cell/cellranger_count.jst @@ -21,10 +21,6 @@ {% set directory %}{{ cellranger_out }}/analysis{% endset %} {% set samples_list = [] %} - {% for rgpu, _ in files|groupby('rgpu') %} - {% set sample_string %}{{ sample.name }}_{{ rgpu[:-2] }}{% endset %} - {% do samples_list.append(sample_string) %} - {% endfor %} - name: cellranger_count_{{ sample.name }} tags: [{{ sample.gltype }}, single_cell, cellranger_count, {{ sample.name }}] @@ -54,14 +50,21 @@ module load {{ constants.tools.cellranger.module }} module load {{ constants.tools.samtools.module }} + {% if sample.fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + {% endif %} rm -r "{{ temp_dir }}" || true mkdir -p "{{ temp_dir }}" mkdir -p "{{ outdir }}" mkdir -p "{{ alignment_outdir }}/stats" - + {% for fq in files %} - ln -rs "temp/fastqs/{{ fq.basename }}" "{{ temp_dir }}/{{ fq.basename }}" + {% set fq_name %}{{ fq.name }}_{{ fq.rgpu[:-2] }}_{{ fq.rgbc }}{% endset %} + {% set fq_new %}{{ fq_name }}_S1{% endset %} + {% do samples_list.append(fq_name) %} + ln -rs "temp/fastqs/{{ fq.basename }}" "{{ temp_dir }}/{{ fq.basename | replace(fq_name, fq_new) }}" {% endfor %} {# @@ -72,6 +75,7 @@ --localcores 20 \ --localmem 160 \ --chemistry "{{ constants.coyote.scrna_chemistry_options[sample.assayCode].chemistry_name }}" \ + --expect-cells {{ sample.expectedCells | default(3000) }} \ --id "{{ sample.name }}" \ --fastqs . \ --sample "{{ samples_list|join(',') }}" \ diff --git a/modules/single_cell/cellranger_vdj.jst b/modules/single_cell/cellranger_vdj.jst index 5bfbfbec..4c205095 100644 --- a/modules/single_cell/cellranger_vdj.jst +++ b/modules/single_cell/cellranger_vdj.jst @@ -16,10 +16,6 @@ {% set json %}{{ outdir }}/stats/{{ sample.name }}.cellranger_vdj.bam.metrics_summary.json{% endset %} {% set samples_list = [] %} -{% for rgpu, _ in files|groupby('rgpu') %} - {% set sample_string %}{{ sample.name }}_{{ rgpu[:-2] }}{% endset %} - {% do samples_list.append(sample_string) %} -{% endfor %} - name: cellranger_vdj_{{ sample.name }} tags: [{{ sample.gltype }}, single_cell, cellranger_vdj, {{ sample.name }}] @@ -47,18 +43,21 @@ - {{ outdir }}/{{ sample.name }}.cellranger_vdj.concat_ref.fasta - {{ outdir }}/{{ sample.name }}.cellranger_vdj.concat_ref.fasta.fai - {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus_annotations.csv - - {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus_annotations.json - {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.bam - {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.bam.bai - {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fasta - {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fasta.fai - - {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fastq - {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig_annotations.csv - {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig.fasta - {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig.fastq - {{ metrics_output }} - {{ outdir }}/{{ sample.name }}.cellranger_vdj.vloupe - {{ outdir }}/stats/{{ sample.name }}.cellranger_vdj.bam.web_summary.html + - {{ outdir }}/{{ sample.name }}.cellranger_vdj.airr_rearrangement.tsv + - {{ outdir }}/{{ sample.name }}.cellranger_vdj.vdj_contig_info.pb + - {{ outdir }}/{{ sample.name }}.cellranger_vdj.donor_regions.fa + - {{ outdir }}/{{ sample.name }}.cellranger_vdj.regions.fa + - {{ outdir }}/{{ sample.name }}.cellranger_vdj.reference.json cpus: 20 mem: 80G walltime: "48:00:00" @@ -67,13 +66,20 @@ set -o pipefail module load {{ constants.tools.cellranger.module }} + {% if sample.fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + {% endif %} rm -r "{{ temp_dir }}" || true mkdir -p "{{ temp_dir }}" mkdir -p "{{ outdir }}/stats" {% for fq in files %} - ln -rs "temp/fastqs/{{ fq.basename }}" "{{ temp_dir }}/{{ fq.basename }}" + {% set fq_name %}{{ fq.name }}_{{ fq.rgpu[:-2] }}_{{ fq.rgbc }}{% endset %} + {% set fq_new %}{{ fq_name }}_S1{% endset %} + {% do samples_list.append(fq_name) %} + ln -rs "temp/fastqs/{{ fq.basename }}" "{{ temp_dir }}/{{ fq.basename | replace(fq_name, fq_new) }}" {% endfor %} {# @@ -103,18 +109,21 @@ mv {{ cellranger_out }}/concat_ref.fasta {{ outdir }}/{{ sample.name }}.cellranger_vdj.concat_ref.fasta mv {{ cellranger_out }}/concat_ref.fasta.fai {{ outdir }}/{{ sample.name }}.cellranger_vdj.concat_ref.fasta.fai mv {{ cellranger_out }}/consensus_annotations.csv {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus_annotations.csv - mv {{ cellranger_out }}/consensus_annotations.json {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus_annotations.json mv {{ cellranger_out }}/consensus.bam {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.bam mv {{ cellranger_out }}/consensus.bam.bai {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.bam.bai mv {{ cellranger_out }}/consensus.fasta {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fasta mv {{ cellranger_out }}/consensus.fasta.fai {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fasta.fai - mv {{ cellranger_out }}/consensus.fastq {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fastq mv {{ cellranger_out }}/filtered_contig_annotations.csv {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig_annotations.csv mv {{ cellranger_out }}/filtered_contig.fasta {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig.fasta mv {{ cellranger_out }}/filtered_contig.fastq {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig.fastq mv {{ cellranger_out }}/metrics_summary.csv {{ metrics_output }} mv {{ cellranger_out }}/vloupe.vloupe {{ outdir }}/{{ sample.name }}.cellranger_vdj.vloupe mv {{ cellranger_out }}/web_summary.html {{ outdir }}/stats/{{ sample.name }}.cellranger_vdj.bam.web_summary.html + mv {{ cellranger_out }}/airr_rearrangement.tsv {{ outdir }}/{{ sample.name }}.cellranger_vdj.airr_rearrangement.tsv + mv {{ cellranger_out }}/vdj_contig_info.pb {{ outdir }}/{{ sample.name }}.cellranger_vdj.vdj_contig_info.pb + mv {{ cellranger_out }}/vdj_reference/fasta/donor_regions.fa {{ outdir }}/{{ sample.name }}.cellranger_vdj.donor_regions.fa + mv {{ cellranger_out }}/vdj_reference/fasta/regions.fa {{ outdir }}/{{ sample.name }}.cellranger_vdj.regions.fa + mv {{ cellranger_out }}/vdj_reference/reference.json {{ outdir }}/{{ sample.name }}.cellranger_vdj.reference.json {{- stats2json(sample.gltype, sample.name, task, metrics_output, json, "cellranger_vdj_metrics", sample_name=sample.name, library_name=sample.rglb) }} diff --git a/modules/single_cell/main.jst b/modules/single_cell/main.jst index b6697e95..acad01f8 100755 --- a/modules/single_cell/main.jst +++ b/modules/single_cell/main.jst @@ -15,6 +15,10 @@ {% set rnaStrandType=file.rnaStrandType|default('unstranded')|lower %} {% set rnaStrandDirection=file.rnaStrandDirection|default('notapplicable')|lower %} {% set strandedness %}{{ readOrientation }}-{{ rnaStrandType }}-{{ rnaStrandDirection }}{% endset %} + {% if file.quantity is defined and file.quantitySource is defined and file.quantitySource|lower == 'cells' %} + {% set expectedCells=file.quantity %} + {% do file.update({'expectedCells': expectedCells}) %} + {% endif %} {% do file.update({'strandedness': strandedness}) %} {% endfor %} diff --git a/modules/single_cell/starsolo.jst b/modules/single_cell/starsolo.jst index a56e6d1a..cafcc7a8 100755 --- a/modules/single_cell/starsolo.jst +++ b/modules/single_cell/starsolo.jst @@ -53,6 +53,10 @@ module load {{ constants.tools.star.module }} module load {{ constants.tools.samtools.module }} + {% if sample.fileType == "fasterq" %} + export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}" + module load {{ constants.tools.petagene.module }} + {% endif %} rm -r "{{ temp_dir }}" || true mkdir -p "{{ temp_dir }}" diff --git a/modules/somatic/gatk_cnv.jst b/modules/somatic/gatk_cnv.jst index 24e376eb..70d4bcf2 100644 --- a/modules/somatic/gatk_cnv.jst +++ b/modules/somatic/gatk_cnv.jst @@ -1,5 +1,6 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} {% from 'modules/annotation/annot_seg.jst' import annot_seg with context %} +{% from 'modules/metrics/thred.jst' import thred with context %} # The macros found in this template are used to generate the gatk4 somatic cnv workflow. {% macro gatk_cnv(pair, aligner='bwa') %} @@ -716,6 +717,7 @@ {% set final_seg %}{{ results_dir }}/{{ pair.name }}.{{ aligner }}.re_centered.cr.igv.seg{% endset %} {{- annot_seg(final_seg, pair.name) }} + {{- thred(pair, final_seg)}} {% set task %}gatk_call_cnv_step4_{{ pair.name }}_{{ aligner }}{% endset %} {% set directory %}{{ temp_dir }}{% endset %} diff --git a/modules/somatic/lancet.jst b/modules/somatic/lancet.jst index 0fe0681a..9cc9d8ae 100755 --- a/modules/somatic/lancet.jst +++ b/modules/somatic/lancet.jst @@ -1,4 +1,5 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro lancet(pair, aligner='bwa') %} {% do pair.callers.append('lancet') %} @@ -137,4 +138,7 @@ bcftools index --tbi --force "{{ pass_vcf }}" +{{- vcf_stats(pass_vcf, results_dir) }} +{{- vcf_stats(all_vcf, results_dir) }} + {% endmacro %} diff --git a/modules/somatic/mutect2.jst b/modules/somatic/mutect2.jst index 62d404e9..605aa49d 100755 --- a/modules/somatic/mutect2.jst +++ b/modules/somatic/mutect2.jst @@ -1,4 +1,5 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro mutect2(pair, aligner='bwa') %} {% do pair.callers.append('mutect2') %} @@ -339,8 +340,10 @@ bcftools index --tbi --force "{{ pass_vcf }}" - {% set task %}mutect2_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %} + {% set task %}mutect2_filter_calls_{{ pair.name }}_{{ aligner }}{% endset %} {% set directory %}{{ temp_dir }}{% endset %} {{- remove_files(directory,none,task) }} + {{- vcf_stats(pass_vcf, results_dir) }} + {{- vcf_stats(all_vcf, results_dir) }} {% endmacro %} diff --git a/modules/somatic/octopus.jst b/modules/somatic/octopus.jst index 5275b5d6..f1b78a76 100644 --- a/modules/somatic/octopus.jst +++ b/modules/somatic/octopus.jst @@ -1,4 +1,5 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro octopus_somatic(pair, aligner='bwa') %} {% do pair.callers.append('octopus') %} @@ -27,7 +28,7 @@ set -o pipefail module load {{ constants.tools.octopus.module }} - + mkdir -p "{{ temp_dir }}" {# Write out the regions in this batch to a bed file #} @@ -37,7 +38,7 @@ {{ interval.contig }}${TAB}{{ interval.start - 1 }}${TAB}{{ interval.stop }} {% endfor %} EOF - + {# Somatic calling with octopus #} octopus \ --caller cancer \ @@ -58,7 +59,7 @@ --temp-directory-prefix "{{ loop.index }}" \ --bamout "{{ loop.index }}.realigned.bam" \ --output "{{ loop.index }}.octopus.vcf" - + {% endfor %} @@ -128,12 +129,13 @@ {% endif %} "{{ all_vcf }}" \ > "{{ pass_vcf }}" - + bcftools index --tbi --force "{{ pass_vcf }}" - {% set task %}octopus_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %} + {% set task %}octopus_merge_chunks_{{ pair.name }}_{{ aligner }}{% endset %} {% set directory %}{{ temp_dir }}{% endset %} {{- remove_files(directory,none,task) }} - + {{- vcf_stats(pass_vcf, results_dir) }} + {{- vcf_stats(all_vcf, results_dir) }} {% endmacro %} diff --git a/modules/somatic/strelka2.jst b/modules/somatic/strelka2.jst index 44a71900..563f8f16 100755 --- a/modules/somatic/strelka2.jst +++ b/modules/somatic/strelka2.jst @@ -1,4 +1,5 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro strelka2_somatic(pair, aligner='bwa') %} {% do pair.callers.append('strelka2') %} @@ -12,8 +13,8 @@ - name: strelka2_{{ pair.name }}_{{ aligner }} tags: [{{ pair.gltype }}, somatic, snp_indel_caller, strelka2, {{ pair.name }}] - methods: > - Somatic variants for {{ pair.name }} ({{ aligner }}) were called with + methods: > + Somatic variants for {{ pair.name }} ({{ aligner }}) were called with {{ constants.tools.strelka.verbose }}. input: - {{ normal_bam }} @@ -166,6 +167,7 @@ {% set task %}strelka2_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %} {% set directory %}{{ temp_dir }}{% endset %} {{- remove_files(directory,none,task) }} - + {{- vcf_stats(pass_vcf, results_dir) }} + {{- vcf_stats(all_vcf, results_dir) }} {% endmacro %} diff --git a/modules/somatic/vardict.jst b/modules/somatic/vardict.jst index 77eeb3b4..580858b0 100755 --- a/modules/somatic/vardict.jst +++ b/modules/somatic/vardict.jst @@ -1,4 +1,5 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro vardict_somatic(pair, aligner='bwa') %} {% do pair.callers.append('vardict') %} @@ -257,5 +258,7 @@ bcftools index --tbi --force "{{ pass_vcf }}" -{% endmacro %} +{{- vcf_stats(pass_vcf, results_dir) }} +{{- vcf_stats(all_vcf, results_dir) }} +{% endmacro %} diff --git a/modules/somatic/vcfmerger2.jst b/modules/somatic/vcfmerger2.jst index 0526ab3c..0d97d6f3 100755 --- a/modules/somatic/vcfmerger2.jst +++ b/modules/somatic/vcfmerger2.jst @@ -3,6 +3,7 @@ {% from 'modules/somatic/rna_variant_check.jst' import add_rna_header_to_vcf with context %} {% from 'modules/somatic/rna_variant_check.jst' import add_matched_rna with context %} {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro vcfmerger2(pair, aligner='bwa', taskPrefix='Genome') %} @@ -155,5 +156,6 @@ {{- add_rna_header_to_vcf(merged_vcf_gz, output_vcf, pair, aligner , temp_dir) }} {% endif %} {{- annotate_vcfs(pair, temp_dir, results_dir, output_vcf, taskPrefix, aligner, 'merged', 'somatic', 'snp_indel_caller') }} + {{- vcf_stats(output_vcf, results_dir) }} {{- collect_somatic_metrics(pair, normal_bam, tumor_bam, results_dir, taskPrefix, aligner, 'merged') }} {% endmacro %} diff --git a/modules/tumor_only/deepvariant.jst b/modules/tumor_only/deepvariant.jst index 3f32e777..b12106ce 100755 --- a/modules/tumor_only/deepvariant.jst +++ b/modules/tumor_only/deepvariant.jst @@ -1,4 +1,5 @@ {% from 'modules/annotation/main.jst' import annotate_vcfs with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {%- macro deepvariant_tumor_only(sample, aligner='bwa', taskPrefix='Genome') %} @@ -153,5 +154,7 @@ {# Skipping annotate vcfs step until requested #} {# annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'deepvariant', 'constitutional', 'snp_indel_caller') #} +{{- vcf_stats(pass_vcf, results_dir) }} +{{- vcf_stats(all_vcf, results_dir) }} {% endmacro %} diff --git a/modules/tumor_only/lancet.jst b/modules/tumor_only/lancet.jst index 96fba4d0..a51b949d 100755 --- a/modules/tumor_only/lancet.jst +++ b/modules/tumor_only/lancet.jst @@ -1,4 +1,5 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro lancet_tumor_only(pair, aligner='bwa') %} {% do pair.callers.append('lancet') %} @@ -139,4 +140,7 @@ bcftools index --tbi --force "{{ pass_vcf }}" +{{- vcf_stats(pass_vcf, results_dir) }} +{{- vcf_stats(all_vcf, results_dir) }} + {% endmacro %} diff --git a/modules/tumor_only/mm_igtx_calling.jst b/modules/tumor_only/mm_igtx_calling.jst index a5c969cb..dda43167 100644 --- a/modules/tumor_only/mm_igtx_calling.jst +++ b/modules/tumor_only/mm_igtx_calling.jst @@ -5,13 +5,6 @@ {% set tumor_bam %}{{ tumor.gltype }}/alignment/{{ aligner }}/{{ tumor.name }}/{{ tumor.name }}.{{ aligner }}.bam{% endset %} {% set results_dir %}{{ tumor.gltype }}/tumor_only_structural_calls/mm_igtx_pairoscope/{{ tumor.name }}{% endset %} -{% if controlDataFiles is defined %} -{% set toAssayCodes = [] %} -{% for file in controlDataFiles %} - {% do toAssayCodes.append(file.assayCode) %} -{% endfor %} -{% endif %} - - name: mm_igtx_pairoscope_{{ tumor.name }}_{{ aligner }} tags: [{{ tumor.gltype }}, structural_caller, pairoscope, {{ tumor.name }}] tags: [pairoscope, mm_igtx_calling, {{ tumor.gltype }}] @@ -63,18 +56,14 @@ echo $'Specimen\tChrA\tPositionA\tChrB\tPositionB' > {{ results_dir }}/{{ tumor.name }}.{{ aligner }}_pairoscope_igtx_discordantTable.txt fi - {% if controlDataFiles is defined and tumor.assayCode in toAssayCodes %} - python ${JS_PIPELINE_PATH}/required_scripts/{{ constants.coyote.pairoscope_mm_igtx_calling_tumor_only_script }} \ - {% else %} python ${JS_PIPELINE_PATH}/required_scripts/{{ constants.coyote.pairoscope_mm_igtx_calling_script }} \ - {% endif %} --input_file {{ results_dir }}/{{ tumor.name }}.{{ aligner }}_pairoscope_igtx_discordantTable.txt \ --specimen {{ tumor.rgsm }} \ --output_file {{ results_dir }}/{{ tumor.name }}.{{ aligner }}_pairoscope_igtx_calls.txt \ --window 2000 \ --window_min 100 \ - {% if controlDataFiles is defined and tumor.assayCode in toAssayCodes %} - --call_requirement 5 + {% if pairoscope_call_requirement is defined %} + --call_requirement {{ pairoscope_call_requirement }} {% else %} --call_requirement 4 {% endif %} @@ -188,6 +177,8 @@ bwa mem \ -t 4 \ + -Y \ + -K 100000000 \ -o {{ temp_dir }}/{{ tumor.name }}_${chrRegion}_${cluster}/trinity/trinbwafile.sam \ {{ constants.coyote.bwa_index }} \ {{ temp_dir }}/{{ tumor.name }}_${chrRegion}_${cluster}/trinity/Trinity.fasta @@ -205,6 +196,8 @@ bwa index {{ results_dir }}/{{ tumor.name }}_${chrRegion}_${cluster}/Trinity.fasta bwa mem \ -t 4 \ + -Y \ + -K 100000000 \ -o {{ temp_dir }}/{{ tumor.name }}_${chrRegion}_${cluster}/trinity/ReadstoContigs.sam \ {{ results_dir }}/{{ tumor.name }}_${chrRegion}_${cluster}/Trinity.fasta \ ${leftfq} ${rightfq} @@ -538,4 +531,4 @@ -c 2 \ -s {{ sample.name }} -{% endmacro %} \ No newline at end of file +{% endmacro %} diff --git a/modules/tumor_only/mutect2.jst b/modules/tumor_only/mutect2.jst index 45f2004b..1c0f9659 100755 --- a/modules/tumor_only/mutect2.jst +++ b/modules/tumor_only/mutect2.jst @@ -1,4 +1,5 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro mutect2_tumor_only(pair, aligner='bwa') %} {% do pair.callers.append('mutect2') %} @@ -351,5 +352,7 @@ {% set task %}tumor_only_mutect2_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %} {% set directory %}{{ temp_dir }}{% endset %} {{- remove_files(directory,none,task) }} + {{- vcf_stats(pass_vcf, results_dir) }} + {{- vcf_stats(all_vcf, results_dir) }} {% endmacro %} diff --git a/modules/tumor_only/octopus.jst b/modules/tumor_only/octopus.jst index b61a472e..93865bbc 100644 --- a/modules/tumor_only/octopus.jst +++ b/modules/tumor_only/octopus.jst @@ -1,4 +1,5 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro octopus_tumor_only(pair, aligner='bwa') %} {% do pair.callers.append('octopus') %} @@ -167,6 +168,7 @@ {% set task %}tumor_only_octopus_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %} {% set directory %}{{ temp_dir }}{% endset %} {{- remove_files(directory,none,task) }} - + {{- vcf_stats(pass_vcf, results_dir) }} + {{- vcf_stats(all_vcf, results_dir) }} {% endmacro %} diff --git a/modules/tumor_only/rna_variant_check.jst b/modules/tumor_only/rna_variant_check.jst index 48befbb3..1b391f8b 100644 --- a/modules/tumor_only/rna_variant_check.jst +++ b/modules/tumor_only/rna_variant_check.jst @@ -61,7 +61,7 @@ - {{ output_vcf }} - {{ output_vcf }}.csi - {{ output_vcf }}.tbi - walltime: "8:00:00" + walltime: "24:00:00" cpus: 2 mem: 4G cmd: | @@ -258,4 +258,4 @@ --force \ {{ output_vcf }} -{% endmacro %} \ No newline at end of file +{% endmacro %} diff --git a/modules/tumor_only/strelka2.jst b/modules/tumor_only/strelka2.jst index 9551b731..694860ff 100755 --- a/modules/tumor_only/strelka2.jst +++ b/modules/tumor_only/strelka2.jst @@ -1,4 +1,5 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro strelka2_tumor_only(pair, aligner='bwa') %} {% do pair.callers.append('strelka2') %} @@ -168,6 +169,7 @@ {% set task %}tumor_only_strelka2_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %} {% set directory %}{{ temp_dir }}{% endset %} {{- remove_files(directory,none,task) }} - + {{- vcf_stats(pass_vcf, results_dir) }} + {{- vcf_stats(all_vcf, results_dir) }} {% endmacro %} diff --git a/modules/tumor_only/vardict.jst b/modules/tumor_only/vardict.jst index 8e39bfb7..ad8c5d7c 100755 --- a/modules/tumor_only/vardict.jst +++ b/modules/tumor_only/vardict.jst @@ -1,4 +1,5 @@ {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro vardict_tumor_only(pair, aligner='bwa') %} {% do pair.callers.append('vardict') %} @@ -263,5 +264,7 @@ bcftools index --tbi --force "{{ pass_vcf }}" -{% endmacro %} +{{- vcf_stats(pass_vcf, results_dir) }} +{{- vcf_stats(all_vcf, results_dir) }} +{% endmacro %} diff --git a/modules/tumor_only/vcfmerger2.jst b/modules/tumor_only/vcfmerger2.jst index 827f01e1..4b08b3ed 100755 --- a/modules/tumor_only/vcfmerger2.jst +++ b/modules/tumor_only/vcfmerger2.jst @@ -3,6 +3,7 @@ {% from 'modules/tumor_only/rna_variant_check.jst' import add_rna_header_to_vcf with context %} {% from 'modules/tumor_only/rna_variant_check.jst' import add_matched_rna with context %} {% from 'utilities/remove_files.jst' import remove_files with context %} +{% from 'utilities/vcf_stats.jst' import vcf_stats with context %} {% macro vcfmerger2_tumor_only(pair, aligner='bwa', taskPrefix='Genome') %} @@ -165,5 +166,6 @@ {{- add_rna_header_to_vcf(merged_vcf_gz, output_vcf, pair, aligner , temp_dir) }} {% endif %} {{- annotate_vcfs(pair, temp_dir, results_dir, output_vcf, taskPrefix, aligner, 'merged', 'tumor_only', 'snp_indel_caller') }} + {{- vcf_stats(output_vcf, results_dir) }} {{- collect_somatic_metrics(pair, normal_bam, tumor_bam, results_dir, taskPrefix, aligner, 'merged') }} {% endmacro %} diff --git a/pipeline.yaml b/pipeline.yaml index 3be119f7..133acf97 100644 --- a/pipeline.yaml +++ b/pipeline.yaml @@ -2,7 +2,7 @@ __pipeline__: name: coyote main: main.jst description: Canine (canfam3.1) genomics suite - version: v1.1.2 + version: v1.2.1 constants: tools: bedtools: @@ -37,10 +37,14 @@ constants: citation: > Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler Transform. Bioinformatics, 25:1754-60. [PMID: 19451168] + bwa_mem2: + module: bwa-mem2/2.2.1 + version: "2.2.1" + verbose: Burrows-Wheeler Aligner v2.2.1 cellranger: - module: cellranger/3.1.0 - version: "3.1.0" - verbose: 10X CellRanger v3.1.0 + module: cellranger/6.0.1 + version: "6.0.1" + verbose: 10X CellRanger v6.0.1 website: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger citation: > TODO @@ -73,6 +77,14 @@ constants: module: DLE/0.1.5 version: "0.1.5" verbose: Discordant Loci Extractor v0.1.5 + expansion_hunter: + module: ExpansionHunter/4.0.2 + version: "4.0.2" + verbose: Illumina ExpansionHunter v4.0.2 + citation: > + Egor Dolzhenko, Viraj Deshpande, Felix Schlesinger, Peter Krusche, Roman Petrovski, and others, + ExpansionHunter: A sequence-graph based tool to analyze variation in short tandem repeat regions, + Bioinformatics 2019 freebayes: module: freebayes/1.3.1-foss-2019a version: "1.3.1" @@ -126,7 +138,7 @@ constants: HTSeq — A Python framework to work with high-throughput sequencing data Bioinformatics (2014), in print, online at doi:10.1093/bioinformatics/btu638 ichorcna: - module: R/3.6.1-phoenix + module: ichorCNA/0.2.0-coyote launcher: required_scripts/runIchorCNA_47ce8db.R version: "0.2.0" verbose: ichorCNA v0.2.0 @@ -209,6 +221,10 @@ constants: website: https://github.com/genome/pairoscope citation: > TBD + petagene: + module: petagene/1.3.0p7 + version: 1.3.0p7 + PetaLinkMode: "+fastq" phaser: module: phASER/1.1.1-python-2.7.13 version: 1.1.1 @@ -239,6 +255,14 @@ constants: citation: > R Core Team (2017). R: A language and environment for statistical computing. R Foundation for Statistical Computing, Vienna, Austria. URL https://www.R-project.org/. + rsem: + module: RSEM/1.3.3 + version: "1.3.3" + verbose: rsem v1.3.3 + website: https://github.com/deweylab/RSEM + citation: > + Li, B., Dewey, C.N. RSEM: accurate transcript quantification from RNA-Seq data with or without a reference + genome. BMC Bioinformatics 12, 323 (2011). https://doi.org/10.1186/1471-2105-12-323 salmon: module: Salmon/1.2.1-gompi-2019a version: "1.2.1" @@ -268,6 +292,10 @@ constants: module: sequenza/3.0.0 version: "3.0.0" verbose: Sequenza v3.0.0 + sigprofiler: + module: SigProfiler/1.1 + version: "1.1" + verbose: SigProfiler v1.1 singularity: module: singularity/3.5.2-phoenix version: "3.5.2" @@ -326,9 +354,13 @@ constants: Liao Y, Smyth GK and Shi W. featureCounts: an efficient general-purpose program for assigning sequence reads to genomic features. Bioinformatics, 30(7):923-30, 2014 tgen_mutation_burden: - module: MutationBurden/1.2.1 - version: "1.2.1" - verbose: TGen Mutation Burden Tool v1.2.1 + module: MutationBurden/1.2.3 + version: "1.2.3" + verbose: TGen Mutation Burden Tool v1.2.3 + thred: + module: tHReD/1.1.0 + version: "1.1.0" + verbose: tHReD v1.1.0 transparser: module: transParser/1.0.1 version: "1.0.1" @@ -398,44 +430,45 @@ constants: delly_svtop_delly_sv_annotation_parellel_script: svtop.delly.sv_annotation.parallel_8820499.py manta_prepare_sv_vcf: manta_prepare_sv_vcf_f94bcc1.py manta_gene_annotation_parallel: manta_sv_annotation_parallel_8820499.py - pairoscope_mm_igtx_calling_script: mm_igtx_pairoscope_calling_b38_f3010c3.py - pairoscope_mm_igtx_calling_tumor_only_script: mm_igtx_pairoscope_calling_b38_tumor_only_39d1efa.py + pairoscope_mm_igtx_calling_script: mm_igtx_pairoscope_calling_b38_356362b.py cna_seg_extend: seg_extend_229b8c7.py plotCNVplus_Rscript: plotCNVplus_4d89cb4.R plotSamStats_Rscript: summarize_samstats_8c45d63.R - summarize_mm_igtx_Rscript: summarize_Ig_4b93aee.R - process_assembled_bam: Process_Assembled_BAM_eb25fca.py + summarize_mm_igtx_Rscript: summarize_Ig_875a823.R + process_assembled_bam: Process_Assembled_BAM_277eed7.py stats2json: samStats2json_3a90a2f.py stats2lims: uploadStats2Lims_1ace81f.py - annotSeg_script: annotSeg_8820499.pl - cellranger_reference: /home/tgenref/canis_familiaris/canfam3.1/tool_specific_resources/cellranger/refdata-cellranger-hg19-1.2.0 - cellranger_vdj_reference: /home/tgenref/canis_familiaris/canfam3.1/tool_specific_resources/cellranger/refdata-cellranger-hg19-1.2.0 + sigprofiler: sigprofiler_d78cc9e.py + annotSeg_script: annotSeg_7102f1c.pl + cellranger_reference: /home/tgenref/homo_sapiens/grch38_hg38/hg38tgen/gene_model/ensembl_v98/tool_resources/cellranger_6.0.0/hg38tgen_ensembl_v98 + cellranger_vdj_reference: /home/tgenref/homo_sapiens/grch38_hg38/tool_specific_resources/cellranger/refdata-cellranger-vdj-GRCh38-alts-ensembl-5.0.0 scrna_chemistry_options: X3SCR: chemistry_name: SC3Pv1 umi_length: 10 - cell_barcode_whitelist_file: /packages/cellranger/3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/737K-april-2014_rc.txt + cell_barcode_whitelist_file: /packages/easy-build/software/CellRanger/6.0.0/lib/python/cellranger/barcodes/737K-april-2014_rc.txt XCSCR: chemistry_name: SC3Pv2 umi_length: 10 - cell_barcode_whitelist_file: /packages/cellranger/3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/737K-august-2016.txt + cell_barcode_whitelist_file: /packages/easy-build/software/CellRanger/6.0.0/lib/python/cellranger/barcodes/737K-august-2016.txt X3SC3: chemistry_name: SC3Pv3 umi_length: 12 - cell_barcode_whitelist_file: /packages/cellranger/3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/3M-february-2018.txt.gz + cell_barcode_whitelist_file: /packages/easy-build/software/CellRanger/6.0.0/lib/python/cellranger/barcodes/3M-february-2018.txt X5SCR: chemistry_name: SC5P-R2 umi_length: 10 - cell_barcode_whitelist_file: /packages/cellranger/3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/737K-august-2016.txt + cell_barcode_whitelist_file: /packages/easy-build/software/CellRanger/6.0.0/lib/python/cellranger/barcodes/737K-august-2016.txt unknown: chemistry_name: auto umi_lenth: 10 - cell_barcode_whitelist_file: /packages/cellranger/3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/737K-august-2016.txt + cell_barcode_whitelist_file: /packages/easy-build/software/CellRanger/6.0.0/lib/python/cellranger/barcodes/737K-august-2016.txt gatk_known_sites: - /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/public_databases/eva/GCA_000002285.2_current_ids_renamed.vcf.gz gatk_cnn_resources: - /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/public_databases/eva/GCA_000002285.2_current_ids_renamed.vcf.gz bwa_index: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/tool_resources/bwa_0.7.17/Canis_familiaris.CanFam3.1.dna.toplevel.fa + bwa_mem2_index: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/tool_resources/bwa_2.2.1/Canis_familiaris.CanFam3.1.dna.toplevel.fa gtf: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/gene_model/ensembl_v98/Canis_familiaris.CanFam3.1.98.gtf ref_flat: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/gene_model/ensembl_v98/Canis_familiaris.CanFam3.1.98.refFlat.txt ribo_locations: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/gene_model/ensembl_v98/Canis_familiaris.CanFam3.1.98.ribo.interval_list @@ -453,18 +486,21 @@ constants: starfusion_index: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/gene_model/ensembl_v98/tool_resources/starFusion_ensembl_v98/starFusion_Resources/ctat_genome_lib_build_dir strandedness_options: inward-unstranded-notapplicable: + rsem: "none" salmon: "IU" htseq: "no" featurecounts: "0" tophat: "-fr-unstranded" collectrnaseqmetrics: "NONE" inward-stranded-forward: + rsem: "forward" salmon: "ISF" htseq: "yes" featurecounts: "1" tophat: "-fr-secondstrand" collectrnaseqmetrics: "FIRST_READ_TRANSCRIPTION_STRAND" inward-stranded-reverse: + rsem: "reverse" salmon: "ISR" htseq: "reverse" featurecounts: "2" diff --git a/required_scripts/Process_Assembled_BAM_eb25fca.py b/required_scripts/Process_Assembled_BAM_277eed7.py similarity index 71% rename from required_scripts/Process_Assembled_BAM_eb25fca.py rename to required_scripts/Process_Assembled_BAM_277eed7.py index e0ab0b3c..c63102de 100644 --- a/required_scripts/Process_Assembled_BAM_eb25fca.py +++ b/required_scripts/Process_Assembled_BAM_277eed7.py @@ -696,261 +696,157 @@ def check_gene_call(table, gene, nreads, min_con_len): for t_elem in res_gene: emp_list.append(t_elem) return emp_list - # other cases - table_by_gene = table[(table.Gene_1 == gene)] - table_by_gene = table_by_gene[(table_by_gene.IgTxCalled == 1)] - # print(table_by_gene) - for row in table_by_gene.index: - print("In Table") - print(table_by_gene.at[row, 'name']) - - if (table_by_gene.at[row, 'fragments_at_junc_1'] >= table_by_gene.at[row, 'fragments_at_junc_2']): - count_gene = table_by_gene.at[row, 'fragments_at_junc_1'] - else: - count_gene = table_by_gene.at[row, 'fragments_at_junc_2'] - - contig_length = table_by_gene.at[row, 'Contig_length_1'] - - if (table_by_gene.at[row, 'Contig_reversed2']): - ig_bp2_tmp = table_by_gene.at[row, 'pos_2_end'] - else: - ig_bp2_tmp = table_by_gene.at[row, 'pos_2_start'] - - if (('Gene_3' in table) and (table_by_gene.at[row, 'Gene_3'] != 0)): - if ('IG' in str(table_by_gene.at[row, 'Gene_3'])): - # if(table_by_gene.at[row,'Contig_reversed3']): - # ig_bp3_tmp = table_by_gene.at[row,'pos_3_end'] - # else: - ig_breakpoint3 = get_genomic_bp(table_by_gene.at[row, 'cigar_3'], table_by_gene.at[row, 'aligned_length_3'], - table_by_gene.at[row, 'Contig_reversed3'], table_by_gene.at[row, 'pos_3_start']) - ig_breakpoint2 = get_genomic_bp(table_by_gene.at[row, 'cigar_2'], table_by_gene.at[row, 'aligned_length_2'], - table_by_gene.at[row, 'Contig_reversed2'], table_by_gene.at[row, 'pos_2_start']) - ig_breakpoint = str(int(ig_breakpoint2)) + ";" + str(int(ig_breakpoint3)) - # ig_bp3_tmp = table_by_gene.at[row,'pos_3_start'] - # ig_gene_cigar = table_by_gene.at[row,'cigar_3'] - # matchlen = table_by_gene.at[row,'aligned_length_3'] - # ig_breakpoint = get_genomic_bp(ig_gene_cigar, matchlen, table_by_gene.at[row,'Contig_reversed3'],ig_bp3_tmp) - # str(int(ig_bp2_tmp))+";"+str(int(ig_bp3_tmp)) - else: - ig_bp2_tmp = table_by_gene.at[row, 'pos_2_start'] - ig_breakpoint = get_genomic_bp(table_by_gene.at[row, 'cigar_2'], table_by_gene.at[row, 'aligned_length_2'], - table_by_gene.at[row, 'Contig_reversed2'], ig_bp2_tmp) - # ig_breakpoint = table_by_gene.at[row,'pos_2_start'] - else: - ig_breakpoint = table_by_gene.at[row, 'pos_2_start'] - - if (('cigar_3' in table) and (table_by_gene.at[row, 'Gene_3'] != 0)): - if ('IG' in str(table_by_gene.at[row, 'Gene_3'])): - ig_cigar = table_by_gene.at[row, 'cigar_2'] + ";" + table_by_gene.at[row, 'cigar_3'] - else: - ig_cigar = table_by_gene.at[row, 'cigar_2'] - else: - ig_cigar = table_by_gene.at[row, 'cigar_2'] - - pos_start = table_by_gene.at[row, 'pos_1_start'] - gene_cigar = table_by_gene.at[row, 'cigar_1'] - matchlen = table_by_gene.at[row, 'aligned_length_1'] - breakpoint = get_genomic_bp(gene_cigar, matchlen, table_by_gene.at[row, 'Contig_reversed1'], pos_start) - - gene_tmp = table_by_gene.at[row, 'Gene_2'] - source = ig_dict[gene_tmp] - gene_overlap = table_by_gene.at[row, 'aligned_length_1'] - ig_overlap = table_by_gene.at[row, 'aligned_length_2'] - - if (table_by_gene.at[row, 'aligned_length_1'] >= min_con_len and table_by_gene.at[ - row, 'aligned_length_2'] >= min_con_len and count_gene >= nreads): - call = 1 - print("Tx pass") - if (table_by_gene.at[row, 'strand_1'] == 'Pos'): - print("Found pos") - pos_strand_der = 1 - pos_strand_list = ( - pos_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint, - count_gene) - elif (table_by_gene.at[row, 'strand_1'] == 'Neg'): - neg_strand_der = 1 - print("found neg") - neg_strand_list = ( - neg_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint, - count_gene) - # Case:2 Gene 2 is Target Gene - # This is the case when IG is Gene 1 and Target Gene is gene2 - if ('Gene_2' in table): - table_by_gene = table[(table.Gene_2 == gene)] - table_by_gene = table_by_gene[(table_by_gene.IgTxCalled == 1)] - else: - table_by_gene = pd.DataFrame() + #updated code + #start with quey gene + gg = 'Gene_'+str(1) - # print(table_by_gene) + table_by_gene = table #_c[(table_c.Gene_1 == gene)] + table_by_gene = table_by_gene[(table_by_gene.IgTxCalled == 1)] + #loop all rows for row in table_by_gene.index: - print("In Table") + print("In Table Gene1 is target") print(table_by_gene.at[row, 'name']) - - if (table_by_gene.at[row, 'fragments_at_junc_2'] >= table_by_gene.at[row, 'fragments_at_junc_1']): - count_gene = table_by_gene.at[row, 'fragments_at_junc_2'] - else: - count_gene = table_by_gene.at[row, 'fragments_at_junc_1'] - - contig_length = table_by_gene.at[row, 'Contig_length_2'] - - if (table_by_gene.at[row, 'Contig_reversed1']): - ig_bp1_tmp = table_by_gene.at[row, 'pos_1_start'] + table_by_gene.at[row, 'aligned_length_1'] - else: - ig_bp1_tmp = table_by_gene.at[row, 'pos_1_start'] + table_by_gene.at[row, 'aligned_length_1'] - - # this case is highly unlikely as bam is sorted by posn. - if (('Gene_3' in table) and (table_by_gene.at[row, 'Gene_3'] != 0)): - - ig_bp3_tmp = table_by_gene.at[row, 'pos_3_start'] - - if ('IG' in str(table_by_gene.at[row, 'Gene_3'])): - ig_bp3_tmp = table_by_gene.at[row, 'pos_3_start'] - ig_breakpoint1 = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'], - table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start']) - ig_breakpoint3 = get_genomic_bp(table_by_gene.at[row, 'cigar_3'], table_by_gene.at[row, 'aligned_length_3'], - table_by_gene.at[row, 'Contig_reversed3'], ig_bp3_tmp) - ig_breakpoint = str(int(ig_breakpoint1)) + ";" + str(int(ig_breakpoint3)) - else: - ig_breakpoint = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'], - table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start']) - else: - ig_breakpoint = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'], - table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start']) - - if (('cigar_3' in table) and (table_by_gene.at[row, 'Gene_3'] != 0)): - if ('IG' in str(table_by_gene.at[row, 'Gene_3'])): - ig_cigar = table_by_gene.at[row, 'cigar_1'] + ";" + table_by_gene.at[row, 'cigar_3'] - else: - ig_cigar = table_by_gene.at[row, 'cigar_1'] - else: - ig_cigar = table_by_gene.at[row, 'cigar_1'] - - # if(table_by_gene.at[row,'Contig_reversed2']): - # breakpoint = table_by_gene.at[row,'pos_2_end'] + table_by_gene.at[row,'aligned_length_2'] - # else: - # breakpoint = table_by_gene.at[row,'pos_2_start'] + table_by_gene.at[row,'aligned_length_2'] - - gene_cigar = table_by_gene.at[row, 'cigar_2'] - pos_start = table_by_gene.at[row, 'pos_2_start'] - matchlen = table_by_gene.at[row, 'aligned_length_2'] - breakpoint = get_genomic_bp(gene_cigar, matchlen, table_by_gene.at[row, 'Contig_reversed2'], pos_start) - - gene_tmp = table_by_gene.at[row, 'Gene_1'] - source = ig_dict[gene_tmp] - gene_overlap = table_by_gene.at[row, 'aligned_length_2'] - ig_overlap = table_by_gene.at[row, 'aligned_length_1'] - - if (table_by_gene.at[row, 'aligned_length_2'] >= min_con_len and table_by_gene.at[ - row, 'aligned_length_1'] >= min_con_len and count_gene >= nreads): + print(table_by_gene.at[row, gg]) + + total_aligned_length = 0 + ig_breakpoint = '' + ig_aligned_length = 0 + gene_strand ='' + all_gene_cigar = '' + all_ig_cigar = '' + gene_overlap = 0 + ig_overlap = 0 + + #get max splits for contigs + max_contig_splits = 1 + for col in table_by_gene.columns: + #print(col) + #assuming max 10 splits per contigs which is highly unlikely + for tmp_idx in range(1,10): + tmp_gene = 'Gene_' + str(tmp_idx) + if(tmp_gene == col): + if(tmp_idx >= max_contig_splits): + max_contig_splits = tmp_idx + #for loop range add 1 + max_contig_splits = max_contig_splits + 1 + + print("We have total contig splits "+str(max_contig_splits)) + #loop through all columns of each row + for gindex in range(1,max_contig_splits): + gg = 'Gene_'+str(gindex) + + print("Gene is "+gg+" "+str(gindex)) + gindex2 = gindex + + #set indices + pos_col = 'pos_'+str(gindex)+'_start' + cigar_col= 'cigar_'+str(gindex) + align_col = 'aligned_length_'+str(gindex) + contig_flag_col = 'Contig_reversed'+str(gindex) + frag_col = 'fragments_at_junc_'+str(gindex) + contig_col = 'Contig_length_'+str(gindex) + strand_col = 'strand_'+str(gindex) + + if((table_by_gene.at[row,gg]==gene)): + + #get values for target + print(pos_col+cigar_col) + pos_start = table_by_gene.at[row, pos_col] + gene_cigar = table_by_gene.at[row, cigar_col] + if(all_gene_cigar ==''): + all_gene_cigar = table_by_gene.at[row, cigar_col] + else: + all_gene_cigar = all_gene_cigar + table_by_gene.at[row, cigar_col] + matchlen = table_by_gene.at[row, align_col] + count_gene = table_by_gene.at[row, frag_col] + contig_length = table_by_gene.at[row, contig_col] + if(matchlen > gene_overlap): + gene_overlap = matchlen + total_aligned_length = total_aligned_length + matchlen + breakpoint = get_genomic_bp(gene_cigar, matchlen, table_by_gene.at[row, contig_flag_col], pos_start) + gene_strand = table_by_gene.at[row, strand_col] + #print(" bb="+str(breakpoint)+" "+str(matchlen)+" "+gene_cigar+" "+str(pos_start)) + + #if gg is IG + elif('IG' in str(table_by_gene.at[row,gg])): + #print("Found IG at "+gg) + ig_breakpoint = get_genomic_bp(table_by_gene.at[row, cigar_col], table_by_gene.at[row, align_col], + table_by_gene.at[row, contig_flag_col], table_by_gene.at[row, pos_col]) + gene_tmp = table_by_gene.at[row,gg] + if(table_by_gene.at[row, align_col] > ig_overlap): + ig_overlap = table_by_gene.at[row, align_col] + source = ig_dict[gene_tmp] + if(table_by_gene.at[row, align_col] > ig_aligned_length): + ig_aligned_length = table_by_gene.at[row, align_col] + + ig_cigar = table_by_gene.at[row, cigar_col] + if(all_ig_cigar ==''): + all_ig_cigar = table_by_gene.at[row, cigar_col] + else: + all_ig_cigar = all_ig_cigar + table_by_gene.at[row, cigar_col] + + #check all alignments for the contig + while(gindex2 < max_contig_splits-1): + gindex2= gindex2 + 1 + gg2 = 'Gene_'+str(gindex2) + #set indices + pos_col2 = 'pos_'+str(gindex2)+'_start' + cigar_col2 = 'cigar_'+str(gindex2) + align_col2 = 'aligned_length_'+str(gindex2) + contig_flag_col2 = 'Contig_reversed'+str(gindex2) + frag_col2 = 'fragments_at_junc_'+str(gindex2) + + #if gene is same as target gene + if((table_by_gene.at[row,gg] == table_by_gene.at[row,gg2]) and (table_by_gene.at[row,gg]==gene)): + print("Found double match"+gg2) + align_col2 = 'aligned_length_'+str(gindex2) + total_aligned_length = total_aligned_length + table_by_gene.at[row, align_col2] + gene_cigar = gene_cigar+';'+table_by_gene.at[row, cigar_col2] + if(table_by_gene.at[row, frag_col2] > count_gene): + count_gene = table_by_gene.at[row, frag_col2] + if( table_by_gene.at[row, align_col2] > gene_overlap): + gene_overlap = table_by_gene.at[row, align_col2] + + #multiple alignments of IG + if(('IG' in str(table_by_gene.at[row,gg2])) and (table_by_gene.at[row,gg]==table_by_gene.at[row,gg2])): + print("Found IG at "+gg2) + ig_breakpoint2 = get_genomic_bp(table_by_gene.at[row, cigar_col2], table_by_gene.at[row, align_col2], + table_by_gene.at[row, contig_flag_col2], table_by_gene.at[row, pos_col2]) + if(table_by_gene.at[row, align_col2] > ig_aligned_length): + ig_aligned_length = table_by_gene.at[row, align_col2] + ig_breakpoint = str(int(ig_breakpoint))+';'+str(int(ig_breakpoint2)) + ig_cigar = ig_cigar+';'+table_by_gene.at[row, cigar_col2] + #test onky since we are missing fastq + print("************************\n"+str(total_aligned_length) + " "+str(ig_aligned_length)+"*****************\n") + #gene overlap is max alignment and total is total region aligned + #switching total to gene to filer by max. + #we keep both in case we would like to get total length in later revisions + total_aligned_length = gene_overlap + #ig_overlap = ig_aligned_length + if (total_aligned_length >= min_con_len and ig_aligned_length >= min_con_len and count_gene >= nreads): call = 1 print("Tx pass") - if (table_by_gene.at[row, 'strand_2'] == 'Pos'): + if (gene_strand == 'Pos'): print("Found pos") pos_strand_der = 1 pos_strand_list = ( - pos_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint, + pos_strand_der, contig_length, gene_overlap, all_gene_cigar, ig_overlap, all_ig_cigar, breakpoint, ig_breakpoint, count_gene) - elif (table_by_gene.at[row, 'strand_2'] == 'Neg'): + elif (gene_strand == 'Neg'): neg_strand_der = 1 print("found neg") neg_strand_list = ( - neg_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint, + neg_strand_der, contig_length, gene_overlap, all_gene_cigar, ig_overlap, all_ig_cigar, breakpoint, ig_breakpoint, count_gene) - ######## - # Case 3: Gene_3 is target gene when 1 and 2 match to IG and 3 is the gene. Case of IG multialignment - ######## - if ('Gene_3' in table): - table_by_gene = table[(table.Gene_3 == gene)] - table_by_gene = table_by_gene[(table_by_gene.IgTxCalled == 1)] - else: - table_by_gene = pd.DataFrame() - # print(table_by_gene) - for row in table_by_gene.index: - print("In Table") - print(table_by_gene.at[row, 'name']) - - if (table_by_gene.at[row, 'fragments_at_junc_3'] >= table_by_gene.at[row, 'fragments_at_junc_1']): - count_gene = table_by_gene.at[row, 'fragments_at_junc_3'] - else: - count_gene = table_by_gene.at[row, 'fragments_at_junc_1'] - - contig_length = table_by_gene.at[row, 'Contig_length_3'] - - if (table_by_gene.at[row, 'Contig_reversed1']): - ig_bp1_tmp = table_by_gene.at[row, 'pos_1_end'] - else: - ig_bp1_tmp = table_by_gene.at[row, 'pos_1_start'] - - if (('Gene_2' in table) and (table_by_gene.at[row, 'Gene_2'] != 0)): - if (table_by_gene.at[row, 'Contig_reversed2']): - ig_bp2_tmp = table_by_gene.at[row, 'pos_2_end'] - else: - ig_bp2_tmp = table_by_gene.at[row, 'pos_2_start'] - - if ('IG' in str(table_by_gene.at[row, 'Gene_2'])): - ig_breakpoint1 = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'], - table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start']) - ig_breakpoint2 = get_genomic_bp(table_by_gene.at[row, 'cigar_2'], table_by_gene.at[row, 'aligned_length_2'], - table_by_gene.at[row, 'Contig_reversed2'], table_by_gene.at[row, 'pos_2_start']) - ig_breakpoint = str(int(ig_breakpoint1)) + ";" + str(int(ig_breakpoint2)) - - # ig_breakpoint = str(int(ig_bp1_tmp))+";"+str(int(ig_bp2_tmp)) # str(int(table_by_gene.at[row,'pos_1_start']))+";"+str(int(table_by_gene.at[row,'pos_2_start'])) - else: - ig_breakpoint = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'], - table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start']) - # ig_breakpoint =ig_bp1_tmp #table_by_gene.at[row,'pos_1_start'] - else: - ig_breakpoint = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'], - table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start']) - # ig_breakpoint = table_by_gene.at[row,'pos_1_start'] - - if (('cigar_2' in table) and (table_by_gene.at[row, 'Gene_2'] != 0)): - if ('IG' in str(table_by_gene.at[row, 'Gene_2'])): - ig_cigar = table_by_gene.at[row, 'cigar_1'] + ";" + table_by_gene.at[row, 'cigar_2'] - else: - ig_cigar = table_by_gene.at[row, 'cigar_1'] - else: - ig_cigar = table_by_gene.at[row, 'cigar_1'] - - # if(table_by_gene.at[row,'Contig_reversed3']): - # breakpoint = table_by_gene.at[row,'pos_3_start'] + table_by_gene.at[row,'aligned_length_3'] - # else: - # breakpoint = table_by_gene.at[row,'pos_3_start'] +table_by_gene.at[row,'aligned_length_3'] - - gene_cigar = table_by_gene.at[row, 'cigar_3'] - pos_start = table_by_gene.at[row, 'pos_3_start'] - matchlen = table_by_gene.at[row, 'aligned_length_3'] - breakpoint = get_genomic_bp(gene_cigar, matchlen, table_by_gene.at[row, 'Contig_reversed3'], pos_start) - - gene_tmp = table_by_gene.at[row, 'Gene_1'] - source = ig_dict[gene_tmp] - gene_overlap = table_by_gene.at[row, 'aligned_length_3'] - ig_overlap = table_by_gene.at[row, 'aligned_length_1'] - - if (table_by_gene.at[row, 'aligned_length_3'] >= min_con_len and table_by_gene.at[ - row, 'aligned_length_1'] >= min_con_len and count_gene >= nreads): - call = 1 - print("Tx pass") - if (table_by_gene.at[row, 'strand_3'] == 'Pos'): - print("Found pos") - pos_strand_der = 1 - pos_strand_list = ( - pos_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint, - count_gene) - elif (table_by_gene.at[row, 'strand_3'] == 'Neg'): - neg_strand_der = 1 - print("found neg") - neg_strand_list = ( - neg_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint, - count_gene) +##end of updated code print("call = " + str(call)) # Collapse into one list if (call == 1): - print(pos_strand_list) - print(neg_strand_list) + #print(pos_strand_list) + #print(neg_strand_list) res_gene = [call, source] for x in pos_strand_list: res_gene.append(x) @@ -1002,7 +898,7 @@ def gen_summ_table(filt_table, results_table, nreads, min_con_len, sample): results = ((sample,), nsd2_call, ccnd3_call, myc_call, mafa_call, ccnd1_call, ccnd2_call, maf_call, mafb_call) - print(mafb_call) + #print(mafb_call) con_results = [] con_results.append(sample) # con_results=con_results+nsd2_call+ccnd3_call+myc_call+mafa_call+ccnd1_call+ccnd2_call+maf_call+mafb_call @@ -1024,7 +920,7 @@ def gen_summ_table(filt_table, results_table, nreads, min_con_len, sample): con_results.append(x) print("full list") - print(con_results) + #print(con_results) translocationsTable = pd.DataFrame(columns=column_names) translocationsTable.loc[len(translocationsTable)] = con_results return translocationsTable @@ -1101,7 +997,7 @@ def getFragsatJunction_samtools(location, contig, bam): # generate summary table summ_table = gen_summ_table(filt_table, results_table, min_reads, window_size, sample_name) -print(summ_table) +#print(summ_table) out_file_summ = out_path + "/DEX_IgTx_GA_Summary.txt" summ_table.to_csv(out_file_summ, sep="\t", index=False, na_rep=0, float_format='%.0f') print("Test Done") diff --git a/required_scripts/README.md b/required_scripts/README.md index d41d8272..c72fb693 100644 --- a/required_scripts/README.md +++ b/required_scripts/README.md @@ -27,16 +27,17 @@ $ cp plotCNVplus.R /path/to/phoenix/required_scripts/plotCNVplus_4d89cb4.R ## Script Source Locations [addRC_to_Delly_VCF_f4d178e.py](https://github.com/tgen/jetstream_resources/commit/f4d178e2b8982ff49025d42cb7c18d7b12053f42) -[annotSeg_8820499.pl](https://github.com/tgen/jetstream_resources/commit/8820499e113a387fee98044112951fa534ad6f8e) -[Process_Assembled_BAM_eb25fca.py](https://github.com/tgen/GaMMiT/commit/eb25fca1769e56048439efe80479759e164433cf) +[annotSeg_7102f1c.pl](https://github.com/tgen/jetstream_resources/commit/7102f1c4fbdabe8aa16d714aa0cfb8787209df6f) +[Process_Assembled_BAM_277eed7.py](https://github.com/tgen/GaMMiT/commit/277eed728712fa8e636858055ecbf1be270cc114) [manta_prepare_sv_vcf_f94bcc1.py](https://github.com/tgen/jetstream_resources/commit/f94bcc13c826f7d5a4088347e305ffcb49ae6a8e) [manta_sv_annotation_parallel_8820499.py](https://github.com/tgen/jetstream_resources/commit/8820499e113a387fee98044112951fa534ad6f8e) -[mm_igtx_pairoscope_calling_b38_f3010c3.py](https://github.com/tgen/mm_IgTx_Calling/commit/f3010c358970f4c25cefddc824636f60a19842e1) +[mm_igtx_pairoscope_calling_b38_356362b.py](https://github.com/tgen/mm_IgTx_Calling/commit/356362b03f13181f2762ab468f9b4f222439ea69) [plotCNVplus_4d89cb4.R](https://github.com/tgen/plotCNVplus/commit/4d89cb4d8f35e48b916d660c82c52b8725ade16f) [runIchorCNA_47ce8db.R](https://github.com/broadinstitute/ichorCNA/commit/47ce8db4d81ada2d3ce09280661d1240f3dcd530#diff-79cb887cc56cef135b77c5b7a725975c) [samStats2json_3a90a2f.py](https://github.com/tgen/samStats2json/commit/3a90a2fefd8fc60a5ebd391dca6702fae419f32f) +[sigprofiler_d78cc9e.py](https://github.com/tgen/jetstream_resources/commit/d78cc9e243e2fbfe4e7adae91f1ce70015e38131) [seg_extend_229b8c7.py](https://github.com/tgen/jetstream_resources/commit/229b8c7641dd505789664aab88c1662d1f97e429) [summarize_samstats_8c45d63.R](https://github.com/tgen/plot_samstats/commit/8c45d63dbd7f5037d7bb658ac91647898bf7509f) -[summarize_Ig_4b93aee.R](https://github.com/tgen/jetstream_resources/commit/4b93aeea52ebc4721621a58a7520f676b4b97001) +[summarize_Ig_875a823.R](https://github.com/tgen/jetstream_resources/commit/875a823202ba698d7adc1f25db86290b67d19028) [svtop.delly.sv_annotation.parallel_8820499.py](https://github.com/tgen/jetstream_resources/commit/8820499e113a387fee98044112951fa534ad6f8e) [uploadStats2Lims_1ace81f.py](https://github.com/tgen/uploadStats2Lims/pull/2/commits/1ace81faaea5f894b9f618d86b1d2d9b8149cdc6) diff --git a/required_scripts/annotSeg_8820499.pl b/required_scripts/annotSeg_7102f1c.pl similarity index 86% rename from required_scripts/annotSeg_8820499.pl rename to required_scripts/annotSeg_7102f1c.pl index 56c938e2..139ff4bd 100755 --- a/required_scripts/annotSeg_8820499.pl +++ b/required_scripts/annotSeg_7102f1c.pl @@ -93,6 +93,7 @@ print OFILE "##INFO=\n"; print OFILE "##INFO=\n"; print OFILE "##INFO=\n"; +print OFILE "##INFO=\n"; print OFILE "##INFO=\n"; print OFILE "##INFO=\n"; print OFILE "##ALT=\n"; @@ -119,7 +120,10 @@ $qual=abs($temp[5]); - if ($temp[0] =~/^\"ID\"/) {next LOOP;} + # Added ^Sample here to continue legacy support of this script with newer outputs + # since this is not a recommended script, we used a solution that required minimal + # development time. + if ($temp[0] =~/^\"ID\"/ || $temp[0] =~/^Sample/) {next LOOP;} $st=int($temp[2]/100); $en=int($temp[3]/100); @@ -146,18 +150,20 @@ } if (@gns){ - $vcfline="$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\tPASS\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$temp[4];LOG2FC=$temp[5]"; + $svlen = $temp[3] - $temp[2]; + $vcfline="$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\tPASS\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$svlen;MARKERS=$temp[4];LOG2FC=$temp[5]"; $genes=join(",",@gns); print OFILE "$vcfline;GENE=$genes\n"; } else{ - print OFILE "$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\tPASS\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$temp[4];LOG2FC=$temp[5]\n"; + $svlen = $temp[3] - $temp[2]; + print OFILE "$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\tPASS\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$svlen;MARKERS=$temp[4];LOG2FC=$temp[5]\n"; } }else{ - print OFILE "$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\t.\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$temp[4];LOG2FC=$temp[5]\n"; + $svlen = $temp[3] - $temp[2]; + print OFILE "$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\t.\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$svlen;MARKERS=$temp[4];LOG2FC=$temp[5]\n"; } } close(OFILE); close(FILE); - diff --git a/required_scripts/mm_igtx_pairoscope_calling_b38_tumor_only_39d1efa.py b/required_scripts/mm_igtx_pairoscope_calling_b38_356362b.py old mode 100644 new mode 100755 similarity index 98% rename from required_scripts/mm_igtx_pairoscope_calling_b38_tumor_only_39d1efa.py rename to required_scripts/mm_igtx_pairoscope_calling_b38_356362b.py index 4d185c50..932f0fe8 --- a/required_scripts/mm_igtx_pairoscope_calling_b38_tumor_only_39d1efa.py +++ b/required_scripts/mm_igtx_pairoscope_calling_b38_356362b.py @@ -139,19 +139,22 @@ def call_translocations(sample, gene, gene_chr, order, window_start=0, window_en print("dreads.PositionA is: " + str(dreads.PositionA)) gene_igh = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr14") & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end) - & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844745))] + & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844760)) + & ((dreads.PositionA <= 143243200) | (dreads.PositionA >= 143244000))] print(str(gene_igh.columns)) print(str(gene_igh)) count_gene_igh = len(gene_igh.index) gene_igk = dreads[(dreads.Specimen == sample) & (dreads.ChrA == "chr2") & (dreads.ChrB == gene_chr) & (dreads.PositionB >= window_start) & (dreads.PositionB <= window_end) - & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844745))] + & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844760)) + & ((dreads.PositionA <= 143243200) | (dreads.PositionA >= 143244000))] count_gene_igk = len(gene_igk.index) print(str(gene_igk.columns)) print(str(gene_igk)) gene_igl = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr22") & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end) - & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844745))] + & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844760)) + & ((dreads.PositionA <= 143243200) | (dreads.PositionA >= 143244000))] count_gene_igl = len(gene_igl.index) print(str(gene_igl.columns)) print(str(gene_igl)) diff --git a/required_scripts/mm_igtx_pairoscope_calling_b38_f3010c3.py b/required_scripts/mm_igtx_pairoscope_calling_b38_f3010c3.py deleted file mode 100755 index dd7d5046..00000000 --- a/required_scripts/mm_igtx_pairoscope_calling_b38_f3010c3.py +++ /dev/null @@ -1,386 +0,0 @@ -#!/usr/bin/env python3 - -# MYC blacklist region chr8:129571461-129571635 (b37) -# MYC blacklist region chr8:128559215-128559389 (b38) - -################################################################# -# Configure the enviroment -import pandas as pd -import argparse -from multiprocessing import Pool - -################################################################# - -parser = argparse.ArgumentParser(description='Process pairoscope discordant reads to make myeloma Ig translocation calls.') - -parser.add_argument('-i', '--input_file', - required=True, - metavar='File.tsv', - dest="input_file", - help='Discordant_read_table') -parser.add_argument('-s', '--specimen', - required=True, - help='Specimen Name, must match discordant read table') -parser.add_argument('-o', '--output_file', - required=True, - metavar='File.tsv', - help='Output filename') -parser.add_argument('-w', '--window', - default=2000, - type=int, - metavar='INT', - help='Genomic window size to query (recommend 2.5x insert size)') -parser.add_argument('-m', '--window_min', - default=100, - type=int, - metavar='INT', - help='Required discordant read island size (recommend 0.1x insert size)') -parser.add_argument('-c', '--call_requirement', - default=3, - type=int, - metavar='INT', - help='Required number of discordant read pairs meeting requirements to define a call (recommend >= 3)') - -args = parser.parse_args() - -################################################################# -# Capture Run Parameters - -window = args.window -call_requirement = args.call_requirement -window_min = args.window_min -sample = args.specimen - -################################################################# -# Read in the discordant reads table - -dreads = pd.read_csv(args.input_file, sep="\t") - -################################################################# -# Define Functions - -def call_translocations(sample, gene, gene_chr, order, window_start=0, window_end=300000000): - # initialize variables - gene_window = 0 - gene_maxwindowcount = 0 - gene_nextlargestwindowcount = 0 - gene_windowend = 0 - gene_maxwindowwidth = 0 - gene_maxwindowlocation = 0 - gene_nextwindowwidth = 0 - gene_nextwindowlocation = 0 - gene_ighsource = 0 - - # Debug - print('') - print('------------------------------') - print(sample) - print(gene) - print(gene_chr) - print(order) - - # Determine which immunoglobulin loci is the most likely to contain a translocation - # Watch ChrA and ChrB as they are in numeric order so dependign on Tx and igH region partner - # and target can switch columns - if order == "standard": - print('1 - In Standard Loop') - if gene != 'MYC': - print('1 - In MYC Sub-loop') - gene_igh = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr14") - & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end)] - - count_gene_igh = len(gene_igh.index) - - gene_igk = dreads[(dreads.Specimen == sample) & (dreads.ChrA == "chr2") & (dreads.ChrB == gene_chr) - & (dreads.PositionB >= window_start) & (dreads.PositionB <= window_end)] - count_gene_igk = len(gene_igk.index) - gene_igl = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr22") - & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end)] - count_gene_igl = len(gene_igl.index) - else: - print('1 - In Non-MYC Sub-loop') - gene_igh = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr14") - & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end) - & ((dreads.PositionA <= 128559215) | (dreads.PositionA >= 128559389))] - count_gene_igh = len(gene_igh.index) - gene_igk = dreads[(dreads.Specimen == sample) & (dreads.ChrA == "chr2") & (dreads.ChrB == gene_chr) - & (dreads.PositionB >= window_start) & (dreads.PositionB <= window_end) - & ((dreads.PositionB <= 128559215) | (dreads.PositionB >= 128559389))] - count_gene_igk = len(gene_igk.index) - gene_igl = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr22") - & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end) - & ((dreads.PositionA <= 128559215) | (dreads.PositionA >= 128559389))] - count_gene_igl = len(gene_igl.index) - print(count_gene_igh) - print(count_gene_igk) - print(count_gene_igl) - if count_gene_igh >= count_gene_igk and count_gene_igh >= count_gene_igl: - print('2-In IgH Loop') - # Suspect igH Translocation as igH count greater then igK and igL - # if all three have zeros or the same number we default to the igH) - table_gene = gene_igh - count_gene = len(table_gene.index) - gene_ighsource = 1 - elif count_gene_igk > count_gene_igh and count_gene_igk > count_gene_igl: - print('2-In IgK Loop') - # Suspect igK Translocation - # WARNiNG THE CHROMOSOME ORDER BECOMES UNEXPECTED, HOW TO FiX? - # Create Flipped Table for the kappa counts ( ChrA PositionA ChrB PositionB) - # Update the column headers - new_cols = ['Speciment', 'ChrB', 'PositionB', 'ChrA', 'PositionA'] - gene_igk.columns = new_cols - # Create new GENE table with the relabelled columns in the expected order - cols = ['Speciment', 'ChrA', 'PositionA', 'ChrB', 'PositionB'] - table_gene = gene_igk[cols] - count_gene = len(table_gene.index) - gene_ighsource = 2 - elif count_gene_igl > count_gene_igh and count_gene_igl > count_gene_igk: - print('2-In IgL Loop') - # Suspect igL Translocation - table_gene = gene_igl - count_gene = len(table_gene.index) - gene_ighsource = 3 - else: - # Error capture NOT AN EXPECTED EVENT - # Could be 0 in all three potentially, then what? - # What if two but not all three have the same number of counts? - print('2 - Argh - WHAT CAUSES THiS TO HAPPEN') - if count_gene_igh < 3 and count_gene_igk < 3 and count_gene_igl < 3: - print('DEFAULTiNG TO iGH COUNTS BECAUSE ALL LESS THAN 3') - table_gene = gene_igh - count_gene = len(table_gene.index) - gene_ighsource = 1 - elif count_gene_igh < count_gene_igk and count_gene_igh < count_gene_igl and count_gene_igk == count_gene_igl: - print('DEFAULTiNG TO iGH COUNTS BECAUSE IgK and IgL ARE EQUAL EVEN THOUGH LESS') - table_gene = gene_igh - count_gene = len(table_gene.index) - gene_ighsource = 1 - - elif order == "reverse": - print('1 - In Reverse Loop') - gene_igh = dreads[(dreads.Specimen == sample) & (dreads.ChrA == "chr14") & (dreads.ChrB == gene_chr) - & (dreads.PositionB >= window_start) & (dreads.PositionB <= window_end)] - count_gene_igh = len(gene_igh.index) - gene_igk = dreads[(dreads.Specimen == sample) & (dreads.ChrA == "chr2") & (dreads.ChrB == gene_chr) - & (dreads.PositionB >= window_start) & (dreads.PositionB <= window_end)] - count_gene_igk = len(gene_igk.index) - gene_igl = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr22") - & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end)] - count_gene_igl = len(gene_igl.index) - - print(count_gene_igh) - print(count_gene_igk) - print(count_gene_igl) - - if count_gene_igh >= count_gene_igk and count_gene_igh >= count_gene_igl: - print('2-In IgH Loop') - # Suspect igH Translocation as igH count greater then igK and igL - # if all three have zeros or the same number we default to the igH - # Update the column headers - new_cols = ['Speciment', 'ChrB', 'PositionB', 'ChrA', 'PositionA'] - gene_igh.columns = new_cols - # Create new GENE table with the relabelled columns in the expected order - cols = ['Speciment', 'ChrA', 'PositionA', 'ChrB', 'PositionB'] - table_gene = gene_igh[cols] - count_gene = len(table_gene.index) - gene_ighsource = 1 - elif count_gene_igk > count_gene_igh and count_gene_igk > count_gene_igl: - print('2-In IgK Loop') - # Suspect igK Translocation - # WARNiNG THE CHROMOSOME ORDER BECOMES UNEXPECTED, HOW TO FiX? - # Create Flipped Table for the kappa counts ( ChrA PositionA ChrB PositionB) - # Update the column headers - new_cols = ['Speciment', 'ChrB', 'PositionB', 'ChrA', 'PositionA'] - gene_igk.columns = new_cols - # Create new GENE table with the relabelled columns in the expected order - cols = ['Speciment', 'ChrA', 'PositionA', 'ChrB', 'PositionB'] - table_gene = gene_igk[cols] - count_gene = len(table_gene.index) - gene_ighsource = 2 - elif count_gene_igl > count_gene_igh and count_gene_igl > count_gene_igk: - print('2-In IgL Loop') - # Suspect igL Translocation - table_gene = gene_igl - count_gene = len(table_gene.index) - gene_ighsource = 3 - else: - # Error capture NOT AN EXPECTED EVENT - # Could be 0 in all three potentially, then what? - # What if two but not all three have the same number of counts? - print('2 - Argh - WHAT CAUSES THiS TO HAPPEN') - if count_gene_igh < 3 and count_gene_igk < 3 and count_gene_igl < 3: - print('DEFAULTiNG TO iGH COUNTS BECAUSE ALL LESS THAN 3') - # Update the column headers - new_cols = ['Speciment', 'ChrB', 'PositionB', 'ChrA', 'PositionA'] - gene_igh.columns = new_cols - # Create new GENE table with the relabelled columns in the expected order - cols = ['Speciment', 'ChrA', 'PositionA', 'ChrB', 'PositionB'] - table_gene = gene_igh[cols] - count_gene = len(table_gene.index) - gene_ighsource = 1 - elif count_gene_igh < count_gene_igk and count_gene_igh < count_gene_igl and count_gene_igk == count_gene_igl: - print('DEFAULTiNG TO iGH COUNTS BECAUSE IgK and IgL ARE EQUAL EVEN THOUGH LESS') - # Update the column headers - new_cols = ['Speciment', 'ChrB', 'PositionB', 'ChrA', 'PositionA'] - gene_igh.columns = new_cols - # Create new GENE table with the relabelled columns in the expected order - cols = ['Speciment', 'ChrA', 'PositionA', 'ChrB', 'PositionB'] - table_gene = gene_igh[cols] - count_gene = len(table_gene.index) - gene_ighsource = 1 - # Do Stuff - if count_gene > 1: - # Sort the table by PositionA - gene_sorted = table_gene.sort_values(['PositionA'], ascending=True, inplace=False, kind='quicksort', na_position='last') - gene_top = gene_sorted.head(1) - gene_bottom = gene_sorted.tail(1) - gene_top_position = gene_top.iat[0, 2] - gene_bottom_position = gene_bottom.iat[0, 2] - gene_window = gene_bottom_position - gene_top_position - # There are likely some true breakpoints with two breakpoint clusters and these create large windows - # To account for this issue we need to count discordant reads per window - for row in gene_sorted.index: - # Get PositionA value, calculate +/- window, capture rows within the +/- window - position_current = gene_sorted.at[row, 'PositionA'] - position_negative = (position_current - window) - position_positive = (position_current + window) - position_table = gene_sorted[ - (gene_sorted.PositionA > position_negative) & (gene_sorted.PositionA < position_positive)] - position_count = len(position_table.index) - # Create Tracking Variables - # if the current window cound is greater than existing Max, update Max and Next accordingly - if position_count > gene_maxwindowcount: - gene_nextlargestwindowcount = gene_maxwindowcount - gene_maxwindowcount = position_count - # NEW FEATURE START - gene_nextwindowwidth = gene_maxwindowwidth - gene_nextwindowlocation = gene_maxwindowlocation - gene_max_window_top = position_table.head(1) - gene_max_window_bottom = position_table.tail(1) - gene_max_window_top_position = gene_max_window_top.iat[0, 2] - gene_max_window_bottom_position = gene_max_window_bottom.iat[0, 2] - gene_maxwindowwidth = gene_max_window_bottom_position - gene_max_window_top_position - gene_maxwindowlocation = int( (gene_max_window_bottom_position + gene_max_window_top_position) / 2 ) - # NEW FEATURE END - # if the two bundle counts will be greater than total count reset the NextLargest to 0 - if (gene_nextlargestwindowcount + gene_maxwindowcount) > count_gene: - gene_nextlargestwindowcount = 0 - # NEW FEATURE START - gene_nextwindowwidth = 0 - gene_nextwindowlocation = 0 - # NEW FEATURE END - gene_windowend = position_positive - # if the second bundle in order is greater than the current next but less than the max then capture current as a next - elif position_count > gene_nextlargestwindowcount and position_count <= gene_maxwindowcount and position_negative >= gene_windowend: - gene_nextlargestwindowcount = position_count - # NEW FEATURE START - # Reusing the Max calculation variables BUT only to calculate the Next window and location (KiSS) - gene_max_window_top = position_table.head(1) - gene_max_window_bottom = position_table.tail(1) - gene_max_window_top_position = gene_max_window_top.iat[0, 2] - gene_max_window_bottom_position = gene_max_window_bottom.iat[0, 2] - gene_nextwindowwidth = gene_max_window_bottom_position - gene_max_window_top_position - gene_nextwindowlocation = int( (gene_max_window_bottom_position + gene_max_window_top_position) / 2 ) - # NEW FEATURE END - - # Make result calls - if gene_maxwindowcount >= call_requirement and gene_maxwindowwidth >= window_min and gene_nextlargestwindowcount >= call_requirement and gene_nextwindowwidth >= window_min: - gene_call = 1 - gene_bundle_count = 2 - elif gene_maxwindowcount >= call_requirement and gene_maxwindowwidth >= window_min and ( - gene_nextlargestwindowcount < call_requirement or gene_nextwindowwidth < window_min): - gene_call = 1 - gene_bundle_count = 1 - # remove the width and location calculations from the 1sies and 2sies (below call requirement) - gene_nextwindowwidth = 0 - gene_nextwindowlocation = 0 - else: - gene_call = 0 - gene_bundle_count = 0 - gene_maxwindowwidth = 0 - # remove the width and location calculations from the 1sies and 2sies (below call requirement) - gene_maxwindowlocation = 0 - gene_nextwindowwidth = 0 - gene_nextwindowlocation = 0 - print('Window Count = ' + str(count_gene)) - return (count_gene, gene_window, gene_maxwindowcount, gene_maxwindowwidth, gene_maxwindowlocation, - gene_nextlargestwindowcount, gene_nextwindowwidth, gene_nextwindowlocation, gene_call, gene_ighsource, - gene_bundle_count) - - -################################################################# -# Make Column Names for Translocation Table -list_of_genes = ['NSD2', 'CCND3', 'MYC', 'MAFA', 'CCND1', 'CCND2', 'MAF', 'MAFB'] -list_of_features = ['COUNT', 'WINDOW', 'MAXWINDOWCOUNT', 'MAXWINDOWWIDTH', 'MAXWINDOWLOCATION', - 'NEXTWINDOWCOUNT', 'NEXTWINDOWWIDTH', 'NEXTWINDOWLOCATION', 'CALL', 'IGSOURCE', 'BUNDLECOUNT'] - -column_names = ['SAMPLE'] -# Make list of column_names: -for gene in list_of_genes: - for feature in list_of_features: - header = '_'.join([gene, feature]) - column_names.append(header) - - -################################################################# -# Function to call all functions for a sample - -def make_sample_calls(sample): - sample_id = sample - - # MMSET - mmset_call = call_translocations(gene="NSD2", gene_chr="chr4", order="standard", sample=sample_id) - - # CCND3 - ccnd3_call = call_translocations(gene="CCND3", gene_chr="chr6", order="standard", sample=sample_id) - - # MYC - myc_call = call_translocations(gene="MYC", gene_chr="chr8", order="standard", window_start=124987758, - window_end=129487754, sample=sample_id) - - # MAFA - mafa_call = call_translocations(gene="MAFA", gene_chr="chr8", order="standard", window_start=142918584, - window_end=143925832, sample=sample_id) - - # CCND1 - ccnd1_call = call_translocations(gene="CCND1", gene_chr="chr11", order="standard", sample=sample_id) - - # CCND2 - ccnd2_call = call_translocations(gene="CCND2", gene_chr="chr12", order="standard", sample=sample_id) - - # MAF - maf_call = call_translocations(gene="MAF", gene_chr="chr16", order="reverse", sample=sample_id) - - # MAFB - mafb_call = call_translocations(gene="MAFB", gene_chr="chr20", order="reverse", sample=sample_id) - - results = ((sample,), mmset_call, ccnd3_call, myc_call, mafa_call, ccnd1_call, ccnd2_call, maf_call, mafb_call) - return (sum(results, ())) - - -################################################################# -# Create pool for multithreading - -pool = Pool(processes=2) - -jobs = [] -job = pool.apply_async(make_sample_calls, [sample]) -jobs.append(job) - -results = [] -for job in jobs: - print(job) - result = job.get() - print(result) - results.append(result) - - -translocationsTable = pd.DataFrame(results, columns=column_names) - -# Save results -translocationsTable.to_csv(args.output_file, sep="\t", index=False, na_rep="NaN") - -################################################################# - -# Message all done -print('ALL PROCESSES COMPLETE') diff --git a/required_scripts/sigprofiler_d78cc9e.py b/required_scripts/sigprofiler_d78cc9e.py new file mode 100644 index 00000000..9a2f9ca2 --- /dev/null +++ b/required_scripts/sigprofiler_d78cc9e.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +import os +import sys +import argparse +from concurrent.futures import ThreadPoolExecutor +from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen +from SigProfilerExtractor import sigpro as sig + +currentDirectory = os.getcwd() + +def parse_arguments(): + """Parse arguments, validate and return the args""" + + parser = argparse.ArgumentParser( + description='Compute mutational signatures.', + formatter_class=argparse.RawTextHelpFormatter) + + parser.add_argument('-o', '--output', default=currentDirectory, + help='Path to and name of output directory - Can be relative or full path') + + parser.add_argument('-i', '--vcfpath', + help='Path to directory containing vcf file(s) - Can be relative or full path') + + parser.add_argument('-g', '--genome', default='GRCh38', + help='Optional definition of genome, defaults to GRCh38') + + parser.add_argument('-p', '--project', metavar="", + help='Name of the project') + + parser.add_argument('-e', '--exome', default=False, action="store_true", + help='Set if input is from exome') + + parser.add_argument('-t', '--threads', default=-1, type=int, + help='Set number of threads, default will use all threads') + + parser.add_argument('--matrix_only', default=False, action="store_true", + help='Stop after mutational matrix generation') + + parser.add_argument('--extract_only', default=False, action="store_true", + help='Stop after SigProfilerExtractor') + + # prints help message when 0 arguments are entered + if len(sys.argv) == 1: + parser.print_help(sys.stderr) + sys.exit(1) + + parser_args = parser.parse_args() + + return parser_args + +def extractSignatures(output, + vcfpath, + genome, + project, + sigClass='SBS', + sigContext='96', + exome=False, + threads=-1, + extract_only=False): + """Extracts signatures from a matrix generated by SigProfilerMatrixGenerator""" + + sigType = sigClass + sigContext + + if exome: + input_data = vcfpath+"/output/"+sigClass+"/"+project+"."+sigType+".exome" + else: + input_data = vcfpath+"/output/"+sigClass+"/"+project+"."+sigType+".all" + + sig.sigProfilerExtractor("matrix", output, input_data, reference_genome=genome, opportunity_genome = genome, context_type=sigType ,minimum_signatures=1, maximum_signatures=10, cpu=threads) + + if not extract_only: + from SigProfilerExtractor import decomposition as decomp + signatures = output+"/"+sigType+"/Suggested_Solution/"+sigType+"_De-Novo_Solution/Signatures/"+sigType+"_De-Novo_Signatures.txt" + activities = output+"/"+sigType+"/Suggested_Solution/"+sigType+"_De-Novo_Solution/Activities/"+sigType+"_De-Novo_Activities_refit.txt" + samples = output+"/"+sigType+"/Samples.txt" + + #to get all cosmic signatures without filtering + decomp.decompose(signatures, activities, samples, output, genome_build=genome, verbose=False, nnls_add_penalty=0.0, nnls_remove_penalty=0.0, initial_remove_penalty=0.0, de_novo_fit_penalty=0.02) + +def main(): + # Parse and validate arguments + args = parse_arguments() + + matrices = matGen.SigProfilerMatrixGeneratorFunc(args.project, args.genome, args.vcfpath, plot=True, exome=args.exome, bed_file=None, chrom_based=False, tsb_stat=False, seqInfo=False, cushion=100) + + if args.matrix_only: + print("User requested matrix generation only, exiting before submitting to SigProfilerExtractor...\n") + sys.exit(0) + + num_tasks = 0 + sig_list = [] + + try: + if matrices['96'][args.project].sum() > 0: + num_tasks = num_tasks + 1 + sig_list.append(('SBS', '96')) + else: + if os.path.exists(args.vcfpath+"/output/SBS"): + for f in os.listdir(args.vcfpath+"/output/SBS"): + os.remove(os.path.join(args.vcfpath+"/output/SBS", f)) + os.rmdir(args.vcfpath+"/output/SBS") + except: + if os.path.exists(args.vcfpath+"/output/SBS"): + for f in os.listdir(args.vcfpath+"/output/SBS"): + os.remove(os.path.join(args.vcfpath+"/output/SBS", f)) + os.rmdir(args.vcfpath+"/output/SBS") + + try: + if matrices['DINUC'][args.project].sum() > 0: + num_tasks = num_tasks + 1 + sig_list.append(('DBS', '78')) + else: + if os.path.exists(args.vcfpath+"/output/DBS"): + for f in os.listdir(args.vcfpath+"/output/DBS"): + os.remove(os.path.join(args.vcfpath+"/output/DBS", f)) + os.rmdir(args.vcfpath+"/output/DBS") + except: + if os.path.exists(args.vcfpath+"/output/DBS"): + for f in os.listdir(args.vcfpath+"/output/DBS"): + os.remove(os.path.join(args.vcfpath+"/output/DBS", f)) + os.rmdir(args.vcfpath+"/output/DBS") + + try: + if matrices['ID'][args.project].sum() > 0: + num_tasks = num_tasks + 1 + sig_list.append(('ID', '83')) + else: + if os.path.exists(args.vcfpath+"/output/ID"): + for f in os.listdir(args.vcfpath+"/output/ID"): + os.remove(os.path.join(args.vcfpath+"/output/ID", f)) + os.rmdir(args.vcfpath+"/output/ID") + except: + if os.path.exists(args.vcfpath+"/output/ID"): + for f in os.listdir(args.vcfpath+"/output/ID"): + os.remove(os.path.join(args.vcfpath+"/output/ID", f)) + os.rmdir(args.vcfpath+"/output/ID") + + if num_tasks > 0: + cpus_per_task = max(int(args.threads/num_tasks),1) + with ThreadPoolExecutor(max_workers=3) as e: + for sigClass, sigContext in sig_list: + e.submit(extractSignatures, args.output, args.vcfpath, args.genome, args.project, sigClass, sigContext, args.exome, cpus_per_task, args.extract_only) + +if __name__ == '__main__': + main() diff --git a/required_scripts/summarize_Ig_4b93aee.R b/required_scripts/summarize_Ig_875a823.R similarity index 82% rename from required_scripts/summarize_Ig_4b93aee.R rename to required_scripts/summarize_Ig_875a823.R index e1b23334..9ed707a8 100644 --- a/required_scripts/summarize_Ig_4b93aee.R +++ b/required_scripts/summarize_Ig_875a823.R @@ -3,7 +3,10 @@ library(tidyverse) suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("stats")) -#Read in the individual flatFiles +# Initialize list of matrix object +genes <- c("NSD2", "CCND3", "MYC", "MAFA", "CCND1", "CCND2", "MAF", "MAFB") +callers <- c() +header <- c("Specimen") option_list <- list( make_option(c("-v", "--verbose"), action="store_true", default=TRUE, @@ -21,25 +24,60 @@ option_list <- list( help="Flat file containing Ig Calls using Gammit"), make_option(c("-c", "--count"), action="store",type="integer", default=2, help="Minimum caller count [default %default]", - metavar="number") + metavar="number"), + make_option(c("--genes"), action="store", + help="Flat file containing genes of interest (not required)") ) opt <- parse_args(OptionParser(option_list=option_list)) write("Check input files...\n", stderr()) -if(!is.null(opt$pairoscope_file) && !file.exists(as.character(opt$pairoscope_file))) +if(!is.null(opt$pairoscope_file)) { - write("Pairoscope Ig Tx file not found...\n", stderr()) + callers <- append(callers, "Pairoscope") + if(!file.exists(as.character(opt$pairoscope_file))) + { + write("Pairoscope Ig Tx file not found...\n", stderr()) + } } -if (!is.null(opt$manta_file) && !file.exists(as.character(opt$manta_file))) + +if (!is.null(opt$manta_file)) { - write("Manta Ig Tx file not found...\n", stderr()) + callers <- append(callers, "Manta") + if (!file.exists(as.character(opt$manta_file))) + { + write("Manta Ig Tx file not found...\n", stderr()) + } } -if (!is.null(opt$gammit_file) && !file.exists(as.character(opt$gammit_file))) + +if (!is.null(opt$gammit_file)) +{ + callers <- append(callers, "Gammit") + if (!file.exists(as.character(opt$gammit_file))) + { + write("Gammit Ig Tx file not found...\n", stderr()) + } +} + +if (!is.null(opt$genes)) { - write("Gammit Ig Tx file not found...\n", stderr()) + genes = read.table(file=opt$genes) } +for (gene in genes) { + header <- append(header, paste(gene, "Summary_CALL", sep = "_")) + header <- append(header, paste(gene, "IgSource", sep = "_")) + header <- append(header, paste(gene, "CALLER_COUNT", sep = "_")) + for (caller in callers) { + header <- append(header, paste(gene, "CALL", caller, sep = "_")) + } +} + +init_matrix <- matrix(0, ncol = length(header), nrow = 1) +colnames(init_matrix) <- header +init_table <- as_tibble(init_matrix) +init_table <- init_table %>% mutate_at(1, as.character) + write("Processing Data...\n", stderr()) specimen = tibble(Specimen=opt$specimen) combined_calls<-NULL @@ -153,9 +191,15 @@ combined_calls$MAF_IgSource = ifelse(length(unique(MAF_IgSource))==1, unique(MAF combined_calls$MAFA_IgSource = ifelse(length(unique(MAFA_IgSource))==1, unique(MAFA_IgSource),0) combined_calls$MAFB_IgSource = ifelse(length(unique(MAFB_IgSource))==1, unique(MAFB_IgSource),0) -combined_calls=combined_calls[,order(colnames(combined_calls), decreasing = TRUE)] +#combined_calls=combined_calls[,order(colnames(combined_calls), decreasing = TRUE)] + +summary_table <- init_table %>% + right_join(combined_calls) %>% + replace(is.na(.), 0) + +summary_table=summary_table[,order(colnames(summary_table), decreasing = TRUE)] # combined_calls=combined_calls %>% relocate(Specimen) write("Save results...\n", stderr()) -write_tsv(combined_calls, opt$outfile, append = FALSE, na="NA") +write_tsv(summary_table, opt$outfile, append = FALSE, na="NA") write("Done.\n", stderr()) diff --git a/utilities/copy_fastq.jst b/utilities/copy_fastq.jst index 9adcfedb..879860c7 100644 --- a/utilities/copy_fastq.jst +++ b/utilities/copy_fastq.jst @@ -12,6 +12,10 @@ mkdir -p temp/fastqs/ + {% if fastq.fileType == "fasterq" %} + rsync "{{ fastq.fastqPath|replace(".fastq.gz",".fasterq") }}" "temp/fastqs/" + {% else %} rsync "{{ fastq.fastqPath }}" "temp/fastqs/" + {% endif %} -{% endmacro %} \ No newline at end of file +{% endmacro %} diff --git a/utilities/finalize.jst b/utilities/finalize.jst index 9dcb445e..e4bbcdc1 100644 --- a/utilities/finalize.jst +++ b/utilities/finalize.jst @@ -41,6 +41,13 @@ ln --symbolic --force ${line} done {% endif %} + + {% if tasks[taskPrefix+"_constitutional_cna_caller_gatk"]|default(true) %} + for line in `find ../{{ general_library_type }}/constitutional_copy_number/gatk -name "*.re_centered.cr.igv.seg"` + do + ln --symbolic --force ${line} + done + {% endif %} {% elif general_library_type in 'genome' %} {% set taskPrefix = 'Genome' %} @@ -73,6 +80,13 @@ ln --symbolic --force ${line} done {% endif %} + + {% if tasks[taskPrefix+"_constitutional_cna_caller_gatk"]|default(true) %} + for line in `find ../{{ general_library_type }}/constitutional_copy_number/gatk -name "*.re_centered.cr.igv.seg"` + do + ln --symbolic --force ${line} + done + {% endif %} {% elif general_library_type in 'rna' %} {% set taskPrefix = 'RNA' %} @@ -121,4 +135,4 @@ {% endfor %} -{% endmacro %} \ No newline at end of file +{% endmacro %} diff --git a/utilities/md5sum_bam_cram.jst b/utilities/md5sum_bam_cram.jst index ca98c140..bcc7ece6 100644 --- a/utilities/md5sum_bam_cram.jst +++ b/utilities/md5sum_bam_cram.jst @@ -1,24 +1,25 @@ {% macro md5sum_bam_cram(sample, aligner='bwa') %} {% if cram|default(true) %} -{% set bam_path %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}/{{ sample.name }}.{{ aligner }}.bam{% endset %} -{% set cram_path %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}/{{ sample.name }}.{{ aligner }}.cram{% endset %} +{% set file_path %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}{% endset %} - name: md5sum_bam_cram_{{ sample.name }}_{{ aligner }} tags: [{{ sample.gltype }}, {{ sample.name }}, md5sum ] input: {% if cram|default(true) %} - - {{ cram_path }} + - {{ file_path }}/{{ sample.name }}.{{ aligner }}.cram + - {{ file_path }}/{{ sample.name }}.{{ aligner }}.cram.crai {% else %} - - {{ bam_path }} + - {{ file_path }}/{{ sample.name }}.{{ aligner }}.bam + - {{ file_path }}/{{ sample.name }}.{{ aligner }}.bam.bai {% endif %} output: {% if cram|default(true) %} - - {{ cram_path }}.md5 - - {{ cram_path }}.crai.md5 + - {{ sample.name }}.{{ aligner }}.cram.md5 + - {{ sample.name }}.{{ aligner }}.cram.crai.md5 {% else %} - - {{ bam_path }}.md5 - - {{ bam_path }}.bai.md5 + - {{ sample.name }}.{{ aligner }}.bam.md5 + - {{ sample.name }}.{{ aligner }}.bam.bai.md5 {% endif %} walltime: "1:00:00" cpus: 1 @@ -27,14 +28,17 @@ set -eu set -o pipefail + {# cd to the directory of the files for proper md5sum generation #} + cd {{ file_path }} + {% if cram|default(true) %} - md5sum {{ cram_path }} > {{ cram_path }}.md5 - md5sum {{ cram_path }}.crai > {{ cram_path }}.crai.md5 + md5sum {{ sample.name }}.{{ aligner }}.cram > {{ sample.name }}.{{ aligner }}.cram.md5 + md5sum {{ sample.name }}.{{ aligner }}.cram.crai > {{ sample.name }}.{{ aligner }}.cram.crai.md5 {% else %} - md5sum {{ bam_path }} > {{ bam_path }}.md5 - md5sum {{ bam_path }}.bai > {{ bam_path }}.bai.md5 + md5sum {{ sample.name }}.{{ aligner }}.bam > {{ sample.name }}.{{ aligner }}.bam.md5 + md5sum {{ sample.name }}.{{ aligner }}.bam.bai > {{ sample.name }}.{{ aligner }}.bam.bai.md5 {% endif %} {% endif %} -{% endmacro %} \ No newline at end of file +{% endmacro %} diff --git a/utilities/remove_files.jst b/utilities/remove_files.jst index b74600c5..dda6969a 100644 --- a/utilities/remove_files.jst +++ b/utilities/remove_files.jst @@ -1,9 +1,13 @@ -{% macro remove_files(directory, before, after) %} +{% macro remove_files(directory, before, after, name='null') %} -{% if after is string %} - {% set task_name %}{{ after }}{% endset %} +{% if name != 'null' %} + {% set task_name %}{{ name }}{% endset %} {% else %} + {% if after is string %} + {% set task_name %}{{ after }}{% endset %} + {% else %} {% set task_name %}{{ after[0] }}{% endset %} + {% endif %} {% endif %} - name: removing_files_{{ task_name }} diff --git a/utilities/variant_filtering.jst b/utilities/variant_filtering.jst index 784ac299..fa0edc8f 100644 --- a/utilities/variant_filtering.jst +++ b/utilities/variant_filtering.jst @@ -69,4 +69,56 @@ bcftools index --threads 4 --tbi --force {{ output_vcf }} bcftools index --threads 4 --force {{ output_vcf }} -{% endmacro %} \ No newline at end of file +{% endmacro %} + +{% macro filter_variants(pair, input_vcf, output_dir, output_vcf, task) %} + +{# Checking if CC_filter is already defined, in preparation for allowing user defined filters #} +{% if CC_filter is not defined %} + {% if pair.callers is defined and pair.callers|length > 1 %} + {% if pair.callers | length > 3 %} + {% set CC_filter %}INFO/CC>={{ pair.callers | length - 2 }}{% endset %} + {% else %} + {% set CC_filter %}INFO/CC>={{ pair.callers | length - 1 }}{% endset %} + {% endif %} + {% endif %} +{% endif %} + +{# Checking for output_vcf type, e.g. vcf.gz, vcf, bcf #} +{% if output_vcf.endswith('vcf') %} + {% set output_type = 'v' %} +{% elif output_vcf.endswith('bcf') %} + {% set output_type = 'b' %} +{% else %} + {% set output_type = 'z' %} +{% endif %} + +- name: variant_filtering_{{ pair.name }}_{{ task }} + tags: [variant_filter] + input: {{ input_vcf }} + output: {{ output_vcf }} + cpus: 1 + mem: 4G + walltime: "4:00:00" + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.bcftools.module }} + + mkdir -p {{ output_dir }} + + bcftools view \ + {% if CC_filter is defined %} + --include "{{ CC_filter }}" \ + {% endif %} + --output-file {{ output_vcf }} \ + --output-type {{ output_type }} \ + {{ input_vcf }} + + {# No need to index vcf if it isn't compressed #} + {%- if output_type != 'v' %} + bcftools index --force {{ output_vcf }} + {% endif %} + +{% endmacro %} diff --git a/utilities/vcf_stats.jst b/utilities/vcf_stats.jst new file mode 100644 index 00000000..0ec7d2c1 --- /dev/null +++ b/utilities/vcf_stats.jst @@ -0,0 +1,21 @@ +{% macro vcf_stats(input_vcf, results_dir) %} + +- name: bcftools_stats_{{ input_vcf | basename | replace(".", "_") }} + tags: [bcftools, stats, vcf] + input: {{ input_vcf }} + output: {{ results_dir }}/stats/{{ input_vcf | basename }}.stats.txt + cpus: 1 + walltime: "4:00:00" + cmd: | + set -eu + set -o pipefail + + module load {{ constants.tools.bcftools.module }} + + mkdir -p {{ results_dir }}/stats + + bcftools stats \ + {{ input_vcf }} \ + > {{ results_dir }}/stats/{{ input_vcf | basename }}.stats.txt + +{% endmacro %}