diff --git a/README.md b/README.md
index f4441373..8c5e97a4 100644
--- a/README.md
+++ b/README.md
@@ -283,58 +283,58 @@ _Click to show details_
Public Tools Used by the Workflow
All tools build with public easybuild configuration files, available [here](https://github.com/easybuilders/easybuild).
-*Last Updated Jan 29th, 2021*
+*Last Updated July 29th, 2021*
| Tool | Version Implemented | Current Version | Dependency and Notes | EasyBuild |
| :---: | :---: | :---: | :--- | :---: |
-| [bcftools](https://github.com/samtools/bcftools/releases) | 1.10.2 | **1.11** | | Yes |
+| [bcftools](https://github.com/samtools/bcftools/releases) | 1.10.2 | **1.13** | | Yes |
| [bedtools](https://github.com/arq5x/bedtools2/releases) | 2.29.0 | **2.30.0** | delly-filter, addmatchRNA, vardict, vcfmerger2 | Yes |
-| [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) | 2.3.5.1 | **2.4.2** | star-fusion | Yes |
+| [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) | 2.3.5.1 | **2.4.4** | star-fusion | Yes |
| [bwa](https://github.com/lh3/bwa/releases) | 0.7.17 | 0.7.17 | | Yes |
-| [cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/count) | 3.1.0 | **5.0** | | Yes |
-| [cuda-toolkit](https://developer.nvidia.com/cuda-downloads) | 10.1.243 | **11.2** | | No |
-| [deepvariant](https://github.com/google/deepvariant/releases) | 0.10.0 | **1.1.0** | singularity container | Yes |
+| [cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/count) | 6.0 | 6.0 | | Yes |
+| [cuda-toolkit](https://developer.nvidia.com/cuda-downloads) | 10.1.243 | **11.4** | | No |
+| [deepvariant](https://github.com/google/deepvariant/releases) | 0.10.0 | **1.2.0** | singularity container | Yes |
| [delly](https://github.com/dellytools/delly/releases) | 0.7.6 | **0.8.7** | staying with 0.7.6 for compatibility reasons | Yes |
| [dle](https://github.com/tgen/Discordant_Loci_Extractor/releases) | 0.1.5 | 0.1.5 | private repo | No |
-| [freebayes](https://github.com/ekg/freebayes/releases) | 1.3.1 | **1.3.4** | 1.3.2 ensures python3 compatibility | Yes |
-| [gatk](https://github.com/broadinstitute/gatk//releases) | 4.1.8.0 | **4.1.9.0** | | Yes |
-| [gmap-gsnp](http://research-pub.gene.com/gmap/) | 2019-09-12 | **2020-12-17** | star-fusion | Yes |
-| [gridss](https://github.com/PapenfussLab/gridss/releases) | 2.6.3 | **2.10.2** | | |
+| [freebayes](https://github.com/ekg/freebayes/releases) | 1.3.1 | **1.3.4** | | Yes |
+| [gatk](https://github.com/broadinstitute/gatk//releases) | 4.1.8.0 | **4.2.0.0** | | Yes |
+| [gmap-gsnp](http://research-pub.gene.com/gmap/) | 2019-09-12 | **2021-07-23** | star-fusion | Yes |
+| [gridss](https://github.com/PapenfussLab/gridss/releases) | 2.6.3 | **2.12.0** | | |
| [hmmcopyutils](https://github.com/shahcompbio/hmmcopy_utils) | 1.0 | 1.0 | no official release | Yes |
| [htseq](https://github.com/htseq/htseq/releases) | 0.12.3 | **0.13.5** | | |
-| [htslib](https://github.com/samtools/htslib/releases) | 1.10.2 | **1.11** | star-fusion(bgzip) | Yes |
+| [htslib](https://github.com/samtools/htslib/releases) | 1.10.2 | **1.13** | star-fusion(bgzip) | Yes |
| [ichor](https://github.com/broadinstitute/ichorCNA/releases) | 0.2.0 | 0.2.0 | package in R/3.6.1-phoenix module | Yes? |
| [jellyfish](https://github.com/gmarcais/Jellyfish/releases) | 2.3.0 | 2.3.0 | star-fusion | Yes |
| [lancet](https://github.com/nygenome/lancet/releases) | 1.1.0 | 1.1.0 | | Yes |
| [lumosVar2](https://github.com/tgen/lumosVar2/releases) | 1.1 | 1.1 | | Yes |
| [manta](https://github.com/Illumina/manta/releases) | 1.6.0 | 1.6.0 | | Yes |
| [manta_tgen](https://github.com/tgen/manta/releases) | 1.6.0 | 1.6.0 | modified for internal use | No |
-| [multiQC](https://github.com/ewels/MultiQC/releases) | 1.9 | 1.9 | python3 pip | Yes |
-| [octopus](https://github.com/luntergroup/octopus/releases) | 0.6.3-beta | **0.7.0** | | Yes |
-| [openmpi](https://github.com/open-mpi/ompi/releases) | 3.1.3 | **4.1.0** | delly | No |
+| [multiQC](https://github.com/ewels/MultiQC/releases) | 1.9 | **1.11**| python3 pip | Yes |
+| [octopus](https://github.com/luntergroup/octopus/releases) | 0.6.3-beta | **0.7.4** | | Yes |
+| [openmpi](https://github.com/open-mpi/ompi/releases) | 3.1.3 | **4.1.1** | delly | No |
| [pairoscope](https://github.com/genome/pairoscope/releases) | 0.4.2 | 0.4.2 | | Yes |
-| [perl](https://github.com/Perl/perl5/releases) | 5.28.1 | **5.33.6** | star-fusion | Yes |
+| [perl](https://github.com/Perl/perl5/releases) | 5.28.1 | **5.35.2** | star-fusion | Yes |
| [phaser](https://github.com/secastel/phaser/tree/master/phaser) | 1.1.1 | 1.1.1 | vcfmerger2 | Yes |
| [python2](https://www.python.org/downloads/) | 2.7.15 | **2.7.18** | | Yes |
-| [python3](https://www.python.org/downloads/) | 3.7.2 | **3.9.1** | star-fusion, vcfmerger2 | Yes |
-| [R](https://www.r-project.org/) | 3.6.1 | **4.0.3** | gatk cnv, varDict, vcfmerger2 | Yes |
-| [salmon](https://github.com/COMBINE-lab/salmon/releases) | 1.2.1 | **1.4.0** | self, star-fusion | Yes |
+| [python3](https://www.python.org/downloads/) | 3.7.2 | **3.9.6** | star-fusion, vcfmerger2 | Yes |
+| [R](https://www.r-project.org/) | 3.6.1 | **4.1.0** | gatk cnv, varDict, vcfmerger2 | Yes |
+| [salmon](https://github.com/COMBINE-lab/salmon/releases) | 1.2.1 | **1.5.2** | self, star-fusion | Yes |
| [sambamba](https://github.com/biod/sambamba/releases) | 0.7.0 | **0.8.0** | | |
| [samblaster](https://github.com/GregoryFaust/samblaster/releases) | 0.1.24 | **0.1.26** | | |
-| [samtools](https://github.com/samtools/samtools/releases) | 1.10 | **1.11** | | Yes |
-| [singularity](https://github.com/sylabs/singularity/releases) | 3.5.2 | **3.7.1** | deepvariant | Yes, release-candidate |
+| [samtools](https://github.com/samtools/samtools/releases) | 1.10 | **1.13** | | Yes |
+| [singularity](https://github.com/sylabs/singularity/releases) | 3.5.2 | **3.8.1** | deepvariant | Yes |
| [snpEff](https://sourceforge.net/projects/snpeff/files/) | 4.3t | **4.5covid19** | covid related release? | Yes |
| [snpSniffer](https://github.com/tgen/snpSniffer/releases) | 7.0.0 | 7.0.0 | | No |
-| [star](https://github.com/alexdobin/STAR/releases) | 2.7.5a | **2.7.7a** | self, star-fusion | Yes |
-| [star-fusion](https://github.com/STAR-Fusion/STAR-Fusion/releases) | 1.8.1 | **1.9.1** | | Yes |
+| [star](https://github.com/alexdobin/STAR/releases) | 2.7.5a | **2.7.9a** | self, star-fusion | Yes |
+| [star-fusion](https://github.com/STAR-Fusion/STAR-Fusion/releases) | 1.8.1 | **1.10.1** | | Yes |
| [strelka](https://github.com/Illumina/strelka/releases) | 2.9.10 | 2.9.10 | | Yes |
-| [subread](https://sourceforge.net/projects/subread/) | 2.0.0 | **2.0.1** | part of subread package | Yes |
-| [tgen_mutation_burden](https://github.com/tgen/tgen_mutation_burden/releases) | 1.2.1 | 1.2.1 | | No |
+| [subread](https://sourceforge.net/projects/subread/) | 2.0.0 | **2.0.3** | part of subread package | Yes |
+| [tgen_mutation_burden](https://github.com/tgen/tgen_mutation_burden/releases) | 1.2.3 | 1.2.3 | | No |
| [transParser](https://github.com/tgen/transParser/releases) | 1.0.1 | 1.0.1 | | No |
-| [trinityrnaseq](https://github.com/trinityrnaseq/trinityrnaseq/releases) | 2.8.6 | **2.11.0** | star-fusion | Yes |
+| [trinityrnaseq](https://github.com/trinityrnaseq/trinityrnaseq/releases) | 2.8.6 | **2.12.0** | star-fusion | Yes |
| [vardictJava](https://github.com/AstraZeneca-NGS/VarDictJava/releases) | 1.7.0 | **1.8.2** | | Yes |
| [vcfmerger2](https://github.com/tgen/vcfMerger2/releases) | 0.8.7 | 0.8.7 | | Yes |
-| [vep](https://github.com/Ensembl/ensembl-vep/releases) | 98.3 | **102.0** | | Yes |
+| [vep](https://github.com/Ensembl/ensembl-vep/releases) | 98.3 | **104.3** | | Yes |
| [verifybamid2](https://github.com/Griffan/VerifyBamID/releases) | 1.0.6 | **2.0.1** | | Yes |
| [vt](https://github.com/atks/vt/releases) | 0_57721 | 0_57721 | | Yes |
diff --git a/modules/constitutional/deepvariant.jst b/modules/constitutional/deepvariant.jst
index 4cc214c0..92462203 100755
--- a/modules/constitutional/deepvariant.jst
+++ b/modules/constitutional/deepvariant.jst
@@ -1,4 +1,5 @@
{% from 'modules/annotation/main.jst' import annotate_vcfs with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{%- macro deepvariant(sample, aligner='bwa', taskPrefix='Genome') %}
@@ -152,4 +153,7 @@
bcftools index --tbi --force "{{ pass_vcf }}"
{{- annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'deepvariant', 'constitutional', 'snp_indel_caller') }}
+{{- vcf_stats(pass_vcf, results_dir) }}
+{{- vcf_stats(all_vcf, results_dir) }}
+
{% endmacro %}
diff --git a/modules/constitutional/expansion_hunter.jst b/modules/constitutional/expansion_hunter.jst
new file mode 100644
index 00000000..e3d29dc7
--- /dev/null
+++ b/modules/constitutional/expansion_hunter.jst
@@ -0,0 +1,74 @@
+
+{% macro expansion_hunter(sample, aligner='bwa') %}
+{% set bam %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}/{{ sample.name }}.{{ aligner }}.bam{% endset %}
+{% set temp_dir %}temp/{{ sample.gltype }}/constitutional_variant_calls/expansion_hunter/{{ sample.name }}{% endset %}
+{% set results_dir %}{{ sample.gltype }}/constitutional_variant_calls/expansion_hunter/{{ sample.name }}{% endset %}
+
+- name: expansion_hunter_{{ aligner }}_{{ sample.name }}_{{ aligner }}
+ tags: [{{ sample.gltype }}, repeats, expansion, SRT, {{ sample.name }}]
+ input: {{ bam }}
+ output:
+ - {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz
+ - {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz.tbi
+ walltime: "24:00:00"
+ cpus: 1
+ mem: 2G
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.expansion_hunter.module }}
+ module load {{ constants.tools.bcftools.module }}
+
+ rm -r {{ temp_dir }} || true
+ mkdir -p {{ temp_dir }}
+ mkdir -p {{ results_dir }}
+
+ {# The variant catalog is in the install path, we grab the path here #}
+ {# ExpansionHunter is in a /bin directory, we want to go one level higher #}
+ eh_path=$(dirname $(dirname `which ExpansionHunter`))
+
+ ExpansionHunter \
+ --reads {{ bam }} \
+ --reference {{ constants.coyote.reference_fasta }} \
+ --variant-catalog ${eh_path}/variant_catalog/grch38/variant_catalog.json \
+ --output-prefix {{ temp_dir }}/{{ sample.name }}.expansion_hunter
+
+ bcftools view \
+ --output-type z \
+ --output-file {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz \
+ {{ temp_dir }}/{{ sample.name }}.expansion_hunter.vcf
+
+ bcftools index --tbi --force {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz
+
+
+- name: expansion_hunter_filter_variants_{{ sample.name }}_{{ aligner }}
+ tags: [{{ sample.gltype}}, constitutional, snp_indel_caller, deepvariant, {{ sample.name }}]
+ input:
+ - {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz
+ - {{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz.tbi
+ output:
+ - {{ results_dir }}/{{ sample.name }}.expansion_hunter.pass.vcf.gz
+ - {{ results_dir }}/{{ sample.name }}.expansion_hunter.pass.vcf.gz.tbi
+ cpus: 1
+ mem: 2G
+ walltime: "12:00:00"
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.bcftools.module }}
+
+ {# Then filter out the PASS variants to a separate file #}
+ bcftools filter \
+ --output-type z \
+ --include 'FILTER == "PASS"' \
+ {% if sample.gltype == 'exome' %}
+ --targets-file "{{ sample.capture_kit.extended_bed }}" \
+ {% endif %}
+ "{{ results_dir }}/{{ sample.name }}.expansion_hunter.all.vcf.gz" \
+ > "{{ results_dir }}/{{ sample.name }}.expansion_hunter.pass.vcf.gz"
+
+ bcftools index --tbi --force "{{ results_dir }}/{{ sample.name }}.expansion_hunter.pass.vcf.gz"
+
+{% endmacro %}
diff --git a/modules/constitutional/freebayes.jst b/modules/constitutional/freebayes.jst
index 96000649..8f94044b 100755
--- a/modules/constitutional/freebayes.jst
+++ b/modules/constitutional/freebayes.jst
@@ -1,5 +1,6 @@
{% from 'modules/annotation/main.jst' import annotate_vcfs with context %}
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{%- macro freebayes(sample, aligner='bwa', taskPrefix='Genome') %}
@@ -109,4 +110,7 @@
{{- annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'freebayes', 'constitutional', 'snp_indel_caller') }}
-{% endmacro %}
\ No newline at end of file
+{{- vcf_stats(pass_vcf, results_dir) }}
+{{- vcf_stats(all_vcf, results_dir) }}
+
+{% endmacro %}
diff --git a/modules/constitutional/gatk_cnv.jst b/modules/constitutional/gatk_cnv.jst
new file mode 100644
index 00000000..de1b9fc8
--- /dev/null
+++ b/modules/constitutional/gatk_cnv.jst
@@ -0,0 +1,341 @@
+{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'modules/annotation/annot_seg.jst' import annot_seg with context %}
+# The macros found in this template are used to generate the gatk4 tumor_only cnv workflow.
+
+{% macro gatk_cnv_constitutional(sample, normal, aligner='bwa') %}
+{% set normal_temp %}temp/tumor_only/control_data_files/{{ normal.assayCode }}{% endset %}
+{% set cnvIntervals %}{{ normal_temp }}/{{ normal.gatkCnvPonIntervals | basename }}{% endset %}
+{% set cnvPon %}{{ normal_temp }}/{{ normal.gatkCnvPon | basename }}{% endset %}
+{% set sample_bam %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}/{{ sample.name }}.{{ aligner }}.bam{% endset %}
+{% set cnvSnps %}{{ normal_temp }}/{{ normal.gatkCnvSnps | basename }}{% endset %}
+{% set temp_dir %}temp/{{ sample.gltype }}/constitutional_copy_number/gatk/{{ sample.name }}_{{ aligner }}{% endset %}
+{% set results_dir %}{{ sample.gltype }}/constitutional_copy_number/gatk/{{ sample.name }}{% endset %}
+
+{% if wesAssayToPair is defined and normal.gatkExomeCnvSnps is defined %}
+ {% set exomeCnvSnps %}{{ normal_temp }}/{{ normal.gatkExomeCnvSnps | basename }}{% endset %}
+
+ {% set exome = {} %}
+ {% do exome.update( (dataFiles|selectattr('assayCode', 'eq', wesAssayToPair)|selectattr('rgsm', 'eq', sample.rgsm)|first|default({'no': 'exome'}))) %}
+
+ {% if exome.sampleMergeKey is defined %}
+ {% set sample_exome_bam %}exome/alignment/{{ aligner }}/{{ exome.sampleMergeKey }}/{{ exome.sampleMergeKey }}.{{ aligner }}.bam{% endset %}
+ {% elif exome.sampleName is defined %}
+ {% set sample_exome_bam %}exome/alignment/{{ aligner }}/{{ exome.sampleName }}/{{ exome.sampleName }}.{{ aligner }}.bam{% endset %}
+ {% endif %}
+{% endif %}
+
+{# Step 1 #}
+- name: constitutional_gatk_call_cnv_step1_{{ sample.name }}_{{ aligner }}
+ tags: [{{ sample.gltype }}, constitutional, cna_caller, gatk, {{ sample.name }}]
+ reset: prepare_tumor_only_{{ normal.name }}
+ input:
+ - {{ sample_bam }}
+ - {{ cnvPon }}
+ - {{ cnvIntervals }}
+ {% if sample_exome_bam is defined %}
+ - {{ sample_exome_bam }}
+ - {{ exomeCnvSnps }}
+ {% else %}
+ - {{ cnvSnps }}
+ {% endif %}
+ output:
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts.tsv
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.standardizedCR.tsv
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.tsv
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.hets.normal.tsv
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.hets.tsv
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelBegin.seg
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelBegin.cr.param
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelBegin.af.param
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.seg
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.cr.param
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.af.param
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.cr.seg
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.cr.igv.seg
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.af.igv.seg
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.called.seg
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.called.igv.seg
+ {% if sample_exome_bam is defined %}
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts_filt.tsv
+ {% endif %}
+ cpus: 1
+ mem: 16G
+ walltime: "24:00:00"
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ {# Load modules #}
+ module load {{ constants.tools.gatk.module }}
+
+ {# Make working and temp directories #}
+ mkdir -p "{{ temp_dir }}/temp"
+ mkdir -p "{{ temp_dir }}/results/plots"
+ mkdir -p "{{ results_dir }}"
+
+ {# CollectReadCounts for the tumor using the intervals provided to the config that matches the PON #}
+ gatk --java-options "-Xmx12g" CollectReadCounts \
+ --tmp-dir {{ temp_dir }}/temp/ \
+ --input {{ sample_bam }} \
+ --intervals {{ cnvIntervals }} \
+ --interval-merging-rule OVERLAPPING_ONLY \
+ --read-filter FirstOfPairReadFilter \
+ --output {{ temp_dir }}/{{ sample.name }}.{{ aligner }}.counts.hdf5
+
+ {# CollectAllelicCounts for the tumor using the snps provided to the config #}
+ gatk --java-options "-Xmx12g" CollectAllelicCounts \
+ --tmp-dir {{ temp_dir }}/temp/ \
+ {% if sample_exome_bam is defined %}
+ --input {{ sample_exome_bam }} \
+ --intervals {{ exomeCnvSnps }} \
+ {% else %}
+ --input {{ sample_bam }} \
+ --intervals {{ cnvSnps }} \
+ {% endif %}
+ --reference {{ constants.coyote.reference_fasta }} \
+ --output {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts.tsv
+
+ {% if sample_exome_bam is defined %}
+ awk -F'\t' '$1 ~ /^@/ { print $0 ; next } ;
+ $1 == "CONTIG" { print $0 ; next } ;
+ $3 != 0 && $4 != 0 {
+ DP = $3+$4 ;
+ RATIO = $4/DP ;
+ if ( DP > 99 && RATIO >= 0.4643519 && RATIO <= 0.5043519) { print $0 }
+ }' {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts.tsv > {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts_filt.tsv
+ {% endif %}
+
+ {# Denoise the tumor using the panel of normals provided to in the configuration #}
+ gatk --java-options "-Xmx12g" DenoiseReadCounts \
+ --tmp-dir {{ temp_dir }}/temp/ \
+ --input {{ temp_dir }}/{{ sample.name }}.{{ aligner }}.counts.hdf5 \
+ --count-panel-of-normals {{ cnvPon }} \
+ --standardized-copy-ratios {{ results_dir }}/{{ sample.name }}.{{ aligner }}.standardizedCR.tsv \
+ --denoised-copy-ratios {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.tsv
+
+ gatk --java-options "-Xmx12g" ModelSegments \
+ --tmp-dir {{ temp_dir }}/temp/ \
+ --number-of-smoothing-iterations-per-fit 1 \
+ --number-of-changepoints-penalty-factor 0.05 \
+ --kernel-variance-allele-fraction 0.0 \
+ --kernel-scaling-allele-fraction 0.0 \
+ --smoothing-credible-interval-threshold-copy-ratio 3.5 \
+ --smoothing-credible-interval-threshold-allele-fraction 3.5 \
+ --window-size 4 \
+ --window-size 8 \
+ --window-size 16 \
+ --window-size 32 \
+ --window-size 64 \
+ --window-size 128 \
+ --window-size 256 \
+ --denoised-copy-ratios {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.tsv \
+ {% if sample_exome_bam is defined %}
+ --allelic-counts {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts_filt.tsv \
+ {% else %}
+ --allelic-counts {{ results_dir }}/{{ sample.name }}.{{ aligner }}.allelicCounts.tsv \
+ {% endif %}
+ --output {{ results_dir }} \
+ --output-prefix {{ sample.name }}.{{ aligner }}
+
+ gatk --java-options "-Xmx12g" CallCopyRatioSegments \
+ --tmp-dir {{ temp_dir }}/temp/ \
+ --input {{ results_dir }}/{{ sample.name }}.{{ aligner }}.cr.seg \
+ --output {{ results_dir }}/{{ sample.name }}.{{ aligner }}.called.seg
+
+{# Step 2 #}
+- name: constitutional_gatk_call_cnv_step2_{{ sample.name }}_{{ aligner }}
+ tags: [{{ sample.gltype }}, constitutional, cna_caller, gatk, {{ sample.name }}]
+ reset: constitutional_gatk_call_cnv_step1_{{ sample.name }}_{{ aligner }}
+ input:
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.tsv
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.seg
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.hets.tsv
+ output:
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.genderCorrected.tsv
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.re_centered.cr.igv.seg
+ - {{ results_dir }}/{{ sample.name }}.{{ aligner }}.dLRs.tsv
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}.hets.density.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_cna_withhets.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_cna.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_baf.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr1.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr2.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr3.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr4.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr5.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr6.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr7.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr8.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr9.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr10.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr11.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr12.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr13.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr14.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr15.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr16.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr17.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr18.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr19.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr20.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr21.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chr22.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chrX.png
+ - {{ results_dir }}/plots/{{ sample.name }}.{{ aligner }}_chrY.png
+ cpus: 1
+ mem: 16G
+ walltime: "24:00:00"
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ {# Load modules #}
+ module load {{ constants.tools.R.module }}
+ module load {{ constants.tools.python_3_7_2.module }}
+
+ {# Make working and temp directories #}
+ mkdir -p "{{ temp_dir }}"
+
+ {# Get the sex results of the normal. If undetermined then use the sex from the config if known. else default to female. #}
+ SEX="Female"
+
+ touch {{ temp_dir }}/results/plots/{{ sample.name }}.{{ aligner }}_chrY.png
+ cp {{ results_dir }}/{{ sample.name }}.{{ aligner }}.denoisedCR.tsv {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.denoisedCR.genderCorrected.tsv
+ cp {{ results_dir }}/{{ sample.name }}.{{ aligner }}.modelFinal.seg {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg
+ contig_names_string="chr1CONTIG_DELIMITERchr2CONTIG_DELIMITERchr3CONTIG_DELIMITERchr4CONTIG_DELIMITERchr5CONTIG_DELIMITERchr6CONTIG_DELIMITERchr7CONTIG_DELIMITERchr8CONTIG_DELIMITERchr9CONTIG_DELIMITERchr10CONTIG_DELIMITERchr11CONTIG_DELIMITERchr12CONTIG_DELIMITERchr13CONTIG_DELIMITERchr14CONTIG_DELIMITERchr15CONTIG_DELIMITERchr16CONTIG_DELIMITERchr17CONTIG_DELIMITERchr18CONTIG_DELIMITERchr19CONTIG_DELIMITERchr20CONTIG_DELIMITERchr21CONTIG_DELIMITERchr22CONTIG_DELIMITERchrX"
+ contig_lengths_string="248956422CONTIG_DELIMITER242193529CONTIG_DELIMITER198295559CONTIG_DELIMITER190214555CONTIG_DELIMITER181538259CONTIG_DELIMITER170805979CONTIG_DELIMITER159345973CONTIG_DELIMITER145138636CONTIG_DELIMITER138394717CONTIG_DELIMITER133797422CONTIG_DELIMITER135086622CONTIG_DELIMITER133275309CONTIG_DELIMITER114364328CONTIG_DELIMITER107043718CONTIG_DELIMITER101991189CONTIG_DELIMITER90338345CONTIG_DELIMITER83257441CONTIG_DELIMITER80373285CONTIG_DELIMITER58617616CONTIG_DELIMITER64444167CONTIG_DELIMITER46709983CONTIG_DELIMITER50818468CONTIG_DELIMITER156040895"
+
+
+ for CHR in {1..24}
+ do
+ if [[ ${CHR} -eq "23" ]]
+ then
+ CHR="chrX"
+ elif [[ ${CHR} -eq "24" ]]
+ then
+ CHR="chrY"
+ else
+ CHR="chr${CHR}"
+ fi
+
+ declare START_${CHR}=$(gawk -F'\t' -v CHROM=${CHR} '$1 == CHROM' {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg | sort -k2,2n | awk -F'\t' 'NR == 1 { print $2 }')
+ export START_${CHR}
+ declare STOP_${CHR}=$(gawk -F'\t' -v CHROM=${CHR} '$1 == CHROM' {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg | sort -k3,3nr | awk -F'\t' 'NR == 1 { print $3 }')
+ export STOP_${CHR}
+ done
+
+ for CHR in {1..24}
+ do
+ # Get centromere start and stop
+ if [[ $CHR -eq "23" ]]
+ then
+ CHR="chrX"
+ elif [[ $CHR -eq "24" ]]
+ then
+ CHR="chrY"
+ else
+ CHR="chr${CHR}"
+ fi
+
+ eval "START=\${START_${CHR}}"
+ eval "STOP=\${STOP_${CHR}}"
+
+ START_C=$(gawk -v CHR=$CHR '$1==CHR { print $2 }' {{ constants.coyote.centromere }})
+ STOP_C=$(gawk -v CHR=$CHR '$1==CHR { print $3 }' {{ constants.coyote.centromere }})
+
+ if [[ ${START} -ge ${START_C} ]]
+ then
+ START=${STOP_C}
+ fi
+
+ export CHR
+ export START
+ export START_C
+ export STOP
+ export STOP_C
+
+ echo -e "${CHR}\t${START}\t${STOP}"
+ echo -e "${CHR}\t${START_C}\t${STOP_C}"
+
+ LINES=$(wc -l < "{{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg")
+ HEADER=$(grep -c "@" "{{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg" || :)
+
+ gawk -F'\t' -v HEADER="${HEADER}" -v CHROM="${CHR}" -v START="${START}" -v STOP="${STOP}" -v LINES="${LINES}" -v STARTC="${START_C}" -v STOPC="${STOP_C}" 'BEGIN { INC=1 } ;
+ # Skip GATK header
+ $0 ~ /^@/ { next } ;
+ # Print the header
+ NR == HEADER + 1 { print $0 ; next } ;
+ # Remove segments that fall within the centromere
+ $2 == CHROM && $3 >= STARTC && $4 <= STOPC { next } ;
+ # Store the second line in the seg file and set the INC to 2 if we are working on the chromosome
+ NR == HEADER + 2 && $1 == CHROM { C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=2 ; next } ;
+ NR == HEADER + 2 && $1 != CHROM { C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; next } ;
+ ## Last line of seg file
+ NR == LINES && $1 == CHROM && INC == 1 { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,START,STOP,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ;
+ NR == LINES && $1 == CHROM && INC == 2 { OFS = "\t" ; print C1,START,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,$2,STOP,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ;
+ NR == LINES && $1 == CHROM && INC == 3 { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,$2,STOP,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ;
+ NR == LINES && $1 != CHROM && INC == 2 { OFS = "\t" ; print C1,START,STOP,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ;
+ NR == LINES && $1 != CHROM && INC == 3 { OFS = "\t" ; print C1,C2,STOP,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ;
+ NR == LINES && $1 != CHROM { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11 ; next } ;
+ ### Segment is the current chromosome we are working on
+ ## First segment in the CHROM, print previous segment variables
+ NR != LINES && $1 == CHROM && INC == 1 { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=2 ; next } ;
+ ## Second segment for CHROM, Tests and print is for the First segment
+ # If the previous segments start is >= the centromere stop
+ NR != LINES && $1 == CHROM && INC == 2 && C2 >= STOPC { OFS = "\t" ; print C1,START,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=3 ; next } ;
+ NR != LINES && $1 == CHROM && INC == 2 && C2 < STARTC && C3 < STOPC && $2 > STARTC && $3 > STOPC { OFS = "\t" ; print C1,START,STARTC,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=STOPC ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=3 ; next } ;
+ NR != LINES && $1 == CHROM && INC == 2 && C2 < STARTC && C3 < STOPC && $2 < STARTC { OFS = "\t" ; print C1,START,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=3 ; next } ;
+ NR != LINES && $1 == CHROM && INC == 2 && C3 > STOPC { OFS = "\t" ; print C1,START,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=3 ; next } ;
+ # Third segment for CHROM
+ NR != LINES && $1 == CHROM && INC == 3 && C2 < STARTC && C3 < STOPC && $2 > STARTC && $3 > STOPC { OFS = "\t" ; print C1,C2,STARTC,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=STOPC ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; next } ;
+ NR != LINES && $1 == CHROM && INC == 3 && C2 < STARTC && C3 < STOPC && $2 < STARTC { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; next } ;
+ NR != LINES && $1 == CHROM && INC == 3 { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; next } ;
+ ## Segment is on a Chromosome that is NOT our current CHROM
+ NR != LINES && $1 != CHROM && INC == 2 { OFS = "\t" ; print C1,START,STOP,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=1 ; next } ;
+ NR != LINES && $1 != CHROM && INC == 3 { OFS = "\t" ; print C1,C2,STOP,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; INC=1 ; next } ;
+ NR != LINES && $1 != CHROM && INC == 1 { OFS = "\t" ; print C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11 ; C1=$1 ; C2=$2 ; C3=$3 ; C4=$4 ; C5=$5 ; C6=$6 ; C7=$7 ; C8=$8 ; C9=$9 ; C10=$10 ; C11=$11 ; next }' {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg > {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg.temp
+
+ mv {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg.temp {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg
+ done
+
+ python ${JS_PIPELINE_PATH}/required_scripts/{{ constants.coyote.cna_seg_extend }} {{ constants.coyote.centromere }} {{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg
+
+ Rscript ${JS_PIPELINE_PATH}/required_scripts/{{ constants.coyote.plotCNVplus_Rscript }} \
+ --point_size 2 \
+ --hetDPfilter 1 \
+ --hetAFlow 0.45 \
+ --hetAFhigh 0.55 \
+ --hetMAFposteriorOffset 0.01 \
+ --lowerCNvalidatePeakOffset 0.125 \
+ --UpperCNvalidatePeakOffset 0.125 \
+ --lowerCNcenteringPeakOffset 0.125 \
+ --UpperCNcenteringPeakOffset 0.125 \
+ --sample_name={{ sample.name }} \
+ --output_prefix={{ sample.name }}.{{ aligner }} \
+ --plots_directory={{ temp_dir }}/results/plots \
+ --re_center_CNA=TRUE \
+ --re_centered_seg_directory={{ temp_dir }}/results \
+ --contig_names_string=${contig_names_string} \
+ --contig_lengths_string=${contig_lengths_string} \
+ --denoised_copy_ratios_file={{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.denoisedCR.genderCorrected.tsv \
+ --allelic_counts_file={{ results_dir }}/{{ sample.name }}.{{ aligner }}.hets.tsv \
+ --modeled_segments_file={{ temp_dir }}/results/{{ sample.name }}.{{ aligner }}.modelFinal.genderCorrected.seg
+
+ {# If the task has already been run then we need to remove the plots directory. #}
+ if [ -d {{ results_dir }}/plots ]
+ then
+ rm -rf {{ results_dir }}/plots
+ fi
+
+ mv {{ temp_dir }}/results/* {{ results_dir }}/
+
+ {% set final_seg %}{{ results_dir }}/{{ sample.name }}.{{ aligner }}.re_centered.cr.igv.seg{% endset %}
+ {{- annot_seg(final_seg, sample.name) }}
+
+ {% set task %}constitutional_gatk_call_cnv_step2_{{ sample.name }}_{{ aligner }}{% endset %}
+ {% set directory %}{{ temp_dir }}{% endset %}
+ {{- remove_files(directory,none,task) }}
+
+{% endmacro %}
diff --git a/modules/constitutional/gatk_genotypegvcf.jst b/modules/constitutional/gatk_genotypegvcf.jst
index ed37f949..dcd6aa41 100755
--- a/modules/constitutional/gatk_genotypegvcf.jst
+++ b/modules/constitutional/gatk_genotypegvcf.jst
@@ -1,5 +1,6 @@
{% from 'modules/annotation/main.jst' import annotate_vcfs with context %}
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{%- macro genotypegvcf(sample, aligner='bwa', taskPrefix='Genome') %}
@@ -136,12 +137,12 @@
--resource "{{ resource }}" \
{% endfor %}
--variant "{{ all_vcf }}" \
- --output "{{ temp_dir }}/{{ sample.name }}.hc.fvt.vcf.gz" \
-
+ --output "{{ temp_dir }}/{{ sample.name }}.hc.fvt.vcf.gz"
+
{# Then filter out the passing variants to a separate file #}
bcftools filter \
--output-type z \
- --include 'FILTER == "."' \
+ --include 'FILTER == "PASS"' \
{% if sample.gltype == 'exome' %}
--targets-file "{{ sample.capture_kit.extended_bed }}" \
{% endif %}
@@ -152,5 +153,7 @@
{{- annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'hc', 'constitutional', 'genotype_hc_gvcf') }}
-{% endmacro %}
+{{- vcf_stats(pass_vcf, results_dir) }}
+{{- vcf_stats(all_vcf, results_dir) }}
+{% endmacro %}
diff --git a/modules/constitutional/main.jst b/modules/constitutional/main.jst
index e7917979..d17f5c05 100755
--- a/modules/constitutional/main.jst
+++ b/modules/constitutional/main.jst
@@ -7,12 +7,29 @@
{% from 'modules/constitutional/deepvariant.jst' import deepvariant with context %}
{% from 'modules/constitutional/octopus.jst' import octopus_constitutional with context %}
{% from 'modules/constitutional/ichorcna.jst' import ichorcna with context %}
+{% from 'modules/constitutional/gatk_cnv.jst' import gatk_cnv_constitutional with context %}
+{% from 'modules/constitutional/expansion_hunter.jst' import expansion_hunter with context %}
# Constitutional tools generally operate on a single DNA samples and should not run
-# on "tumor" samples with the exception of ichor that cna be run on tumor or constitutional.
+# on "tumor" samples with the exception of ichor that can be run on tumor or constitutional.
{%- macro constitutional_variant_calling(samples) %}
+{% set normSamples = {} %}
+
+{% if controlDataFiles is defined %}
+{% for file in controlDataFiles %}
+
+ {% set name %}{{ study }}_{{ file.assayCode }}{% endset %}
+ {% do file.update({'name': name}) %}
+
+ {% if name not in normSamples %}
+ {% do normSamples.update({name: {}}) %}
+ {% do normSamples[name].update(file) %}
+ {% endif %}
+{% endfor %}
+{% endif %}
+
{%- for sample in samples.values() if sample.gltype in ('exome', 'genome') %}
{% if sample.gltype in 'exome' %}
@@ -22,6 +39,11 @@
{% if tasks.Genome_constitutional_cna_caller_ichor|default(true) %}
{{- ichorcna(sample, aligner='bwa', taskPrefix=taskPrefix) }}
{% endif %}
+ {# MODULE_DISABLED: Human specific - variant catalog does not exist for canine
+ {% if tasks[taskPrefix+"_constitutional_structural_caller_expansion_hunter"]|default(true) %}
+ {{- expansion_hunter(sample, aligner='bwa') }}
+ {% endif %}
+ #}
{% endif %}
{%- if sample.subGroup|lower != 'tumor' %}
@@ -49,6 +71,11 @@
{%- if tasks[taskPrefix+"_constitutional_snp_indel_caller_octopus"]|default(false) %}
{{- octopus_constitutional(sample, aligner='bwa', taskPrefix=taskPrefix) }}
{% endif %}
+ {%- if tasks[taskPrefix+"_constitutional_cna_caller_gatk"]|default(true) %}
+ {% for normal in normSamples.values() if normal.assayCode == sample.assayCode and normal.gatkCnvPon is defined %}
+ {{- gatk_cnv_constitutional(sample, normal, aligner='bwa') }}
+ {% endfor %}
+ {% endif %}
{% endif %}
{% endfor %}
diff --git a/modules/constitutional/octopus.jst b/modules/constitutional/octopus.jst
index 2ab06148..c3c0b12f 100644
--- a/modules/constitutional/octopus.jst
+++ b/modules/constitutional/octopus.jst
@@ -1,5 +1,6 @@
{% from 'modules/annotation/main.jst' import annotate_vcfs with context %}
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{%- macro octopus_constitutional(sample, aligner='bwa', taskPrefix='Genome') %}
{%- set bam %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}/{{ sample.name }}.{{ aligner }}.bam{% endset %}
@@ -127,4 +128,7 @@
{{- annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'octopus', 'constitutional', 'snp_indel_caller') }}
+{{- vcf_stats(pass_vcf, results_dir) }}
+{{- vcf_stats(all_vcf, results_dir) }}
+
{% endmacro -%}
diff --git a/modules/constitutional/strelka2.jst b/modules/constitutional/strelka2.jst
index d8dc33a8..b7317ada 100755
--- a/modules/constitutional/strelka2.jst
+++ b/modules/constitutional/strelka2.jst
@@ -1,5 +1,6 @@
{% from 'modules/annotation/main.jst' import annotate_vcfs with context %}
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{%- macro strelka2_constitutional(sample, aligner='bwa', taskPrefix='Genome') %}
@@ -138,4 +139,7 @@
{{- annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'strelka2', 'constitutional', 'snp_indel_caller' ) }}
+{{- vcf_stats(pass_vcf, results_dir) }}
+{{- vcf_stats(all_vcf, results_dir) }}
+
{% endmacro %}
diff --git a/modules/dna_alignment/bowtie2_samtools.jst b/modules/dna_alignment/bowtie2_samtools.jst
index 035c7ada..27c38f85 100644
--- a/modules/dna_alignment/bowtie2_samtools.jst
+++ b/modules/dna_alignment/bowtie2_samtools.jst
@@ -35,6 +35,14 @@
set -eu
set -o pipefail
+ {% if fastq.fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+
+ {% endif %}
+ {#
+ This comment is here for protect render spacing, do not remove.
+ #}
rm -r "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/" || true
mkdir -p "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/"
@@ -249,8 +257,8 @@
{# Extracting ecoli alignments #}
samtools view \
- -b
- -o "{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2_ecoli.bam"
+ -b \
+ -o "{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2_ecoli.bam" \
"{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2.bam" \
"U00096.3"
@@ -265,7 +273,11 @@
{# Calculating spike and scale for genomecov #}
SPIKE=$(wc -l < "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2_ecoli.bed")
- SCALE=$(echo "10000 / $SPIKE" | bc -l)
+ if [ $SPIKE -gt 0 ]; then
+ SCALE=$(echo "10000 / $SPIKE" | bc -l)
+ else
+ SCALE=$(echo "10000")
+ fi
TAB=$'\t'
cat < "temp/{{ sample.gltype }}/alignment/bowtie2/callRegions.bed"
@@ -277,11 +289,10 @@
bedtools genomecov \
-bg \
-scale $SCALE \
- -i "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2.bed" \
- -g "temp/{{ sample.gltype }}/alignment/bowtie2/callRegions.bed" \
+ -i "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.no_decoy.bowtie2.bed" \
+ -g "temp/{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/callRegions.bed" \
> "{{ sample.gltype }}/alignment/bowtie2/{{ sample.name }}/{{ sample.name }}.bowtie2_ecoli_norm.bedgraph"
{% endif %}
{% endmacro %}
-
diff --git a/modules/dna_alignment/bwa_mem2_samtools.jst b/modules/dna_alignment/bwa_mem2_samtools.jst
new file mode 100644
index 00000000..7bedaa8f
--- /dev/null
+++ b/modules/dna_alignment/bwa_mem2_samtools.jst
@@ -0,0 +1,312 @@
+# Aligns fastqs for a sample using BWA MEM2. Samples may have multiple read
+# groups.
+#
+# ? ?
+# ? -----fastqs--> temp/.bwa.bam
+# ? ?
+#
+
+# This alignment command prefix is shared by all modules using bwa
+{% from 'utilities/read_group_line.jst' import read_group_line with context %}
+{% from 'utilities/remove_files.jst' import remove_files with context %}
+
+# This macro splits large fastqs into chunks prior to aligning.
+# If fastq is less than reads_per_chunk (48000000) then one chunk is made.
+{% macro bwa_mem2_samtools_chunked(sample, reads_per_chunk, opt_dup_distance) %}
+
+{% set temp_dir %}temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}{% endset %}
+{% set results_dir %}{{ sample.gltype }}/alignment/bwa/{{ sample.name }}{% endset %}
+
+{% for rgid, rg in sample.read_groups.items() %}
+{% set r1fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R1')|first %}
+{% set r2fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R2')|first %}
+
+{# Comment about the math, using the assumed legacy illumina interpretation, it is a TGen modification #}
+{% set n_lines = (reads_per_chunk * 4)|int %}
+{% set n_chunks = (r1fastq.numberOfReads / 2 / reads_per_chunk)|round(0, method='ceil')|int %}
+{% if n_chunks > 99 %}{{ raise('ValueError', 'Too many chunks!') }}{% endif %}
+
+{% for fastq in [r1fastq, r2fastq] %}
+- name: split_fastq_{{ fastq.basename | replace(".", "_") }}
+ tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, split, {{ sample.name }}]
+ reset: predecessors
+ input:
+ - temp/fastqs/{{ fastq.basename }}
+ cpus: 1
+ walltime: "4:00:00"
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ {% if fastq.fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+
+ {% endif %}
+ {#
+ This comment is here for protect render spacing, do not remove.
+ #}
+ rm -r "{{ temp_dir }}/{{ rgid }}/{{ fastq.basename }}/" || true
+ mkdir -p "{{ temp_dir }}/{{ rgid }}/{{ fastq.basename }}/"
+
+ zcat "temp/fastqs/{{ fastq.basename }}" |\
+ split \
+ --numeric-suffixes \
+ --suffix-length 2 \
+ --lines {{ n_lines }} \
+ - \
+ "{{ temp_dir }}/{{ rgid }}/{{ fastq.basename }}/"
+
+ N_CREATED=$(ls "{{ temp_dir }}/{{ rgid }}/{{ fastq.basename }}/" | wc -l)
+
+ if [[ ${N_CREATED} -ne {{ n_chunks }} ]]
+ then
+ echo "Chunks created does not match expected value"
+ exit 1
+ fi
+
+{% endfor %}
+
+{% for i in range(n_chunks) %}
+{% set chunk_suffix = '%02d' % i %}
+- name: chunked_bwa_mem2_samtools_fixmate_{{ sample.name }}_{{ rgid }}_{{ chunk_suffix }}
+ tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, {{ sample.name }}]
+ reset: predecessors
+ after:
+ - split_fastq_{{ r1fastq.basename | replace(".", "_") }}
+ - split_fastq_{{ r2fastq.basename | replace(".", "_") }}
+ output: {{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}.bwa.bam
+ walltime: "48:00:00"
+ cpus: 10
+ mem: 32G
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.bwa_mem2.module }}
+ module load {{ constants.tools.samtools.module }}
+
+ {#
+ # If this task was interrupted previously, temp files may exist
+ # that will cause errors with samtools sort. Here, we purge any
+ # existing temp files before making the directory again.
+ #}
+ rm -r "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}_st_sort_temp/" || true
+ mkdir -p "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}_st_sort_temp/"
+
+ {# No long options available for the following:
+ bwa mem
+ -v INT Verbosity: 3 = message (default = 3)
+ -Y Use soft clipping for supplementary alignments
+ -K INT Process INT input bases in each batch regardless of nThreads (for reproducibility)
+ -t INT Number of threads to use
+ -R STR Read group header line such as '@RG\tID:foo\tSM:bar' [null]
+
+ samtools fixmate
+ -m Add mate score tag, REQUIRED for samtools markdup
+ - Input from stdin
+ - Output to stdout
+
+ samtools sort
+ -l INT Set compression level, from 0 (uncompressed) to 9 (best)
+ -m INT Set maximum memory per thread; suffix K/M/G recognized [768M]
+ -T PREFIX Write temporary files to PREFIX.nnnn.bam
+ - Input from stdin
+ -o FILE Write final output to FILE rather than standard output
+ #}
+ bwa-mem2 mem \
+ -v 3 \
+ -Y \
+ -K 100000000 \
+ -t 9 \
+ -R "{{ read_group_line(rg, format='bwa') }}" \
+ "{{ constants.coyote.bwa_mem2_index }}" \
+ "{{ temp_dir }}/{{ rgid }}/{{ r1fastq.basename }}/{{ chunk_suffix }}" \
+ "{{ temp_dir }}/{{ rgid }}/{{ r2fastq.basename }}/{{ chunk_suffix }}" |\
+ samtools fixmate \
+ --threads 1 \
+ -m \
+ - \
+ - |\
+ samtools sort \
+ -T "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}_st_sort_temp/{{ chunk_suffix }}" \
+ -l 2 \
+ -m 768M \
+ --threads 9 \
+ --output-fmt BAM \
+ -o "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}.bwa.bam" \
+ -
+
+{% endfor %}
+
+- name: chunked_samtools_merge_rgid_bams_{{ sample.name }}_{{ rgid }}
+ tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, {{ sample.name }}]
+ reset: predecessors
+ input:
+ {% for i in range(n_chunks) %}
+ {% set chunk_suffix = '%02d' % i %}
+ - {{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}.bwa.bam
+ {% endfor %}
+ output: {{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.bam
+ walltime: "24:00:00"
+ cpus: 8
+ mem: 8G
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.samtools.module }}
+
+ {# No long options available for the following:
+ -c Combine @RG headers with colliding IDs [alter IDs to be distinct]
+ -f Overwrite the output BAM if exist
+ -l INT Compression level, from 0 to 9 [-1]
+ #}
+ samtools merge \
+ --threads 8 \
+ -c \
+ -f \
+ -l 6 \
+ "{{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.bam" \
+ {% for i in range(n_chunks) %}
+ {% set chunk_suffix = '%02d' % i %}
+ {% if not loop.last %}
+ "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}.bwa.bam" \
+ {% else %}
+ "{{ temp_dir }}/{{ rgid }}/{{ chunk_suffix }}.bwa.bam"
+ {% endif %}
+ {% endfor %}
+
+ {# Cleanup the tempfiles
+ # removes fastq chunks and intermediate chunk bam files
+ #}
+ {% if not debug %}
+ rm -r "{{ temp_dir }}/{{ rgid }}/"
+ {% endif %}
+
+
+- name: samtools_markdup_{{ sample.name }}_{{ rgid }}_bwa
+ tags: [{{ sample.gltype }}, alignment, mark_duplicates, samtools, {{ sample.name }}]
+ reset: predecessors
+ methods: Duplicate reads for {{ sample.name }} were marked with
+ {{ constants.tools.samtools.verbose }} markdup.
+ input: {{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.bam
+ output:
+ - {{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.md.bam
+ - {{ temp_dir }}/stats/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt
+ walltime: "48:00:00"
+ cpus: 4
+ mem: 16G
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.samtools.module }}
+
+ {#
+ # If this task was interrupted previously, temp files may exist
+ # that will cause errors with samtools markdup. Here, we purge any
+ # existing temp files before making the directory again.
+ #}
+ rm -r "{{ temp_dir }}/markdup_temp/{{ sample.name }}_{{ rgid }}" || true
+ rm "{{ temp_dir }}/stats/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt" || true
+ mkdir -p "{{ temp_dir }}/markdup_temp/{{ sample.name }}_{{ rgid }}"
+ mkdir -p "{{ temp_dir }}/stats/"
+
+ {# No long options available for the following:
+ -d Optical distance
+ -S Mark supplemenary alignments of duplicates as duplicates (slower)
+ -f Write stats to named file. Implies -s (report stats)
+ -T PREFIX Write temporary files to PREFIX.samtools.nnnn.nnnn.tmp
+ 2> Stats are output to stderr which is redirected to ".bwa.bam.markdup.txt"
+ #}
+ samtools markdup \
+ -d {{ opt_dup_distance }} \
+ -S \
+ -f "{{ temp_dir }}/stats/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt" \
+ --threads 4 \
+ --write-index \
+ -T "{{ temp_dir }}/markdup_temp/{{ sample.name }}_{{ rgid }}" \
+ "{{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.bam" \
+ "{{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.md.bam"
+
+{% endfor %}
+
+- name: samtools_markdup_merge_rg_bams_{{ sample.name }}
+ tags: [{{ sample.gltype }}, alignment, mark_duplicates, samtools, {{ sample.name }}]
+ reset: predecessors
+ methods: Duplicate reads for {{ sample.name }} were marked with
+ {{ constants.tools.samtools.verbose }} markdup.
+ input:
+ {% for rgid in sample.read_groups %}
+ - {{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.md.bam
+ - {{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt
+ {% endfor %}
+ output:
+ - {{ temp_dir }}/{{ sample.name }}.bwa.md.bam
+ - {{ results_dir }}/stats/{{ sample.name }}.bwa.bam.samtools.markdup.txt
+ walltime: "48:00:00"
+ cpus: 8
+ mem: 16G
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.samtools.module }}
+
+ mkdir -p "{{ results_dir }}/stats/"
+
+ {# No long options available for the following:
+ -c Combine @RG headers with colliding IDs [alter IDs to be distinct]
+ -f Overwrite the output BAM if exist
+ -l INT Compression level, from 0 to 9 [-1]
+ #}
+ samtools merge \
+ --threads 8 \
+ -c \
+ -f \
+ -l 6 \
+ "{{ temp_dir }}/{{ sample.name }}.bwa.md.bam" \
+ {% for rgid in sample.read_groups %}
+ {% if not loop.last %}
+ "{{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.md.bam" \
+ {% else %}
+ "{{ temp_dir }}/{{ sample.name }}_{{ rgid }}.bwa.md.bam"
+ {% endif %}
+ {% endfor %}
+
+ {# Merging markdup stat files #}
+ {% set first_sample_rgid = (sample.read_groups.values()|first).rgid|default('') %}
+
+ {#
+ Pasting the files together and stripping string down to integer values
+ then bc sum them together respectively.
+ #}
+ paste \
+ {% for rgid in sample.read_groups %}
+ "{{ temp_dir }}/stats/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt"
+ {%- endfor %}
+ | head -n -1 | tail -n +2 | sed 's/\t[a-zA-Z :_]\+/+/g' | awk '{ print $NF }' | bc \
+ > {{ temp_dir }}/stats/{{ sample.name }}.bwa.bam.samtools_markdup_merge.txt
+
+ paste \
+ {% for rgid in sample.read_groups %}
+ "{{ temp_dir }}/stats/{{ sample.name }}_{{ rgid }}.bwa.bam.samtools.markdup.txt"
+ {%- endfor %}
+ | tail -n1 | sed 's/\t[a-zA-Z :_]\+/\ /g' | cut -d' ' -f2- |\
+ awk '{m=$1;for(i=1;i<=NF;i++)if($i>m)m=$i;print m}' \
+ >> {{ temp_dir }}/stats/{{ sample.name }}.bwa.bam.samtools_markdup_merge.txt
+
+
+ {# Combining the summed values with their naming #}
+ tail -n +2 {{ temp_dir }}/stats/{{ sample.name }}_{{ first_sample_rgid }}.bwa.bam.samtools.markdup.txt |\
+ cut -d':' -f1 |\
+ paste - {{ temp_dir }}/stats/{{ sample.name }}.bwa.bam.samtools_markdup_merge.txt |\
+ sed 's/\t/: /g' \
+ > {{ results_dir }}/stats/{{ sample.name }}.bwa.bam.samtools.markdup.txt
+
+ {# insert COMMAND: into the top of the merged file #}
+ header=$(head -n 1 {{ temp_dir }}/stats/{{ sample.name }}_{{ first_sample_rgid }}.bwa.bam.samtools.markdup.txt)
+ sed -i "1i ${header}" {{ results_dir }}/stats/{{ sample.name }}.bwa.bam.samtools.markdup.txt
+
+{% endmacro %}
diff --git a/modules/dna_alignment/bwa_mem_gatk_picard.jst b/modules/dna_alignment/bwa_mem_gatk_picard.jst
index 79ff436f..d6debd50 100755
--- a/modules/dna_alignment/bwa_mem_gatk_picard.jst
+++ b/modules/dna_alignment/bwa_mem_gatk_picard.jst
@@ -34,6 +34,14 @@
set -eu
set -o pipefail
+ {% if fastq.fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+
+ {% endif %}
+ {#
+ This comment is here for protect render spacing, do not remove.
+ #}
rm -r "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/" || true
mkdir -p "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/"
@@ -60,7 +68,7 @@
- name: chunked_bwa_mem_samtools_view_{{ sample.name }}_{{ rgid }}_{{ chunk_suffix }}
tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, {{ sample.name }}]
reset: predecessors
- after:
+ after:
- split_fastq_{{ r1fastq.basename | replace(".", "_") }}
- split_fastq_{{ r2fastq.basename | replace(".", "_") }}
walltime: "48:00:00"
@@ -163,7 +171,7 @@
--MAX_RECORDS_IN_RAM 300000 \
--INPUT "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.md-uns.bam" \
--OUTPUT "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.md.bam"
-
+
{# Cleanup the tempfiles #}
{% if not debug %}
rm "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.md-uns.bam"
diff --git a/modules/dna_alignment/bwa_mem_samtools.jst b/modules/dna_alignment/bwa_mem_samtools.jst
index 47356b1f..00f3cf44 100755
--- a/modules/dna_alignment/bwa_mem_samtools.jst
+++ b/modules/dna_alignment/bwa_mem_samtools.jst
@@ -34,6 +34,14 @@
set -eu
set -o pipefail
+ {% if fastq.fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+
+ {% endif %}
+ {#
+ This comment is here for protect render spacing, do not remove.
+ #}
rm -r "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/" || true
mkdir -p "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ rgid }}/{{ fastq.basename }}/"
@@ -60,7 +68,7 @@
- name: chunked_bwa_mem_samtools_fixmate_{{ sample.name }}_{{ rgid }}_{{ chunk_suffix }}
tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, {{ sample.name }}]
reset: predecessors
- after:
+ after:
- split_fastq_{{ r1fastq.basename | replace(".", "_") }}
- split_fastq_{{ r2fastq.basename | replace(".", "_") }}
walltime: "48:00:00"
@@ -219,4 +227,3 @@
"temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.md.bam"
{% endmacro %}
-
diff --git a/modules/dna_alignment/main.jst b/modules/dna_alignment/main.jst
index 8f07b290..faa8a722 100755
--- a/modules/dna_alignment/main.jst
+++ b/modules/dna_alignment/main.jst
@@ -1,6 +1,8 @@
# DNA Alignment with BWA MEM
# Take DNA fastq data files to aligned bams with qc data
{% from 'modules/dna_alignment/bwa_mem_samtools.jst' import bwa_mem_samtools_chunked with context %}
+{% from 'modules/dna_alignment/bwa_mem2_samtools.jst' import bwa_mem2_samtools_chunked with context %}
+{% from 'modules/dna_alignment/pb_fq2bam.jst' import fq2bam with context %}
{% from 'modules/dna_alignment/bwa_mem_gatk_picard.jst' import bwa_mem_gatk_picard_chunked with context %}
{% from 'modules/dna_alignment/bwa_mem_samblaster_sambamba.jst' import bwa_mem_blaster_bamba with context %}
{% from 'modules/dna_alignment/gatk_baserecalibration.jst' import baserecalibration, nobaserecalibration with context %}
@@ -23,14 +25,19 @@
{% set opt_dup_distance = 100 %}
{% endif %}
- {% if alignment_style == 'tgen' %}
- {{- bwa_mem_samtools_chunked(sample, reads_per_chunk, opt_dup_distance) }}
- {% elif alignment_style == 'broad' %}
- {{- bwa_mem_gatk_picard_chunked(sample, reads_per_chunk, opt_dup_distance) }}
- {% elif alignment_style == 'ashion' %}
- {{- bwa_mem_blaster_bamba(sample) }}
+ {# PCRfree will not have any pcr cycles #}
+ {% if sample.pcrCycles is defined and sample.pcrCycles == 0 %}
+ {{- bwa_mem2_samtools_chunked(sample, reads_per_chunk, opt_dup_distance) }}
{% else %}
- {{ raise('Unknown dnaAlignmentStyle: {}'.format(dnaAlignmentStyle)) }}
+ {% if alignment_style == 'tgen' %}
+ {{- bwa_mem_samtools_chunked(sample, reads_per_chunk, opt_dup_distance) }}
+ {% elif alignment_style == 'broad' %}
+ {{- bwa_mem_gatk_picard_chunked(sample, reads_per_chunk, opt_dup_distance) }}
+ {% elif alignment_style == 'ashion' %}
+ {{- bwa_mem_blaster_bamba(sample) }}
+ {% else %}
+ {{ raise('Unknown dnaAlignmentStyle: {}'.format(dnaAlignmentStyle)) }}
+ {% endif %}
{% endif %}
{% if sample.gltype == 'genome' %}
diff --git a/modules/dna_alignment/pb_fq2bam.jst b/modules/dna_alignment/pb_fq2bam.jst
new file mode 100644
index 00000000..7997a6de
--- /dev/null
+++ b/modules/dna_alignment/pb_fq2bam.jst
@@ -0,0 +1,49 @@
+# Aligns fastqs from a sample using Parabricks fq2BAM. This aligns all samples
+# in on large run. No need to split or merge. It will also run
+# mark duplicatese and BQSR if needed
+
+# This alignment command prefix is shared by all modules using bwa
+{% from 'utilities/read_group_line.jst' import read_group_line with context %}
+{% from 'utilities/remove_files.jst' import remove_files with context %}
+
+{% macro fq2bam(sample, opt_dup_distance) %}
+ {% set bam %}{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.bam{% endset %}
+
+- name: fq2bam_{{sample.name}}
+ tags: [{{ sample.gltype }}, alignment, dna_alignment, bwa, {{ sample.name }}]
+ reset: predecessors
+ input:
+ {% for rgid, rg in sample.read_groups.items() %}
+ {% set r1fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R1')|first %}
+ {% set r2fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R2')|first %}
+ - temp/fastqs/{{ r1fastq.basename }}
+ - temp/fastqs/{{ r2fastq.basename }}
+ {% endfor %}
+ output: temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.bam
+ walltime: "24:00:00"
+ sbatch_args: ['-p', 'gpu', '--exclusive']
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ rm -r "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/markdup_temp/" || true
+ mkdir -p "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/markdup_temp/"
+ mkdir -p "{{ sample.gltype }}/alignment/bwa/{{ sample.name }}"
+ mkdir -p "{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/stats/"
+
+ module load {{ constants.tools.parabricks.module }}
+
+ pbrun fq2bam \
+ --tmp-dir "temp/{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/markdup_temp/" \
+ --ref "{{ constants.coyote.bwa_index }}" \
+ --bwa-options "-Y -K 100000000" \
+ {% for rgid, rg in sample.read_groups.items() %}
+ {% set r1fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R1')|first %}
+ {% set r2fastq = rg.data_files|selectattr('fastqCode', 'eq', 'R2')|first %}
+ --in-fq "temp/fastqs/{{ r1fastq.basename }}" "temp/fastqs/{{ r2fastq.basename }}" "{{ read_group_line(rg, format='bwa') }}" \
+ {% endfor %}
+ --out-duplicate-metrics "{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/stats/{{ sample.name }}.mdmetrics.txt" \
+ --optical-duplicate-pixel-distance {{ opt_dup_distance }} \
+ --out-bam "{{ sample.gltype }}/alignment/bwa/{{ sample.name }}/{{ sample.name }}.bwa.bam"
+
+{% endmacro %}
diff --git a/modules/metrics/main.jst b/modules/metrics/main.jst
index d86cc1ae..b0726d05 100644
--- a/modules/metrics/main.jst
+++ b/modules/metrics/main.jst
@@ -1,5 +1,7 @@
{%- from 'modules/metrics/mutation_burden.jst' import mutation_burden with context %}
{%- from 'modules/metrics/msisensor_pro.jst' import msisensor_pro with context %}
+{%- from 'modules/metrics/sigprofiler.jst' import sigprofiler with context %}
+{%- from 'modules/metrics/tucon.jst' import tucon with context %}
{%- macro collect_somatic_metrics(pair, normal_bam, tumor_bam, results_dir, taskPrefix, aligner, variant_caller) %}
@@ -40,29 +42,48 @@
{% set flags.vep = true %}
{% endif %}
-{% if tasks[taskPrefix+"_somatic_sample_metric_tgen_mutation_burden"]|default(true) %}
- {% if flags.bcftools %}
- {# If pair.normal.pathToBam is defined then we have a tumor_only pair #}
- {% if pair.normal.pathToBam is defined %}
- {% set final_vcf_prefix %}{{ vcf_prefix }}.db.flt{% endset %}
- {% else %}
- {% set final_vcf_prefix %}{{ vcf_prefix }}.db{% endset %}
- {% endif %}
+{% if flags.bcftools %}
+ {# If pair.normal.pathToBam is defined then we have a tumor_only pair #}
+ {% if pair.normal.pathToBam is defined %}
+ {% set final_vcf_prefix %}{{ vcf_prefix }}.db.flt{% endset %}
{% else %}
- {% set final_vcf_prefix %}{{ vcf_prefix }}{% endset %}
+ {% set final_vcf_prefix %}{{ vcf_prefix }}.db{% endset %}
{% endif %}
- {% if flags.vep %}
+{% else %}
+ {% set final_vcf_prefix %}{{ vcf_prefix }}{% endset %}
+{% endif %}
+{% if flags.vep %}
+ {% if tasks[taskPrefix+"_somatic_sample_metric_tgen_mutation_burden"]|default(true) %}
{% set final_vcf %}{{ final_vcf_prefix }}.vep.pick.vcf.gz{% endset %}
{{- mutation_burden(pair, normal_bam, tumor_bam, final_vcf, variant_caller, aligner, 'vep') }}
{% endif %}
- {% if flags.snpeff %}
+ {% if tasks[taskPrefix+"_somatic_sample_metric_tucon"]|default(true) %}
+ {% if tasks[taskPrefix+"_somatic_annotate_vcfs_bcftools_topmed"]|default(true) %}
+ {% set final_vcf %}{{ final_vcf_prefix }}.vep.full.vcf.gz{% endset %}
+ {{- tucon(pair, final_vcf, 'vep') }}
+ {% endif %}
+ {% endif %}
+{% endif %}
+{% if flags.snpeff %}
+ {% if tasks[taskPrefix+"_somatic_sample_metric_tgen_mutation_burden"]|default(true) %}
{% set final_vcf %}{{ final_vcf_prefix }}.snpeff.can.vcf.gz{% endset %}
{{- mutation_burden(pair, normal_bam, tumor_bam, final_vcf, variant_caller, aligner, 'snpeff') }}
{% endif %}
+ {% if tasks[taskPrefix+"_somatic_sample_metric_tucon"]|default(true) %}
+ {% if tasks[taskPrefix+"_somatic_annotate_vcfs_bcftools_topmed"]|default(true) %}
+ {% set final_vcf %}{{ final_vcf_prefix }}.snpeff.full.vcf.gz{% endset %}
+ {{- tucon(pair, final_vcf, 'snpeff') }}
+ {% endif %}
+ {% endif %}
{% endif %}
{#
{% if tasks[taskPrefix+"_somatic_sample_metric_msisensor_pro"]|default(true) %}
{{- msisensor_pro(pair, normal_bam, tumor_bam, aligner) }}
{% endif %}
#}
+
+{% if tasks[taskPrefix+"_somatic_sample_metric_sigprofiler"]|default(true) %}
+ {{- sigprofiler(pair, vcf_prefix, variant_caller, aligner) }}
+{% endif %}
+
{% endmacro %}
diff --git a/modules/metrics/mutation_burden.jst b/modules/metrics/mutation_burden.jst
index b2c48d6e..bbc9e47a 100644
--- a/modules/metrics/mutation_burden.jst
+++ b/modules/metrics/mutation_burden.jst
@@ -8,6 +8,14 @@
{% set mutation_burden_output %}{{ results_dir }}/{{ pair.name }}.{{ annotate_flag }}.mutation_burden.txt{% endset %}
{% set mutation_burden_json %}{{ results_dir }}/{{ pair.name }}.{{ annotate_flag }}.mutation_burden.json{% endset %}
+{% if pair.callers is defined and pair.callers|length > 1 %}
+ {% if pair.callers | length > 3 %}
+ {% set CC_filter %}INFO/CC>={{ pair.callers | length - 2 }}{% endset %}
+ {% else %}
+ {% set CC_filter %}INFO/CC>={{ pair.callers | length - 1 }}{% endset %}
+ {% endif %}
+{% endif %}
+
- name: tgen_mutation_burden_{{ variant_caller }}_{{ pair.name }}_{{ aligner }}_{{ annotate_flag }}
tags:
input:
@@ -53,7 +61,7 @@
{% endif %}
bcftools filter \
- --include 'INFO/CC>=3 && ({{ ns_list|join(' || ') }})' \
+ --include '{{ CC_filter }} && ({{ ns_list|join(' || ') }})' \
--output {{ mutation_burden_temp }}/{{ input_vcf | basename }} \
--output-type z \
{{ input_vcf }}
diff --git a/modules/metrics/sigprofiler.jst b/modules/metrics/sigprofiler.jst
new file mode 100644
index 00000000..0b182efa
--- /dev/null
+++ b/modules/metrics/sigprofiler.jst
@@ -0,0 +1,101 @@
+{% from 'utilities/variant_filtering.jst' import filter_variants with context %}
+
+{% macro sigprofiler(pair, vcf_prefix, variant_caller, aligner) %}
+
+{% set temp_dir %}temp/{{ pair.gltype }}/metrics/sigprofiler/{{ pair.name }}{% endset %}
+{% set temp_vcf_dir %}{{ temp_dir }}/vcfs{% endset %}
+{% set results_dir %}{{ pair.gltype }}/metrics/sigprofiler/{{ pair.name }}{% endset %}
+{% set all_vcf %}{{ vcf_prefix }}.vcf.gz{% endset %}
+{% set filt_vcf %}{{ temp_vcf_dir }}/{{ vcf_prefix|basename }}.pass.vcf{% endset %}
+
+{% set task %}sigprofiler_{{ variant_caller }}_{{ pair.name }}_{{ aligner }}{% endset %}
+{{- filter_variants(pair, all_vcf, temp_vcf_dir, filt_vcf, task) }}
+
+- name: sigprofiler_{{ variant_caller }}_{{ pair.name }}_{{ aligner }}
+ tags:
+ input:
+ - {{ filt_vcf }}
+ output:
+ - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_Activities_refit.txt
+ - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_Activity_Plots_refit.pdf
+ - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_TMB_plot_refit.pdf
+ - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt
+ - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Signatures/SBS96_De-Novo_Signatures.txt
+ - {{ results_dir }}/SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Signatures/SBS_96_plots_SBS96_De-Novo.pdf
+ - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_Activities_refit.txt
+ - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_Activity_Plots_refit.pdf
+ - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_TMB_plot_refit.pdf
+ - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt
+ - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Signatures/ID83_De-Novo_Signatures.txt
+ - {{ results_dir }}/ID83/Suggested_Solution/ID83_De-Novo_Solution/Signatures/ID_83_plots_ID83_De-Novo.pdf
+ - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_Activities_refit.txt
+ - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_Activity_Plots_refit.pdf
+ - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_TMB_plot_refit.pdf
+ - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt
+ - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Signatures/DBS78_De-Novo_Signatures.txt
+ - {{ results_dir }}/DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Signatures/DBS_78_plots_DBS78_De-Novo.pdf
+ - {{ results_dir }}/extraneous_results.tar
+ walltime: "24:00:00"
+ cpus: 10
+ mem: 8G
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.sigprofiler.module }}
+ module load {{ constants.tools.bcftools.module }}
+
+ {# Remove previously generated matrix #}
+ rm -r {{ temp_dir }}/vcfs/*/ || true
+
+ {# Should already exist #}
+ mkdir -p {{ temp_dir }}/vcfs
+ mkdir -p {{ results_dir }}
+
+ python3 ${JS_PIPELINE_PATH}/required_scripts/{{ constants.coyote.sigprofiler }} \
+ --threads 10 \
+ --vcfpath {{ temp_dir }}/vcfs \
+ --output {{ temp_dir }} \
+ --project {{ pair.name }} \
+ --genome dog \
+ --extract_only
+
+ {# It's possible for no signatures to be found, so we need an if statement to check for output #}
+ if [ "$(ls -A {{ temp_dir }}/vcfs/output)" ]; then
+ {# Remove true temp files #}
+ rm -r {{ temp_dir }}/JOB_METADATA.txt {{ temp_dir }}/Seeds.txt
+
+ {# Prepare the known useful output #}
+ {# rsync -R copies the relative path after the /./ section #}
+ if [ -d "{{ temp_dir }}/SBS96/Suggested_Solution" ]; then
+ rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_Activities_refit.txt {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_Activity_Plots_refit.pdf {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/SBS96_De-Novo_TMB_plot_refit.pdf {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Signatures/SBS96_De-Novo_Signatures.txt {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./SBS96/Suggested_Solution/SBS96_De-Novo_Solution/Signatures/SBS_96_plots_SBS96_De-Novo.pdf {{ results_dir }}
+ fi
+ if [ -d "{{ temp_dir }}/ID83/Suggested_Solution" ]; then
+ rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_Activities_refit.txt {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_Activity_Plots_refit.pdf {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/ID83_De-Novo_TMB_plot_refit.pdf {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Signatures/ID83_De-Novo_Signatures.txt {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./ID83/Suggested_Solution/ID83_De-Novo_Solution/Signatures/ID_83_plots_ID83_De-Novo.pdf {{ results_dir }}
+ fi
+ if [ -d "{{ temp_dir }}/DBS78/Suggested_Solution" ]; then
+ rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_Activities_refit.txt {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_Activity_Plots_refit.pdf {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/DBS78_De-Novo_TMB_plot_refit.pdf {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Activities/De_Novo_Mutation_Probabilities_refit.txt {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Signatures/DBS78_De-Novo_Signatures.txt {{ results_dir }}
+ rsync -R --remove-source-files {{ temp_dir }}/./DBS78/Suggested_Solution/DBS78_De-Novo_Solution/Signatures/DBS_78_plots_DBS78_De-Novo.pdf {{ results_dir }}
+ fi
+
+ {# tar the files that are not known to be immediately important #}
+ tar -cvf {{ results_dir }}/extraneous_results.tar {{ temp_dir }}/*
+ else
+ echo "No signatures found"
+ fi
+
+{% endmacro %}
diff --git a/modules/metrics/thred.jst b/modules/metrics/thred.jst
new file mode 100644
index 00000000..54f4a1f4
--- /dev/null
+++ b/modules/metrics/thred.jst
@@ -0,0 +1,53 @@
+{% macro thred(sample_or_pair, input_seg) %}
+
+{% set results_dir %}{{ sample_or_pair.gltype }}/metrics/thred/{{ sample_or_pair.name }}{% endset %}
+{% set temp_dir %}temp/{{ sample_or_pair.gltype }}/metrics/thred/{{ sample_or_pair.name }}{% endset %}
+
+- name: thred_{{ sample_or_pair.name }}
+ tags: [{{ sample_or_pair.gltype }}, HRD, {{ sample_or_pair.name }}]
+ input:
+ - {{ input_seg }}
+ output:
+ - {{ results_dir }}/{{ sample_or_pair.name }}_hrd_scores.txt
+ - {{ results_dir }}/{{ sample_or_pair.name }}_hrd_flt_segments.txt
+ - {{ results_dir }}/{{ sample_or_pair.name }}_hrd_ori_segments.txt
+ - {{ results_dir }}/{{ sample_or_pair.name }}_excluded90_hrd_excluded_segments.txt
+ - {{ results_dir }}/{{ sample_or_pair.name }}_hrd_captured_genome_territory.txt
+ - {{ results_dir }}/{{ sample_or_pair.name }}_original_segments_karyoplot_1.png
+ - {{ results_dir }}/{{ sample_or_pair.name }}_original_segments_karyoplot_2.png
+ - {{ results_dir }}/{{ sample_or_pair.name }}_segments_filtered_karyoplot_1.png
+ - {{ results_dir }}/{{ sample_or_pair.name }}_segments_filtered_karyoplot_2.png
+ - {{ results_dir }}/{{ sample_or_pair.name }}_segments_excluded_karyoplot_1.png
+ - {{ results_dir }}/{{ sample_or_pair.name }}_segments_excluded_karyoplot_2.png
+ cpus: 1
+ mem: 2G
+ walltime: "4:00:00"
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.python_3_7_2.module }}
+ module load {{ constants.tools.thred.module }}
+
+ mkdir -p {{ results_dir }}
+
+ {# Purge any existing run files prior to starting #}
+ rm -r "{{ temp_dir }}" || true
+ mkdir -p "{{ temp_dir }}"
+
+ sed 's/$/\tcentromere/' {{ constants.coyote.centromere }} \
+ > {{ temp_dir }}/genomic_regions.bed
+
+ tHReD.py \
+ --genomic-regions {{ temp_dir }}/genomic_regions.bed \
+ --seg {{ input_seg }} \
+ --sample {{ sample_or_pair.name }} \
+ --outfile {{ sample_or_pair.name }} \
+ --dir-out {{ results_dir }} \
+ --th-log2r -0.1613 \
+ --minsize 1000000 \
+ --th-pct-overlapping 0.90 \
+ --plots \
+ --exclude-contigs "X"
+
+{% endmacro %}
diff --git a/modules/metrics/tucon.jst b/modules/metrics/tucon.jst
new file mode 100644
index 00000000..4e1c5e4e
--- /dev/null
+++ b/modules/metrics/tucon.jst
@@ -0,0 +1,37 @@
+{% from 'utilities/variant_filtering.jst' import filter_variants with context %}
+
+{% macro tucon(pair, input_vcf, annotate_flag) %}
+
+{% set temp_dir %}temp/{{ pair.gltype }}/metrics/tucon/{{ pair.name }}{% endset %}
+{% set results_dir %}{{ pair.gltype }}/metrics/tucon/{{ pair.name }}{% endset %}
+{% set tucon_output %}{{ results_dir }}/{{ pair.name }}_{{ annotate_flag }}_tucon.tsv{% endset %}
+{% set filtered_vcf %}{{ temp_dir }}/{{ pair.name }}_{{ annotate_flag }}.flt.vcf.gz{% endset %}
+
+{% set task %}tucon_{{ pair.name }}_{{ annotate_flag }}{% endset %}
+{{- filter_variants(pair, input_vcf, temp_dir, filtered_vcf, task) }}
+
+- name: tucon_{{ pair.name }}_{{ annotate_flag }}
+ tags: [tucon, {{ annotate_flag }}]
+ input:
+ - {{ filtered_vcf }}
+ output:
+ - {{ tucon_output }}
+ walltime: "12:00:00"
+ cpus: 1
+ mem: 2G
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.bcftools.module }}
+
+ mkdir -p {{ results_dir }}
+
+ echo -e "VCF\tMUTATION_COUNT\tTOPMED_COUNT" > {{ tucon_output }}
+
+ VCF_BASE=$(basename {{ filtered_vcf }})
+ MUT_COUNT=$(bcftools view -H {{ filtered_vcf }} | wc -l)
+ EVA_COUNT=$(bcftools view -H -i 'INFO/GCA_2285.2=1' {{ filtered_vcf }} | wc -l)
+ echo -e "${VCF_BASE}\t${MUT_COUNT}\t${EVA_COUNT}" >> {{ tucon_output }}
+
+{% endmacro %}
diff --git a/modules/qc/stats2lims.jst b/modules/qc/stats2lims.jst
index 8651309a..74836a3a 100644
--- a/modules/qc/stats2lims.jst
+++ b/modules/qc/stats2lims.jst
@@ -3,7 +3,7 @@
# to upload to LIMS via REST api integration.
{% macro stats2lims(tag1, tag2, task, input_file, file_type) %}
-{% if submissionSource == "TGenLIMS" %}
+{% if submissionSource == "TGenLIMS" and file_type != "samtools_idxstats" %}
- name: stats2lims_{{ task }}_{{ file_type }}
tags: [{{ tag1 }}, quality_control, stats, stats2lims, {{ tag2 }}]
@@ -28,4 +28,4 @@
{{ study }}
{% endif %}
-{% endmacro %}
\ No newline at end of file
+{% endmacro %}
diff --git a/modules/rna/salmon.jst b/modules/rna/salmon.jst
index 29d88815..85c8003f 100755
--- a/modules/rna/salmon.jst
+++ b/modules/rna/salmon.jst
@@ -9,10 +9,10 @@
{% set r2fqlist = [] %}
{% for rgid, rg in sample.read_groups.items() %}
{% for fq in rg.data_files|selectattr('fastqCode', 'eq', 'R1') %}
- {% do r1fqlist.append('"temp/fastqs/' + (fq.basename) + '"') %}
+ {% do r1fqlist.append(fq) %}
{% endfor %}
{% for fq in rg.data_files|selectattr('fastqCode', 'eq', 'R2') %}
- {% do r2fqlist.append('"temp/fastqs/' + (fq.basename) + '"') %}
+ {% do r2fqlist.append(fq) %}
{% endfor %}
{% endfor %}
@@ -24,10 +24,10 @@
reset: predecessors
input:
{% for fq in r1fqlist %}
- - {{ fq }}
+ - temp/fastqs/{{ fq.basename }}
{% endfor %}
{% for fq in r2fqlist %}
- - {{ fq }}
+ - temp/fastqs/{{ fq.basename }}
{% endfor %}
output:
- {{ results_dir }}/{{ sample.name }}.transcripts.sf
@@ -41,6 +41,14 @@
set -o pipefail
module load {{ constants.tools.salmon.module }}
+ {% if r1fqlist[0].fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+
+ {% endif %}
+ {#
+ This comment is here for protect render spacing, do not remove.
+ #}
mkdir -p "{{ temp_dir }}"
mkdir -p "{{ results_dir }}"
@@ -52,9 +60,9 @@
--libType "{{ constants.coyote.strandedness_options[sample.strandedness].salmon }}" \
--index "{{ constants.coyote.salmon_index }}" \
--geneMap "{{ constants.coyote.gtf }}" \
- -1 {{ r1fqlist|join(' ') }} \
+ -1 {% for fq in r1fqlist %}"temp/fastqs/{{ fq.basename }}"{% if not loop.last%} {% endif %}{% endfor %} \
{% if r2fqlist %}
- -2 {{ r2fqlist|join(' ') }} \
+ -2 {% for fq in r2fqlist %}"temp/fastqs/{{ fq.basename }}"{% if not loop.last%} {% endif %}{% endfor %} \
{% endif %}
--output "{{ temp_dir }}"
@@ -87,4 +95,4 @@
{% set directory %}{{ temp_dir }}{% endset %}
{{- remove_files(directory,none,task) }}
-{% endmacro %}
\ No newline at end of file
+{% endmacro %}
diff --git a/modules/rna/star_fusion.jst b/modules/rna/star_fusion.jst
index 5fb63e56..a9cda339 100755
--- a/modules/rna/star_fusion.jst
+++ b/modules/rna/star_fusion.jst
@@ -40,6 +40,14 @@
set -eu
set -o pipefail
+ {% if r1fqlist[0].fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+
+ {% endif %}
+ {#
+ This comment is here for protect render spacing, do not remove.
+ #}
module load {{ constants.tools.star_fusion.module }}
rm -r {{ temp_dir }} || true
@@ -115,6 +123,14 @@
mkdir -p {{ temp_dir }}/_starF_checkpoints
mkdir -p {{ temp_dir }}/star-fusion.preliminary
+ {% if r1fqlist[0].fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+
+ {% endif %}
+ {#
+ This comment is here for protect render spacing, do not remove.
+ #}
module load {{ constants.tools.samtools.module }}
module load {{ constants.tools.star_fusion.module }}
sf_path=$(dirname `which STAR-Fusion`)
diff --git a/modules/rna/star_quant.jst b/modules/rna/star_quant.jst
index 039c68cb..3e952243 100755
--- a/modules/rna/star_quant.jst
+++ b/modules/rna/star_quant.jst
@@ -66,6 +66,14 @@
set -o pipefail
module load {{ constants.tools.star.module }}
+ {% if r1fqlist[0].fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+
+ {% endif %}
+ {#
+ This comment is here for protect render spacing, do not remove.
+ #}
mkdir -p "{{ temp_dir }}"
mkdir -p "{{ results_dir }}/stats"
mkdir -p "{{ star_finalcountsdir }}"
@@ -338,7 +346,8 @@ which appears to be working now after some code changes, keeping this here in ca
{% set task %}markduplicates_star_gatk_{{ sample.name }}{% endset %}
{% do task_list.append(task) %}
- {{- remove_files(temp_dir,none,task_list) }}
+ {% set task_name %}{{ sample.name }}_star_quant{% endset %}
+ {{- remove_files(temp_dir, none, task_list, task_name) }}
{% endmacro %}
{% macro htseq(sample) %}
@@ -464,8 +473,8 @@ which appears to be working now after some code changes, keeping this here in ca
--alignments \
--paired-end \
--no-bam-output \
- --strandedness "{{ constants.phoenix.strandedness_options[sample.strandedness].rsem }}" \
+ --strandedness "{{ constants.coyote.strandedness_options[sample.strandedness].rsem }}" \
{{ star_bam_transcript }} \
- {{ constants.phoenix.rsem_index }} \
+ {{ constants.coyote.rsem_index }} \
"{{ sample.gltype }}/quant/rsem/{{ sample.name }}/{{ sample.name }}"
{% endmacro %}
diff --git a/modules/single_cell/cellranger_count.jst b/modules/single_cell/cellranger_count.jst
index d2793548..f9eac291 100755
--- a/modules/single_cell/cellranger_count.jst
+++ b/modules/single_cell/cellranger_count.jst
@@ -21,10 +21,6 @@
{% set directory %}{{ cellranger_out }}/analysis{% endset %}
{% set samples_list = [] %}
- {% for rgpu, _ in files|groupby('rgpu') %}
- {% set sample_string %}{{ sample.name }}_{{ rgpu[:-2] }}{% endset %}
- {% do samples_list.append(sample_string) %}
- {% endfor %}
- name: cellranger_count_{{ sample.name }}
tags: [{{ sample.gltype }}, single_cell, cellranger_count, {{ sample.name }}]
@@ -54,14 +50,21 @@
module load {{ constants.tools.cellranger.module }}
module load {{ constants.tools.samtools.module }}
+ {% if sample.fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+ {% endif %}
rm -r "{{ temp_dir }}" || true
mkdir -p "{{ temp_dir }}"
mkdir -p "{{ outdir }}"
mkdir -p "{{ alignment_outdir }}/stats"
-
+
{% for fq in files %}
- ln -rs "temp/fastqs/{{ fq.basename }}" "{{ temp_dir }}/{{ fq.basename }}"
+ {% set fq_name %}{{ fq.name }}_{{ fq.rgpu[:-2] }}_{{ fq.rgbc }}{% endset %}
+ {% set fq_new %}{{ fq_name }}_S1{% endset %}
+ {% do samples_list.append(fq_name) %}
+ ln -rs "temp/fastqs/{{ fq.basename }}" "{{ temp_dir }}/{{ fq.basename | replace(fq_name, fq_new) }}"
{% endfor %}
{#
@@ -72,6 +75,7 @@
--localcores 20 \
--localmem 160 \
--chemistry "{{ constants.coyote.scrna_chemistry_options[sample.assayCode].chemistry_name }}" \
+ --expect-cells {{ sample.expectedCells | default(3000) }} \
--id "{{ sample.name }}" \
--fastqs . \
--sample "{{ samples_list|join(',') }}" \
diff --git a/modules/single_cell/cellranger_vdj.jst b/modules/single_cell/cellranger_vdj.jst
index 5bfbfbec..4c205095 100644
--- a/modules/single_cell/cellranger_vdj.jst
+++ b/modules/single_cell/cellranger_vdj.jst
@@ -16,10 +16,6 @@
{% set json %}{{ outdir }}/stats/{{ sample.name }}.cellranger_vdj.bam.metrics_summary.json{% endset %}
{% set samples_list = [] %}
-{% for rgpu, _ in files|groupby('rgpu') %}
- {% set sample_string %}{{ sample.name }}_{{ rgpu[:-2] }}{% endset %}
- {% do samples_list.append(sample_string) %}
-{% endfor %}
- name: cellranger_vdj_{{ sample.name }}
tags: [{{ sample.gltype }}, single_cell, cellranger_vdj, {{ sample.name }}]
@@ -47,18 +43,21 @@
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.concat_ref.fasta
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.concat_ref.fasta.fai
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus_annotations.csv
- - {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus_annotations.json
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.bam
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.bam.bai
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fasta
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fasta.fai
- - {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fastq
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig_annotations.csv
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig.fasta
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig.fastq
- {{ metrics_output }}
- {{ outdir }}/{{ sample.name }}.cellranger_vdj.vloupe
- {{ outdir }}/stats/{{ sample.name }}.cellranger_vdj.bam.web_summary.html
+ - {{ outdir }}/{{ sample.name }}.cellranger_vdj.airr_rearrangement.tsv
+ - {{ outdir }}/{{ sample.name }}.cellranger_vdj.vdj_contig_info.pb
+ - {{ outdir }}/{{ sample.name }}.cellranger_vdj.donor_regions.fa
+ - {{ outdir }}/{{ sample.name }}.cellranger_vdj.regions.fa
+ - {{ outdir }}/{{ sample.name }}.cellranger_vdj.reference.json
cpus: 20
mem: 80G
walltime: "48:00:00"
@@ -67,13 +66,20 @@
set -o pipefail
module load {{ constants.tools.cellranger.module }}
+ {% if sample.fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+ {% endif %}
rm -r "{{ temp_dir }}" || true
mkdir -p "{{ temp_dir }}"
mkdir -p "{{ outdir }}/stats"
{% for fq in files %}
- ln -rs "temp/fastqs/{{ fq.basename }}" "{{ temp_dir }}/{{ fq.basename }}"
+ {% set fq_name %}{{ fq.name }}_{{ fq.rgpu[:-2] }}_{{ fq.rgbc }}{% endset %}
+ {% set fq_new %}{{ fq_name }}_S1{% endset %}
+ {% do samples_list.append(fq_name) %}
+ ln -rs "temp/fastqs/{{ fq.basename }}" "{{ temp_dir }}/{{ fq.basename | replace(fq_name, fq_new) }}"
{% endfor %}
{#
@@ -103,18 +109,21 @@
mv {{ cellranger_out }}/concat_ref.fasta {{ outdir }}/{{ sample.name }}.cellranger_vdj.concat_ref.fasta
mv {{ cellranger_out }}/concat_ref.fasta.fai {{ outdir }}/{{ sample.name }}.cellranger_vdj.concat_ref.fasta.fai
mv {{ cellranger_out }}/consensus_annotations.csv {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus_annotations.csv
- mv {{ cellranger_out }}/consensus_annotations.json {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus_annotations.json
mv {{ cellranger_out }}/consensus.bam {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.bam
mv {{ cellranger_out }}/consensus.bam.bai {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.bam.bai
mv {{ cellranger_out }}/consensus.fasta {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fasta
mv {{ cellranger_out }}/consensus.fasta.fai {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fasta.fai
- mv {{ cellranger_out }}/consensus.fastq {{ outdir }}/{{ sample.name }}.cellranger_vdj.consensus.fastq
mv {{ cellranger_out }}/filtered_contig_annotations.csv {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig_annotations.csv
mv {{ cellranger_out }}/filtered_contig.fasta {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig.fasta
mv {{ cellranger_out }}/filtered_contig.fastq {{ outdir }}/{{ sample.name }}.cellranger_vdj.filtered_contig.fastq
mv {{ cellranger_out }}/metrics_summary.csv {{ metrics_output }}
mv {{ cellranger_out }}/vloupe.vloupe {{ outdir }}/{{ sample.name }}.cellranger_vdj.vloupe
mv {{ cellranger_out }}/web_summary.html {{ outdir }}/stats/{{ sample.name }}.cellranger_vdj.bam.web_summary.html
+ mv {{ cellranger_out }}/airr_rearrangement.tsv {{ outdir }}/{{ sample.name }}.cellranger_vdj.airr_rearrangement.tsv
+ mv {{ cellranger_out }}/vdj_contig_info.pb {{ outdir }}/{{ sample.name }}.cellranger_vdj.vdj_contig_info.pb
+ mv {{ cellranger_out }}/vdj_reference/fasta/donor_regions.fa {{ outdir }}/{{ sample.name }}.cellranger_vdj.donor_regions.fa
+ mv {{ cellranger_out }}/vdj_reference/fasta/regions.fa {{ outdir }}/{{ sample.name }}.cellranger_vdj.regions.fa
+ mv {{ cellranger_out }}/vdj_reference/reference.json {{ outdir }}/{{ sample.name }}.cellranger_vdj.reference.json
{{- stats2json(sample.gltype, sample.name, task, metrics_output, json, "cellranger_vdj_metrics", sample_name=sample.name, library_name=sample.rglb) }}
diff --git a/modules/single_cell/main.jst b/modules/single_cell/main.jst
index b6697e95..acad01f8 100755
--- a/modules/single_cell/main.jst
+++ b/modules/single_cell/main.jst
@@ -15,6 +15,10 @@
{% set rnaStrandType=file.rnaStrandType|default('unstranded')|lower %}
{% set rnaStrandDirection=file.rnaStrandDirection|default('notapplicable')|lower %}
{% set strandedness %}{{ readOrientation }}-{{ rnaStrandType }}-{{ rnaStrandDirection }}{% endset %}
+ {% if file.quantity is defined and file.quantitySource is defined and file.quantitySource|lower == 'cells' %}
+ {% set expectedCells=file.quantity %}
+ {% do file.update({'expectedCells': expectedCells}) %}
+ {% endif %}
{% do file.update({'strandedness': strandedness}) %}
{% endfor %}
diff --git a/modules/single_cell/starsolo.jst b/modules/single_cell/starsolo.jst
index a56e6d1a..cafcc7a8 100755
--- a/modules/single_cell/starsolo.jst
+++ b/modules/single_cell/starsolo.jst
@@ -53,6 +53,10 @@
module load {{ constants.tools.star.module }}
module load {{ constants.tools.samtools.module }}
+ {% if sample.fileType == "fasterq" %}
+ export PetaLinkMode="{{ constants.tools.petagene.PetaLinkMode }}"
+ module load {{ constants.tools.petagene.module }}
+ {% endif %}
rm -r "{{ temp_dir }}" || true
mkdir -p "{{ temp_dir }}"
diff --git a/modules/somatic/gatk_cnv.jst b/modules/somatic/gatk_cnv.jst
index 24e376eb..70d4bcf2 100644
--- a/modules/somatic/gatk_cnv.jst
+++ b/modules/somatic/gatk_cnv.jst
@@ -1,5 +1,6 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
{% from 'modules/annotation/annot_seg.jst' import annot_seg with context %}
+{% from 'modules/metrics/thred.jst' import thred with context %}
# The macros found in this template are used to generate the gatk4 somatic cnv workflow.
{% macro gatk_cnv(pair, aligner='bwa') %}
@@ -716,6 +717,7 @@
{% set final_seg %}{{ results_dir }}/{{ pair.name }}.{{ aligner }}.re_centered.cr.igv.seg{% endset %}
{{- annot_seg(final_seg, pair.name) }}
+ {{- thred(pair, final_seg)}}
{% set task %}gatk_call_cnv_step4_{{ pair.name }}_{{ aligner }}{% endset %}
{% set directory %}{{ temp_dir }}{% endset %}
diff --git a/modules/somatic/lancet.jst b/modules/somatic/lancet.jst
index 0fe0681a..9cc9d8ae 100755
--- a/modules/somatic/lancet.jst
+++ b/modules/somatic/lancet.jst
@@ -1,4 +1,5 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro lancet(pair, aligner='bwa') %}
{% do pair.callers.append('lancet') %}
@@ -137,4 +138,7 @@
bcftools index --tbi --force "{{ pass_vcf }}"
+{{- vcf_stats(pass_vcf, results_dir) }}
+{{- vcf_stats(all_vcf, results_dir) }}
+
{% endmacro %}
diff --git a/modules/somatic/mutect2.jst b/modules/somatic/mutect2.jst
index 62d404e9..605aa49d 100755
--- a/modules/somatic/mutect2.jst
+++ b/modules/somatic/mutect2.jst
@@ -1,4 +1,5 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro mutect2(pair, aligner='bwa') %}
{% do pair.callers.append('mutect2') %}
@@ -339,8 +340,10 @@
bcftools index --tbi --force "{{ pass_vcf }}"
- {% set task %}mutect2_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %}
+ {% set task %}mutect2_filter_calls_{{ pair.name }}_{{ aligner }}{% endset %}
{% set directory %}{{ temp_dir }}{% endset %}
{{- remove_files(directory,none,task) }}
+ {{- vcf_stats(pass_vcf, results_dir) }}
+ {{- vcf_stats(all_vcf, results_dir) }}
{% endmacro %}
diff --git a/modules/somatic/octopus.jst b/modules/somatic/octopus.jst
index 5275b5d6..f1b78a76 100644
--- a/modules/somatic/octopus.jst
+++ b/modules/somatic/octopus.jst
@@ -1,4 +1,5 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro octopus_somatic(pair, aligner='bwa') %}
{% do pair.callers.append('octopus') %}
@@ -27,7 +28,7 @@
set -o pipefail
module load {{ constants.tools.octopus.module }}
-
+
mkdir -p "{{ temp_dir }}"
{# Write out the regions in this batch to a bed file #}
@@ -37,7 +38,7 @@
{{ interval.contig }}${TAB}{{ interval.start - 1 }}${TAB}{{ interval.stop }}
{% endfor %}
EOF
-
+
{# Somatic calling with octopus #}
octopus \
--caller cancer \
@@ -58,7 +59,7 @@
--temp-directory-prefix "{{ loop.index }}" \
--bamout "{{ loop.index }}.realigned.bam" \
--output "{{ loop.index }}.octopus.vcf"
-
+
{% endfor %}
@@ -128,12 +129,13 @@
{% endif %}
"{{ all_vcf }}" \
> "{{ pass_vcf }}"
-
+
bcftools index --tbi --force "{{ pass_vcf }}"
- {% set task %}octopus_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %}
+ {% set task %}octopus_merge_chunks_{{ pair.name }}_{{ aligner }}{% endset %}
{% set directory %}{{ temp_dir }}{% endset %}
{{- remove_files(directory,none,task) }}
-
+ {{- vcf_stats(pass_vcf, results_dir) }}
+ {{- vcf_stats(all_vcf, results_dir) }}
{% endmacro %}
diff --git a/modules/somatic/strelka2.jst b/modules/somatic/strelka2.jst
index 44a71900..563f8f16 100755
--- a/modules/somatic/strelka2.jst
+++ b/modules/somatic/strelka2.jst
@@ -1,4 +1,5 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro strelka2_somatic(pair, aligner='bwa') %}
{% do pair.callers.append('strelka2') %}
@@ -12,8 +13,8 @@
- name: strelka2_{{ pair.name }}_{{ aligner }}
tags: [{{ pair.gltype }}, somatic, snp_indel_caller, strelka2, {{ pair.name }}]
- methods: >
- Somatic variants for {{ pair.name }} ({{ aligner }}) were called with
+ methods: >
+ Somatic variants for {{ pair.name }} ({{ aligner }}) were called with
{{ constants.tools.strelka.verbose }}.
input:
- {{ normal_bam }}
@@ -166,6 +167,7 @@
{% set task %}strelka2_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %}
{% set directory %}{{ temp_dir }}{% endset %}
{{- remove_files(directory,none,task) }}
-
+ {{- vcf_stats(pass_vcf, results_dir) }}
+ {{- vcf_stats(all_vcf, results_dir) }}
{% endmacro %}
diff --git a/modules/somatic/vardict.jst b/modules/somatic/vardict.jst
index 77eeb3b4..580858b0 100755
--- a/modules/somatic/vardict.jst
+++ b/modules/somatic/vardict.jst
@@ -1,4 +1,5 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro vardict_somatic(pair, aligner='bwa') %}
{% do pair.callers.append('vardict') %}
@@ -257,5 +258,7 @@
bcftools index --tbi --force "{{ pass_vcf }}"
-{% endmacro %}
+{{- vcf_stats(pass_vcf, results_dir) }}
+{{- vcf_stats(all_vcf, results_dir) }}
+{% endmacro %}
diff --git a/modules/somatic/vcfmerger2.jst b/modules/somatic/vcfmerger2.jst
index 0526ab3c..0d97d6f3 100755
--- a/modules/somatic/vcfmerger2.jst
+++ b/modules/somatic/vcfmerger2.jst
@@ -3,6 +3,7 @@
{% from 'modules/somatic/rna_variant_check.jst' import add_rna_header_to_vcf with context %}
{% from 'modules/somatic/rna_variant_check.jst' import add_matched_rna with context %}
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro vcfmerger2(pair, aligner='bwa', taskPrefix='Genome') %}
@@ -155,5 +156,6 @@
{{- add_rna_header_to_vcf(merged_vcf_gz, output_vcf, pair, aligner , temp_dir) }}
{% endif %}
{{- annotate_vcfs(pair, temp_dir, results_dir, output_vcf, taskPrefix, aligner, 'merged', 'somatic', 'snp_indel_caller') }}
+ {{- vcf_stats(output_vcf, results_dir) }}
{{- collect_somatic_metrics(pair, normal_bam, tumor_bam, results_dir, taskPrefix, aligner, 'merged') }}
{% endmacro %}
diff --git a/modules/tumor_only/deepvariant.jst b/modules/tumor_only/deepvariant.jst
index 3f32e777..b12106ce 100755
--- a/modules/tumor_only/deepvariant.jst
+++ b/modules/tumor_only/deepvariant.jst
@@ -1,4 +1,5 @@
{% from 'modules/annotation/main.jst' import annotate_vcfs with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{%- macro deepvariant_tumor_only(sample, aligner='bwa', taskPrefix='Genome') %}
@@ -153,5 +154,7 @@
{# Skipping annotate vcfs step until requested #}
{# annotate_vcfs(sample, temp_dir, results_dir, pass_vcf, taskPrefix, aligner, 'deepvariant', 'constitutional', 'snp_indel_caller') #}
+{{- vcf_stats(pass_vcf, results_dir) }}
+{{- vcf_stats(all_vcf, results_dir) }}
{% endmacro %}
diff --git a/modules/tumor_only/lancet.jst b/modules/tumor_only/lancet.jst
index 96fba4d0..a51b949d 100755
--- a/modules/tumor_only/lancet.jst
+++ b/modules/tumor_only/lancet.jst
@@ -1,4 +1,5 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro lancet_tumor_only(pair, aligner='bwa') %}
{% do pair.callers.append('lancet') %}
@@ -139,4 +140,7 @@
bcftools index --tbi --force "{{ pass_vcf }}"
+{{- vcf_stats(pass_vcf, results_dir) }}
+{{- vcf_stats(all_vcf, results_dir) }}
+
{% endmacro %}
diff --git a/modules/tumor_only/mm_igtx_calling.jst b/modules/tumor_only/mm_igtx_calling.jst
index a5c969cb..dda43167 100644
--- a/modules/tumor_only/mm_igtx_calling.jst
+++ b/modules/tumor_only/mm_igtx_calling.jst
@@ -5,13 +5,6 @@
{% set tumor_bam %}{{ tumor.gltype }}/alignment/{{ aligner }}/{{ tumor.name }}/{{ tumor.name }}.{{ aligner }}.bam{% endset %}
{% set results_dir %}{{ tumor.gltype }}/tumor_only_structural_calls/mm_igtx_pairoscope/{{ tumor.name }}{% endset %}
-{% if controlDataFiles is defined %}
-{% set toAssayCodes = [] %}
-{% for file in controlDataFiles %}
- {% do toAssayCodes.append(file.assayCode) %}
-{% endfor %}
-{% endif %}
-
- name: mm_igtx_pairoscope_{{ tumor.name }}_{{ aligner }}
tags: [{{ tumor.gltype }}, structural_caller, pairoscope, {{ tumor.name }}]
tags: [pairoscope, mm_igtx_calling, {{ tumor.gltype }}]
@@ -63,18 +56,14 @@
echo $'Specimen\tChrA\tPositionA\tChrB\tPositionB' > {{ results_dir }}/{{ tumor.name }}.{{ aligner }}_pairoscope_igtx_discordantTable.txt
fi
- {% if controlDataFiles is defined and tumor.assayCode in toAssayCodes %}
- python ${JS_PIPELINE_PATH}/required_scripts/{{ constants.coyote.pairoscope_mm_igtx_calling_tumor_only_script }} \
- {% else %}
python ${JS_PIPELINE_PATH}/required_scripts/{{ constants.coyote.pairoscope_mm_igtx_calling_script }} \
- {% endif %}
--input_file {{ results_dir }}/{{ tumor.name }}.{{ aligner }}_pairoscope_igtx_discordantTable.txt \
--specimen {{ tumor.rgsm }} \
--output_file {{ results_dir }}/{{ tumor.name }}.{{ aligner }}_pairoscope_igtx_calls.txt \
--window 2000 \
--window_min 100 \
- {% if controlDataFiles is defined and tumor.assayCode in toAssayCodes %}
- --call_requirement 5
+ {% if pairoscope_call_requirement is defined %}
+ --call_requirement {{ pairoscope_call_requirement }}
{% else %}
--call_requirement 4
{% endif %}
@@ -188,6 +177,8 @@
bwa mem \
-t 4 \
+ -Y \
+ -K 100000000 \
-o {{ temp_dir }}/{{ tumor.name }}_${chrRegion}_${cluster}/trinity/trinbwafile.sam \
{{ constants.coyote.bwa_index }} \
{{ temp_dir }}/{{ tumor.name }}_${chrRegion}_${cluster}/trinity/Trinity.fasta
@@ -205,6 +196,8 @@
bwa index {{ results_dir }}/{{ tumor.name }}_${chrRegion}_${cluster}/Trinity.fasta
bwa mem \
-t 4 \
+ -Y \
+ -K 100000000 \
-o {{ temp_dir }}/{{ tumor.name }}_${chrRegion}_${cluster}/trinity/ReadstoContigs.sam \
{{ results_dir }}/{{ tumor.name }}_${chrRegion}_${cluster}/Trinity.fasta \
${leftfq} ${rightfq}
@@ -538,4 +531,4 @@
-c 2 \
-s {{ sample.name }}
-{% endmacro %}
\ No newline at end of file
+{% endmacro %}
diff --git a/modules/tumor_only/mutect2.jst b/modules/tumor_only/mutect2.jst
index 45f2004b..1c0f9659 100755
--- a/modules/tumor_only/mutect2.jst
+++ b/modules/tumor_only/mutect2.jst
@@ -1,4 +1,5 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro mutect2_tumor_only(pair, aligner='bwa') %}
{% do pair.callers.append('mutect2') %}
@@ -351,5 +352,7 @@
{% set task %}tumor_only_mutect2_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %}
{% set directory %}{{ temp_dir }}{% endset %}
{{- remove_files(directory,none,task) }}
+ {{- vcf_stats(pass_vcf, results_dir) }}
+ {{- vcf_stats(all_vcf, results_dir) }}
{% endmacro %}
diff --git a/modules/tumor_only/octopus.jst b/modules/tumor_only/octopus.jst
index b61a472e..93865bbc 100644
--- a/modules/tumor_only/octopus.jst
+++ b/modules/tumor_only/octopus.jst
@@ -1,4 +1,5 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro octopus_tumor_only(pair, aligner='bwa') %}
{% do pair.callers.append('octopus') %}
@@ -167,6 +168,7 @@
{% set task %}tumor_only_octopus_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %}
{% set directory %}{{ temp_dir }}{% endset %}
{{- remove_files(directory,none,task) }}
-
+ {{- vcf_stats(pass_vcf, results_dir) }}
+ {{- vcf_stats(all_vcf, results_dir) }}
{% endmacro %}
diff --git a/modules/tumor_only/rna_variant_check.jst b/modules/tumor_only/rna_variant_check.jst
index 48befbb3..1b391f8b 100644
--- a/modules/tumor_only/rna_variant_check.jst
+++ b/modules/tumor_only/rna_variant_check.jst
@@ -61,7 +61,7 @@
- {{ output_vcf }}
- {{ output_vcf }}.csi
- {{ output_vcf }}.tbi
- walltime: "8:00:00"
+ walltime: "24:00:00"
cpus: 2
mem: 4G
cmd: |
@@ -258,4 +258,4 @@
--force \
{{ output_vcf }}
-{% endmacro %}
\ No newline at end of file
+{% endmacro %}
diff --git a/modules/tumor_only/strelka2.jst b/modules/tumor_only/strelka2.jst
index 9551b731..694860ff 100755
--- a/modules/tumor_only/strelka2.jst
+++ b/modules/tumor_only/strelka2.jst
@@ -1,4 +1,5 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro strelka2_tumor_only(pair, aligner='bwa') %}
{% do pair.callers.append('strelka2') %}
@@ -168,6 +169,7 @@
{% set task %}tumor_only_strelka2_filter_variants_{{ pair.name }}_{{ aligner }}{% endset %}
{% set directory %}{{ temp_dir }}{% endset %}
{{- remove_files(directory,none,task) }}
-
+ {{- vcf_stats(pass_vcf, results_dir) }}
+ {{- vcf_stats(all_vcf, results_dir) }}
{% endmacro %}
diff --git a/modules/tumor_only/vardict.jst b/modules/tumor_only/vardict.jst
index 8e39bfb7..ad8c5d7c 100755
--- a/modules/tumor_only/vardict.jst
+++ b/modules/tumor_only/vardict.jst
@@ -1,4 +1,5 @@
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro vardict_tumor_only(pair, aligner='bwa') %}
{% do pair.callers.append('vardict') %}
@@ -263,5 +264,7 @@
bcftools index --tbi --force "{{ pass_vcf }}"
-{% endmacro %}
+{{- vcf_stats(pass_vcf, results_dir) }}
+{{- vcf_stats(all_vcf, results_dir) }}
+{% endmacro %}
diff --git a/modules/tumor_only/vcfmerger2.jst b/modules/tumor_only/vcfmerger2.jst
index 827f01e1..4b08b3ed 100755
--- a/modules/tumor_only/vcfmerger2.jst
+++ b/modules/tumor_only/vcfmerger2.jst
@@ -3,6 +3,7 @@
{% from 'modules/tumor_only/rna_variant_check.jst' import add_rna_header_to_vcf with context %}
{% from 'modules/tumor_only/rna_variant_check.jst' import add_matched_rna with context %}
{% from 'utilities/remove_files.jst' import remove_files with context %}
+{% from 'utilities/vcf_stats.jst' import vcf_stats with context %}
{% macro vcfmerger2_tumor_only(pair, aligner='bwa', taskPrefix='Genome') %}
@@ -165,5 +166,6 @@
{{- add_rna_header_to_vcf(merged_vcf_gz, output_vcf, pair, aligner , temp_dir) }}
{% endif %}
{{- annotate_vcfs(pair, temp_dir, results_dir, output_vcf, taskPrefix, aligner, 'merged', 'tumor_only', 'snp_indel_caller') }}
+ {{- vcf_stats(output_vcf, results_dir) }}
{{- collect_somatic_metrics(pair, normal_bam, tumor_bam, results_dir, taskPrefix, aligner, 'merged') }}
{% endmacro %}
diff --git a/pipeline.yaml b/pipeline.yaml
index 3be119f7..133acf97 100644
--- a/pipeline.yaml
+++ b/pipeline.yaml
@@ -2,7 +2,7 @@ __pipeline__:
name: coyote
main: main.jst
description: Canine (canfam3.1) genomics suite
- version: v1.1.2
+ version: v1.2.1
constants:
tools:
bedtools:
@@ -37,10 +37,14 @@ constants:
citation: >
Li H. and Durbin R. (2009) Fast and accurate short read alignment with Burrows-Wheeler Transform.
Bioinformatics, 25:1754-60. [PMID: 19451168]
+ bwa_mem2:
+ module: bwa-mem2/2.2.1
+ version: "2.2.1"
+ verbose: Burrows-Wheeler Aligner v2.2.1
cellranger:
- module: cellranger/3.1.0
- version: "3.1.0"
- verbose: 10X CellRanger v3.1.0
+ module: cellranger/6.0.1
+ version: "6.0.1"
+ verbose: 10X CellRanger v6.0.1
website: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger
citation: >
TODO
@@ -73,6 +77,14 @@ constants:
module: DLE/0.1.5
version: "0.1.5"
verbose: Discordant Loci Extractor v0.1.5
+ expansion_hunter:
+ module: ExpansionHunter/4.0.2
+ version: "4.0.2"
+ verbose: Illumina ExpansionHunter v4.0.2
+ citation: >
+ Egor Dolzhenko, Viraj Deshpande, Felix Schlesinger, Peter Krusche, Roman Petrovski, and others,
+ ExpansionHunter: A sequence-graph based tool to analyze variation in short tandem repeat regions,
+ Bioinformatics 2019
freebayes:
module: freebayes/1.3.1-foss-2019a
version: "1.3.1"
@@ -126,7 +138,7 @@ constants:
HTSeq — A Python framework to work with high-throughput sequencing data
Bioinformatics (2014), in print, online at doi:10.1093/bioinformatics/btu638
ichorcna:
- module: R/3.6.1-phoenix
+ module: ichorCNA/0.2.0-coyote
launcher: required_scripts/runIchorCNA_47ce8db.R
version: "0.2.0"
verbose: ichorCNA v0.2.0
@@ -209,6 +221,10 @@ constants:
website: https://github.com/genome/pairoscope
citation: >
TBD
+ petagene:
+ module: petagene/1.3.0p7
+ version: 1.3.0p7
+ PetaLinkMode: "+fastq"
phaser:
module: phASER/1.1.1-python-2.7.13
version: 1.1.1
@@ -239,6 +255,14 @@ constants:
citation: >
R Core Team (2017). R: A language and environment for statistical computing. R Foundation for
Statistical Computing, Vienna, Austria. URL https://www.R-project.org/.
+ rsem:
+ module: RSEM/1.3.3
+ version: "1.3.3"
+ verbose: rsem v1.3.3
+ website: https://github.com/deweylab/RSEM
+ citation: >
+ Li, B., Dewey, C.N. RSEM: accurate transcript quantification from RNA-Seq data with or without a reference
+ genome. BMC Bioinformatics 12, 323 (2011). https://doi.org/10.1186/1471-2105-12-323
salmon:
module: Salmon/1.2.1-gompi-2019a
version: "1.2.1"
@@ -268,6 +292,10 @@ constants:
module: sequenza/3.0.0
version: "3.0.0"
verbose: Sequenza v3.0.0
+ sigprofiler:
+ module: SigProfiler/1.1
+ version: "1.1"
+ verbose: SigProfiler v1.1
singularity:
module: singularity/3.5.2-phoenix
version: "3.5.2"
@@ -326,9 +354,13 @@ constants:
Liao Y, Smyth GK and Shi W. featureCounts: an efficient general-purpose program for assigning
sequence reads to genomic features. Bioinformatics, 30(7):923-30, 2014
tgen_mutation_burden:
- module: MutationBurden/1.2.1
- version: "1.2.1"
- verbose: TGen Mutation Burden Tool v1.2.1
+ module: MutationBurden/1.2.3
+ version: "1.2.3"
+ verbose: TGen Mutation Burden Tool v1.2.3
+ thred:
+ module: tHReD/1.1.0
+ version: "1.1.0"
+ verbose: tHReD v1.1.0
transparser:
module: transParser/1.0.1
version: "1.0.1"
@@ -398,44 +430,45 @@ constants:
delly_svtop_delly_sv_annotation_parellel_script: svtop.delly.sv_annotation.parallel_8820499.py
manta_prepare_sv_vcf: manta_prepare_sv_vcf_f94bcc1.py
manta_gene_annotation_parallel: manta_sv_annotation_parallel_8820499.py
- pairoscope_mm_igtx_calling_script: mm_igtx_pairoscope_calling_b38_f3010c3.py
- pairoscope_mm_igtx_calling_tumor_only_script: mm_igtx_pairoscope_calling_b38_tumor_only_39d1efa.py
+ pairoscope_mm_igtx_calling_script: mm_igtx_pairoscope_calling_b38_356362b.py
cna_seg_extend: seg_extend_229b8c7.py
plotCNVplus_Rscript: plotCNVplus_4d89cb4.R
plotSamStats_Rscript: summarize_samstats_8c45d63.R
- summarize_mm_igtx_Rscript: summarize_Ig_4b93aee.R
- process_assembled_bam: Process_Assembled_BAM_eb25fca.py
+ summarize_mm_igtx_Rscript: summarize_Ig_875a823.R
+ process_assembled_bam: Process_Assembled_BAM_277eed7.py
stats2json: samStats2json_3a90a2f.py
stats2lims: uploadStats2Lims_1ace81f.py
- annotSeg_script: annotSeg_8820499.pl
- cellranger_reference: /home/tgenref/canis_familiaris/canfam3.1/tool_specific_resources/cellranger/refdata-cellranger-hg19-1.2.0
- cellranger_vdj_reference: /home/tgenref/canis_familiaris/canfam3.1/tool_specific_resources/cellranger/refdata-cellranger-hg19-1.2.0
+ sigprofiler: sigprofiler_d78cc9e.py
+ annotSeg_script: annotSeg_7102f1c.pl
+ cellranger_reference: /home/tgenref/homo_sapiens/grch38_hg38/hg38tgen/gene_model/ensembl_v98/tool_resources/cellranger_6.0.0/hg38tgen_ensembl_v98
+ cellranger_vdj_reference: /home/tgenref/homo_sapiens/grch38_hg38/tool_specific_resources/cellranger/refdata-cellranger-vdj-GRCh38-alts-ensembl-5.0.0
scrna_chemistry_options:
X3SCR:
chemistry_name: SC3Pv1
umi_length: 10
- cell_barcode_whitelist_file: /packages/cellranger/3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/737K-april-2014_rc.txt
+ cell_barcode_whitelist_file: /packages/easy-build/software/CellRanger/6.0.0/lib/python/cellranger/barcodes/737K-april-2014_rc.txt
XCSCR:
chemistry_name: SC3Pv2
umi_length: 10
- cell_barcode_whitelist_file: /packages/cellranger/3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/737K-august-2016.txt
+ cell_barcode_whitelist_file: /packages/easy-build/software/CellRanger/6.0.0/lib/python/cellranger/barcodes/737K-august-2016.txt
X3SC3:
chemistry_name: SC3Pv3
umi_length: 12
- cell_barcode_whitelist_file: /packages/cellranger/3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/3M-february-2018.txt.gz
+ cell_barcode_whitelist_file: /packages/easy-build/software/CellRanger/6.0.0/lib/python/cellranger/barcodes/3M-february-2018.txt
X5SCR:
chemistry_name: SC5P-R2
umi_length: 10
- cell_barcode_whitelist_file: /packages/cellranger/3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/737K-august-2016.txt
+ cell_barcode_whitelist_file: /packages/easy-build/software/CellRanger/6.0.0/lib/python/cellranger/barcodes/737K-august-2016.txt
unknown:
chemistry_name: auto
umi_lenth: 10
- cell_barcode_whitelist_file: /packages/cellranger/3.1.0/cellranger-cs/3.1.0/lib/python/cellranger/barcodes/737K-august-2016.txt
+ cell_barcode_whitelist_file: /packages/easy-build/software/CellRanger/6.0.0/lib/python/cellranger/barcodes/737K-august-2016.txt
gatk_known_sites:
- /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/public_databases/eva/GCA_000002285.2_current_ids_renamed.vcf.gz
gatk_cnn_resources:
- /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/public_databases/eva/GCA_000002285.2_current_ids_renamed.vcf.gz
bwa_index: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/tool_resources/bwa_0.7.17/Canis_familiaris.CanFam3.1.dna.toplevel.fa
+ bwa_mem2_index: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/tool_resources/bwa_2.2.1/Canis_familiaris.CanFam3.1.dna.toplevel.fa
gtf: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/gene_model/ensembl_v98/Canis_familiaris.CanFam3.1.98.gtf
ref_flat: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/gene_model/ensembl_v98/Canis_familiaris.CanFam3.1.98.refFlat.txt
ribo_locations: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/gene_model/ensembl_v98/Canis_familiaris.CanFam3.1.98.ribo.interval_list
@@ -453,18 +486,21 @@ constants:
starfusion_index: /home/tgenref/canis_familiaris/canfam3.1/canfam3.1_tgen/gene_model/ensembl_v98/tool_resources/starFusion_ensembl_v98/starFusion_Resources/ctat_genome_lib_build_dir
strandedness_options:
inward-unstranded-notapplicable:
+ rsem: "none"
salmon: "IU"
htseq: "no"
featurecounts: "0"
tophat: "-fr-unstranded"
collectrnaseqmetrics: "NONE"
inward-stranded-forward:
+ rsem: "forward"
salmon: "ISF"
htseq: "yes"
featurecounts: "1"
tophat: "-fr-secondstrand"
collectrnaseqmetrics: "FIRST_READ_TRANSCRIPTION_STRAND"
inward-stranded-reverse:
+ rsem: "reverse"
salmon: "ISR"
htseq: "reverse"
featurecounts: "2"
diff --git a/required_scripts/Process_Assembled_BAM_eb25fca.py b/required_scripts/Process_Assembled_BAM_277eed7.py
similarity index 71%
rename from required_scripts/Process_Assembled_BAM_eb25fca.py
rename to required_scripts/Process_Assembled_BAM_277eed7.py
index e0ab0b3c..c63102de 100644
--- a/required_scripts/Process_Assembled_BAM_eb25fca.py
+++ b/required_scripts/Process_Assembled_BAM_277eed7.py
@@ -696,261 +696,157 @@ def check_gene_call(table, gene, nreads, min_con_len):
for t_elem in res_gene:
emp_list.append(t_elem)
return emp_list
- # other cases
- table_by_gene = table[(table.Gene_1 == gene)]
- table_by_gene = table_by_gene[(table_by_gene.IgTxCalled == 1)]
- # print(table_by_gene)
- for row in table_by_gene.index:
- print("In Table")
- print(table_by_gene.at[row, 'name'])
-
- if (table_by_gene.at[row, 'fragments_at_junc_1'] >= table_by_gene.at[row, 'fragments_at_junc_2']):
- count_gene = table_by_gene.at[row, 'fragments_at_junc_1']
- else:
- count_gene = table_by_gene.at[row, 'fragments_at_junc_2']
-
- contig_length = table_by_gene.at[row, 'Contig_length_1']
-
- if (table_by_gene.at[row, 'Contig_reversed2']):
- ig_bp2_tmp = table_by_gene.at[row, 'pos_2_end']
- else:
- ig_bp2_tmp = table_by_gene.at[row, 'pos_2_start']
-
- if (('Gene_3' in table) and (table_by_gene.at[row, 'Gene_3'] != 0)):
- if ('IG' in str(table_by_gene.at[row, 'Gene_3'])):
- # if(table_by_gene.at[row,'Contig_reversed3']):
- # ig_bp3_tmp = table_by_gene.at[row,'pos_3_end']
- # else:
- ig_breakpoint3 = get_genomic_bp(table_by_gene.at[row, 'cigar_3'], table_by_gene.at[row, 'aligned_length_3'],
- table_by_gene.at[row, 'Contig_reversed3'], table_by_gene.at[row, 'pos_3_start'])
- ig_breakpoint2 = get_genomic_bp(table_by_gene.at[row, 'cigar_2'], table_by_gene.at[row, 'aligned_length_2'],
- table_by_gene.at[row, 'Contig_reversed2'], table_by_gene.at[row, 'pos_2_start'])
- ig_breakpoint = str(int(ig_breakpoint2)) + ";" + str(int(ig_breakpoint3))
- # ig_bp3_tmp = table_by_gene.at[row,'pos_3_start']
- # ig_gene_cigar = table_by_gene.at[row,'cigar_3']
- # matchlen = table_by_gene.at[row,'aligned_length_3']
- # ig_breakpoint = get_genomic_bp(ig_gene_cigar, matchlen, table_by_gene.at[row,'Contig_reversed3'],ig_bp3_tmp)
- # str(int(ig_bp2_tmp))+";"+str(int(ig_bp3_tmp))
- else:
- ig_bp2_tmp = table_by_gene.at[row, 'pos_2_start']
- ig_breakpoint = get_genomic_bp(table_by_gene.at[row, 'cigar_2'], table_by_gene.at[row, 'aligned_length_2'],
- table_by_gene.at[row, 'Contig_reversed2'], ig_bp2_tmp)
- # ig_breakpoint = table_by_gene.at[row,'pos_2_start']
- else:
- ig_breakpoint = table_by_gene.at[row, 'pos_2_start']
-
- if (('cigar_3' in table) and (table_by_gene.at[row, 'Gene_3'] != 0)):
- if ('IG' in str(table_by_gene.at[row, 'Gene_3'])):
- ig_cigar = table_by_gene.at[row, 'cigar_2'] + ";" + table_by_gene.at[row, 'cigar_3']
- else:
- ig_cigar = table_by_gene.at[row, 'cigar_2']
- else:
- ig_cigar = table_by_gene.at[row, 'cigar_2']
-
- pos_start = table_by_gene.at[row, 'pos_1_start']
- gene_cigar = table_by_gene.at[row, 'cigar_1']
- matchlen = table_by_gene.at[row, 'aligned_length_1']
- breakpoint = get_genomic_bp(gene_cigar, matchlen, table_by_gene.at[row, 'Contig_reversed1'], pos_start)
-
- gene_tmp = table_by_gene.at[row, 'Gene_2']
- source = ig_dict[gene_tmp]
- gene_overlap = table_by_gene.at[row, 'aligned_length_1']
- ig_overlap = table_by_gene.at[row, 'aligned_length_2']
-
- if (table_by_gene.at[row, 'aligned_length_1'] >= min_con_len and table_by_gene.at[
- row, 'aligned_length_2'] >= min_con_len and count_gene >= nreads):
- call = 1
- print("Tx pass")
- if (table_by_gene.at[row, 'strand_1'] == 'Pos'):
- print("Found pos")
- pos_strand_der = 1
- pos_strand_list = (
- pos_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint,
- count_gene)
- elif (table_by_gene.at[row, 'strand_1'] == 'Neg'):
- neg_strand_der = 1
- print("found neg")
- neg_strand_list = (
- neg_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint,
- count_gene)
- # Case:2 Gene 2 is Target Gene
- # This is the case when IG is Gene 1 and Target Gene is gene2
- if ('Gene_2' in table):
- table_by_gene = table[(table.Gene_2 == gene)]
- table_by_gene = table_by_gene[(table_by_gene.IgTxCalled == 1)]
- else:
- table_by_gene = pd.DataFrame()
+ #updated code
+ #start with quey gene
+ gg = 'Gene_'+str(1)
- # print(table_by_gene)
+ table_by_gene = table #_c[(table_c.Gene_1 == gene)]
+ table_by_gene = table_by_gene[(table_by_gene.IgTxCalled == 1)]
+ #loop all rows
for row in table_by_gene.index:
- print("In Table")
+ print("In Table Gene1 is target")
print(table_by_gene.at[row, 'name'])
-
- if (table_by_gene.at[row, 'fragments_at_junc_2'] >= table_by_gene.at[row, 'fragments_at_junc_1']):
- count_gene = table_by_gene.at[row, 'fragments_at_junc_2']
- else:
- count_gene = table_by_gene.at[row, 'fragments_at_junc_1']
-
- contig_length = table_by_gene.at[row, 'Contig_length_2']
-
- if (table_by_gene.at[row, 'Contig_reversed1']):
- ig_bp1_tmp = table_by_gene.at[row, 'pos_1_start'] + table_by_gene.at[row, 'aligned_length_1']
- else:
- ig_bp1_tmp = table_by_gene.at[row, 'pos_1_start'] + table_by_gene.at[row, 'aligned_length_1']
-
- # this case is highly unlikely as bam is sorted by posn.
- if (('Gene_3' in table) and (table_by_gene.at[row, 'Gene_3'] != 0)):
-
- ig_bp3_tmp = table_by_gene.at[row, 'pos_3_start']
-
- if ('IG' in str(table_by_gene.at[row, 'Gene_3'])):
- ig_bp3_tmp = table_by_gene.at[row, 'pos_3_start']
- ig_breakpoint1 = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'],
- table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start'])
- ig_breakpoint3 = get_genomic_bp(table_by_gene.at[row, 'cigar_3'], table_by_gene.at[row, 'aligned_length_3'],
- table_by_gene.at[row, 'Contig_reversed3'], ig_bp3_tmp)
- ig_breakpoint = str(int(ig_breakpoint1)) + ";" + str(int(ig_breakpoint3))
- else:
- ig_breakpoint = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'],
- table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start'])
- else:
- ig_breakpoint = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'],
- table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start'])
-
- if (('cigar_3' in table) and (table_by_gene.at[row, 'Gene_3'] != 0)):
- if ('IG' in str(table_by_gene.at[row, 'Gene_3'])):
- ig_cigar = table_by_gene.at[row, 'cigar_1'] + ";" + table_by_gene.at[row, 'cigar_3']
- else:
- ig_cigar = table_by_gene.at[row, 'cigar_1']
- else:
- ig_cigar = table_by_gene.at[row, 'cigar_1']
-
- # if(table_by_gene.at[row,'Contig_reversed2']):
- # breakpoint = table_by_gene.at[row,'pos_2_end'] + table_by_gene.at[row,'aligned_length_2']
- # else:
- # breakpoint = table_by_gene.at[row,'pos_2_start'] + table_by_gene.at[row,'aligned_length_2']
-
- gene_cigar = table_by_gene.at[row, 'cigar_2']
- pos_start = table_by_gene.at[row, 'pos_2_start']
- matchlen = table_by_gene.at[row, 'aligned_length_2']
- breakpoint = get_genomic_bp(gene_cigar, matchlen, table_by_gene.at[row, 'Contig_reversed2'], pos_start)
-
- gene_tmp = table_by_gene.at[row, 'Gene_1']
- source = ig_dict[gene_tmp]
- gene_overlap = table_by_gene.at[row, 'aligned_length_2']
- ig_overlap = table_by_gene.at[row, 'aligned_length_1']
-
- if (table_by_gene.at[row, 'aligned_length_2'] >= min_con_len and table_by_gene.at[
- row, 'aligned_length_1'] >= min_con_len and count_gene >= nreads):
+ print(table_by_gene.at[row, gg])
+
+ total_aligned_length = 0
+ ig_breakpoint = ''
+ ig_aligned_length = 0
+ gene_strand =''
+ all_gene_cigar = ''
+ all_ig_cigar = ''
+ gene_overlap = 0
+ ig_overlap = 0
+
+ #get max splits for contigs
+ max_contig_splits = 1
+ for col in table_by_gene.columns:
+ #print(col)
+ #assuming max 10 splits per contigs which is highly unlikely
+ for tmp_idx in range(1,10):
+ tmp_gene = 'Gene_' + str(tmp_idx)
+ if(tmp_gene == col):
+ if(tmp_idx >= max_contig_splits):
+ max_contig_splits = tmp_idx
+ #for loop range add 1
+ max_contig_splits = max_contig_splits + 1
+
+ print("We have total contig splits "+str(max_contig_splits))
+ #loop through all columns of each row
+ for gindex in range(1,max_contig_splits):
+ gg = 'Gene_'+str(gindex)
+
+ print("Gene is "+gg+" "+str(gindex))
+ gindex2 = gindex
+
+ #set indices
+ pos_col = 'pos_'+str(gindex)+'_start'
+ cigar_col= 'cigar_'+str(gindex)
+ align_col = 'aligned_length_'+str(gindex)
+ contig_flag_col = 'Contig_reversed'+str(gindex)
+ frag_col = 'fragments_at_junc_'+str(gindex)
+ contig_col = 'Contig_length_'+str(gindex)
+ strand_col = 'strand_'+str(gindex)
+
+ if((table_by_gene.at[row,gg]==gene)):
+
+ #get values for target
+ print(pos_col+cigar_col)
+ pos_start = table_by_gene.at[row, pos_col]
+ gene_cigar = table_by_gene.at[row, cigar_col]
+ if(all_gene_cigar ==''):
+ all_gene_cigar = table_by_gene.at[row, cigar_col]
+ else:
+ all_gene_cigar = all_gene_cigar + table_by_gene.at[row, cigar_col]
+ matchlen = table_by_gene.at[row, align_col]
+ count_gene = table_by_gene.at[row, frag_col]
+ contig_length = table_by_gene.at[row, contig_col]
+ if(matchlen > gene_overlap):
+ gene_overlap = matchlen
+ total_aligned_length = total_aligned_length + matchlen
+ breakpoint = get_genomic_bp(gene_cigar, matchlen, table_by_gene.at[row, contig_flag_col], pos_start)
+ gene_strand = table_by_gene.at[row, strand_col]
+ #print(" bb="+str(breakpoint)+" "+str(matchlen)+" "+gene_cigar+" "+str(pos_start))
+
+ #if gg is IG
+ elif('IG' in str(table_by_gene.at[row,gg])):
+ #print("Found IG at "+gg)
+ ig_breakpoint = get_genomic_bp(table_by_gene.at[row, cigar_col], table_by_gene.at[row, align_col],
+ table_by_gene.at[row, contig_flag_col], table_by_gene.at[row, pos_col])
+ gene_tmp = table_by_gene.at[row,gg]
+ if(table_by_gene.at[row, align_col] > ig_overlap):
+ ig_overlap = table_by_gene.at[row, align_col]
+ source = ig_dict[gene_tmp]
+ if(table_by_gene.at[row, align_col] > ig_aligned_length):
+ ig_aligned_length = table_by_gene.at[row, align_col]
+
+ ig_cigar = table_by_gene.at[row, cigar_col]
+ if(all_ig_cigar ==''):
+ all_ig_cigar = table_by_gene.at[row, cigar_col]
+ else:
+ all_ig_cigar = all_ig_cigar + table_by_gene.at[row, cigar_col]
+
+ #check all alignments for the contig
+ while(gindex2 < max_contig_splits-1):
+ gindex2= gindex2 + 1
+ gg2 = 'Gene_'+str(gindex2)
+ #set indices
+ pos_col2 = 'pos_'+str(gindex2)+'_start'
+ cigar_col2 = 'cigar_'+str(gindex2)
+ align_col2 = 'aligned_length_'+str(gindex2)
+ contig_flag_col2 = 'Contig_reversed'+str(gindex2)
+ frag_col2 = 'fragments_at_junc_'+str(gindex2)
+
+ #if gene is same as target gene
+ if((table_by_gene.at[row,gg] == table_by_gene.at[row,gg2]) and (table_by_gene.at[row,gg]==gene)):
+ print("Found double match"+gg2)
+ align_col2 = 'aligned_length_'+str(gindex2)
+ total_aligned_length = total_aligned_length + table_by_gene.at[row, align_col2]
+ gene_cigar = gene_cigar+';'+table_by_gene.at[row, cigar_col2]
+ if(table_by_gene.at[row, frag_col2] > count_gene):
+ count_gene = table_by_gene.at[row, frag_col2]
+ if( table_by_gene.at[row, align_col2] > gene_overlap):
+ gene_overlap = table_by_gene.at[row, align_col2]
+
+ #multiple alignments of IG
+ if(('IG' in str(table_by_gene.at[row,gg2])) and (table_by_gene.at[row,gg]==table_by_gene.at[row,gg2])):
+ print("Found IG at "+gg2)
+ ig_breakpoint2 = get_genomic_bp(table_by_gene.at[row, cigar_col2], table_by_gene.at[row, align_col2],
+ table_by_gene.at[row, contig_flag_col2], table_by_gene.at[row, pos_col2])
+ if(table_by_gene.at[row, align_col2] > ig_aligned_length):
+ ig_aligned_length = table_by_gene.at[row, align_col2]
+ ig_breakpoint = str(int(ig_breakpoint))+';'+str(int(ig_breakpoint2))
+ ig_cigar = ig_cigar+';'+table_by_gene.at[row, cigar_col2]
+ #test onky since we are missing fastq
+ print("************************\n"+str(total_aligned_length) + " "+str(ig_aligned_length)+"*****************\n")
+ #gene overlap is max alignment and total is total region aligned
+ #switching total to gene to filer by max.
+ #we keep both in case we would like to get total length in later revisions
+ total_aligned_length = gene_overlap
+ #ig_overlap = ig_aligned_length
+ if (total_aligned_length >= min_con_len and ig_aligned_length >= min_con_len and count_gene >= nreads):
call = 1
print("Tx pass")
- if (table_by_gene.at[row, 'strand_2'] == 'Pos'):
+ if (gene_strand == 'Pos'):
print("Found pos")
pos_strand_der = 1
pos_strand_list = (
- pos_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint,
+ pos_strand_der, contig_length, gene_overlap, all_gene_cigar, ig_overlap, all_ig_cigar, breakpoint, ig_breakpoint,
count_gene)
- elif (table_by_gene.at[row, 'strand_2'] == 'Neg'):
+ elif (gene_strand == 'Neg'):
neg_strand_der = 1
print("found neg")
neg_strand_list = (
- neg_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint,
+ neg_strand_der, contig_length, gene_overlap, all_gene_cigar, ig_overlap, all_ig_cigar, breakpoint, ig_breakpoint,
count_gene)
- ########
- # Case 3: Gene_3 is target gene when 1 and 2 match to IG and 3 is the gene. Case of IG multialignment
- ########
- if ('Gene_3' in table):
- table_by_gene = table[(table.Gene_3 == gene)]
- table_by_gene = table_by_gene[(table_by_gene.IgTxCalled == 1)]
- else:
- table_by_gene = pd.DataFrame()
- # print(table_by_gene)
- for row in table_by_gene.index:
- print("In Table")
- print(table_by_gene.at[row, 'name'])
-
- if (table_by_gene.at[row, 'fragments_at_junc_3'] >= table_by_gene.at[row, 'fragments_at_junc_1']):
- count_gene = table_by_gene.at[row, 'fragments_at_junc_3']
- else:
- count_gene = table_by_gene.at[row, 'fragments_at_junc_1']
-
- contig_length = table_by_gene.at[row, 'Contig_length_3']
-
- if (table_by_gene.at[row, 'Contig_reversed1']):
- ig_bp1_tmp = table_by_gene.at[row, 'pos_1_end']
- else:
- ig_bp1_tmp = table_by_gene.at[row, 'pos_1_start']
-
- if (('Gene_2' in table) and (table_by_gene.at[row, 'Gene_2'] != 0)):
- if (table_by_gene.at[row, 'Contig_reversed2']):
- ig_bp2_tmp = table_by_gene.at[row, 'pos_2_end']
- else:
- ig_bp2_tmp = table_by_gene.at[row, 'pos_2_start']
-
- if ('IG' in str(table_by_gene.at[row, 'Gene_2'])):
- ig_breakpoint1 = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'],
- table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start'])
- ig_breakpoint2 = get_genomic_bp(table_by_gene.at[row, 'cigar_2'], table_by_gene.at[row, 'aligned_length_2'],
- table_by_gene.at[row, 'Contig_reversed2'], table_by_gene.at[row, 'pos_2_start'])
- ig_breakpoint = str(int(ig_breakpoint1)) + ";" + str(int(ig_breakpoint2))
-
- # ig_breakpoint = str(int(ig_bp1_tmp))+";"+str(int(ig_bp2_tmp)) # str(int(table_by_gene.at[row,'pos_1_start']))+";"+str(int(table_by_gene.at[row,'pos_2_start']))
- else:
- ig_breakpoint = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'],
- table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start'])
- # ig_breakpoint =ig_bp1_tmp #table_by_gene.at[row,'pos_1_start']
- else:
- ig_breakpoint = get_genomic_bp(table_by_gene.at[row, 'cigar_1'], table_by_gene.at[row, 'aligned_length_1'],
- table_by_gene.at[row, 'Contig_reversed1'], table_by_gene.at[row, 'pos_1_start'])
- # ig_breakpoint = table_by_gene.at[row,'pos_1_start']
-
- if (('cigar_2' in table) and (table_by_gene.at[row, 'Gene_2'] != 0)):
- if ('IG' in str(table_by_gene.at[row, 'Gene_2'])):
- ig_cigar = table_by_gene.at[row, 'cigar_1'] + ";" + table_by_gene.at[row, 'cigar_2']
- else:
- ig_cigar = table_by_gene.at[row, 'cigar_1']
- else:
- ig_cigar = table_by_gene.at[row, 'cigar_1']
-
- # if(table_by_gene.at[row,'Contig_reversed3']):
- # breakpoint = table_by_gene.at[row,'pos_3_start'] + table_by_gene.at[row,'aligned_length_3']
- # else:
- # breakpoint = table_by_gene.at[row,'pos_3_start'] +table_by_gene.at[row,'aligned_length_3']
-
- gene_cigar = table_by_gene.at[row, 'cigar_3']
- pos_start = table_by_gene.at[row, 'pos_3_start']
- matchlen = table_by_gene.at[row, 'aligned_length_3']
- breakpoint = get_genomic_bp(gene_cigar, matchlen, table_by_gene.at[row, 'Contig_reversed3'], pos_start)
-
- gene_tmp = table_by_gene.at[row, 'Gene_1']
- source = ig_dict[gene_tmp]
- gene_overlap = table_by_gene.at[row, 'aligned_length_3']
- ig_overlap = table_by_gene.at[row, 'aligned_length_1']
-
- if (table_by_gene.at[row, 'aligned_length_3'] >= min_con_len and table_by_gene.at[
- row, 'aligned_length_1'] >= min_con_len and count_gene >= nreads):
- call = 1
- print("Tx pass")
- if (table_by_gene.at[row, 'strand_3'] == 'Pos'):
- print("Found pos")
- pos_strand_der = 1
- pos_strand_list = (
- pos_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint,
- count_gene)
- elif (table_by_gene.at[row, 'strand_3'] == 'Neg'):
- neg_strand_der = 1
- print("found neg")
- neg_strand_list = (
- neg_strand_der, contig_length, gene_overlap, gene_cigar, ig_overlap, ig_cigar, breakpoint, ig_breakpoint,
- count_gene)
+##end of updated code
print("call = " + str(call))
# Collapse into one list
if (call == 1):
- print(pos_strand_list)
- print(neg_strand_list)
+ #print(pos_strand_list)
+ #print(neg_strand_list)
res_gene = [call, source]
for x in pos_strand_list:
res_gene.append(x)
@@ -1002,7 +898,7 @@ def gen_summ_table(filt_table, results_table, nreads, min_con_len, sample):
results = ((sample,), nsd2_call, ccnd3_call, myc_call, mafa_call, ccnd1_call, ccnd2_call, maf_call, mafb_call)
- print(mafb_call)
+ #print(mafb_call)
con_results = []
con_results.append(sample)
# con_results=con_results+nsd2_call+ccnd3_call+myc_call+mafa_call+ccnd1_call+ccnd2_call+maf_call+mafb_call
@@ -1024,7 +920,7 @@ def gen_summ_table(filt_table, results_table, nreads, min_con_len, sample):
con_results.append(x)
print("full list")
- print(con_results)
+ #print(con_results)
translocationsTable = pd.DataFrame(columns=column_names)
translocationsTable.loc[len(translocationsTable)] = con_results
return translocationsTable
@@ -1101,7 +997,7 @@ def getFragsatJunction_samtools(location, contig, bam):
# generate summary table
summ_table = gen_summ_table(filt_table, results_table, min_reads, window_size, sample_name)
-print(summ_table)
+#print(summ_table)
out_file_summ = out_path + "/DEX_IgTx_GA_Summary.txt"
summ_table.to_csv(out_file_summ, sep="\t", index=False, na_rep=0, float_format='%.0f')
print("Test Done")
diff --git a/required_scripts/README.md b/required_scripts/README.md
index d41d8272..c72fb693 100644
--- a/required_scripts/README.md
+++ b/required_scripts/README.md
@@ -27,16 +27,17 @@ $ cp plotCNVplus.R /path/to/phoenix/required_scripts/plotCNVplus_4d89cb4.R
## Script Source Locations
[addRC_to_Delly_VCF_f4d178e.py](https://github.com/tgen/jetstream_resources/commit/f4d178e2b8982ff49025d42cb7c18d7b12053f42)
-[annotSeg_8820499.pl](https://github.com/tgen/jetstream_resources/commit/8820499e113a387fee98044112951fa534ad6f8e)
-[Process_Assembled_BAM_eb25fca.py](https://github.com/tgen/GaMMiT/commit/eb25fca1769e56048439efe80479759e164433cf)
+[annotSeg_7102f1c.pl](https://github.com/tgen/jetstream_resources/commit/7102f1c4fbdabe8aa16d714aa0cfb8787209df6f)
+[Process_Assembled_BAM_277eed7.py](https://github.com/tgen/GaMMiT/commit/277eed728712fa8e636858055ecbf1be270cc114)
[manta_prepare_sv_vcf_f94bcc1.py](https://github.com/tgen/jetstream_resources/commit/f94bcc13c826f7d5a4088347e305ffcb49ae6a8e)
[manta_sv_annotation_parallel_8820499.py](https://github.com/tgen/jetstream_resources/commit/8820499e113a387fee98044112951fa534ad6f8e)
-[mm_igtx_pairoscope_calling_b38_f3010c3.py](https://github.com/tgen/mm_IgTx_Calling/commit/f3010c358970f4c25cefddc824636f60a19842e1)
+[mm_igtx_pairoscope_calling_b38_356362b.py](https://github.com/tgen/mm_IgTx_Calling/commit/356362b03f13181f2762ab468f9b4f222439ea69)
[plotCNVplus_4d89cb4.R](https://github.com/tgen/plotCNVplus/commit/4d89cb4d8f35e48b916d660c82c52b8725ade16f)
[runIchorCNA_47ce8db.R](https://github.com/broadinstitute/ichorCNA/commit/47ce8db4d81ada2d3ce09280661d1240f3dcd530#diff-79cb887cc56cef135b77c5b7a725975c)
[samStats2json_3a90a2f.py](https://github.com/tgen/samStats2json/commit/3a90a2fefd8fc60a5ebd391dca6702fae419f32f)
+[sigprofiler_d78cc9e.py](https://github.com/tgen/jetstream_resources/commit/d78cc9e243e2fbfe4e7adae91f1ce70015e38131)
[seg_extend_229b8c7.py](https://github.com/tgen/jetstream_resources/commit/229b8c7641dd505789664aab88c1662d1f97e429)
[summarize_samstats_8c45d63.R](https://github.com/tgen/plot_samstats/commit/8c45d63dbd7f5037d7bb658ac91647898bf7509f)
-[summarize_Ig_4b93aee.R](https://github.com/tgen/jetstream_resources/commit/4b93aeea52ebc4721621a58a7520f676b4b97001)
+[summarize_Ig_875a823.R](https://github.com/tgen/jetstream_resources/commit/875a823202ba698d7adc1f25db86290b67d19028)
[svtop.delly.sv_annotation.parallel_8820499.py](https://github.com/tgen/jetstream_resources/commit/8820499e113a387fee98044112951fa534ad6f8e)
[uploadStats2Lims_1ace81f.py](https://github.com/tgen/uploadStats2Lims/pull/2/commits/1ace81faaea5f894b9f618d86b1d2d9b8149cdc6)
diff --git a/required_scripts/annotSeg_8820499.pl b/required_scripts/annotSeg_7102f1c.pl
similarity index 86%
rename from required_scripts/annotSeg_8820499.pl
rename to required_scripts/annotSeg_7102f1c.pl
index 56c938e2..139ff4bd 100755
--- a/required_scripts/annotSeg_8820499.pl
+++ b/required_scripts/annotSeg_7102f1c.pl
@@ -93,6 +93,7 @@
print OFILE "##INFO=\n";
print OFILE "##INFO=\n";
print OFILE "##INFO=\n";
+print OFILE "##INFO=\n";
print OFILE "##INFO=\n";
print OFILE "##INFO=\n";
print OFILE "##ALT=\n";
@@ -119,7 +120,10 @@
$qual=abs($temp[5]);
- if ($temp[0] =~/^\"ID\"/) {next LOOP;}
+ # Added ^Sample here to continue legacy support of this script with newer outputs
+ # since this is not a recommended script, we used a solution that required minimal
+ # development time.
+ if ($temp[0] =~/^\"ID\"/ || $temp[0] =~/^Sample/) {next LOOP;}
$st=int($temp[2]/100);
$en=int($temp[3]/100);
@@ -146,18 +150,20 @@
}
if (@gns){
- $vcfline="$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\tPASS\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$temp[4];LOG2FC=$temp[5]";
+ $svlen = $temp[3] - $temp[2];
+ $vcfline="$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\tPASS\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$svlen;MARKERS=$temp[4];LOG2FC=$temp[5]";
$genes=join(",",@gns);
print OFILE "$vcfline;GENE=$genes\n";
}
else{
- print OFILE "$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\tPASS\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$temp[4];LOG2FC=$temp[5]\n";
+ $svlen = $temp[3] - $temp[2];
+ print OFILE "$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\tPASS\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$svlen;MARKERS=$temp[4];LOG2FC=$temp[5]\n";
}
}else{
- print OFILE "$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\t.\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$temp[4];LOG2FC=$temp[5]\n";
+ $svlen = $temp[3] - $temp[2];
+ print OFILE "$temp[1]\t$temp[2]\t$temp[3]\tN\t$alt\t$qual\t.\tIMPRECISE;SVTYPE=$alt;END=$temp[3];SVLEN=$svlen;MARKERS=$temp[4];LOG2FC=$temp[5]\n";
}
}
close(OFILE);
close(FILE);
-
diff --git a/required_scripts/mm_igtx_pairoscope_calling_b38_tumor_only_39d1efa.py b/required_scripts/mm_igtx_pairoscope_calling_b38_356362b.py
old mode 100644
new mode 100755
similarity index 98%
rename from required_scripts/mm_igtx_pairoscope_calling_b38_tumor_only_39d1efa.py
rename to required_scripts/mm_igtx_pairoscope_calling_b38_356362b.py
index 4d185c50..932f0fe8
--- a/required_scripts/mm_igtx_pairoscope_calling_b38_tumor_only_39d1efa.py
+++ b/required_scripts/mm_igtx_pairoscope_calling_b38_356362b.py
@@ -139,19 +139,22 @@ def call_translocations(sample, gene, gene_chr, order, window_start=0, window_en
print("dreads.PositionA is: " + str(dreads.PositionA))
gene_igh = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr14")
& (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end)
- & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844745))]
+ & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844760))
+ & ((dreads.PositionA <= 143243200) | (dreads.PositionA >= 143244000))]
print(str(gene_igh.columns))
print(str(gene_igh))
count_gene_igh = len(gene_igh.index)
gene_igk = dreads[(dreads.Specimen == sample) & (dreads.ChrA == "chr2") & (dreads.ChrB == gene_chr)
& (dreads.PositionB >= window_start) & (dreads.PositionB <= window_end)
- & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844745))]
+ & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844760))
+ & ((dreads.PositionA <= 143243200) | (dreads.PositionA >= 143244000))]
count_gene_igk = len(gene_igk.index)
print(str(gene_igk.columns))
print(str(gene_igk))
gene_igl = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr22")
& (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end)
- & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844745))]
+ & ((dreads.PositionA <= 143844450) | (dreads.PositionA >= 143844760))
+ & ((dreads.PositionA <= 143243200) | (dreads.PositionA >= 143244000))]
count_gene_igl = len(gene_igl.index)
print(str(gene_igl.columns))
print(str(gene_igl))
diff --git a/required_scripts/mm_igtx_pairoscope_calling_b38_f3010c3.py b/required_scripts/mm_igtx_pairoscope_calling_b38_f3010c3.py
deleted file mode 100755
index dd7d5046..00000000
--- a/required_scripts/mm_igtx_pairoscope_calling_b38_f3010c3.py
+++ /dev/null
@@ -1,386 +0,0 @@
-#!/usr/bin/env python3
-
-# MYC blacklist region chr8:129571461-129571635 (b37)
-# MYC blacklist region chr8:128559215-128559389 (b38)
-
-#################################################################
-# Configure the enviroment
-import pandas as pd
-import argparse
-from multiprocessing import Pool
-
-#################################################################
-
-parser = argparse.ArgumentParser(description='Process pairoscope discordant reads to make myeloma Ig translocation calls.')
-
-parser.add_argument('-i', '--input_file',
- required=True,
- metavar='File.tsv',
- dest="input_file",
- help='Discordant_read_table')
-parser.add_argument('-s', '--specimen',
- required=True,
- help='Specimen Name, must match discordant read table')
-parser.add_argument('-o', '--output_file',
- required=True,
- metavar='File.tsv',
- help='Output filename')
-parser.add_argument('-w', '--window',
- default=2000,
- type=int,
- metavar='INT',
- help='Genomic window size to query (recommend 2.5x insert size)')
-parser.add_argument('-m', '--window_min',
- default=100,
- type=int,
- metavar='INT',
- help='Required discordant read island size (recommend 0.1x insert size)')
-parser.add_argument('-c', '--call_requirement',
- default=3,
- type=int,
- metavar='INT',
- help='Required number of discordant read pairs meeting requirements to define a call (recommend >= 3)')
-
-args = parser.parse_args()
-
-#################################################################
-# Capture Run Parameters
-
-window = args.window
-call_requirement = args.call_requirement
-window_min = args.window_min
-sample = args.specimen
-
-#################################################################
-# Read in the discordant reads table
-
-dreads = pd.read_csv(args.input_file, sep="\t")
-
-#################################################################
-# Define Functions
-
-def call_translocations(sample, gene, gene_chr, order, window_start=0, window_end=300000000):
- # initialize variables
- gene_window = 0
- gene_maxwindowcount = 0
- gene_nextlargestwindowcount = 0
- gene_windowend = 0
- gene_maxwindowwidth = 0
- gene_maxwindowlocation = 0
- gene_nextwindowwidth = 0
- gene_nextwindowlocation = 0
- gene_ighsource = 0
-
- # Debug
- print('')
- print('------------------------------')
- print(sample)
- print(gene)
- print(gene_chr)
- print(order)
-
- # Determine which immunoglobulin loci is the most likely to contain a translocation
- # Watch ChrA and ChrB as they are in numeric order so dependign on Tx and igH region partner
- # and target can switch columns
- if order == "standard":
- print('1 - In Standard Loop')
- if gene != 'MYC':
- print('1 - In MYC Sub-loop')
- gene_igh = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr14")
- & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end)]
-
- count_gene_igh = len(gene_igh.index)
-
- gene_igk = dreads[(dreads.Specimen == sample) & (dreads.ChrA == "chr2") & (dreads.ChrB == gene_chr)
- & (dreads.PositionB >= window_start) & (dreads.PositionB <= window_end)]
- count_gene_igk = len(gene_igk.index)
- gene_igl = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr22")
- & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end)]
- count_gene_igl = len(gene_igl.index)
- else:
- print('1 - In Non-MYC Sub-loop')
- gene_igh = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr14")
- & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end)
- & ((dreads.PositionA <= 128559215) | (dreads.PositionA >= 128559389))]
- count_gene_igh = len(gene_igh.index)
- gene_igk = dreads[(dreads.Specimen == sample) & (dreads.ChrA == "chr2") & (dreads.ChrB == gene_chr)
- & (dreads.PositionB >= window_start) & (dreads.PositionB <= window_end)
- & ((dreads.PositionB <= 128559215) | (dreads.PositionB >= 128559389))]
- count_gene_igk = len(gene_igk.index)
- gene_igl = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr22")
- & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end)
- & ((dreads.PositionA <= 128559215) | (dreads.PositionA >= 128559389))]
- count_gene_igl = len(gene_igl.index)
- print(count_gene_igh)
- print(count_gene_igk)
- print(count_gene_igl)
- if count_gene_igh >= count_gene_igk and count_gene_igh >= count_gene_igl:
- print('2-In IgH Loop')
- # Suspect igH Translocation as igH count greater then igK and igL
- # if all three have zeros or the same number we default to the igH)
- table_gene = gene_igh
- count_gene = len(table_gene.index)
- gene_ighsource = 1
- elif count_gene_igk > count_gene_igh and count_gene_igk > count_gene_igl:
- print('2-In IgK Loop')
- # Suspect igK Translocation
- # WARNiNG THE CHROMOSOME ORDER BECOMES UNEXPECTED, HOW TO FiX?
- # Create Flipped Table for the kappa counts ( ChrA PositionA ChrB PositionB)
- # Update the column headers
- new_cols = ['Speciment', 'ChrB', 'PositionB', 'ChrA', 'PositionA']
- gene_igk.columns = new_cols
- # Create new GENE table with the relabelled columns in the expected order
- cols = ['Speciment', 'ChrA', 'PositionA', 'ChrB', 'PositionB']
- table_gene = gene_igk[cols]
- count_gene = len(table_gene.index)
- gene_ighsource = 2
- elif count_gene_igl > count_gene_igh and count_gene_igl > count_gene_igk:
- print('2-In IgL Loop')
- # Suspect igL Translocation
- table_gene = gene_igl
- count_gene = len(table_gene.index)
- gene_ighsource = 3
- else:
- # Error capture NOT AN EXPECTED EVENT
- # Could be 0 in all three potentially, then what?
- # What if two but not all three have the same number of counts?
- print('2 - Argh - WHAT CAUSES THiS TO HAPPEN')
- if count_gene_igh < 3 and count_gene_igk < 3 and count_gene_igl < 3:
- print('DEFAULTiNG TO iGH COUNTS BECAUSE ALL LESS THAN 3')
- table_gene = gene_igh
- count_gene = len(table_gene.index)
- gene_ighsource = 1
- elif count_gene_igh < count_gene_igk and count_gene_igh < count_gene_igl and count_gene_igk == count_gene_igl:
- print('DEFAULTiNG TO iGH COUNTS BECAUSE IgK and IgL ARE EQUAL EVEN THOUGH LESS')
- table_gene = gene_igh
- count_gene = len(table_gene.index)
- gene_ighsource = 1
-
- elif order == "reverse":
- print('1 - In Reverse Loop')
- gene_igh = dreads[(dreads.Specimen == sample) & (dreads.ChrA == "chr14") & (dreads.ChrB == gene_chr)
- & (dreads.PositionB >= window_start) & (dreads.PositionB <= window_end)]
- count_gene_igh = len(gene_igh.index)
- gene_igk = dreads[(dreads.Specimen == sample) & (dreads.ChrA == "chr2") & (dreads.ChrB == gene_chr)
- & (dreads.PositionB >= window_start) & (dreads.PositionB <= window_end)]
- count_gene_igk = len(gene_igk.index)
- gene_igl = dreads[(dreads.Specimen == sample) & (dreads.ChrA == gene_chr) & (dreads.ChrB == "chr22")
- & (dreads.PositionA >= window_start) & (dreads.PositionA <= window_end)]
- count_gene_igl = len(gene_igl.index)
-
- print(count_gene_igh)
- print(count_gene_igk)
- print(count_gene_igl)
-
- if count_gene_igh >= count_gene_igk and count_gene_igh >= count_gene_igl:
- print('2-In IgH Loop')
- # Suspect igH Translocation as igH count greater then igK and igL
- # if all three have zeros or the same number we default to the igH
- # Update the column headers
- new_cols = ['Speciment', 'ChrB', 'PositionB', 'ChrA', 'PositionA']
- gene_igh.columns = new_cols
- # Create new GENE table with the relabelled columns in the expected order
- cols = ['Speciment', 'ChrA', 'PositionA', 'ChrB', 'PositionB']
- table_gene = gene_igh[cols]
- count_gene = len(table_gene.index)
- gene_ighsource = 1
- elif count_gene_igk > count_gene_igh and count_gene_igk > count_gene_igl:
- print('2-In IgK Loop')
- # Suspect igK Translocation
- # WARNiNG THE CHROMOSOME ORDER BECOMES UNEXPECTED, HOW TO FiX?
- # Create Flipped Table for the kappa counts ( ChrA PositionA ChrB PositionB)
- # Update the column headers
- new_cols = ['Speciment', 'ChrB', 'PositionB', 'ChrA', 'PositionA']
- gene_igk.columns = new_cols
- # Create new GENE table with the relabelled columns in the expected order
- cols = ['Speciment', 'ChrA', 'PositionA', 'ChrB', 'PositionB']
- table_gene = gene_igk[cols]
- count_gene = len(table_gene.index)
- gene_ighsource = 2
- elif count_gene_igl > count_gene_igh and count_gene_igl > count_gene_igk:
- print('2-In IgL Loop')
- # Suspect igL Translocation
- table_gene = gene_igl
- count_gene = len(table_gene.index)
- gene_ighsource = 3
- else:
- # Error capture NOT AN EXPECTED EVENT
- # Could be 0 in all three potentially, then what?
- # What if two but not all three have the same number of counts?
- print('2 - Argh - WHAT CAUSES THiS TO HAPPEN')
- if count_gene_igh < 3 and count_gene_igk < 3 and count_gene_igl < 3:
- print('DEFAULTiNG TO iGH COUNTS BECAUSE ALL LESS THAN 3')
- # Update the column headers
- new_cols = ['Speciment', 'ChrB', 'PositionB', 'ChrA', 'PositionA']
- gene_igh.columns = new_cols
- # Create new GENE table with the relabelled columns in the expected order
- cols = ['Speciment', 'ChrA', 'PositionA', 'ChrB', 'PositionB']
- table_gene = gene_igh[cols]
- count_gene = len(table_gene.index)
- gene_ighsource = 1
- elif count_gene_igh < count_gene_igk and count_gene_igh < count_gene_igl and count_gene_igk == count_gene_igl:
- print('DEFAULTiNG TO iGH COUNTS BECAUSE IgK and IgL ARE EQUAL EVEN THOUGH LESS')
- # Update the column headers
- new_cols = ['Speciment', 'ChrB', 'PositionB', 'ChrA', 'PositionA']
- gene_igh.columns = new_cols
- # Create new GENE table with the relabelled columns in the expected order
- cols = ['Speciment', 'ChrA', 'PositionA', 'ChrB', 'PositionB']
- table_gene = gene_igh[cols]
- count_gene = len(table_gene.index)
- gene_ighsource = 1
- # Do Stuff
- if count_gene > 1:
- # Sort the table by PositionA
- gene_sorted = table_gene.sort_values(['PositionA'], ascending=True, inplace=False, kind='quicksort', na_position='last')
- gene_top = gene_sorted.head(1)
- gene_bottom = gene_sorted.tail(1)
- gene_top_position = gene_top.iat[0, 2]
- gene_bottom_position = gene_bottom.iat[0, 2]
- gene_window = gene_bottom_position - gene_top_position
- # There are likely some true breakpoints with two breakpoint clusters and these create large windows
- # To account for this issue we need to count discordant reads per window
- for row in gene_sorted.index:
- # Get PositionA value, calculate +/- window, capture rows within the +/- window
- position_current = gene_sorted.at[row, 'PositionA']
- position_negative = (position_current - window)
- position_positive = (position_current + window)
- position_table = gene_sorted[
- (gene_sorted.PositionA > position_negative) & (gene_sorted.PositionA < position_positive)]
- position_count = len(position_table.index)
- # Create Tracking Variables
- # if the current window cound is greater than existing Max, update Max and Next accordingly
- if position_count > gene_maxwindowcount:
- gene_nextlargestwindowcount = gene_maxwindowcount
- gene_maxwindowcount = position_count
- # NEW FEATURE START
- gene_nextwindowwidth = gene_maxwindowwidth
- gene_nextwindowlocation = gene_maxwindowlocation
- gene_max_window_top = position_table.head(1)
- gene_max_window_bottom = position_table.tail(1)
- gene_max_window_top_position = gene_max_window_top.iat[0, 2]
- gene_max_window_bottom_position = gene_max_window_bottom.iat[0, 2]
- gene_maxwindowwidth = gene_max_window_bottom_position - gene_max_window_top_position
- gene_maxwindowlocation = int( (gene_max_window_bottom_position + gene_max_window_top_position) / 2 )
- # NEW FEATURE END
- # if the two bundle counts will be greater than total count reset the NextLargest to 0
- if (gene_nextlargestwindowcount + gene_maxwindowcount) > count_gene:
- gene_nextlargestwindowcount = 0
- # NEW FEATURE START
- gene_nextwindowwidth = 0
- gene_nextwindowlocation = 0
- # NEW FEATURE END
- gene_windowend = position_positive
- # if the second bundle in order is greater than the current next but less than the max then capture current as a next
- elif position_count > gene_nextlargestwindowcount and position_count <= gene_maxwindowcount and position_negative >= gene_windowend:
- gene_nextlargestwindowcount = position_count
- # NEW FEATURE START
- # Reusing the Max calculation variables BUT only to calculate the Next window and location (KiSS)
- gene_max_window_top = position_table.head(1)
- gene_max_window_bottom = position_table.tail(1)
- gene_max_window_top_position = gene_max_window_top.iat[0, 2]
- gene_max_window_bottom_position = gene_max_window_bottom.iat[0, 2]
- gene_nextwindowwidth = gene_max_window_bottom_position - gene_max_window_top_position
- gene_nextwindowlocation = int( (gene_max_window_bottom_position + gene_max_window_top_position) / 2 )
- # NEW FEATURE END
-
- # Make result calls
- if gene_maxwindowcount >= call_requirement and gene_maxwindowwidth >= window_min and gene_nextlargestwindowcount >= call_requirement and gene_nextwindowwidth >= window_min:
- gene_call = 1
- gene_bundle_count = 2
- elif gene_maxwindowcount >= call_requirement and gene_maxwindowwidth >= window_min and (
- gene_nextlargestwindowcount < call_requirement or gene_nextwindowwidth < window_min):
- gene_call = 1
- gene_bundle_count = 1
- # remove the width and location calculations from the 1sies and 2sies (below call requirement)
- gene_nextwindowwidth = 0
- gene_nextwindowlocation = 0
- else:
- gene_call = 0
- gene_bundle_count = 0
- gene_maxwindowwidth = 0
- # remove the width and location calculations from the 1sies and 2sies (below call requirement)
- gene_maxwindowlocation = 0
- gene_nextwindowwidth = 0
- gene_nextwindowlocation = 0
- print('Window Count = ' + str(count_gene))
- return (count_gene, gene_window, gene_maxwindowcount, gene_maxwindowwidth, gene_maxwindowlocation,
- gene_nextlargestwindowcount, gene_nextwindowwidth, gene_nextwindowlocation, gene_call, gene_ighsource,
- gene_bundle_count)
-
-
-#################################################################
-# Make Column Names for Translocation Table
-list_of_genes = ['NSD2', 'CCND3', 'MYC', 'MAFA', 'CCND1', 'CCND2', 'MAF', 'MAFB']
-list_of_features = ['COUNT', 'WINDOW', 'MAXWINDOWCOUNT', 'MAXWINDOWWIDTH', 'MAXWINDOWLOCATION',
- 'NEXTWINDOWCOUNT', 'NEXTWINDOWWIDTH', 'NEXTWINDOWLOCATION', 'CALL', 'IGSOURCE', 'BUNDLECOUNT']
-
-column_names = ['SAMPLE']
-# Make list of column_names:
-for gene in list_of_genes:
- for feature in list_of_features:
- header = '_'.join([gene, feature])
- column_names.append(header)
-
-
-#################################################################
-# Function to call all functions for a sample
-
-def make_sample_calls(sample):
- sample_id = sample
-
- # MMSET
- mmset_call = call_translocations(gene="NSD2", gene_chr="chr4", order="standard", sample=sample_id)
-
- # CCND3
- ccnd3_call = call_translocations(gene="CCND3", gene_chr="chr6", order="standard", sample=sample_id)
-
- # MYC
- myc_call = call_translocations(gene="MYC", gene_chr="chr8", order="standard", window_start=124987758,
- window_end=129487754, sample=sample_id)
-
- # MAFA
- mafa_call = call_translocations(gene="MAFA", gene_chr="chr8", order="standard", window_start=142918584,
- window_end=143925832, sample=sample_id)
-
- # CCND1
- ccnd1_call = call_translocations(gene="CCND1", gene_chr="chr11", order="standard", sample=sample_id)
-
- # CCND2
- ccnd2_call = call_translocations(gene="CCND2", gene_chr="chr12", order="standard", sample=sample_id)
-
- # MAF
- maf_call = call_translocations(gene="MAF", gene_chr="chr16", order="reverse", sample=sample_id)
-
- # MAFB
- mafb_call = call_translocations(gene="MAFB", gene_chr="chr20", order="reverse", sample=sample_id)
-
- results = ((sample,), mmset_call, ccnd3_call, myc_call, mafa_call, ccnd1_call, ccnd2_call, maf_call, mafb_call)
- return (sum(results, ()))
-
-
-#################################################################
-# Create pool for multithreading
-
-pool = Pool(processes=2)
-
-jobs = []
-job = pool.apply_async(make_sample_calls, [sample])
-jobs.append(job)
-
-results = []
-for job in jobs:
- print(job)
- result = job.get()
- print(result)
- results.append(result)
-
-
-translocationsTable = pd.DataFrame(results, columns=column_names)
-
-# Save results
-translocationsTable.to_csv(args.output_file, sep="\t", index=False, na_rep="NaN")
-
-#################################################################
-
-# Message all done
-print('ALL PROCESSES COMPLETE')
diff --git a/required_scripts/sigprofiler_d78cc9e.py b/required_scripts/sigprofiler_d78cc9e.py
new file mode 100644
index 00000000..9a2f9ca2
--- /dev/null
+++ b/required_scripts/sigprofiler_d78cc9e.py
@@ -0,0 +1,146 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+import argparse
+from concurrent.futures import ThreadPoolExecutor
+from SigProfilerMatrixGenerator.scripts import SigProfilerMatrixGeneratorFunc as matGen
+from SigProfilerExtractor import sigpro as sig
+
+currentDirectory = os.getcwd()
+
+def parse_arguments():
+ """Parse arguments, validate and return the args"""
+
+ parser = argparse.ArgumentParser(
+ description='Compute mutational signatures.',
+ formatter_class=argparse.RawTextHelpFormatter)
+
+ parser.add_argument('-o', '--output', default=currentDirectory,
+ help='Path to and name of output directory - Can be relative or full path')
+
+ parser.add_argument('-i', '--vcfpath',
+ help='Path to directory containing vcf file(s) - Can be relative or full path')
+
+ parser.add_argument('-g', '--genome', default='GRCh38',
+ help='Optional definition of genome, defaults to GRCh38')
+
+ parser.add_argument('-p', '--project', metavar="",
+ help='Name of the project')
+
+ parser.add_argument('-e', '--exome', default=False, action="store_true",
+ help='Set if input is from exome')
+
+ parser.add_argument('-t', '--threads', default=-1, type=int,
+ help='Set number of threads, default will use all threads')
+
+ parser.add_argument('--matrix_only', default=False, action="store_true",
+ help='Stop after mutational matrix generation')
+
+ parser.add_argument('--extract_only', default=False, action="store_true",
+ help='Stop after SigProfilerExtractor')
+
+ # prints help message when 0 arguments are entered
+ if len(sys.argv) == 1:
+ parser.print_help(sys.stderr)
+ sys.exit(1)
+
+ parser_args = parser.parse_args()
+
+ return parser_args
+
+def extractSignatures(output,
+ vcfpath,
+ genome,
+ project,
+ sigClass='SBS',
+ sigContext='96',
+ exome=False,
+ threads=-1,
+ extract_only=False):
+ """Extracts signatures from a matrix generated by SigProfilerMatrixGenerator"""
+
+ sigType = sigClass + sigContext
+
+ if exome:
+ input_data = vcfpath+"/output/"+sigClass+"/"+project+"."+sigType+".exome"
+ else:
+ input_data = vcfpath+"/output/"+sigClass+"/"+project+"."+sigType+".all"
+
+ sig.sigProfilerExtractor("matrix", output, input_data, reference_genome=genome, opportunity_genome = genome, context_type=sigType ,minimum_signatures=1, maximum_signatures=10, cpu=threads)
+
+ if not extract_only:
+ from SigProfilerExtractor import decomposition as decomp
+ signatures = output+"/"+sigType+"/Suggested_Solution/"+sigType+"_De-Novo_Solution/Signatures/"+sigType+"_De-Novo_Signatures.txt"
+ activities = output+"/"+sigType+"/Suggested_Solution/"+sigType+"_De-Novo_Solution/Activities/"+sigType+"_De-Novo_Activities_refit.txt"
+ samples = output+"/"+sigType+"/Samples.txt"
+
+ #to get all cosmic signatures without filtering
+ decomp.decompose(signatures, activities, samples, output, genome_build=genome, verbose=False, nnls_add_penalty=0.0, nnls_remove_penalty=0.0, initial_remove_penalty=0.0, de_novo_fit_penalty=0.02)
+
+def main():
+ # Parse and validate arguments
+ args = parse_arguments()
+
+ matrices = matGen.SigProfilerMatrixGeneratorFunc(args.project, args.genome, args.vcfpath, plot=True, exome=args.exome, bed_file=None, chrom_based=False, tsb_stat=False, seqInfo=False, cushion=100)
+
+ if args.matrix_only:
+ print("User requested matrix generation only, exiting before submitting to SigProfilerExtractor...\n")
+ sys.exit(0)
+
+ num_tasks = 0
+ sig_list = []
+
+ try:
+ if matrices['96'][args.project].sum() > 0:
+ num_tasks = num_tasks + 1
+ sig_list.append(('SBS', '96'))
+ else:
+ if os.path.exists(args.vcfpath+"/output/SBS"):
+ for f in os.listdir(args.vcfpath+"/output/SBS"):
+ os.remove(os.path.join(args.vcfpath+"/output/SBS", f))
+ os.rmdir(args.vcfpath+"/output/SBS")
+ except:
+ if os.path.exists(args.vcfpath+"/output/SBS"):
+ for f in os.listdir(args.vcfpath+"/output/SBS"):
+ os.remove(os.path.join(args.vcfpath+"/output/SBS", f))
+ os.rmdir(args.vcfpath+"/output/SBS")
+
+ try:
+ if matrices['DINUC'][args.project].sum() > 0:
+ num_tasks = num_tasks + 1
+ sig_list.append(('DBS', '78'))
+ else:
+ if os.path.exists(args.vcfpath+"/output/DBS"):
+ for f in os.listdir(args.vcfpath+"/output/DBS"):
+ os.remove(os.path.join(args.vcfpath+"/output/DBS", f))
+ os.rmdir(args.vcfpath+"/output/DBS")
+ except:
+ if os.path.exists(args.vcfpath+"/output/DBS"):
+ for f in os.listdir(args.vcfpath+"/output/DBS"):
+ os.remove(os.path.join(args.vcfpath+"/output/DBS", f))
+ os.rmdir(args.vcfpath+"/output/DBS")
+
+ try:
+ if matrices['ID'][args.project].sum() > 0:
+ num_tasks = num_tasks + 1
+ sig_list.append(('ID', '83'))
+ else:
+ if os.path.exists(args.vcfpath+"/output/ID"):
+ for f in os.listdir(args.vcfpath+"/output/ID"):
+ os.remove(os.path.join(args.vcfpath+"/output/ID", f))
+ os.rmdir(args.vcfpath+"/output/ID")
+ except:
+ if os.path.exists(args.vcfpath+"/output/ID"):
+ for f in os.listdir(args.vcfpath+"/output/ID"):
+ os.remove(os.path.join(args.vcfpath+"/output/ID", f))
+ os.rmdir(args.vcfpath+"/output/ID")
+
+ if num_tasks > 0:
+ cpus_per_task = max(int(args.threads/num_tasks),1)
+ with ThreadPoolExecutor(max_workers=3) as e:
+ for sigClass, sigContext in sig_list:
+ e.submit(extractSignatures, args.output, args.vcfpath, args.genome, args.project, sigClass, sigContext, args.exome, cpus_per_task, args.extract_only)
+
+if __name__ == '__main__':
+ main()
diff --git a/required_scripts/summarize_Ig_4b93aee.R b/required_scripts/summarize_Ig_875a823.R
similarity index 82%
rename from required_scripts/summarize_Ig_4b93aee.R
rename to required_scripts/summarize_Ig_875a823.R
index e1b23334..9ed707a8 100644
--- a/required_scripts/summarize_Ig_4b93aee.R
+++ b/required_scripts/summarize_Ig_875a823.R
@@ -3,7 +3,10 @@ library(tidyverse)
suppressPackageStartupMessages(library("optparse"))
suppressPackageStartupMessages(library("stats"))
-#Read in the individual flatFiles
+# Initialize list of matrix object
+genes <- c("NSD2", "CCND3", "MYC", "MAFA", "CCND1", "CCND2", "MAF", "MAFB")
+callers <- c()
+header <- c("Specimen")
option_list <- list(
make_option(c("-v", "--verbose"), action="store_true", default=TRUE,
@@ -21,25 +24,60 @@ option_list <- list(
help="Flat file containing Ig Calls using Gammit"),
make_option(c("-c", "--count"), action="store",type="integer", default=2,
help="Minimum caller count [default %default]",
- metavar="number")
+ metavar="number"),
+ make_option(c("--genes"), action="store",
+ help="Flat file containing genes of interest (not required)")
)
opt <- parse_args(OptionParser(option_list=option_list))
write("Check input files...\n", stderr())
-if(!is.null(opt$pairoscope_file) && !file.exists(as.character(opt$pairoscope_file)))
+if(!is.null(opt$pairoscope_file))
{
- write("Pairoscope Ig Tx file not found...\n", stderr())
+ callers <- append(callers, "Pairoscope")
+ if(!file.exists(as.character(opt$pairoscope_file)))
+ {
+ write("Pairoscope Ig Tx file not found...\n", stderr())
+ }
}
-if (!is.null(opt$manta_file) && !file.exists(as.character(opt$manta_file)))
+
+if (!is.null(opt$manta_file))
{
- write("Manta Ig Tx file not found...\n", stderr())
+ callers <- append(callers, "Manta")
+ if (!file.exists(as.character(opt$manta_file)))
+ {
+ write("Manta Ig Tx file not found...\n", stderr())
+ }
}
-if (!is.null(opt$gammit_file) && !file.exists(as.character(opt$gammit_file)))
+
+if (!is.null(opt$gammit_file))
+{
+ callers <- append(callers, "Gammit")
+ if (!file.exists(as.character(opt$gammit_file)))
+ {
+ write("Gammit Ig Tx file not found...\n", stderr())
+ }
+}
+
+if (!is.null(opt$genes))
{
- write("Gammit Ig Tx file not found...\n", stderr())
+ genes = read.table(file=opt$genes)
}
+for (gene in genes) {
+ header <- append(header, paste(gene, "Summary_CALL", sep = "_"))
+ header <- append(header, paste(gene, "IgSource", sep = "_"))
+ header <- append(header, paste(gene, "CALLER_COUNT", sep = "_"))
+ for (caller in callers) {
+ header <- append(header, paste(gene, "CALL", caller, sep = "_"))
+ }
+}
+
+init_matrix <- matrix(0, ncol = length(header), nrow = 1)
+colnames(init_matrix) <- header
+init_table <- as_tibble(init_matrix)
+init_table <- init_table %>% mutate_at(1, as.character)
+
write("Processing Data...\n", stderr())
specimen = tibble(Specimen=opt$specimen)
combined_calls<-NULL
@@ -153,9 +191,15 @@ combined_calls$MAF_IgSource = ifelse(length(unique(MAF_IgSource))==1, unique(MAF
combined_calls$MAFA_IgSource = ifelse(length(unique(MAFA_IgSource))==1, unique(MAFA_IgSource),0)
combined_calls$MAFB_IgSource = ifelse(length(unique(MAFB_IgSource))==1, unique(MAFB_IgSource),0)
-combined_calls=combined_calls[,order(colnames(combined_calls), decreasing = TRUE)]
+#combined_calls=combined_calls[,order(colnames(combined_calls), decreasing = TRUE)]
+
+summary_table <- init_table %>%
+ right_join(combined_calls) %>%
+ replace(is.na(.), 0)
+
+summary_table=summary_table[,order(colnames(summary_table), decreasing = TRUE)]
# combined_calls=combined_calls %>% relocate(Specimen)
write("Save results...\n", stderr())
-write_tsv(combined_calls, opt$outfile, append = FALSE, na="NA")
+write_tsv(summary_table, opt$outfile, append = FALSE, na="NA")
write("Done.\n", stderr())
diff --git a/utilities/copy_fastq.jst b/utilities/copy_fastq.jst
index 9adcfedb..879860c7 100644
--- a/utilities/copy_fastq.jst
+++ b/utilities/copy_fastq.jst
@@ -12,6 +12,10 @@
mkdir -p temp/fastqs/
+ {% if fastq.fileType == "fasterq" %}
+ rsync "{{ fastq.fastqPath|replace(".fastq.gz",".fasterq") }}" "temp/fastqs/"
+ {% else %}
rsync "{{ fastq.fastqPath }}" "temp/fastqs/"
+ {% endif %}
-{% endmacro %}
\ No newline at end of file
+{% endmacro %}
diff --git a/utilities/finalize.jst b/utilities/finalize.jst
index 9dcb445e..e4bbcdc1 100644
--- a/utilities/finalize.jst
+++ b/utilities/finalize.jst
@@ -41,6 +41,13 @@
ln --symbolic --force ${line}
done
{% endif %}
+
+ {% if tasks[taskPrefix+"_constitutional_cna_caller_gatk"]|default(true) %}
+ for line in `find ../{{ general_library_type }}/constitutional_copy_number/gatk -name "*.re_centered.cr.igv.seg"`
+ do
+ ln --symbolic --force ${line}
+ done
+ {% endif %}
{% elif general_library_type in 'genome' %}
{% set taskPrefix = 'Genome' %}
@@ -73,6 +80,13 @@
ln --symbolic --force ${line}
done
{% endif %}
+
+ {% if tasks[taskPrefix+"_constitutional_cna_caller_gatk"]|default(true) %}
+ for line in `find ../{{ general_library_type }}/constitutional_copy_number/gatk -name "*.re_centered.cr.igv.seg"`
+ do
+ ln --symbolic --force ${line}
+ done
+ {% endif %}
{% elif general_library_type in 'rna' %}
{% set taskPrefix = 'RNA' %}
@@ -121,4 +135,4 @@
{% endfor %}
-{% endmacro %}
\ No newline at end of file
+{% endmacro %}
diff --git a/utilities/md5sum_bam_cram.jst b/utilities/md5sum_bam_cram.jst
index ca98c140..bcc7ece6 100644
--- a/utilities/md5sum_bam_cram.jst
+++ b/utilities/md5sum_bam_cram.jst
@@ -1,24 +1,25 @@
{% macro md5sum_bam_cram(sample, aligner='bwa') %}
{% if cram|default(true) %}
-{% set bam_path %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}/{{ sample.name }}.{{ aligner }}.bam{% endset %}
-{% set cram_path %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}/{{ sample.name }}.{{ aligner }}.cram{% endset %}
+{% set file_path %}{{ sample.gltype }}/alignment/{{ aligner }}/{{ sample.name }}{% endset %}
- name: md5sum_bam_cram_{{ sample.name }}_{{ aligner }}
tags: [{{ sample.gltype }}, {{ sample.name }}, md5sum ]
input:
{% if cram|default(true) %}
- - {{ cram_path }}
+ - {{ file_path }}/{{ sample.name }}.{{ aligner }}.cram
+ - {{ file_path }}/{{ sample.name }}.{{ aligner }}.cram.crai
{% else %}
- - {{ bam_path }}
+ - {{ file_path }}/{{ sample.name }}.{{ aligner }}.bam
+ - {{ file_path }}/{{ sample.name }}.{{ aligner }}.bam.bai
{% endif %}
output:
{% if cram|default(true) %}
- - {{ cram_path }}.md5
- - {{ cram_path }}.crai.md5
+ - {{ sample.name }}.{{ aligner }}.cram.md5
+ - {{ sample.name }}.{{ aligner }}.cram.crai.md5
{% else %}
- - {{ bam_path }}.md5
- - {{ bam_path }}.bai.md5
+ - {{ sample.name }}.{{ aligner }}.bam.md5
+ - {{ sample.name }}.{{ aligner }}.bam.bai.md5
{% endif %}
walltime: "1:00:00"
cpus: 1
@@ -27,14 +28,17 @@
set -eu
set -o pipefail
+ {# cd to the directory of the files for proper md5sum generation #}
+ cd {{ file_path }}
+
{% if cram|default(true) %}
- md5sum {{ cram_path }} > {{ cram_path }}.md5
- md5sum {{ cram_path }}.crai > {{ cram_path }}.crai.md5
+ md5sum {{ sample.name }}.{{ aligner }}.cram > {{ sample.name }}.{{ aligner }}.cram.md5
+ md5sum {{ sample.name }}.{{ aligner }}.cram.crai > {{ sample.name }}.{{ aligner }}.cram.crai.md5
{% else %}
- md5sum {{ bam_path }} > {{ bam_path }}.md5
- md5sum {{ bam_path }}.bai > {{ bam_path }}.bai.md5
+ md5sum {{ sample.name }}.{{ aligner }}.bam > {{ sample.name }}.{{ aligner }}.bam.md5
+ md5sum {{ sample.name }}.{{ aligner }}.bam.bai > {{ sample.name }}.{{ aligner }}.bam.bai.md5
{% endif %}
{% endif %}
-{% endmacro %}
\ No newline at end of file
+{% endmacro %}
diff --git a/utilities/remove_files.jst b/utilities/remove_files.jst
index b74600c5..dda6969a 100644
--- a/utilities/remove_files.jst
+++ b/utilities/remove_files.jst
@@ -1,9 +1,13 @@
-{% macro remove_files(directory, before, after) %}
+{% macro remove_files(directory, before, after, name='null') %}
-{% if after is string %}
- {% set task_name %}{{ after }}{% endset %}
+{% if name != 'null' %}
+ {% set task_name %}{{ name }}{% endset %}
{% else %}
+ {% if after is string %}
+ {% set task_name %}{{ after }}{% endset %}
+ {% else %}
{% set task_name %}{{ after[0] }}{% endset %}
+ {% endif %}
{% endif %}
- name: removing_files_{{ task_name }}
diff --git a/utilities/variant_filtering.jst b/utilities/variant_filtering.jst
index 784ac299..fa0edc8f 100644
--- a/utilities/variant_filtering.jst
+++ b/utilities/variant_filtering.jst
@@ -69,4 +69,56 @@
bcftools index --threads 4 --tbi --force {{ output_vcf }}
bcftools index --threads 4 --force {{ output_vcf }}
-{% endmacro %}
\ No newline at end of file
+{% endmacro %}
+
+{% macro filter_variants(pair, input_vcf, output_dir, output_vcf, task) %}
+
+{# Checking if CC_filter is already defined, in preparation for allowing user defined filters #}
+{% if CC_filter is not defined %}
+ {% if pair.callers is defined and pair.callers|length > 1 %}
+ {% if pair.callers | length > 3 %}
+ {% set CC_filter %}INFO/CC>={{ pair.callers | length - 2 }}{% endset %}
+ {% else %}
+ {% set CC_filter %}INFO/CC>={{ pair.callers | length - 1 }}{% endset %}
+ {% endif %}
+ {% endif %}
+{% endif %}
+
+{# Checking for output_vcf type, e.g. vcf.gz, vcf, bcf #}
+{% if output_vcf.endswith('vcf') %}
+ {% set output_type = 'v' %}
+{% elif output_vcf.endswith('bcf') %}
+ {% set output_type = 'b' %}
+{% else %}
+ {% set output_type = 'z' %}
+{% endif %}
+
+- name: variant_filtering_{{ pair.name }}_{{ task }}
+ tags: [variant_filter]
+ input: {{ input_vcf }}
+ output: {{ output_vcf }}
+ cpus: 1
+ mem: 4G
+ walltime: "4:00:00"
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.bcftools.module }}
+
+ mkdir -p {{ output_dir }}
+
+ bcftools view \
+ {% if CC_filter is defined %}
+ --include "{{ CC_filter }}" \
+ {% endif %}
+ --output-file {{ output_vcf }} \
+ --output-type {{ output_type }} \
+ {{ input_vcf }}
+
+ {# No need to index vcf if it isn't compressed #}
+ {%- if output_type != 'v' %}
+ bcftools index --force {{ output_vcf }}
+ {% endif %}
+
+{% endmacro %}
diff --git a/utilities/vcf_stats.jst b/utilities/vcf_stats.jst
new file mode 100644
index 00000000..0ec7d2c1
--- /dev/null
+++ b/utilities/vcf_stats.jst
@@ -0,0 +1,21 @@
+{% macro vcf_stats(input_vcf, results_dir) %}
+
+- name: bcftools_stats_{{ input_vcf | basename | replace(".", "_") }}
+ tags: [bcftools, stats, vcf]
+ input: {{ input_vcf }}
+ output: {{ results_dir }}/stats/{{ input_vcf | basename }}.stats.txt
+ cpus: 1
+ walltime: "4:00:00"
+ cmd: |
+ set -eu
+ set -o pipefail
+
+ module load {{ constants.tools.bcftools.module }}
+
+ mkdir -p {{ results_dir }}/stats
+
+ bcftools stats \
+ {{ input_vcf }} \
+ > {{ results_dir }}/stats/{{ input_vcf | basename }}.stats.txt
+
+{% endmacro %}