From 733d7e5f65c5b459bb3a1bcc8e5e70931088e83b Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 22 Feb 2019 21:25:06 +0000 Subject: [PATCH 01/24] Updated to gatk4.1, Update VariantRecal syntax --- README.md | 22 ++- haplotypecaller-gvcf-gatk4-nio.wdl | 8 +- haplotypecaller-gvcf-gatk4.wdl | 8 +- ...discovery-gatk4-local.hg38.wgs.inputs.json | 14 +- joint-discovery-gatk4-local.wdl | 77 ++++----- joint-discovery-gatk4.hg38.wgs.inputs.json | 29 +--- joint-discovery-gatk4.wdl | 153 +++++++++++------- 7 files changed, 175 insertions(+), 136 deletions(-) diff --git a/README.md b/README.md index 0e72c07..2daca5a 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ Workflows for germline short variant discovery with GATK4. ### haplotypecaller-gvcf-gatk : The haplotypecaller-gvcf-gatk4 workflow runs HaplotypeCaller -from GATK4 in GVCF mode on a single sample according to the GATK Best Practices (June 2016), +from GATK4 in GVCF mode on a single sample according to the GATK Best Practices, scattered across intervals. #### Requirements/expectations @@ -39,13 +39,11 @@ discovery in human whole-genome sequencing (WGS) and exome sequencing data. - GATK 4 or later - Samtools (see gotc docker) - Python 2.7 - -Cromwell version support -- Successfully tested on v31 -- Does not work on versions < v23 due to output syntax +- Cromwell version support + - Successfully tested on v31 + - Does not work on versions < v23 due to output syntax ### IMPORTANT NOTE : -- Runtime parameters are optimized for Broad's Google Cloud Platform implementation. - VQSR wiring. The SNP and INDEL models are built in parallel, but then the corresponding recalibrations are applied in series. Because the INDEL model is generally ready first (because there are fewer indels than SNPs) we set INDEL recalibration to @@ -69,3 +67,15 @@ Cromwell version support The dynamic scatter interval creating was optimized for genomes. The scattered SNP VariantRecalibration may fail because of two few "bad" variants to build the negative model. Also, apologies that the logging for SNP recalibration is overly verbose. +- The provided JSON is meant to be a ready to use example JSON template of the workflow. It is the user’s responsibility to correctly set the reference and resource input variables using the [GATK Tool and Tutorial Documentations](https://software.broadinstitute.org/gatk/documentation/). +- Relevant reference and resources bundles can be accessed in [Resource Bundle](https://software.broadinstitute.org/gatk/download/bundle). +- Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +- For help running workflows on the Google Cloud Platform or locally please +view the following tutorial [(How to) Execute Workflows from the gatk-workflows Git Organization](https://software.broadinstitute.org/gatk/documentation/article?id=12521). +- The following material is provided by the GATK Team. Please post any questions or concerns to one of our forum sites : [GATK](https://gatkforums.broadinstitute.org/gatk/categories/ask-the-team/) , [FireCloud](https://gatkforums.broadinstitute.org/firecloud/categories/ask-the-firecloud-team) or [Terra](https://broadinstitute.zendesk.com/hc/en-us/community/topics/360000500432-General-Discussion) , [WDL/Cromwell](https://gatkforums.broadinstitute.org/wdl/categories/ask-the-wdl-team). +- Please visit the [User Guide](https://software.broadinstitute.org/gatk/documentation/) site for further documentation on our workflows and tools. + +### LICENSING : +Copyright Broad Institute, 2019 | BSD-3 +This script is released under the WDL open source code license (BSD-3) (full license text at https://github.com/openwdl/wdl/blob/master/LICENSE). Note however that the programs it calls may be subject to different licenses. Users are responsible for checking that they are authorized to run all programs before running this script. + diff --git a/haplotypecaller-gvcf-gatk4-nio.wdl b/haplotypecaller-gvcf-gatk4-nio.wdl index 0d6faef..2edc471 100644 --- a/haplotypecaller-gvcf-gatk4-nio.wdl +++ b/haplotypecaller-gvcf-gatk4-nio.wdl @@ -1,4 +1,4 @@ -## Copyright Broad Institute, 2017 +## Copyright Broad Institute, 2019 ## ## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample ## according to the GATK Best Practices (June 2016), scattered across intervals. @@ -11,7 +11,7 @@ ## - One GVCF file and its index ## ## Cromwell version support -## - Successfully tested on v31 +## - Successfully tested on v37 ## - Does not work on versions < v23 due to output syntax ## ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. @@ -36,7 +36,7 @@ workflow HaplotypeCallerGvcf_GATK4 { Boolean making_gvcf = select_first([make_gvcf,true]) String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"]) + String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) String? gatk_path_override String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) String? gitc_docker_override @@ -142,7 +142,7 @@ task CramToBamTask { docker: docker memory: select_first([machine_mem_gb, 15]) + " GB" disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" - preemptible: preemptible_attempts + preemptible: select_first([preemptible_attempts, 3]) } output { File output_bam = "${sample_name}.bam" diff --git a/haplotypecaller-gvcf-gatk4.wdl b/haplotypecaller-gvcf-gatk4.wdl index f5eb32c..d53cfac 100644 --- a/haplotypecaller-gvcf-gatk4.wdl +++ b/haplotypecaller-gvcf-gatk4.wdl @@ -1,4 +1,4 @@ -## Copyright Broad Institute, 2017 +## Copyright Broad Institute, 2019 ## ## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample ## according to the GATK Best Practices (June 2016), scattered across intervals. @@ -11,7 +11,7 @@ ## - One GVCF file and its index ## ## Cromwell version support -## - Successfully tested on v31 +## - Successfully tested on v37 ## - Does not work on versions < v23 due to output syntax ## ## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. @@ -36,7 +36,7 @@ workflow HaplotypeCallerGvcf_GATK4 { Boolean making_gvcf = select_first([make_gvcf,true]) String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.0.6.0"]) + String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) String? gatk_path_override String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) String? gitc_docker_override @@ -134,7 +134,7 @@ task CramToBamTask { docker: docker memory: select_first([machine_mem_gb, 15]) + " GB" disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" - preemptibe: preemptible_attempts + preemptible: select_first([preemptible_attempts, 3]) } output { File output_bam = "${sample_name}.bam" diff --git a/joint-discovery-gatk4-local.hg38.wgs.inputs.json b/joint-discovery-gatk4-local.hg38.wgs.inputs.json index cf620c8..bd0c045 100644 --- a/joint-discovery-gatk4-local.hg38.wgs.inputs.json +++ b/joint-discovery-gatk4-local.hg38.wgs.inputs.json @@ -38,15 +38,15 @@ "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], "##_COMMENT4": "DOCKERS", - "JointGenotyping.python_docker": "python:2.7", - "JointGenotyping.gatk_docker": "broadinstitute/gatk:4.0.6.0", + "#PreProcessingForVariantDiscovery_GATK4.gatk_docker_override": "String? (optional)", "##_COMMENT5": "PATHS", - "JointGenotyping.gatk_path": "/gatk/gatk", + "#PreProcessingForVariantDiscovery_GATK4.gatk_path_override": "String? (optional)", "##_COMMENT8": "DISK SIZE ALLOCATION", - "JointGenotyping.small_disk": 100, - "JointGenotyping.medium_disk": 200, - "JointGenotyping.large_disk": 300, - "JointGenotyping.huge_disk": 400 + "#JointGenotyping.small_disk_override": "Int? (optional)", + "#JointGenotyping.medium_disk_override": "Int? (optional)", + "#JointGenotyping.large_disk_override": "Int? (optional)", + "#JointGenotyping.huge_disk_override": "Int? (optional)" + } diff --git a/joint-discovery-gatk4-local.wdl b/joint-discovery-gatk4-local.wdl index 06028fa..c41d7a6 100644 --- a/joint-discovery-gatk4-local.wdl +++ b/joint-discovery-gatk4-local.wdl @@ -42,29 +42,20 @@ ## licensing information pertaining to the included programs. workflow JointGenotyping { - File unpadded_intervals_file - + # Input Sample String callset_name - + Array[String] sample_names + Array[File] input_gvcfs + Array[File] input_gvcfs_indices + + # Reference and Resources File ref_fasta File ref_fasta_index File ref_dict File dbsnp_vcf File dbsnp_vcf_index - - Array[String] sample_names - Array[File] input_gvcfs - Array[File] input_gvcfs_indices - - String gatk_docker - String gatk_path - - Int small_disk - Int medium_disk - Int large_disk - Int huge_disk - + Array[String] snp_recalibration_tranche_values Array[String] snp_recalibration_annotation_values Array[String] indel_recalibration_tranche_values @@ -83,6 +74,23 @@ workflow JointGenotyping { File axiomPoly_resource_vcf_index File dbsnp_resource_vcf = dbsnp_vcf File dbsnp_resource_vcf_index = dbsnp_vcf_index + + File unpadded_intervals_file + + # Runtime attributes + String? gatk_docker_override + String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) + String? gatk_path_override + String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) + + Int? small_disk_override + Int small_disk = select_first([small_disk_override, "100"]) + Int? medium_disk_override + Int medium_disk = select_first([medium_disk_override, "200"]) + Int? large_disk_override + Int large_disk = select_first([large_disk_override, "300"]) + Int? huge_disk_override + Int huge_disk = select_first([huge_disk_override, "400"]) # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 @@ -344,18 +352,15 @@ workflow JointGenotyping { output { # outputs from the small callset path through the wdl - FinalGatherVcf.output_vcf - FinalGatherVcf.output_vcf_index - CollectMetricsOnFullVcf.detail_metrics_file - CollectMetricsOnFullVcf.summary_metrics_file + File? output_vcf = FinalGatherVcf.output_vcf + File? output_vcf_index = FinalGatherVcf.output_vcf_index - # outputs from the large callset path through the wdl - # (note that we do not list ApplyRecalibration here because it is run in both paths) - GatherMetrics.detail_metrics_file - GatherMetrics.summary_metrics_file + # select metrics from the small callset path and the large callset path + File detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherMetrics.detail_metrics_file]) + File summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherMetrics.summary_metrics_file]) # output the interval list generated/used by this run workflow - DynamicallyCombineIntervals.output_intervals + File output_intervals = DynamicallyCombineIntervals.output_intervals } } @@ -556,9 +561,9 @@ task IndelsVariantRecalibrator { -an ${sep=' -an ' recalibration_annotation_values} \ -mode INDEL \ --max-gaussians 4 \ - -resource mills,known=false,training=true,truth=true,prior=12:${mills_resource_vcf} \ - -resource axiomPoly,known=false,training=true,truth=false,prior=10:${axiomPoly_resource_vcf} \ - -resource dbsnp,known=true,training=false,truth=false,prior=2:${dbsnp_resource_vcf} + --resource:mills,known=false,training=true,truth=true,prior=12 ${mills_resource_vcf} \ + --resource:axiomPoly,known=false,training=true,truth=false,prior=10 ${axiomPoly_resource_vcf} \ + --resource:dbsnp,known=true,training=false,truth=false,prior=2 ${dbsnp_resource_vcf} } runtime { docker: docker @@ -612,10 +617,10 @@ task SNPsVariantRecalibratorCreateModel { --sample-every-Nth-variant ${downsampleFactor} \ --output-model ${model_report_filename} \ --max-gaussians 6 \ - -resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \ - -resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \ - -resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \ - -resource dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf} + --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ + --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ + --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ + --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} } runtime { docker: docker @@ -665,10 +670,10 @@ task SNPsVariantRecalibrator { -mode SNP \ ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ --max-gaussians 6 \ - -resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \ - -resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \ - -resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \ - -resource dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf} + --resource:hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \ + --resource:omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \ + --resource:1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \ + --resource:dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf} } runtime { docker: docker diff --git a/joint-discovery-gatk4.hg38.wgs.inputs.json b/joint-discovery-gatk4.hg38.wgs.inputs.json index 8610b56..cc45b76 100644 --- a/joint-discovery-gatk4.hg38.wgs.inputs.json +++ b/joint-discovery-gatk4.hg38.wgs.inputs.json @@ -36,11 +36,10 @@ "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], "##_COMMENT4": "DOCKERS", - "JointGenotyping.python_docker": "python:2.7", - "JointGenotyping.gatk_docker": "broadinstitute/gatk:4.0.6.0", + "#PreProcessingForVariantDiscovery_GATK4.gatk_docker_override": "String? (optional)", "##_COMMENT5": "PATHS", - "JointGenotyping.gatk_path": "/gatk/gatk", + "#PreProcessingForVariantDiscovery_GATK4.gatk_path_override": "String? (optional)", "##_COMMENT6": "JAVA OPTIONS", "JointGenotyping.SNPsVariantRecalibratorScattered.java_opt": "-Xmx3g -Xms3g", @@ -74,24 +73,12 @@ "JointGenotyping.SNPGatherTranches.mem_size": "7 GB", "##_COMMENT8": "DISK SIZE ALLOCATION", - "JointGenotyping.small_disk": 100, - "JointGenotyping.medium_disk": 200, - "JointGenotyping.large_disk": 300, - "JointGenotyping.huge_disk": 400, + "#JointGenotyping.small_disk_override": "Int? (optional)", + "#JointGenotyping.medium_disk_override": "Int? (optional)", + "#JointGenotyping.large_disk_override": "Int? (optional)", + "#JointGenotyping.huge_disk_override": "Int? (optional)", "##_COMMENT9": "PREEMPTIBLES", - "JointGenotyping.FinalGatherVcf.preemptibles": 5, - "JointGenotyping.IndelsVariantRecalibrator.preemptibles": 5, - "JointGenotyping.SNPsVariantRecalibratorScattered.preemptibles": 5, - "JointGenotyping.HardFilterAndMakeSitesOnlyVcf.preemptibles": 5, - "JointGenotyping.CollectMetricsOnFullVcf.preemptibles": 5, - "JointGenotyping.GatherMetrics.preemptibles": 5, - "JointGenotyping.SNPGatherTranches.preemptibles": 5, - "JointGenotyping.SitesOnlyGatherVcf.preemptibles": 5, - "JointGenotyping.CollectMetricsSharded.preemptibles": 5, - "JointGenotyping.ApplyRecalibration.preemptibles": 5, - "JointGenotyping.ImportGVCFs.preemptibles": 5, - "JointGenotyping.SNPsVariantRecalibratorCreateModel.preemptibles": 5, - "JointGenotyping.GenotypeGVCFs.preemptibles": 5, - "JointGenotyping.DynamicallyCombineIntervals.preemptibles": 5 + "#PreProcessingForVariantDiscovery_GATK4.preemptible_tries_override": "Int? (optional)" } + diff --git a/joint-discovery-gatk4.wdl b/joint-discovery-gatk4.wdl index 12ac9fe..6e81df3 100644 --- a/joint-discovery-gatk4.wdl +++ b/joint-discovery-gatk4.wdl @@ -42,11 +42,11 @@ ## licensing information pertaining to the included programs. workflow JointGenotyping { - File unpadded_intervals_file - + # Input Sample String callset_name File sample_name_map + # Reference and Resources File ref_fasta File ref_fasta_index File ref_dict @@ -54,14 +54,6 @@ workflow JointGenotyping { File dbsnp_vcf File dbsnp_vcf_index - String gatk_docker - String gatk_path - - Int small_disk - Int medium_disk - Int large_disk - Int huge_disk - Array[String] snp_recalibration_tranche_values Array[String] snp_recalibration_annotation_values Array[String] indel_recalibration_tranche_values @@ -80,6 +72,26 @@ workflow JointGenotyping { File axiomPoly_resource_vcf_index File dbsnp_resource_vcf = dbsnp_vcf File dbsnp_resource_vcf_index = dbsnp_vcf_index + + File unpadded_intervals_file + + # Runtime attributes + String? gatk_docker_override + String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) + String? gatk_path_override + String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) + + Int? small_disk_override + Int small_disk = select_first([small_disk_override, "100"]) + Int? medium_disk_override + Int medium_disk = select_first([medium_disk_override, "200"]) + Int? large_disk_override + Int large_disk = select_first([large_disk_override, "300"]) + Int? huge_disk_override + Int huge_disk = select_first([huge_disk_override, "400"]) + + String? preemptible_tries_override + Int preemptible_tries = select_first([preemptible_tries_override, "3"]) # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 @@ -98,7 +110,8 @@ workflow JointGenotyping { call DynamicallyCombineIntervals { input: intervals = unpadded_intervals_file, - merge_count = merge_count + merge_count = merge_count, + preemptible_tries = preemptible_tries } Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals) @@ -116,7 +129,8 @@ workflow JointGenotyping { disk_size = medium_disk, batch_size = 50, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } call GenotypeGVCFs { @@ -130,7 +144,8 @@ workflow JointGenotyping { dbsnp_vcf = dbsnp_vcf, disk_size = medium_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } call HardFilterAndMakeSitesOnlyVcf { @@ -142,7 +157,8 @@ workflow JointGenotyping { sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", disk_size = medium_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } } @@ -152,7 +168,8 @@ workflow JointGenotyping { output_vcf_name = callset_name + ".sites_only.vcf.gz", disk_size = medium_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } call IndelsVariantRecalibrator { @@ -171,7 +188,8 @@ workflow JointGenotyping { dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, disk_size = small_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } if (num_gvcfs > 10000) { @@ -195,7 +213,8 @@ workflow JointGenotyping { dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, disk_size = small_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { @@ -218,7 +237,8 @@ workflow JointGenotyping { dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, disk_size = small_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } } call GatherTranches as SNPGatherTranches { @@ -227,7 +247,8 @@ workflow JointGenotyping { output_filename = callset_name + ".snps.gathered.tranches", disk_size = small_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } } @@ -251,7 +272,8 @@ workflow JointGenotyping { dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, disk_size = small_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } } @@ -275,7 +297,8 @@ workflow JointGenotyping { snp_filter_level = snp_filter_level, disk_size = medium_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } # for large callsets we need to collect metrics from the shards and gather them later @@ -291,7 +314,8 @@ workflow JointGenotyping { ref_dict = ref_dict, disk_size = medium_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } } } @@ -304,7 +328,8 @@ workflow JointGenotyping { output_vcf_name = callset_name + ".vcf.gz", disk_size = huge_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } call CollectVariantCallingMetrics as CollectMetricsOnFullVcf { @@ -318,7 +343,8 @@ workflow JointGenotyping { ref_dict = ref_dict, disk_size = large_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } } @@ -331,37 +357,36 @@ workflow JointGenotyping { output_prefix = callset_name, disk_size = medium_disk, docker = gatk_docker, - gatk_path = gatk_path + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } } output { # outputs from the small callset path through the wdl - FinalGatherVcf.output_vcf - FinalGatherVcf.output_vcf_index - CollectMetricsOnFullVcf.detail_metrics_file - CollectMetricsOnFullVcf.summary_metrics_file + File? output_vcf = FinalGatherVcf.output_vcf + File? output_vcf_index = FinalGatherVcf.output_vcf_index - # outputs from the large callset path through the wdl - # (note that we do not list ApplyRecalibration here because it is run in both paths) - GatherMetrics.detail_metrics_file - GatherMetrics.summary_metrics_file + # select metrics from the small callset path and the large callset path + File detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherMetrics.detail_metrics_file]) + File summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherMetrics.summary_metrics_file]) # output the interval list generated/used by this run workflow - DynamicallyCombineIntervals.output_intervals + File output_intervals = DynamicallyCombineIntervals.output_intervals } } task GetNumberOfSamples { File sample_name_map String docker + Int preemptible_tries command <<< wc -l ${sample_name_map} | awk '{print $1}' >>> runtime { docker: docker memory: "1 GB" - preemptible: 5 + preemptible: preemptible_tries } output { Int sample_count = read_int(stdout()) @@ -377,6 +402,7 @@ task ImportGVCFs { String gatk_path String docker Int disk_size + Int preemptible_tries Int batch_size command <<< @@ -406,7 +432,7 @@ task ImportGVCFs { memory: "7 GB" cpu: "2" disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } output { File output_genomicsdb = "${workspace_dir_name}.tar" @@ -428,6 +454,7 @@ task GenotypeGVCFs { String dbsnp_vcf String docker Int disk_size + Int preemptible_tries command <<< set -e @@ -451,7 +478,7 @@ task GenotypeGVCFs { memory: "7 GB" cpu: "2" disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } output { File output_vcf = "${output_vcf_filename}" @@ -470,6 +497,7 @@ task HardFilterAndMakeSitesOnlyVcf { String docker Int disk_size + Int preemptible_tries command { set -e @@ -492,7 +520,7 @@ task HardFilterAndMakeSitesOnlyVcf { memory: "3.5 GB" cpu: "1" disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } output { File variant_filtered_vcf = "${variant_filtered_vcf_filename}" @@ -522,6 +550,7 @@ task IndelsVariantRecalibrator { String gatk_path String docker Int disk_size + Int preemptible_tries command { ${gatk_path} --java-options "-Xmx24g -Xms24g" \ @@ -534,16 +563,16 @@ task IndelsVariantRecalibrator { -an ${sep=' -an ' recalibration_annotation_values} \ -mode INDEL \ --max-gaussians 4 \ - -resource mills,known=false,training=true,truth=true,prior=12:${mills_resource_vcf} \ - -resource axiomPoly,known=false,training=true,truth=false,prior=10:${axiomPoly_resource_vcf} \ - -resource dbsnp,known=true,training=false,truth=false,prior=2:${dbsnp_resource_vcf} + --resource:mills,known=false,training=true,truth=true,prior=12 ${mills_resource_vcf} \ + --resource:axiomPoly,known=false,training=true,truth=false,prior=10 ${axiomPoly_resource_vcf} \ + --resource:dbsnp,known=true,training=false,truth=false,prior=2 ${dbsnp_resource_vcf} } runtime { docker: docker memory: "26 GB" cpu: "2" disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } output { File recalibration = "${recalibration_filename}" @@ -576,6 +605,7 @@ task SNPsVariantRecalibratorCreateModel { String gatk_path String docker Int disk_size + Int preemptible_tries command { ${gatk_path} --java-options "-Xmx100g -Xms100g" \ @@ -590,17 +620,17 @@ task SNPsVariantRecalibratorCreateModel { --sample-every-Nth-variant ${downsampleFactor} \ --output-model ${model_report_filename} \ --max-gaussians 6 \ - -resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \ - -resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \ - -resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \ - -resource dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf} + --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ + --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ + --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ + --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} } runtime { docker: docker memory: "104 GB" cpu: "2" disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } output { File model_report = "${model_report_filename}" @@ -630,6 +660,7 @@ task SNPsVariantRecalibrator { String gatk_path String docker Int disk_size + Int preemptible_tries command { ${gatk_path} --java-options "-Xmx3g -Xms3g" \ @@ -643,17 +674,17 @@ task SNPsVariantRecalibrator { -mode SNP \ ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ --max-gaussians 6 \ - -resource hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \ - -resource omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \ - -resource 1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \ - -resource dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf} + --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ + --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ + --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ + --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} } runtime { docker: docker memory: "3.5 GB" cpu: "2" disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } output { File recalibration = "${recalibration_filename}" @@ -670,6 +701,7 @@ task GatherTranches { String docker Int disk_size + Int preemptible_tries command <<< set -e @@ -702,7 +734,7 @@ task GatherTranches { memory: "7 GB" cpu: "2" disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } output { File tranches = "${output_filename}" @@ -726,6 +758,7 @@ task ApplyRecalibration { String gatk_path String docker Int disk_size + Int preemptible_tries command { set -e @@ -755,7 +788,7 @@ task ApplyRecalibration { memory: "7 GB" cpu: "1" disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } output { File recalibrated_vcf = "${recalibrated_vcf_filename}" @@ -770,6 +803,7 @@ task GatherVcfs { String docker Int disk_size + Int preemptible_tries command <<< set -e @@ -796,7 +830,7 @@ task GatherVcfs { memory: "7 GB" cpu: "1" disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } output { File output_vcf = "${output_vcf_name}" @@ -817,6 +851,7 @@ task CollectVariantCallingMetrics { String gatk_path String docker Int disk_size + Int preemptible_tries command { ${gatk_path} --java-options "-Xmx6g -Xms6g" \ @@ -837,7 +872,7 @@ task CollectVariantCallingMetrics { memory: "7 GB" cpu: 2 disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } } @@ -850,6 +885,7 @@ task GatherMetrics { String gatk_path String docker Int disk_size + Int preemptible_tries command <<< set -e @@ -891,7 +927,7 @@ task GatherMetrics { memory: "3 GB" cpu: "1" disks: "local-disk " + disk_size + " HDD" - preemptible: 5 + preemptible: preemptible_tries } output { File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics" @@ -902,6 +938,7 @@ task GatherMetrics { task DynamicallyCombineIntervals { File intervals Int merge_count + Int preemptible_tries command { python << CODE @@ -955,7 +992,7 @@ task DynamicallyCombineIntervals { runtime { memory: "3 GB" - preemptible: 5 + preemptible: preemptible_tries docker: "python:2.7" } From fe77d80edd397681950b8da5021e89b990e448e8 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 22 Feb 2019 21:43:18 +0000 Subject: [PATCH 02/24] added samtools path variable --- haplotypecaller-gvcf-gatk4-nio.wdl | 14 +++++++++----- haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json | 1 + haplotypecaller-gvcf-gatk4.wdl | 14 +++++++++----- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/haplotypecaller-gvcf-gatk4-nio.wdl b/haplotypecaller-gvcf-gatk4-nio.wdl index 2edc471..64cc986 100644 --- a/haplotypecaller-gvcf-gatk4-nio.wdl +++ b/haplotypecaller-gvcf-gatk4-nio.wdl @@ -41,7 +41,9 @@ workflow HaplotypeCallerGvcf_GATK4 { String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) String? gitc_docker_override String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) - + String? samtools_path_override + String samtools_path = select_first([samtools_path_override, "samtools"]) + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) #is the input a cram file? @@ -67,7 +69,8 @@ workflow HaplotypeCallerGvcf_GATK4 { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - docker = gitc_docker + docker = gitc_docker, + samtools_path = samtools_path } } @@ -124,6 +127,7 @@ task CramToBamTask { Int? disk_space_gb Boolean use_ssd = false Int? preemptible_attempts + String samtools_path Float output_bam_size = size(input_cram, "GB") / 0.60 Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") @@ -133,9 +137,9 @@ task CramToBamTask { set -e set -o pipefail - samtools view -h -T ${ref_fasta} ${input_cram} | - samtools view -b -o ${sample_name}.bam - - samtools index -b ${sample_name}.bam + ${samtools_path} view -h -T ${ref_fasta} ${input_cram} | + ${samtools_path} view -b -o ${sample_name}.bam - + ${samtools_path} index -b ${sample_name}.bam mv ${sample_name}.bam.bai ${sample_name}.bai } runtime { diff --git a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json index 0709c4e..0b5df9a 100644 --- a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json +++ b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json @@ -23,6 +23,7 @@ "##_COMMENT6": "PATHS", "#HaplotypeCallerGvcf_GATK4.gatk_path_override": "String? (optional)", + "#HaplotypeCallerGvcf_GATK4.samtools_path_override": "String? (optional)", "##_COMMENT7": "JAVA OPTIONS", "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "String? (optional)", diff --git a/haplotypecaller-gvcf-gatk4.wdl b/haplotypecaller-gvcf-gatk4.wdl index d53cfac..a8a292f 100644 --- a/haplotypecaller-gvcf-gatk4.wdl +++ b/haplotypecaller-gvcf-gatk4.wdl @@ -41,7 +41,9 @@ workflow HaplotypeCallerGvcf_GATK4 { String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) String? gitc_docker_override String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) - + String? samtools_path_override + String samtools_path = select_first([samtools_path_override, "samtools"]) + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) #is the input a cram file? @@ -60,7 +62,8 @@ workflow HaplotypeCallerGvcf_GATK4 { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - docker = gitc_docker + docker = gitc_docker, + samtools_path = samtools_path } } @@ -116,6 +119,7 @@ task CramToBamTask { Int? disk_space_gb Boolean use_ssd = false Int? preemptible_attempts + String samtools_path Float output_bam_size = size(input_cram, "GB") / 0.60 Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") @@ -125,9 +129,9 @@ task CramToBamTask { set -e set -o pipefail - samtools view -h -T ${ref_fasta} ${input_cram} | - samtools view -b -o ${sample_name}.bam - - samtools index -b ${sample_name}.bam + ${samtools_path} view -h -T ${ref_fasta} ${input_cram} | + ${samtools_path} view -b -o ${sample_name}.bam - + ${samtools_path} index -b ${sample_name}.bam mv ${sample_name}.bam.bai ${sample_name}.bai } runtime { From dd632f02ec03d941b89b53a37de44c3235d78d94 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Sat, 23 Feb 2019 22:44:07 +0000 Subject: [PATCH 03/24] Updated wording --- README.md | 29 +++++++++++++++++------------ haplotypecaller-gvcf-gatk4-nio.wdl | 7 +++++-- haplotypecaller-gvcf-gatk4.wdl | 7 +++++-- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 2daca5a..3d915c5 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,33 @@ # gatk4-germline-snps-indels ### Purpose : -Workflows for germline short variant discovery with GATK4. +Workflows for [germline short variant discovery](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11145) with GATK4. ### haplotypecaller-gvcf-gatk : -The haplotypecaller-gvcf-gatk4 workflow runs HaplotypeCaller -from GATK4 in GVCF mode on a single sample according to the GATK Best Practices, -scattered across intervals. +The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool +from GATK4 in GVCF mode on a single sample according to GATK Best Practices. +When executed the workflow scatters the HaplotypeCaller tool over a sample +using an intervals list file. The output file produced will be a +single gvcf file which can be used by the joint-discovery workflow. #### Requirements/expectations - One analysis-ready BAM file for a single sample (as identified in RG:SM) -- Set of variant calling intervals lists for the scatter, provided in a file +- A file containing a set of variant calling interval list for the scatter + #### Outputs - One GVCF file and its index ### joint-discovery-gatk : -The second WDL implements the joint discovery and VQSR -filtering portion of the GATK Best Practices (June 2016) for germline SNP and Indel -discovery in human whole-genome sequencing (WGS) and exome sequencing data. +This WDL implements the joint calling and VQSR filtering portion of the +GATK Best Practices for germline SNP and Indel discovery +in human whole-genome sequencing (WGS). *NOTE: joint-discovery-gatk4-local.wdl is a slightly modified version of the original to support users interested in running the workflow locally.* #### Requirements/expectations - One or more GVCFs produced by HaplotypeCaller in GVCF mode - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. -- When deteriming disk size in the json, use the guideline below +- When determining disk size in the JSON, use the guideline below - small_disk = (num_gvcfs / 10) + 10 - medium_disk = (num_gvcfs * 15) + 10 - huge_disk = num_gvcfs + 10 @@ -36,11 +39,11 @@ discovery in human whole-genome sequencing (WGS) and exome sequencing data. in the FILTER field. ### Software version requirements : -- GATK 4 or later -- Samtools (see gotc docker) +- GATK 4.1 +- Samtools 1.3.1 - Python 2.7 - Cromwell version support - - Successfully tested on v31 + - Successfully tested on v37 - Does not work on versions < v23 due to output syntax ### IMPORTANT NOTE : @@ -78,4 +81,6 @@ view the following tutorial [(How to) Execute Workflows from the gatk-workflows ### LICENSING : Copyright Broad Institute, 2019 | BSD-3 This script is released under the WDL open source code license (BSD-3) (full license text at https://github.com/openwdl/wdl/blob/master/LICENSE). Note however that the programs it calls may be subject to different licenses. Users are responsible for checking that they are authorized to run all programs before running this script. +- [GATK](https://software.broadinstitute.org/gatk/download/licensing.php) +- [Samtools](http://www.htslib.org/terms/) diff --git a/haplotypecaller-gvcf-gatk4-nio.wdl b/haplotypecaller-gvcf-gatk4-nio.wdl index 64cc986..afcbbfb 100644 --- a/haplotypecaller-gvcf-gatk4-nio.wdl +++ b/haplotypecaller-gvcf-gatk4-nio.wdl @@ -1,7 +1,10 @@ ## Copyright Broad Institute, 2019 ## -## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample -## according to the GATK Best Practices (June 2016), scattered across intervals. +## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool +## from GATK4 in GVCF mode on a single sample according to GATK Best Practices. +## When executed the workflow scatters the HaplotypeCaller tool over a sample +## using an intervals list file. The output file produced will be a +## single gvcf file which can be used by the joint-discovery workflow. ## ## Requirements/expectations : ## - One analysis-ready BAM file for a single sample (as identified in RG:SM) diff --git a/haplotypecaller-gvcf-gatk4.wdl b/haplotypecaller-gvcf-gatk4.wdl index a8a292f..0270187 100644 --- a/haplotypecaller-gvcf-gatk4.wdl +++ b/haplotypecaller-gvcf-gatk4.wdl @@ -1,7 +1,10 @@ ## Copyright Broad Institute, 2019 ## -## This WDL workflow runs HaplotypeCaller from GATK4 in GVCF mode on a single sample -## according to the GATK Best Practices (June 2016), scattered across intervals. +## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool +## from GATK4 in GVCF mode on a single sample according to GATK Best Practices. +## When executed the workflow scatters the HaplotypeCaller tool over a sample +## using an intervals list file. The output file produced will be a +## single gvcf file which can be used by the joint-discovery workflow. ## ## Requirements/expectations : ## - One analysis-ready BAM file for a single sample (as identified in RG:SM) From bff1267d7ec8e2b89107fe845c011748cb36ae0c Mon Sep 17 00:00:00 2001 From: bshifaw Date: Sun, 24 Feb 2019 00:44:24 +0000 Subject: [PATCH 04/24] add a version of joint-discovery that takes in an array for gvcfs --- README.md | 8 +- joint-discovery-gatk4-fc.wdl | 1024 ++++++++++++++++++++++++++++++++++ 2 files changed, 1031 insertions(+), 1 deletion(-) create mode 100644 joint-discovery-gatk4-fc.wdl diff --git a/README.md b/README.md index 3d915c5..7bcc7c1 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,13 @@ This WDL implements the joint calling and VQSR filtering portion of the GATK Best Practices for germline SNP and Indel discovery in human whole-genome sequencing (WGS). -*NOTE: joint-discovery-gatk4-local.wdl is a slightly modified version of the original to support users interested in running the workflow locally.* +*NOTE: +- joint-discovery-gatk4-local.wdl is a slightly modified version of the +original to support users interested in running the workflow locally. +- joint-discovery-gatk4-fc.wdl is a slightly modified version of the +original to support users interested in running the workflow firecloud with and +using an array of gvcfs as input.* + #### Requirements/expectations - One or more GVCFs produced by HaplotypeCaller in GVCF mode diff --git a/joint-discovery-gatk4-fc.wdl b/joint-discovery-gatk4-fc.wdl new file mode 100644 index 0000000..0d195cc --- /dev/null +++ b/joint-discovery-gatk4-fc.wdl @@ -0,0 +1,1024 @@ +## Copyright Broad Institute, 2018 +## +## This WDL implements the joint discovery and VQSR filtering portion of the GATK +## Best Practices (June 2016) for germline SNP and Indel discovery in human +## whole-genome sequencing (WGS) and exome sequencing data. +## +## Requirements/expectations : +## - One or more GVCFs produced by HaplotypeCaller in GVCF mode +## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. +## +## Outputs : +## - A VCF file and its index, filtered using variant quality score recalibration +## (VQSR) with genotypes for all samples present in the input VCF. All sites that +## are present in the input VCF are retained; filtered sites are annotated as such +## in the FILTER field. +## - Note that the sample_names is what the sample will be called in the output, but not necessarily what the sample name is called in its GVCF. +## +## Note about VQSR wiring : +## The SNP and INDEL models are built in parallel, but then the corresponding +## recalibrations are applied in series. Because the INDEL model is generally ready +## first (because there are fewer indels than SNPs) we set INDEL recalibration to +## be applied first to the input VCF, while the SNP model is still being built. By +## the time the SNP model is available, the indel-recalibrated file is available to +## serve as input to apply the SNP recalibration. If we did it the other way around, +## we would have to wait until the SNP recal file was available despite the INDEL +## recal file being there already, then apply SNP recalibration, then apply INDEL +## recalibration. This would lead to a longer wall clock time for complete workflow +## execution. Wiring the INDEL recalibration to be applied first solves the problem. +## +## Cromwell version support +## - Successfully tested on v31 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +workflow JointGenotyping { + # Input Sample + String callset_name + + Array[String] sample_names + Array[File] input_gvcfs + Array[File] input_gvcfs_indices + + # Reference and Resources + File ref_fasta + File ref_fasta_index + File ref_dict + + File dbsnp_vcf + File dbsnp_vcf_index + + Array[String] snp_recalibration_tranche_values + Array[String] snp_recalibration_annotation_values + Array[String] indel_recalibration_tranche_values + Array[String] indel_recalibration_annotation_values + + File eval_interval_list + File hapmap_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf + File one_thousand_genomes_resource_vcf_index + File mills_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf = dbsnp_vcf + File dbsnp_resource_vcf_index = dbsnp_vcf_index + + File unpadded_intervals_file + # Runtime attributes + String? gatk_docker_override + String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) + String? gatk_path_override + String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) + + Int? small_disk_override + Int small_disk = select_first([small_disk_override, "100"]) + Int? medium_disk_override + Int medium_disk = select_first([medium_disk_override, "200"]) + Int? large_disk_override + Int large_disk = select_first([large_disk_override, "300"]) + Int? huge_disk_override + Int huge_disk = select_first([huge_disk_override, "400"]) + + String? preemptible_tries_override + Int preemptible_tries = select_first([preemptible_tries_override, "3"]) + + # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme + # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 + Float excess_het_threshold = 54.69 + Float snp_filter_level + Float indel_filter_level + Int SNP_VQSR_downsampleFactor + + Int num_of_original_intervals = length(read_lines(unpadded_intervals_file)) + Int num_gvcfs = length(input_gvcfs) + + # Make a 2.5:1 interval number to samples in callset ratio interval list + Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5) + Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1 + + call DynamicallyCombineIntervals { + input: + intervals = unpadded_intervals_file, + merge_count = merge_count, + preemptible_tries = preemptible_tries + } + + Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals) + + scatter (idx in range(length(unpadded_intervals))) { + # the batch_size value was carefully chosen here as it + # is the optimal value for the amount of memory allocated + # within the task; please do not change it without consulting + # the Hellbender (GATK engine) team! + call ImportGVCFs { + input: + sample_names = sample_names, + input_gvcfs = input_gvcfs, + input_gvcfs_indices = input_gvcfs_indices, + interval = unpadded_intervals[idx], + workspace_dir_name = "genomicsdb", + disk_size = medium_disk, + batch_size = 50, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + call GenotypeGVCFs { + input: + workspace_tar = ImportGVCFs.output_genomicsdb, + interval = unpadded_intervals[idx], + output_vcf_filename = "output.vcf.gz", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + disk_size = medium_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + call HardFilterAndMakeSitesOnlyVcf { + input: + vcf = GenotypeGVCFs.output_vcf, + vcf_index = GenotypeGVCFs.output_vcf_index, + excess_het_threshold = excess_het_threshold, + variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", + sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", + disk_size = medium_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + call GatherVcfs as SitesOnlyGatherVcf { + input: + input_vcfs_fofn = write_lines(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf), + output_vcf_name = callset_name + ".sites_only.vcf.gz", + disk_size = medium_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + call IndelsVariantRecalibrator { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".indels.recal", + tranches_filename = callset_name + ".indels.tranches", + recalibration_tranche_values = indel_recalibration_tranche_values, + recalibration_annotation_values = indel_recalibration_annotation_values, + mills_resource_vcf = mills_resource_vcf, + mills_resource_vcf_index = mills_resource_vcf_index, + axiomPoly_resource_vcf = axiomPoly_resource_vcf, + axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + disk_size = small_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + if (num_gvcfs > 10000) { + call SNPsVariantRecalibratorCreateModel { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + downsampleFactor = SNP_VQSR_downsampleFactor, + model_report_filename = callset_name + ".snps.model.report", + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + disk_size = small_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { + call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { + input: + sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], + sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], + recalibration_filename = callset_name + ".snps." + idx + ".recal", + tranches_filename = callset_name + ".snps." + idx + ".tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + model_report = SNPsVariantRecalibratorCreateModel.model_report, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + disk_size = small_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + call GatherTranches as SNPGatherTranches { + input: + input_fofn = write_lines(SNPsVariantRecalibratorScattered.tranches), + output_filename = callset_name + ".snps.gathered.tranches", + disk_size = small_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + + if (num_gvcfs <= 10000){ + call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + disk_size = small_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. + # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. + Boolean is_small_callset = num_gvcfs <= 1000 + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { + call ApplyRecalibration { + input: + recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", + input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], + input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], + indels_recalibration = IndelsVariantRecalibrator.recalibration, + indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, + indels_tranches = IndelsVariantRecalibrator.tranches, + snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), + snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), + snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), + indel_filter_level = indel_filter_level, + snp_filter_level = snp_filter_level, + disk_size = medium_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + # for large callsets we need to collect metrics from the shards and gather them later + if (!is_small_callset) { + call CollectVariantCallingMetrics as CollectMetricsSharded { + input: + input_vcf = ApplyRecalibration.recalibrated_vcf, + input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, + metrics_filename_prefix = callset_name + "." + idx, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict, + disk_size = medium_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + } + + # for small callsets we can gather the VCF shards and then collect metrics on it + if (is_small_callset) { + call GatherVcfs as FinalGatherVcf { + input: + input_vcfs_fofn = write_lines(ApplyRecalibration.recalibrated_vcf), + output_vcf_name = callset_name + ".vcf.gz", + disk_size = huge_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + call CollectVariantCallingMetrics as CollectMetricsOnFullVcf { + input: + input_vcf = FinalGatherVcf.output_vcf, + input_vcf_index = FinalGatherVcf.output_vcf_index, + metrics_filename_prefix = callset_name, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict, + disk_size = large_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + # for large callsets we still need to gather the sharded metrics + if (!is_small_callset) { + call GatherMetrics { + input: + input_details_fofn = write_lines(select_all(CollectMetricsSharded.detail_metrics_file)), + input_summaries_fofn = write_lines(select_all(CollectMetricsSharded.summary_metrics_file)), + output_prefix = callset_name, + disk_size = medium_disk, + docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + output { + # outputs from the small callset path through the wdl + File? output_vcf = FinalGatherVcf.output_vcf + File? output_vcf_index = FinalGatherVcf.output_vcf_index + + # select metrics from the small callset path and the large callset path + File detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherMetrics.detail_metrics_file]) + File summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherMetrics.summary_metrics_file]) + + # output the interval list generated/used by this run workflow + File output_intervals = DynamicallyCombineIntervals.output_intervals + } +} + +task GetNumberOfSamples { + File sample_name_map + String docker + Int preemptible_tries + command <<< + wc -l ${sample_name_map} | awk '{print $1}' + >>> + runtime { + docker: docker + memory: "1 GB" + preemptible: preemptible_tries + } + output { + Int sample_count = read_int(stdout()) + } +} + +task ImportGVCFs { + Array[String] sample_names + Array[File] input_gvcfs + Array[File] input_gvcfs_indices + String interval + + String workspace_dir_name + + String gatk_path + String docker + Int disk_size + Int preemptible_tries + Int batch_size + + command <<< + set -e + set -o pipefail + + python << CODE + gvcfs = ['${sep="','" input_gvcfs}'] + sample_names = ['${sep="','" sample_names}'] + + if len(gvcfs)!= len(sample_names): + exit(1) + + with open("inputs.list", "w") as fi: + for i in range(len(gvcfs)): + fi.write(sample_names[i] + "\t" + gvcfs[i] + "\n") + + CODE + + rm -rf ${workspace_dir_name} + + # The memory setting here is very important and must be several GB lower + # than the total memory allocated to the VM because this tool uses + # a significant amount of non-heap memory for native libraries. + # Also, testing has shown that the multithreaded reader initialization + # does not scale well beyond 5 threads, so don't increase beyond that. + ${gatk_path} --java-options "-Xmx4g -Xms4g" \ + GenomicsDBImport \ + --genomicsdb-workspace-path ${workspace_dir_name} \ + --batch-size ${batch_size} \ + -L ${interval} \ + --sample-name-map inputs.list \ + --reader-threads 5 \ + -ip 500 + + tar -cf ${workspace_dir_name}.tar ${workspace_dir_name} + + >>> + runtime { + docker: docker + memory: "7 GB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_genomicsdb = "${workspace_dir_name}.tar" + } +} + +task GenotypeGVCFs { + File workspace_tar + String interval + + String output_vcf_filename + + String gatk_path + + File ref_fasta + File ref_fasta_index + File ref_dict + + String dbsnp_vcf + String docker + Int disk_size + Int preemptible_tries + + command <<< + set -e + + tar -xf ${workspace_tar} + WORKSPACE=$( basename ${workspace_tar} .tar) + + ${gatk_path} --java-options "-Xmx5g -Xms5g" \ + GenotypeGVCFs \ + -R ${ref_fasta} \ + -O ${output_vcf_filename} \ + -D ${dbsnp_vcf} \ + -G StandardAnnotation \ + --only-output-calls-starting-in-intervals \ + --use-new-qual-calculator \ + -V gendb://$WORKSPACE \ + -L ${interval} + >>> + runtime { + docker: docker + memory: "7 GB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_vcf = "${output_vcf_filename}" + File output_vcf_index = "${output_vcf_filename}.tbi" + } +} + +task HardFilterAndMakeSitesOnlyVcf { + File vcf + File vcf_index + Float excess_het_threshold + + String variant_filtered_vcf_filename + String sites_only_vcf_filename + String gatk_path + + String docker + Int disk_size + Int preemptible_tries + + command { + set -e + + ${gatk_path} --java-options "-Xmx3g -Xms3g" \ + VariantFiltration \ + --filter-expression "ExcessHet > ${excess_het_threshold}" \ + --filter-name ExcessHet \ + -O ${variant_filtered_vcf_filename} \ + -V ${vcf} + + ${gatk_path} --java-options "-Xmx3g -Xms3g" \ + MakeSitesOnlyVcf \ + --INPUT ${variant_filtered_vcf_filename} \ + --OUTPUT ${sites_only_vcf_filename} + + } + runtime { + docker: docker + memory: "3.5 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File variant_filtered_vcf = "${variant_filtered_vcf_filename}" + File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi" + File sites_only_vcf = "${sites_only_vcf_filename}" + File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi" + } +} + +task IndelsVariantRecalibrator { + String recalibration_filename + String tranches_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File mills_resource_vcf + File axiomPoly_resource_vcf + File dbsnp_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf_index + + String gatk_path + String docker + Int disk_size + Int preemptible_tries + + command { + ${gatk_path} --java-options "-Xmx24g -Xms24g" \ + VariantRecalibrator \ + -V ${sites_only_variant_filtered_vcf} \ + -O ${recalibration_filename} \ + --tranches-file ${tranches_filename} \ + --trust-all-polymorphic \ + -tranche ${sep=' -tranche ' recalibration_tranche_values} \ + -an ${sep=' -an ' recalibration_annotation_values} \ + -mode INDEL \ + --max-gaussians 4 \ + --resource:mills,known=false,training=true,truth=true,prior=12 ${mills_resource_vcf} \ + --resource:axiomPoly,known=false,training=true,truth=false,prior=10 ${axiomPoly_resource_vcf} \ + --resource:dbsnp,known=true,training=false,truth=false,prior=2 ${dbsnp_resource_vcf} + } + runtime { + docker: docker + memory: "26 GB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File recalibration = "${recalibration_filename}" + File recalibration_index = "${recalibration_filename}.idx" + File tranches = "${tranches_filename}" + } +} + +task SNPsVariantRecalibratorCreateModel { + String recalibration_filename + String tranches_filename + Int downsampleFactor + String model_report_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + + String gatk_path + String docker + Int disk_size + Int preemptible_tries + + command { + ${gatk_path} --java-options "-Xmx100g -Xms100g" \ + VariantRecalibrator \ + -V ${sites_only_variant_filtered_vcf} \ + -O ${recalibration_filename} \ + --tranches-file ${tranches_filename} \ + --trust-all-polymorphic \ + -tranche ${sep=' -tranche ' recalibration_tranche_values} \ + -an ${sep=' -an ' recalibration_annotation_values} \ + -mode SNP \ + --sample-every-Nth-variant ${downsampleFactor} \ + --output-model ${model_report_filename} \ + --max-gaussians 6 \ + --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ + --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ + --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ + --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} + } + runtime { + docker: docker + memory: "104 GB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File model_report = "${model_report_filename}" + } +} + +task SNPsVariantRecalibrator { + String recalibration_filename + String tranches_filename + File? model_report + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + + String gatk_path + String docker + Int disk_size + Int preemptible_tries + + command { + ${gatk_path} --java-options "-Xmx3g -Xms3g" \ + VariantRecalibrator \ + -V ${sites_only_variant_filtered_vcf} \ + -O ${recalibration_filename} \ + --tranches-file ${tranches_filename} \ + --trust-all-polymorphic \ + -tranche ${sep=' -tranche ' recalibration_tranche_values} \ + -an ${sep=' -an ' recalibration_annotation_values} \ + -mode SNP \ + ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ + --max-gaussians 6 \ + --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ + --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ + --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ + --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} + } + runtime { + docker: docker + memory: "3.5 GB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File recalibration = "${recalibration_filename}" + File recalibration_index = "${recalibration_filename}.idx" + File tranches = "${tranches_filename}" + } +} + +task GatherTranches { + File input_fofn + String output_filename + + String gatk_path + + String docker + Int disk_size + Int preemptible_tries + + command <<< + set -e + set -o pipefail + + # this is here to deal with the JES bug where commands may be run twice + rm -rf tranches + + mkdir tranches + RETRY_LIMIT=5 + + count=0 + until cat ${input_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the tranches from the cloud' && exit 1 + fi + + cat ${input_fofn} | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list + + ${gatk_path} --java-options "-Xmx6g -Xms6g" \ + GatherTranches \ + --input inputs.list \ + --output ${output_filename} + >>> + runtime { + docker: docker + memory: "7 GB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File tranches = "${output_filename}" + } +} + +task ApplyRecalibration { + String recalibrated_vcf_filename + File input_vcf + File input_vcf_index + File indels_recalibration + File indels_recalibration_index + File indels_tranches + File snps_recalibration + File snps_recalibration_index + File snps_tranches + + Float indel_filter_level + Float snp_filter_level + + String gatk_path + String docker + Int disk_size + Int preemptible_tries + + command { + set -e + + ${gatk_path} --java-options "-Xmx5g -Xms5g" \ + ApplyVQSR \ + -O tmp.indel.recalibrated.vcf \ + -V ${input_vcf} \ + --recal-file ${indels_recalibration} \ + --tranches-file ${indels_tranches} \ + --truth-sensitivity-filter-level ${indel_filter_level} \ + --create-output-variant-index true \ + -mode INDEL + + ${gatk_path} --java-options "-Xmx5g -Xms5g" \ + ApplyVQSR \ + -O ${recalibrated_vcf_filename} \ + -V tmp.indel.recalibrated.vcf \ + --recal-file ${snps_recalibration} \ + --tranches-file ${snps_tranches} \ + --truth-sensitivity-filter-level ${snp_filter_level} \ + --create-output-variant-index true \ + -mode SNP + } + runtime { + docker: docker + memory: "7 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File recalibrated_vcf = "${recalibrated_vcf_filename}" + File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi" + } +} + +task GatherVcfs { + File input_vcfs_fofn + String output_vcf_name + String gatk_path + + String docker + Int disk_size + Int preemptible_tries + + command <<< + set -e + + # Now using NIO to localize the vcfs but the input file must have a ".list" extension + mv ${input_vcfs_fofn} inputs.list + + # --ignore-safety-checks makes a big performance difference so we include it in our invocation. + # This argument disables expensive checks that the file headers contain the same set of + # genotyped samples and that files are in order by position of first record. + ${gatk_path} --java-options "-Xmx6g -Xms6g" \ + GatherVcfsCloud \ + --ignore-safety-checks \ + --gather-type BLOCK \ + --input inputs.list \ + --output ${output_vcf_name} + + ${gatk_path} --java-options "-Xmx6g -Xms6g" \ + IndexFeatureFile \ + --feature-file ${output_vcf_name} + >>> + runtime { + docker: docker + memory: "7 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File output_vcf = "${output_vcf_name}" + File output_vcf_index = "${output_vcf_name}.tbi" + } +} + +task CollectVariantCallingMetrics { + File input_vcf + File input_vcf_index + + String metrics_filename_prefix + File dbsnp_vcf + File dbsnp_vcf_index + File interval_list + File ref_dict + + String gatk_path + String docker + Int disk_size + Int preemptible_tries + + command { + ${gatk_path} --java-options "-Xmx6g -Xms6g" \ + CollectVariantCallingMetrics \ + --INPUT ${input_vcf} \ + --DBSNP ${dbsnp_vcf} \ + --SEQUENCE_DICTIONARY ${ref_dict} \ + --OUTPUT ${metrics_filename_prefix} \ + --THREAD_COUNT 8 \ + --TARGET_INTERVALS ${interval_list} + } + output { + File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics" + } + runtime { + docker: docker + memory: "7 GB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } +} + +task GatherMetrics { + File input_details_fofn + File input_summaries_fofn + + String output_prefix + + String gatk_path + String docker + Int disk_size + Int preemptible_tries + + command <<< + set -e + set -o pipefail + + # this is here to deal with the JES bug where commands may be run twice + rm -rf metrics + + mkdir metrics + RETRY_LIMIT=5 + + count=0 + until cat ${input_details_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the metrics from the cloud' && exit 1 + fi + + count=0 + until cat ${input_summaries_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the metrics from the cloud' && exit 1 + fi + + INPUT=`cat ${input_details_fofn} | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("-I=metrics/%s ", $1)}'` + + ${gatk_path} --java-options "-Xmx2g -Xms2g" \ + AccumulateVariantCallingMetrics \ + $INPUT \ + -O ${output_prefix} + >>> + runtime { + docker: docker + memory: "3 GB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + } + output { + File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics" + } +} + +task DynamicallyCombineIntervals { + File intervals + Int merge_count + Int preemptible_tries + + command { + python << CODE + def parse_interval(interval): + colon_split = interval.split(":") + chromosome = colon_split[0] + dash_split = colon_split[1].split("-") + start = int(dash_split[0]) + end = int(dash_split[1]) + return chromosome, start, end + + def add_interval(chr, start, end): + lines_to_write.append(chr + ":" + str(start) + "-" + str(end)) + return chr, start, end + + count = 0 + chain_count = ${merge_count} + l_chr, l_start, l_end = "", 0, 0 + lines_to_write = [] + with open("${intervals}") as f: + with open("out.intervals", "w") as f1: + for line in f.readlines(): + # initialization + if count == 0: + w_chr, w_start, w_end = parse_interval(line) + count = 1 + continue + # reached number to combine, so spit out and start over + if count == chain_count: + l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) + w_chr, w_start, w_end = parse_interval(line) + count = 1 + continue + + c_chr, c_start, c_end = parse_interval(line) + # if adjacent keep the chain going + if c_chr == w_chr and c_start == w_end + 1: + w_end = c_end + count += 1 + continue + # not adjacent, end here and start a new chain + else: + l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) + w_chr, w_start, w_end = parse_interval(line) + count = 1 + if l_char != w_chr or l_start != w_start or l_end != w_end: + add_interval(w_chr, w_start, w_end) + f1.writelines("\n".join(lines_to_write)) + CODE + } + + runtime { + memory: "3 GB" + preemptible: preemptible_tries + docker: "python:2.7" + } + + output { + File output_intervals = "out.intervals" + } +} + From 082df3b8dc0dc38f5797c53c90eecda12903d7f0 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 28 Feb 2019 16:38:35 -0500 Subject: [PATCH 05/24] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7bcc7c1..9880957 100644 --- a/README.md +++ b/README.md @@ -22,10 +22,10 @@ This WDL implements the joint calling and VQSR filtering portion of the GATK Best Practices for germline SNP and Indel discovery in human whole-genome sequencing (WGS). -*NOTE: -- joint-discovery-gatk4-local.wdl is a slightly modified version of the -original to support users interested in running the workflow locally. -- joint-discovery-gatk4-fc.wdl is a slightly modified version of the +*NOTE:* +*- joint-discovery-gatk4-local.wdl is a slightly modified version of the +original to support users interested in running the workflow locally.* +*- joint-discovery-gatk4-fc.wdl is a slightly modified version of the original to support users interested in running the workflow firecloud with and using an array of gvcfs as input.* From 093e47845fa3982b5e69a5497f43bb78879342ac Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 3 May 2019 18:30:46 +0000 Subject: [PATCH 06/24] increased mem for SNPsVariantRecalibrator for papiv2 --- joint-discovery-gatk4.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/joint-discovery-gatk4.wdl b/joint-discovery-gatk4.wdl index 6e81df3..02ccfbf 100644 --- a/joint-discovery-gatk4.wdl +++ b/joint-discovery-gatk4.wdl @@ -681,7 +681,7 @@ task SNPsVariantRecalibrator { } runtime { docker: docker - memory: "3.5 GB" + memory: "7.5 GB" cpu: "2" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries From 329f152049067f722640164099575b49c131cf8d Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 3 May 2019 18:44:04 +0000 Subject: [PATCH 07/24] increased mem for SNPsVariantRecalibrator for papiv2 --- joint-discovery-gatk4-fc.wdl | 2 +- joint-discovery-gatk4.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/joint-discovery-gatk4-fc.wdl b/joint-discovery-gatk4-fc.wdl index 0d195cc..a9bec80 100644 --- a/joint-discovery-gatk4-fc.wdl +++ b/joint-discovery-gatk4-fc.wdl @@ -702,7 +702,7 @@ task SNPsVariantRecalibrator { } runtime { docker: docker - memory: "3.5 GB" + memory: "7.5 GB" cpu: "2" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries diff --git a/joint-discovery-gatk4.wdl b/joint-discovery-gatk4.wdl index 02ccfbf..6e81df3 100644 --- a/joint-discovery-gatk4.wdl +++ b/joint-discovery-gatk4.wdl @@ -681,7 +681,7 @@ task SNPsVariantRecalibrator { } runtime { docker: docker - memory: "7.5 GB" + memory: "3.5 GB" cpu: "2" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries From 7184b70b1344dd7f1133da8d7d3f7266147e82cc Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 6 May 2019 01:05:08 +0000 Subject: [PATCH 08/24] increased memory to run on papiv2 --- joint-discovery-gatk4.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/joint-discovery-gatk4.wdl b/joint-discovery-gatk4.wdl index 6e81df3..02ccfbf 100644 --- a/joint-discovery-gatk4.wdl +++ b/joint-discovery-gatk4.wdl @@ -681,7 +681,7 @@ task SNPsVariantRecalibrator { } runtime { docker: docker - memory: "3.5 GB" + memory: "7.5 GB" cpu: "2" disks: "local-disk " + disk_size + " HDD" preemptible: preemptible_tries From 6304e56fbace8d34c9c3935cc89e29a0ece9e8b5 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 16 Jul 2019 16:47:52 +0000 Subject: [PATCH 09/24] correction to variable names in json --- joint-discovery-gatk4-local.hg38.wgs.inputs.json | 4 ++-- joint-discovery-gatk4.hg38.wgs.inputs.json | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/joint-discovery-gatk4-local.hg38.wgs.inputs.json b/joint-discovery-gatk4-local.hg38.wgs.inputs.json index bd0c045..2742e5d 100644 --- a/joint-discovery-gatk4-local.hg38.wgs.inputs.json +++ b/joint-discovery-gatk4-local.hg38.wgs.inputs.json @@ -38,10 +38,10 @@ "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], "##_COMMENT4": "DOCKERS", - "#PreProcessingForVariantDiscovery_GATK4.gatk_docker_override": "String? (optional)", + "#JointGenotyping.gatk_docker_override": "String? (optional)", "##_COMMENT5": "PATHS", - "#PreProcessingForVariantDiscovery_GATK4.gatk_path_override": "String? (optional)", + "#JointGenotyping.gatk_path_override": "String? (optional)", "##_COMMENT8": "DISK SIZE ALLOCATION", "#JointGenotyping.small_disk_override": "Int? (optional)", diff --git a/joint-discovery-gatk4.hg38.wgs.inputs.json b/joint-discovery-gatk4.hg38.wgs.inputs.json index cc45b76..4bd2d8b 100644 --- a/joint-discovery-gatk4.hg38.wgs.inputs.json +++ b/joint-discovery-gatk4.hg38.wgs.inputs.json @@ -36,10 +36,10 @@ "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], "##_COMMENT4": "DOCKERS", - "#PreProcessingForVariantDiscovery_GATK4.gatk_docker_override": "String? (optional)", + "#JointGenotyping.gatk_docker_override": "String? (optional)", "##_COMMENT5": "PATHS", - "#PreProcessingForVariantDiscovery_GATK4.gatk_path_override": "String? (optional)", + "#JointGenotyping.gatk_path_override": "String? (optional)", "##_COMMENT6": "JAVA OPTIONS", "JointGenotyping.SNPsVariantRecalibratorScattered.java_opt": "-Xmx3g -Xms3g", @@ -79,6 +79,6 @@ "#JointGenotyping.huge_disk_override": "Int? (optional)", "##_COMMENT9": "PREEMPTIBLES", - "#PreProcessingForVariantDiscovery_GATK4.preemptible_tries_override": "Int? (optional)" + "#JointGenotyping.preemptible_tries_override": "Int? (optional)" } From 411c494901f13adbad939c78529893ce36ac39a9 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 30 Oct 2019 17:54:07 +0000 Subject: [PATCH 10/24] corrected parameter syntax for SNPsVariantRecalibrator task --- joint-discovery-gatk4-local.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/joint-discovery-gatk4-local.wdl b/joint-discovery-gatk4-local.wdl index c41d7a6..771debd 100644 --- a/joint-discovery-gatk4-local.wdl +++ b/joint-discovery-gatk4-local.wdl @@ -670,10 +670,10 @@ task SNPsVariantRecalibrator { -mode SNP \ ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf} + --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ + --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ + --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ + --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} } runtime { docker: docker From 043e443fa6ec05966e4b49c3de4b34424f2956ad Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 1 Nov 2019 13:28:56 +0000 Subject: [PATCH 11/24] Updated haplotypecaller to WDL 1.0, removed comments from haplotypecaller # --- README.md | 17 +- haplotypecaller-gvcf-gatk4-nio.wdl | 254 ------------------ ...typecaller-gvcf-gatk4.hg38.wgs.inputs.json | 37 +-- haplotypecaller-gvcf-gatk4.wdl | 222 ++++++++------- 4 files changed, 135 insertions(+), 395 deletions(-) delete mode 100644 haplotypecaller-gvcf-gatk4-nio.wdl diff --git a/README.md b/README.md index 9880957..ee9821c 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,16 @@ Workflows for [germline short variant discovery](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11145) with GATK4. ### haplotypecaller-gvcf-gatk : -The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool -from GATK4 in GVCF mode on a single sample according to GATK Best Practices. -When executed the workflow scatters the HaplotypeCaller tool over a sample -using an intervals list file. The output file produced will be a -single gvcf file which can be used by the joint-discovery workflow. +The haplotypecaller-gvcf-gatk4 workflow runs the GATK4 HaplotypeCaller tool +in GVCF mode on a single sample according to GATK Best Practices. When +executed the workflow scatters the HaplotypeCaller tool over the input bam sample +using an interval list file. The output produced by the workflow will be a single GVCF +file which can then be provided to the joint-discovery workflow along with several other +GVCF files to call for variants simultaneously, producing a multisample VCF. +The haplotypecaller-gvcf-gatk4 workflows default GVCF mode is useful when calling variants +for several samples efficiently. However, for instances when calling variants for one or a +few samples it is possible to have the workflow directly call variants and output a VCF file by +setting the `make_gvcf` input variable to `true`. #### Requirements/expectations - One analysis-ready BAM file for a single sample (as identified in RG:SM) @@ -45,7 +50,7 @@ using an array of gvcfs as input.* in the FILTER field. ### Software version requirements : -- GATK 4.1 +- GATK 4.1.4.0 - Samtools 1.3.1 - Python 2.7 - Cromwell version support diff --git a/haplotypecaller-gvcf-gatk4-nio.wdl b/haplotypecaller-gvcf-gatk4-nio.wdl deleted file mode 100644 index afcbbfb..0000000 --- a/haplotypecaller-gvcf-gatk4-nio.wdl +++ /dev/null @@ -1,254 +0,0 @@ -## Copyright Broad Institute, 2019 -## -## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool -## from GATK4 in GVCF mode on a single sample according to GATK Best Practices. -## When executed the workflow scatters the HaplotypeCaller tool over a sample -## using an intervals list file. The output file produced will be a -## single gvcf file which can be used by the joint-discovery workflow. -## -## Requirements/expectations : -## - One analysis-ready BAM file for a single sample (as identified in RG:SM) -## - Set of variant calling intervals lists for the scatter, provided in a file -## -## Outputs : -## - One GVCF file and its index -## -## Cromwell version support -## - Successfully tested on v37 -## - Does not work on versions < v23 due to output syntax -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the dockers -## for detailed licensing information pertaining to the included programs. - -# WORKFLOW DEFINITION -workflow HaplotypeCallerGvcf_GATK4 { - File input_bam - File input_bam_index - File ref_dict - File ref_fasta - File ref_fasta_index - File scattered_calling_intervals_list - - Boolean? make_gvcf - Boolean making_gvcf = select_first([make_gvcf,true]) - - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - String? gitc_docker_override - String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) - String? samtools_path_override - String samtools_path = select_first([samtools_path_override, "samtools"]) - - Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) - - #is the input a cram file? - Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" - - String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") - String vcf_basename = sample_basename - String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" - String output_filename = vcf_basename + output_suffix - - # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. - # If we take the number we are scattering by and reduce by 20 we will have enough disk space - # to account for the fact that the data is quite uneven across the shards. - Int potential_hc_divisor = length(scattered_calling_intervals) - 20 - Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 - - - if ( is_cram ) { - call CramToBamTask { - input: - input_cram = input_bam, - sample_name = sample_basename, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - docker = gitc_docker, - samtools_path = samtools_path - } - } - - # Call variants in parallel over grouped calling intervals - scatter (interval_file in scattered_calling_intervals) { - - # Generate GVCF by interval - call HaplotypeCaller { - input: - input_bam = select_first([CramToBamTask.output_bam, input_bam]), - input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]), - interval_list = interval_file, - output_filename = output_filename, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - hc_scatter = hc_divisor, - make_gvcf = making_gvcf, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - # Merge per-interval GVCFs - call MergeGVCFs { - input: - input_vcfs = HaplotypeCaller.output_vcf, - input_vcfs_indexes = HaplotypeCaller.output_vcf_index, - output_filename = output_filename, - docker = gatk_docker, - gatk_path = gatk_path - } - - # Outputs that will be retained when execution is complete - output { - File output_vcf = MergeGVCFs.output_vcf - File output_vcf_index = MergeGVCFs.output_vcf_index - } -} - -# TASK DEFINITIONS - -task CramToBamTask { - # Command parameters - File ref_fasta - File ref_fasta_index - File ref_dict - File input_cram - String sample_name - - # Runtime parameters - String docker - Int? machine_mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts - String samtools_path - - Float output_bam_size = size(input_cram, "GB") / 0.60 - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 - - command { - set -e - set -o pipefail - - ${samtools_path} view -h -T ${ref_fasta} ${input_cram} | - ${samtools_path} view -b -o ${sample_name}.bam - - ${samtools_path} index -b ${sample_name}.bam - mv ${sample_name}.bam.bai ${sample_name}.bai - } - runtime { - docker: docker - memory: select_first([machine_mem_gb, 15]) + " GB" - disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - } - output { - File output_bam = "${sample_name}.bam" - File output_bai = "${sample_name}.bai" - } -} - -# HaplotypeCaller per-sample in GVCF mode -task HaplotypeCaller { - String input_bam - String input_bam_index - File interval_list - String output_filename - File ref_dict - File ref_fasta - File ref_fasta_index - Float? contamination - Boolean make_gvcf - Int hc_scatter - - String gatk_path - String? java_options - String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) - - # Runtime parameters - String docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts - - Int machine_mem_gb = select_first([mem_gb, 7]) - Int command_mem_gb = machine_mem_gb - 1 - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 - - command <<< - set -e - - ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ - HaplotypeCaller \ - -R ${ref_fasta} \ - -I ${input_bam} \ - -L ${interval_list} \ - -O ${output_filename} \ - -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} - >>> - - runtime { - docker: docker - memory: machine_mem_gb + " GB" - disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - } - - output { - File output_vcf = "${output_filename}" - File output_vcf_index = "${output_filename}.tbi" - } -} -# Merge GVCFs generated per-interval for the same sample -task MergeGVCFs { - Array[File] input_vcfs - Array[File] input_vcfs_indexes - String output_filename - - String gatk_path - - # Runtime parameters - String docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts - - Int machine_mem_gb = select_first([mem_gb, 3]) - Int command_mem_gb = machine_mem_gb - 1 - - command <<< - set -e - - ${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ - MergeVcfs \ - --INPUT ${sep=' --INPUT ' input_vcfs} \ - --OUTPUT ${output_filename} - >>> - - runtime { - docker: docker - memory: machine_mem_gb + " GB" - disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - } - - - output { - File output_vcf = "${output_filename}" - File output_vcf_index = "${output_filename}.tbi" - } -} - diff --git a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json index 0b5df9a..313ce6b 100644 --- a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json +++ b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json @@ -1,45 +1,10 @@ { - "##_COMMENT1": "INPUT BAM", - "#HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam", - "#HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai", "HaplotypeCallerGvcf_GATK4.input_bam": "gs://broad-public-datasets/NA12878/NA12878.cram", "HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://broad-public-datasets/NA12878/NA12878.cram.crai", - "##_COMMENT2": "REFERENCE FILES", "HaplotypeCallerGvcf_GATK4.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", "HaplotypeCallerGvcf_GATK4.ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", "HaplotypeCallerGvcf_GATK4.ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "##_COMMENT3": "INTERVALS", - "HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt", - - "##_COMMENT4": "MISCELLANEOUS PARAMETERS", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "Float? (optional)", - - "##_COMMENT5": "DOCKERS", - "#HaplotypeCallerGvcf_GATK4.gatk_docker_override": "String? (optional)", - "#HaplotypeCallerGvcf_GATK4.gitc_docker_override": "String? (optional)", - - "##_COMMENT6": "PATHS", - "#HaplotypeCallerGvcf_GATK4.gatk_path_override": "String? (optional)", - "#HaplotypeCallerGvcf_GATK4.samtools_path_override": "String? (optional)", - - "##_COMMENT7": "JAVA OPTIONS", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "String? (optional)", - - "##_COMMENT8": "MEMORY ALLOCATION", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.CramToBamTask.machine_mem_gb": "Int? (optional)", - - "##_COMMENT9": "DISK SIZE ALLOCATION", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.CramToBamTask.disk_space_gb": "Int? (optional)", - - "##_COMMENT10": "PREEMPTION", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.CramToBamTask.preemptible_attempts": "Int? (optional)" + "HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt" } diff --git a/haplotypecaller-gvcf-gatk4.wdl b/haplotypecaller-gvcf-gatk4.wdl index 0270187..abf9092 100644 --- a/haplotypecaller-gvcf-gatk4.wdl +++ b/haplotypecaller-gvcf-gatk4.wdl @@ -1,3 +1,5 @@ +version 1.0 + ## Copyright Broad Institute, 2019 ## ## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool @@ -28,45 +30,53 @@ # WORKFLOW DEFINITION workflow HaplotypeCallerGvcf_GATK4 { - File input_bam - File input_bam_index - File ref_dict - File ref_fasta - File ref_fasta_index - File scattered_calling_intervals_list + input { + File input_bam + File input_bam_index + File ref_dict + File ref_fasta + File ref_fasta_index + File scattered_calling_intervals_list - Boolean? make_gvcf - Boolean making_gvcf = select_first([make_gvcf,true]) - - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - String? gitc_docker_override - String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) - String? samtools_path_override - String samtools_path = select_first([samtools_path_override, "samtools"]) + Boolean? make_gvcf + Boolean making_gvcf = select_first([make_gvcf,true]) + + String? gatk_docker_override + String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.4.0"]) + String? gatk_path_override + String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) + String? gitc_docker_override + String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) + String? samtools_path_override + String samtools_path = select_first([samtools_path_override, "samtools"]) - Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) + + #is the input a cram file? + Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" - #is the input a cram file? - Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" + String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") + String vcf_basename = sample_basename + String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" + String output_filename = vcf_basename + output_suffix - String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") - String vcf_basename = sample_basename - String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" - String output_filename = vcf_basename + output_suffix + # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. + # If we take the number we are scattering by and reduce by 20 we will have enough disk space + # to account for the fact that the data is quite uneven across the shards. + Int potential_hc_divisor = length(scattered_calling_intervals) - 20 + Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 + } if ( is_cram ) { call CramToBamTask { - input: - input_cram = input_bam, - sample_name = sample_basename, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - docker = gitc_docker, - samtools_path = samtools_path + input: + input_cram = input_bam, + sample_name = sample_basename, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + docker = gitc_docker, + samtools_path = samtools_path } } @@ -83,6 +93,7 @@ workflow HaplotypeCallerGvcf_GATK4 { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, + hc_scatter = hc_divisor, make_gvcf = making_gvcf, docker = gatk_docker, gatk_path = gatk_path @@ -109,25 +120,26 @@ workflow HaplotypeCallerGvcf_GATK4 { # TASK DEFINITIONS task CramToBamTask { - # Command parameters - File ref_fasta - File ref_fasta_index - File ref_dict - File input_cram - String sample_name - - # Runtime parameters - String docker - Int? machine_mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts - String samtools_path - - Float output_bam_size = size(input_cram, "GB") / 0.60 - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 - + input { + # Command parameters + File ref_fasta + File ref_fasta_index + File ref_dict + File input_cram + String sample_name + + # Runtime parameters + String docker + Int? machine_mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + String samtools_path + } + Float output_bam_size = size(input_cram, "GB") / 0.60 + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 + command { set -e set -o pipefail @@ -151,35 +163,50 @@ task CramToBamTask { # HaplotypeCaller per-sample in GVCF mode task HaplotypeCaller { - File input_bam - File input_bam_index - File interval_list - String output_filename - File ref_dict - File ref_fasta - File ref_fasta_index - Float? contamination - Boolean make_gvcf - - String gatk_path - String? java_options - String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) - - # Runtime parameters - String docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts + input { + # Command parameters + File input_bam + File input_bam_index + File interval_list + String output_filename + File ref_dict + File ref_fasta + File ref_fasta_index + Float? contamination + Boolean make_gvcf + Int hc_scatter + + String gatk_path + String? java_options + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + } + + String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) Int machine_mem_gb = select_first([mem_gb, 7]) Int command_mem_gb = machine_mem_gb - 1 Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 - - command <<< - set -e + Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 + + parameter_meta { + input_bam: { + description: "a bam file", + localization_optional: true + } + input_bam_index: { + description: "an index file for the bam input", + localization_optional: true + } + } + command { + set -e ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ HaplotypeCaller \ @@ -188,15 +215,13 @@ task HaplotypeCaller { -L ${interval_list} \ -O ${output_filename} \ -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} - >>> - + } runtime { docker: docker memory: machine_mem_gb + " GB" disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 3]) } - output { File output_vcf = "${output_filename}" File output_vcf_index = "${output_filename}.tbi" @@ -204,39 +229,38 @@ task HaplotypeCaller { } # Merge GVCFs generated per-interval for the same sample task MergeGVCFs { - Array[File] input_vcfs - Array[File] input_vcfs_indexes - String output_filename - - String gatk_path - - # Runtime parameters - String docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts - - Int machine_mem_gb = select_first([mem_gb, 3]) - Int command_mem_gb = machine_mem_gb - 1 - - command <<< + input { + # Command parameters + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String output_filename + + String gatk_path + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + } + Int machine_mem_gb = select_first([mem_gb, 3]) + Int command_mem_gb = machine_mem_gb - 1 + + command { set -e ${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ MergeVcfs \ --INPUT ${sep=' --INPUT ' input_vcfs} \ --OUTPUT ${output_filename} - >>> - + } runtime { docker: docker memory: machine_mem_gb + " GB" disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 3]) } - - output { File output_vcf = "${output_filename}" File output_vcf_index = "${output_filename}.tbi" From c4ce8f6376472a937dc226be11ef1e4f7e66b3de Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 1 Nov 2019 18:10:19 +0000 Subject: [PATCH 12/24] Added WDL 1.0 version of JointGenotyping to soon replace joint discovery wdl --- JointGenotyping.wdl | 498 +++++++++++++++ tasks/JointGenotypingTasks.wdl | 1069 ++++++++++++++++++++++++++++++++ 2 files changed, 1567 insertions(+) create mode 100644 JointGenotyping.wdl create mode 100644 tasks/JointGenotypingTasks.wdl diff --git a/JointGenotyping.wdl b/JointGenotyping.wdl new file mode 100644 index 0000000..58cf33c --- /dev/null +++ b/JointGenotyping.wdl @@ -0,0 +1,498 @@ +version 1.0 + +## Copyright Broad Institute, 2019 +## +## This WDL implements the joint discovery and VQSR filtering portion of the GATK +## Best Practices (June 2016) for germline SNP and Indel discovery in human +## whole-genome sequencing (WGS) and exome sequencing data. +## +## Requirements/expectations : +## - One or more GVCFs produced by HaplotypeCaller in GVCF mode +## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. +## +## Outputs : +## - A VCF file and its index, filtered using variant quality score recalibration +## (VQSR) with genotypes for all samples present in the input VCF. All sites that +## are present in the input VCF are retained; filtered sites are annotated as such +## in the FILTER field. +## +## Note about VQSR wiring : +## The SNP and INDEL models are built in parallel, but then the corresponding +## recalibrations are applied in series. Because the INDEL model is generally ready +## first (because there are fewer indels than SNPs) we set INDEL recalibration to +## be applied first to the input VCF, while the SNP model is still being built. By +## the time the SNP model is available, the indel-recalibrated file is available to +## serve as input to apply the SNP recalibration. If we did it the other way around, +## we would have to wait until the SNP recal file was available despite the INDEL +## recal file being there already, then apply SNP recalibration, then apply INDEL +## recalibration. This would lead to a longer wall clock time for complete workflow +## execution. Wiring the INDEL recalibration to be applied first solves the problem. +## +## Cromwell version support +## - Successfully tested on v47 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# WORKFLOW DEFINITION + +import "./tasks/JointGenotypingTasks.wdl" as Tasks + + +# Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19) +workflow JointGenotyping { + + String pipeline_version = "1.1" + + input { + File unpadded_intervals_file + + String callset_name + File sample_name_map + + File ref_fasta + File ref_fasta_index + File ref_dict + + File dbsnp_vcf + File dbsnp_vcf_index + + Int small_disk + Int medium_disk + Int large_disk + Int huge_disk + + Array[String] snp_recalibration_tranche_values + Array[String] snp_recalibration_annotation_values + Array[String] indel_recalibration_tranche_values + Array[String] indel_recalibration_annotation_values + + File haplotype_database + + File eval_interval_list + File hapmap_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf + File one_thousand_genomes_resource_vcf_index + File mills_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf = dbsnp_vcf + File dbsnp_resource_vcf_index = dbsnp_vcf_index + + # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme + # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 + Float excess_het_threshold = 54.69 + Float snp_filter_level + Float indel_filter_level + Int SNP_VQSR_downsampleFactor + + Int? top_level_scatter_count + Boolean? gather_vcfs + Int snps_variant_recalibration_threshold = 500000 + Boolean rename_gvcf_samples = true + Float unbounded_scatter_count_scale_factor = 0.15 + Int gnarly_scatter_count = 10 + Boolean use_gnarly_genotyper = false + Boolean cross_check_fingerprints = true + Boolean scatter_cross_check_fingerprints = false + } + + Array[Array[String]] sample_name_map_lines = read_tsv(sample_name_map) + Int num_gvcfs = length(sample_name_map_lines) + + # Make a 2.5:1 interval number to samples in callset ratio interval list. + # We allow overriding the behavior by specifying the desired number of vcfs + # to scatter over for testing / special requests. + # Zamboni notes say "WGS runs get 30x more scattering than Exome" and + # exome scatterCountPerSample is 0.05, min scatter 10, max 1000 + + # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. + # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. + # We allow overriding this default behavior for testing / special requests. + Boolean is_small_callset = select_first([gather_vcfs, num_gvcfs <= 1000]) + + Int unbounded_scatter_count = select_first([top_level_scatter_count, round(unbounded_scatter_count_scale_factor * num_gvcfs)]) + Int scatter_count = if unbounded_scatter_count > 2 then unbounded_scatter_count else 2 #I think weird things happen if scatterCount is 1 -- IntervalListTools is noop? + + call Tasks.CheckSamplesUnique { + input: + sample_name_map = sample_name_map + } + + call Tasks.SplitIntervalList { + input: + interval_list = unpadded_intervals_file, + scatter_count = scatter_count, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = small_disk, + sample_names_unique_done = CheckSamplesUnique.samples_unique + } + + Array[File] unpadded_intervals = SplitIntervalList.output_intervals + + scatter (idx in range(length(unpadded_intervals))) { + # The batch_size value was carefully chosen here as it + # is the optimal value for the amount of memory allocated + # within the task; please do not change it without consulting + # the Hellbender (GATK engine) team! + call Tasks.ImportGVCFs { + input: + sample_name_map = sample_name_map, + interval = unpadded_intervals[idx], + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + workspace_dir_name = "genomicsdb", + disk_size = medium_disk, + batch_size = 50 + } + + if (use_gnarly_genotyper) { + + call Tasks.SplitIntervalList as GnarlyIntervalScatterDude { + input: + interval_list = unpadded_intervals[idx], + scatter_count = gnarly_scatter_count, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = small_disk, + sample_names_unique_done = CheckSamplesUnique.samples_unique + } + + Array[File] gnarly_intervals = GnarlyIntervalScatterDude.output_intervals + + scatter (gnarly_idx in range(length(gnarly_intervals))) { + call Tasks.GnarlyGenotyper { + input: + workspace_tar = ImportGVCFs.output_genomicsdb, + interval = gnarly_intervals[gnarly_idx], + output_vcf_filename = callset_name + "." + idx + "." + gnarly_idx + ".vcf.gz", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + } + } + + Array[File] gnarly_gvcfs = GnarlyGenotyper.output_vcf + + call Tasks.GatherVcfs as TotallyRadicalGatherVcfs { + input: + input_vcfs = gnarly_gvcfs, + output_vcf_name = callset_name + "." + idx + ".gnarly.vcf.gz", + disk_size = large_disk + } + } + + if (!use_gnarly_genotyper) { + call Tasks.GenotypeGVCFs { + input: + workspace_tar = ImportGVCFs.output_genomicsdb, + interval = unpadded_intervals[idx], + output_vcf_filename = callset_name + "." + idx + ".vcf.gz", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + disk_size = medium_disk + } + } + + File genotyped_vcf = select_first([TotallyRadicalGatherVcfs.output_vcf, GenotypeGVCFs.output_vcf]) + File genotyped_vcf_index = select_first([TotallyRadicalGatherVcfs.output_vcf_index, GenotypeGVCFs.output_vcf_index]) + + call Tasks.HardFilterAndMakeSitesOnlyVcf { + input: + vcf = genotyped_vcf, + vcf_index = genotyped_vcf_index, + excess_het_threshold = excess_het_threshold, + variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", + sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", + disk_size = medium_disk + } + } + + call Tasks.GatherVcfs as SitesOnlyGatherVcf { + input: + input_vcfs = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf, + output_vcf_name = callset_name + ".sites_only.vcf.gz", + disk_size = medium_disk + } + + call Tasks.IndelsVariantRecalibrator { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".indels.recal", + tranches_filename = callset_name + ".indels.tranches", + recalibration_tranche_values = indel_recalibration_tranche_values, + recalibration_annotation_values = indel_recalibration_annotation_values, + mills_resource_vcf = mills_resource_vcf, + mills_resource_vcf_index = mills_resource_vcf_index, + axiomPoly_resource_vcf = axiomPoly_resource_vcf, + axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = !use_gnarly_genotyper, + disk_size = small_disk + } + + if (num_gvcfs > snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibratorCreateModel { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + downsampleFactor = SNP_VQSR_downsampleFactor, + model_report_filename = callset_name + ".snps.model.report", + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = !use_gnarly_genotyper, + disk_size = small_disk + } + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { + input: + sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], + sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], + recalibration_filename = callset_name + ".snps." + idx + ".recal", + tranches_filename = callset_name + ".snps." + idx + ".tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + model_report = SNPsVariantRecalibratorCreateModel.model_report, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = !use_gnarly_genotyper, + disk_size = small_disk + } + } + + call Tasks.GatherTranches as SNPGatherTranches { + input: + tranches = SNPsVariantRecalibratorScattered.tranches, + output_filename = callset_name + ".snps.gathered.tranches", + disk_size = small_disk + } + } + + if (num_gvcfs <= snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = !use_gnarly_genotyper, + disk_size = small_disk + } + } + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { + #for really large callsets we give to friends, just apply filters to the sites-only + call Tasks.ApplyRecalibration { + input: + recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", + input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], + input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], + indels_recalibration = IndelsVariantRecalibrator.recalibration, + indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, + indels_tranches = IndelsVariantRecalibrator.tranches, + snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), + snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), + snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), + indel_filter_level = indel_filter_level, + snp_filter_level = snp_filter_level, + use_allele_specific_annotations = !use_gnarly_genotyper, + disk_size = medium_disk + } + + # For large callsets we need to collect metrics from the shards and gather them later. + if (!is_small_callset) { + call Tasks.CollectVariantCallingMetrics as CollectMetricsSharded { + input: + input_vcf = ApplyRecalibration.recalibrated_vcf, + input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, + metrics_filename_prefix = callset_name + "." + idx, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict, + disk_size = medium_disk + } + } + } + + # For small callsets we can gather the VCF shards and then collect metrics on it. + if (is_small_callset) { + call Tasks.GatherVcfs as FinalGatherVcf { + input: + input_vcfs = ApplyRecalibration.recalibrated_vcf, + output_vcf_name = callset_name + ".vcf.gz", + disk_size = huge_disk + } + + call Tasks.CollectVariantCallingMetrics as CollectMetricsOnFullVcf { + input: + input_vcf = FinalGatherVcf.output_vcf, + input_vcf_index = FinalGatherVcf.output_vcf_index, + metrics_filename_prefix = callset_name, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict, + disk_size = large_disk + } + } + + if (!is_small_callset) { + # For large callsets we still need to gather the sharded metrics. + call Tasks.GatherVariantCallingMetrics { + input: + input_details = select_all(CollectMetricsSharded.detail_metrics_file), + input_summaries = select_all(CollectMetricsSharded.summary_metrics_file), + output_prefix = callset_name, + disk_size = medium_disk + } + } + + # CrossCheckFingerprints takes forever on large callsets. + # We scatter over the input GVCFs to make things faster. + if (scatter_cross_check_fingerprints) { + call Tasks.GetFingerprintingIntervalIndices { + input: + unpadded_intervals = unpadded_intervals, + haplotype_database = haplotype_database + } + + Array[Int] fingerprinting_indices = GetFingerprintingIntervalIndices.indices_to_fingerprint + + scatter (idx in fingerprinting_indices) { + File vcfs_to_fingerprint = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx] + } + + call Tasks.GatherVcfs as GatherFingerprintingVcfs { + input: + input_vcfs = vcfs_to_fingerprint, + output_vcf_name = callset_name + ".gathered.fingerprinting.vcf.gz", + disk_size = medium_disk + } + + call Tasks.SelectFingerprintSiteVariants { + input: + input_vcf = GatherFingerprintingVcfs.output_vcf, + base_output_name = callset_name + ".fingerprinting", + haplotype_database = haplotype_database, + disk_size = medium_disk + } + + call Tasks.PartitionSampleNameMap { + input: + sample_name_map = sample_name_map, + line_limit = 1000 + } + + scatter (idx in range(length(PartitionSampleNameMap.partitions))) { + + Array[File] files_in_partition = read_lines(PartitionSampleNameMap.partitions[idx]) + + call Tasks.CrossCheckFingerprint as CrossCheckFingerprintsScattered { + input: + gvcf_paths = files_in_partition, + vcf_paths = vcfs_to_fingerprint, + sample_name_map = sample_name_map, + haplotype_database = haplotype_database, + output_base_name = callset_name + "." + idx, + scattered = true + } + } + + call Tasks.GatherPicardMetrics as GatherFingerprintingMetrics { + input: + metrics_files = CrossCheckFingerprintsScattered.crosscheck_metrics, + output_file_name = callset_name + ".fingerprintcheck", + disk_size = small_disk + } + } + + if (!scatter_cross_check_fingerprints) { + + scatter (line in sample_name_map_lines) { + File gvcf_paths = line[1] + } + + call Tasks.CrossCheckFingerprint as CrossCheckFingerprintSolo { + input: + gvcf_paths = gvcf_paths, + vcf_paths = ApplyRecalibration.recalibrated_vcf, + sample_name_map = sample_name_map, + haplotype_database = haplotype_database, + output_base_name = callset_name + } + } + + # Get the metrics from either code path + File output_detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherVariantCallingMetrics.detail_metrics_file]) + File output_summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherVariantCallingMetrics.summary_metrics_file]) + + # Get the VCFs from either code path + Array[File?] output_vcf_files = if defined(FinalGatherVcf.output_vcf) then [FinalGatherVcf.output_vcf] else ApplyRecalibration.recalibrated_vcf + Array[File?] output_vcf_index_files = if defined(FinalGatherVcf.output_vcf_index) then [FinalGatherVcf.output_vcf_index] else ApplyRecalibration.recalibrated_vcf_index + + output { + # Metrics from either the small or large callset + File detail_metrics_file = output_detail_metrics_file + File summary_metrics_file = output_summary_metrics_file + + # Outputs from the small callset path through the wdl. + Array[File] output_vcfs = select_all(output_vcf_files) + Array[File] output_vcf_indices = select_all(output_vcf_index_files) + + # Output the interval list generated/used by this run workflow. + Array[File] output_intervals = SplitIntervalList.output_intervals + + # Output the metrics from crosschecking fingerprints. + File crosscheck_fingerprint_check = select_first([CrossCheckFingerprintSolo.crosscheck_metrics, GatherFingerprintingMetrics.gathered_metrics]) + } +} diff --git a/tasks/JointGenotypingTasks.wdl b/tasks/JointGenotypingTasks.wdl new file mode 100644 index 0000000..f847e5d --- /dev/null +++ b/tasks/JointGenotypingTasks.wdl @@ -0,0 +1,1069 @@ +version 1.0 + + +task CheckSamplesUnique { + input { + File sample_name_map + } + + command { + set -euo pipefail + if [[ $(cut -f 1 ~{sample_name_map} | wc -l) -ne $(cut -f 1 ~{sample_name_map} | sort | uniq | wc -l) ]] + then + echo "Samples in the sample_name_map are not unique" 1>&2 + exit 1 + elif [[ $(cut -f 1 ~{sample_name_map} | wc -l) -lt 50 ]] + then + echo "There are less than 50 samples in the sample_name_map" 1>&2 + echo "Having less than 50 samples means there likely isn't enough data to complete joint calling" 1>&2 + exit 1 + else + echo true + fi + } + + output { + Boolean samples_unique = read_boolean(stdout()) + } + + runtime { + memory: "1 GiB" + preemptible: 1 + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} + +task SplitIntervalList { + + input { + File interval_list + Int scatter_count + File ref_fasta + File ref_fasta_index + File ref_dict + Boolean sample_names_unique_done + Int disk_size + String scatter_mode = "BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + parameter_meta { + interval_list: { + localization_optional: true + } + } + + command <<< + gatk --java-options -Xms3g SplitIntervals \ + -L ~{interval_list} -O scatterDir -scatter ~{scatter_count} -R ~{ref_fasta} \ + -mode ~{scatter_mode} + >>> + + runtime { + memory: "3.75 GiB" + preemptible: 1 + disks: "local-disk " + disk_size + " HDD" + docker: gatk_docker + } + + output { + Array[File] output_intervals = glob("scatterDir/*") + } +} + +task ImportGVCFs { + + input { + File sample_name_map + File interval + File ref_fasta + File ref_fasta_index + File ref_dict + + String workspace_dir_name + + Int disk_size + Int batch_size + + # Using a nightly version of GATK containing fixes for GenomicsDB + # https://github.com/broadinstitute/gatk/pull/5899 + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk-nightly:2019-05-07-4.1.2.0-5-g53d015e4f-NIGHTLY-SNAPSHOT" + } + + command <<< + set -euo pipefail + + rm -rf ~{workspace_dir_name} + + # We've seen some GenomicsDB performance regressions related to intervals, so we're going to pretend we only have a single interval + # using the --merge-input-intervals arg + # There's no data in between since we didn't run HaplotypeCaller over those loci so we're not wasting any compute + + # The memory setting here is very important and must be several GiB lower + # than the total memory allocated to the VM because this tool uses + # a significant amount of non-heap memory for native libraries. + # Also, testing has shown that the multithreaded reader initialization + # does not scale well beyond 5 threads, so don't increase beyond that. + gatk --java-options -Xms8g \ + GenomicsDBImport \ + --genomicsdb-workspace-path ~{workspace_dir_name} \ + --batch-size ~{batch_size} \ + -L ~{interval} \ + --sample-name-map ~{sample_name_map} \ + --reader-threads 5 \ + --merge-input-intervals \ + --consolidate + + tar -cf ~{workspace_dir_name}.tar ~{workspace_dir_name} + >>> + + runtime { + memory: "26 GiB" + cpu: 4 + disks: "local-disk " + disk_size + " HDD" + docker: gatk_docker + preemptible: 1 + } + + output { + File output_genomicsdb = "~{workspace_dir_name}.tar" + } +} + +task GenotypeGVCFs { + + input { + File workspace_tar + File interval + + String output_vcf_filename + + File ref_fasta + File ref_fasta_index + File ref_dict + + String dbsnp_vcf + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.4.0" + } + + parameter_meta { + interval: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + tar -xf ~{workspace_tar} + WORKSPACE=$(basename ~{workspace_tar} .tar) + + gatk --java-options -Xms8g \ + GenotypeGVCFs \ + -R ~{ref_fasta} \ + -O ~{output_vcf_filename} \ + -D ~{dbsnp_vcf} \ + -G StandardAnnotation -G AS_StandardAnnotation \ + --only-output-calls-starting-in-intervals \ + --use-new-qual-calculator \ + -V gendb://$WORKSPACE \ + -L ~{interval} \ + --merge-input-intervals + >>> + + runtime { + memory: "26 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_filename}" + File output_vcf_index = "~{output_vcf_filename}.tbi" + } +} + +task GnarlyGenotyper { + + input { + File workspace_tar + File interval + String output_vcf_filename + File ref_fasta + File ref_fasta_index + File ref_dict + String dbsnp_vcf + + String gatk_docker = "us.gcr.io/broad-gotc-prod/gnarly_genotyper:fixNegativeRefCount" + } + + parameter_meta { + interval: { + localization_optional: true + } + } + + Int disk_size = ceil(size(workspace_tar, "GiB") + size(ref_fasta, "GiB") + size(dbsnp_vcf, "GiB") * 3) + + command <<< + set -e + + tar -xf ~{workspace_tar} + WORKSPACE=$( basename ~{workspace_tar} .tar) + + # use a query.json to set some params that aren't exposed -- ewwwww + cat < $WORKSPACE/query.json + { + "scan_full": true, + "workspace": "genomicsdb", + "array": "genomicsdb_array", + "vid_mapping_file": "genomicsdb/vidmap.json", + "callset_mapping_file": "genomicsdb/callset.json", + "reference_genome": "/cromwell_root/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "max_diploid_alt_alleles_that_can_be_genotyped": 6, + "produce_GT_field": true + } + EOF + + gatk --java-options -Xms8g \ + GnarlyGenotyper \ + -R ~{ref_fasta} \ + -O ~{output_vcf_filename} \ + --output-database-name annotationDB.vcf.gz \ + -D ~{dbsnp_vcf} \ + --only-output-calls-starting-in-intervals \ + --use-new-qual-calculator \ + -V gendb://$WORKSPACE \ + -L ~{interval} \ + -stand-call-conf 10 \ + --merge-input-intervals + >>> + + runtime { + memory: "26 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_filename}" + File output_vcf_index = "~{output_vcf_filename}.tbi" + File output_database = "annotationDB.vcf.gz" + File output_database_index = "annotationDB.vcf.gz.tbi" + } +} + +task HardFilterAndMakeSitesOnlyVcf { + + input { + File vcf + File vcf_index + Float excess_het_threshold + + String variant_filtered_vcf_filename + String sites_only_vcf_filename + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms3g \ + VariantFiltration \ + --filter-expression "ExcessHet > ~{excess_het_threshold}" \ + --filter-name ExcessHet \ + -O ~{variant_filtered_vcf_filename} \ + -V ~{vcf} + + gatk --java-options -Xms3g \ + MakeSitesOnlyVcf \ + -I ~{variant_filtered_vcf_filename} \ + -O ~{sites_only_vcf_filename} + >>> + + runtime { + memory: "3.75 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File variant_filtered_vcf = "~{variant_filtered_vcf_filename}" + File variant_filtered_vcf_index = "~{variant_filtered_vcf_filename}.tbi" + File sites_only_vcf = "~{sites_only_vcf_filename}" + File sites_only_vcf_index = "~{sites_only_vcf_filename}.tbi" + } +} + +task IndelsVariantRecalibrator { + + input { + String recalibration_filename + String tranches_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File mills_resource_vcf + File axiomPoly_resource_vcf + File dbsnp_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 4 + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms24g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode INDEL \ + --max-gaussians ~{max_gaussians} \ + -resource:mills,known=false,training=true,truth=true,prior=12 ~{mills_resource_vcf} \ + -resource:axiomPoly,known=false,training=true,truth=false,prior=10 ~{axiomPoly_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=2 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "26 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File recalibration = "~{recalibration_filename}" + File recalibration_index = "~{recalibration_filename}.idx" + File tranches = "~{tranches_filename}" + } +} + +task SNPsVariantRecalibratorCreateModel { + + input { + String recalibration_filename + String tranches_filename + Int downsampleFactor + String model_report_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms100g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + --sample-every-Nth-variant ~{downsampleFactor} \ + --output-model ~{model_report_filename} \ + --max-gaussians ~{max_gaussians} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "104 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File model_report = "~{model_report_filename}" + } +} + +task SNPsVariantRecalibrator { + + input { + String recalibration_filename + String tranches_filename + File? model_report + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + Int? machine_mem_gb + + } + + Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf, + hapmap_resource_vcf, + omni_resource_vcf, + one_thousand_genomes_resource_vcf, + dbsnp_resource_vcf], + "GiB")) + Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) + Int java_mem = machine_mem - 1 + + + String model_report_arg = if defined(model_report) then "--input-model $MODEL_REPORT --output-tranches-for-scatter" else "" + + command <<< + set -euo pipefail + + MODEL_REPORT=~{model_report} + + gatk --java-options -Xms~{java_mem}g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + ~{model_report_arg} \ + --max-gaussians ~{max_gaussians} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "~{machine_mem} GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File recalibration = "~{recalibration_filename}" + File recalibration_index = "~{recalibration_filename}.idx" + File tranches = "~{tranches_filename}" + } +} + +task GatherTranches { + + input { + Array[File] tranches + String output_filename + Int disk_size + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152" + } + + parameter_meta { + tranches: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + tranches_fofn=~{write_lines(tranches)} + + # Jose says: + # Cromwell will fall over if we have it try to localize tens of thousands of files, + # so we manually localize files using gsutil. + # Using gsutil also lets us parallelize the localization, which (as far as we can tell) + # PAPI doesn't do. + + # This is here to deal with the JES bug where commands may be run twice + rm -rf tranches + mkdir tranches + RETRY_LIMIT=5 + + count=0 + until cat $tranches_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the tranches from the cloud' && exit 1 + fi + + cat $tranches_fofn | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list + + /usr/gitc/gatk --java-options -Xms6g \ + GatherTranches \ + --input inputs.list \ + --output ~{output_filename} + >>> + + runtime { + memory: "7.5 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File tranches = "~{output_filename}" + } +} + +task ApplyRecalibration { + + input { + String recalibrated_vcf_filename + File input_vcf + File input_vcf_index + File indels_recalibration + File indels_recalibration_index + File indels_tranches + File snps_recalibration + File snps_recalibration_index + File snps_tranches + Float indel_filter_level + Float snp_filter_level + Boolean use_allele_specific_annotations + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms5g \ + ApplyVQSR \ + -O tmp.indel.recalibrated.vcf \ + -V ~{input_vcf} \ + --recal-file ~{indels_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{indels_tranches} \ + --truth-sensitivity-filter-level ~{indel_filter_level} \ + --create-output-variant-index true \ + -mode INDEL + + gatk --java-options -Xms5g \ + ApplyVQSR \ + -O ~{recalibrated_vcf_filename} \ + -V tmp.indel.recalibrated.vcf \ + --recal-file ~{snps_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{snps_tranches} \ + --truth-sensitivity-filter-level ~{snp_filter_level} \ + --create-output-variant-index true \ + -mode SNP + >>> + + runtime { + memory: "7 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File recalibrated_vcf = "~{recalibrated_vcf_filename}" + File recalibrated_vcf_index = "~{recalibrated_vcf_filename}.tbi" + } +} + +task GatherVcfs { + + input { + Array[File] input_vcfs + String output_vcf_name + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + parameter_meta { + input_vcfs: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + # --ignore-safety-checks makes a big performance difference so we include it in our invocation. + # This argument disables expensive checks that the file headers contain the same set of + # genotyped samples and that files are in order by position of first record. + gatk --java-options -Xms6g \ + GatherVcfsCloud \ + --ignore-safety-checks \ + --gather-type BLOCK \ + --input ~{sep=" --input " input_vcfs} \ + --output ~{output_vcf_name} + + tabix ~{output_vcf_name} + >>> + + runtime { + memory: "7 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_name}" + File output_vcf_index = "~{output_vcf_name}.tbi" + } +} + +task SelectFingerprintSiteVariants { + + input { + File input_vcf + File haplotype_database + String base_output_name + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + parameter_meta { + input_vcf: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + function hdb_to_interval_list() { + input=$1 + awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1 + } + + hdb_to_interval_list ~{haplotype_database} > hdb.interval_list + + gatk --java-options -Xms6g \ + SelectVariants \ + --variant ~{input_vcf} \ + --intervals hdb.interval_list \ + --output ~{base_output_name}.vcf.gz + >>> + + runtime { + memory: "7.5 GiB" + cpu: 1 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File output_vcf = "~{base_output_name}.vcf.gz" + File output_vcf_index = "~{base_output_name}.vcf.gz.tbi" + } +} + +task CollectVariantCallingMetrics { + + input { + File input_vcf + File input_vcf_index + String metrics_filename_prefix + File dbsnp_vcf + File dbsnp_vcf_index + File interval_list + File ref_dict + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms6g \ + CollectVariantCallingMetrics \ + --INPUT ~{input_vcf} \ + --DBSNP ~{dbsnp_vcf} \ + --SEQUENCE_DICTIONARY ~{ref_dict} \ + --OUTPUT ~{metrics_filename_prefix} \ + --THREAD_COUNT 8 \ + --TARGET_INTERVALS ~{interval_list} + >>> + + output { + File detail_metrics_file = "~{metrics_filename_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "~{metrics_filename_prefix}.variant_calling_summary_metrics" + } + + runtime { + memory: "7.5 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } +} + +task GatherVariantCallingMetrics { + + input { + Array[File] input_details + Array[File] input_summaries + String output_prefix + Int disk_size + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152" + } + + parameter_meta { + input_details: { + localization_optional: true + } + input_summaries: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + input_details_fofn=~{write_lines(input_details)} + input_summaries_fofn=~{write_lines(input_summaries)} + + # Jose says: + # Cromwell will fall over if we have it try to localize tens of thousands of files, + # so we manually localize files using gsutil. + # Using gsutil also lets us parallelize the localization, which (as far as we can tell) + # PAPI doesn't do. + + # This is here to deal with the JES bug where commands may be run twice + rm -rf metrics + + mkdir metrics + RETRY_LIMIT=5 + + count=0 + until cat $input_details_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the metrics from the cloud' && exit 1 + fi + + count=0 + until cat $input_summaries_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the metrics from the cloud' && exit 1 + fi + + INPUT=$(cat $input_details_fofn | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("--INPUT metrics/%s ", $1)}') + + /usr/gitc/gatk --java-options -Xms2g \ + AccumulateVariantCallingMetrics \ + $INPUT \ + --OUTPUT ~{output_prefix} + >>> + + runtime { + memory: "3 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File detail_metrics_file = "~{output_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "~{output_prefix}.variant_calling_summary_metrics" + } +} + +task CrossCheckFingerprint { + + input { + Array[File] gvcf_paths + Array[File] vcf_paths + File sample_name_map + File haplotype_database + String output_base_name + Boolean scattered = false + Array[String] expected_inconclusive_samples = [] + String picard_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map" + } + + parameter_meta { + gvcf_paths: { + localization_optional: true + } + vcf_paths: { + localization_optional: true + } + } + + Int num_gvcfs = length(gvcf_paths) + Int cpu = if num_gvcfs < 32 then num_gvcfs else 32 + # Compute memory to use based on the CPU count, following the pattern of + # 3.75GiB / cpu used by GCP's pricing: https://cloud.google.com/compute/pricing + Int memMb = round(cpu * 3.75 * 1024) + Int disk = 100 + + String output_name = output_base_name + ".fingerprintcheck" + + command <<< + set -eu + + gvcfInputsList=~{write_lines(gvcf_paths)} + vcfInputsList=~{write_lines(vcf_paths)} + + cp $gvcfInputsList gvcf_inputs.list + cp $vcfInputsList vcf_inputs.list + + java -Dpicard.useLegacyParser=false -Xms~{memMb - 512}m \ + -jar /usr/gitc/PicardPublicWithCrosscheckNIOandSampleMapping.jar \ + CrosscheckFingerprints \ + --INPUT gvcf_inputs.list \ + --SECOND_INPUT vcf_inputs.list \ + --HAPLOTYPE_MAP ~{haplotype_database} \ + --INPUT_SAMPLE_FILE_MAP ~{sample_name_map} \ + --CROSSCHECK_BY SAMPLE \ + --CROSSCHECK_MODE CHECK_SAME_SAMPLE \ + --NUM_THREADS ~{cpu} \ + --SKIP_INPUT_READABLITY_TEST \ + ~{true='--EXIT_CODE_WHEN_MISMATCH 0' false='' scattered} \ + --OUTPUT ~{output_name} + + if ~{scattered}; then + # UNEXPECTED_MATCH is not possible with CHECK_SAME_SAMPLE + matches=$(grep "EXPECTED_MATCH" ~{output_name} | wc -l) + + # check inconclusive samples + expectedInconclusiveSamples=("~{sep='" "' expected_inconclusive_samples}") + inconclusiveSamplesCount=0 + inconclusiveSamples=($(grep 'INCONCLUSIVE' ~{output_name} | cut -f 1)) + for sample in ${inconclusiveSamples[@]}; do + if printf '%s\n' ${expectedInconclusiveSamples[@]} | grep -P '^'${sample}'$'; then + inconclusiveSamplesCount=$((inconclusiveSamplesCount+1)) + fi + done + + total_matches=$((inconclusiveSamplesCount + matches)) + if [[ ${total_matches} -eq ~{num_gvcfs} ]]; then + >&2 echo "Found the correct number of matches (~{num_gvcfs}) for this shard" + else + >&2 echo "ERROR: Found $total_matches 'EXPECTED_MATCH' records, but expected ~{num_gvcfs}" + exit 1 + fi + fi + >>> + + runtime { + memory: memMb + " MiB" + disks: "local-disk " + disk + " HDD" + preemptible: 0 + docker: picard_docker + } + + output { + File crosscheck_metrics = output_name + } +} + +task GatherPicardMetrics { + + input { + Array[File] metrics_files + String output_file_name + Int disk_size + } + + command { + # Don't use this task to gather tens of thousands of files. + # Cromwell can't handle it. + + # This cannot gather metrics with histograms + + head -n 7 ~{metrics_files[0]} > ~{output_file_name} + + for metrics_file in ~{sep=' ' metrics_files}; do + sed -n '1,7d;p' $metrics_file | grep -v '^$' >> ~{output_file_name} + done + } + + output { + File gathered_metrics = "~{output_file_name}" + } + + runtime { + cpu: 1 + memory: "3.75 GiB" + preemptible: 1 + disks: "local-disk " + disk_size + " HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} + +task GetFingerprintingIntervalIndices { + + input { + Array[File] unpadded_intervals + File haplotype_database + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -xeo pipefail + + function rename_intervals(){ + interval_list=$1 + name=$2 + + awk 'BEGIN{FS=IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {$5="'$name'"; print}' $interval_list + } + export -f rename_intervals + + function hdb_to_interval_list(){ + input=$1 + + awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1 + } + + function rename_scatter(){ + file=$1 + number=$(echo $file | sed -E 's|([0-9]+)-scattered\.interval.*|\1|') + rename_intervals $file $number > scattered.renamed.$number.interval_list + } + export -f rename_scatter + + # rename the intervals within each interval_list according to the number in the name of the list + + cp ~{sep=' ' unpadded_intervals} ./ + + cat ~{write_lines(unpadded_intervals)} | xargs -n1 basename | xargs -I{} bash -c 'rename_scatter $@' _ {} + + #find the first header + find . -name "scattered.renamed.*.interval_list" | head -n1 | xargs cat | grep '^@' > all.interval_list + + # concatenate the resulting intervals (with no header) + find . -name "scattered.renamed.*.interval_list" | xargs cat | grep -v '^@' >> all.interval_list + + # convert the Haplotype_database to an interval_list + hdb_to_interval_list ~{haplotype_database} > hdb.interval_list + + # find the intervals that overlap the haplotype_database + gatk IntervalListTools \ + -ACTION OVERLAPS \ + -O all.sorted.interval_list \ + -I all.interval_list \ + -SI hdb.interval_list + + if grep -v '^@' all.sorted.interval_list; then + grep -v '^@' all.sorted.interval_list | awk '{FS="\t"; print $5}' | uniq > indices.out + else + touch indices.out + fi + >>> + + output { + Array[String] indices_to_fingerprint = read_lines("indices.out") + File all_sorted_interval_list = "all.sorted.interval_list" + File all_interval_list = "all.interval_list" + File hdb_interval_list = "hdb.interval_list" + } + + runtime { + cpu: 2 + memory: "3.75 GiB" + preemptible: 1 + disks: "local-disk 10 HDD" + docker: gatk_docker + } +} + +task PartitionSampleNameMap { + + input { + File sample_name_map + Int line_limit + } + + command { + + cut -f 2 ~{sample_name_map} > sample_paths + split -l ~{line_limit} -d sample_paths partition_ + + # Let the OS catch up with creation of files for glob command + sleep 1 + } + + output { + Array[File] partitions = glob("partition_*") + } + + runtime { + memory: "1 GiB" + preemptible: 1 + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} From e189a70e4773227515a047916ac467fd94b36ec6 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 2 Dec 2019 11:42:26 -0500 Subject: [PATCH 13/24] Update genotype2develop (#44) * added updated joint-discovery wdl, needs testing * Added place holder for terra verion of JointGenotyping wdl * Replaced joint-discovery with JointGenotyping workflow, added place holders for terra version for JointGenotyping workflow * updated the terra version of the JointGenotyping * added urls for imports for JointGenotyping workflows * Updated ReadMe --- JointGenotyping-terra.wdl | 564 +++++++++ JointGenotyping.hg38.wgs.inputs.json | 41 + JointGenotyping.wdl | 18 +- README.md | 20 +- joint-discovery-gatk4-fc.wdl | 1026 --------------- ...discovery-gatk4-local.hg38.wgs.inputs.json | 52 - joint-discovery-gatk4-local.wdl | 947 -------------- joint-discovery-gatk4.hg38.wgs.inputs.json | 84 -- joint-discovery-gatk4.wdl | 1018 --------------- tasks/JointGenotypingTasks-terra.wdl | 1099 +++++++++++++++++ tasks/JointGenotypingTasks.wdl | 10 +- 11 files changed, 1732 insertions(+), 3147 deletions(-) create mode 100644 JointGenotyping-terra.wdl create mode 100644 JointGenotyping.hg38.wgs.inputs.json delete mode 100644 joint-discovery-gatk4-fc.wdl delete mode 100644 joint-discovery-gatk4-local.hg38.wgs.inputs.json delete mode 100644 joint-discovery-gatk4-local.wdl delete mode 100644 joint-discovery-gatk4.hg38.wgs.inputs.json delete mode 100644 joint-discovery-gatk4.wdl create mode 100644 tasks/JointGenotypingTasks-terra.wdl diff --git a/JointGenotyping-terra.wdl b/JointGenotyping-terra.wdl new file mode 100644 index 0000000..754b3ff --- /dev/null +++ b/JointGenotyping-terra.wdl @@ -0,0 +1,564 @@ +version 1.0 + +## Copyright Broad Institute, 2019 +## +## This WDL implements the joint discovery and VQSR filtering portion of the GATK +## Best Practices (June 2016) for germline SNP and Indel discovery in human +## whole-genome sequencing (WGS) and exome sequencing data. +## +## Requirements/expectations : +## - One or more GVCFs produced by HaplotypeCaller in GVCF mode +## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. +## +## Outputs : +## - A VCF file and its index, filtered using variant quality score recalibration +## (VQSR) with genotypes for all samples present in the input VCF. All sites that +## are present in the input VCF are retained; filtered sites are annotated as such +## in the FILTER field. +## +## Note about VQSR wiring : +## The SNP and INDEL models are built in parallel, but then the corresponding +## recalibrations are applied in series. Because the INDEL model is generally ready +## first (because there are fewer indels than SNPs) we set INDEL recalibration to +## be applied first to the input VCF, while the SNP model is still being built. By +## the time the SNP model is available, the indel-recalibrated file is available to +## serve as input to apply the SNP recalibration. If we did it the other way around, +## we would have to wait until the SNP recal file was available despite the INDEL +## recal file being there already, then apply SNP recalibration, then apply INDEL +## recalibration. This would lead to a longer wall clock time for complete workflow +## execution. Wiring the INDEL recalibration to be applied first solves the problem. +## +## Cromwell version support +## - Successfully tested on v47 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# WORKFLOW DEFINITION + +#import "./tasks/JointGenotypingTasks-terra.wdl" as Tasks + +import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/updateGenotype2develop/tasks/JointGenotypingTasks-terra.wdl" as Tasks + +# Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19) +workflow JointGenotyping { + + String pipeline_version = "1.2" + + input { + File unpadded_intervals_file + + String callset_name + File sample_name_map + + File ref_fasta + File ref_fasta_index + File ref_dict + + File dbsnp_vcf + File dbsnp_vcf_index + + Array[String] snp_recalibration_tranche_values + Array[String] snp_recalibration_annotation_values + Array[String] indel_recalibration_tranche_values + Array[String] indel_recalibration_annotation_values + + File haplotype_database + + File eval_interval_list + File hapmap_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf + File one_thousand_genomes_resource_vcf_index + File mills_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf = dbsnp_vcf + File dbsnp_resource_vcf_index = dbsnp_vcf_index + + # Runtime attributes + String? gatk_docker_override + String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.4.0"]) + String? gatk_path_override + String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) + String? picard_docker_override + String picard_docker = select_first([picard_docker_override, "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map"]) + + Int? small_disk_override + Int small_disk = select_first([small_disk_override, "100"]) + Int? medium_disk_override + Int medium_disk = select_first([medium_disk_override, "200"]) + Int? large_disk_override + Int large_disk = select_first([large_disk_override, "300"]) + Int? huge_disk_override + Int huge_disk = select_first([huge_disk_override, "400"]) + + String? preemptible_tries_override + Int preemptible_tries = select_first([preemptible_tries_override, "3"]) + + # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme + # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 + Float excess_het_threshold = 54.69 + Float snp_filter_level + Float indel_filter_level + Int SNP_VQSR_downsampleFactor + + Int? top_level_scatter_count + Boolean? gather_vcfs + Int snps_variant_recalibration_threshold = 500000 + Boolean rename_gvcf_samples = true + Float unbounded_scatter_count_scale_factor = 0.15 + Int gnarly_scatter_count = 10 + Boolean use_gnarly_genotyper = false + Boolean use_allele_specific_annotations = true + Boolean cross_check_fingerprints = true + Boolean scatter_cross_check_fingerprints = false + } + + Boolean allele_specific_annotations = !use_gnarly_genotyper && use_allele_specific_annotations + + Array[Array[String]] sample_name_map_lines = read_tsv(sample_name_map) + Int num_gvcfs = length(sample_name_map_lines) + + # Make a 2.5:1 interval number to samples in callset ratio interval list. + # We allow overriding the behavior by specifying the desired number of vcfs + # to scatter over for testing / special requests. + # Zamboni notes say "WGS runs get 30x more scattering than Exome" and + # exome scatterCountPerSample is 0.05, min scatter 10, max 1000 + + # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. + # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. + # We allow overriding this default behavior for testing / special requests. + Boolean is_small_callset = select_first([gather_vcfs, num_gvcfs <= 1000]) + + Int unbounded_scatter_count = select_first([top_level_scatter_count, round(unbounded_scatter_count_scale_factor * num_gvcfs)]) + Int scatter_count = if unbounded_scatter_count > 2 then unbounded_scatter_count else 2 #I think weird things happen if scatterCount is 1 -- IntervalListTools is noop? + + call Tasks.CheckSamplesUnique { + input: + sample_name_map = sample_name_map + } + + call Tasks.SplitIntervalList { + input: + interval_list = unpadded_intervals_file, + scatter_count = scatter_count, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = small_disk, + sample_names_unique_done = CheckSamplesUnique.samples_unique + } + + Array[File] unpadded_intervals = SplitIntervalList.output_intervals + + scatter (idx in range(length(unpadded_intervals))) { + # The batch_size value was carefully chosen here as it + # is the optimal value for the amount of memory allocated + # within the task; please do not change it without consulting + # the Hellbender (GATK engine) team! + call Tasks.ImportGVCFs { + input: + sample_name_map = sample_name_map, + interval = unpadded_intervals[idx], + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + workspace_dir_name = "genomicsdb", + disk_size = medium_disk, + batch_size = 50, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + if (use_gnarly_genotyper) { + + call Tasks.SplitIntervalList as GnarlyIntervalScatterDude { + input: + interval_list = unpadded_intervals[idx], + scatter_count = gnarly_scatter_count, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = small_disk, + sample_names_unique_done = CheckSamplesUnique.samples_unique, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + Array[File] gnarly_intervals = GnarlyIntervalScatterDude.output_intervals + + scatter (gnarly_idx in range(length(gnarly_intervals))) { + call Tasks.GnarlyGenotyper { + input: + workspace_tar = ImportGVCFs.output_genomicsdb, + interval = gnarly_intervals[gnarly_idx], + output_vcf_filename = callset_name + "." + idx + "." + gnarly_idx + ".vcf.gz", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + } + } + + Array[File] gnarly_gvcfs = GnarlyGenotyper.output_vcf + + call Tasks.GatherVcfs as TotallyRadicalGatherVcfs { + input: + input_vcfs = gnarly_gvcfs, + output_vcf_name = callset_name + "." + idx + ".gnarly.vcf.gz", + disk_size = large_disk + } + } + + if (!use_gnarly_genotyper) { + call Tasks.GenotypeGVCFs { + input: + workspace_tar = ImportGVCFs.output_genomicsdb, + interval = unpadded_intervals[idx], + output_vcf_filename = callset_name + "." + idx + ".vcf.gz", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + File genotyped_vcf = select_first([TotallyRadicalGatherVcfs.output_vcf, GenotypeGVCFs.output_vcf]) + File genotyped_vcf_index = select_first([TotallyRadicalGatherVcfs.output_vcf_index, GenotypeGVCFs.output_vcf_index]) + + call Tasks.HardFilterAndMakeSitesOnlyVcf { + input: + vcf = genotyped_vcf, + vcf_index = genotyped_vcf_index, + excess_het_threshold = excess_het_threshold, + variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", + sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + call Tasks.GatherVcfs as SitesOnlyGatherVcf { + input: + input_vcfs = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf, + output_vcf_name = callset_name + ".sites_only.vcf.gz", + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + call Tasks.IndelsVariantRecalibrator { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".indels.recal", + tranches_filename = callset_name + ".indels.tranches", + recalibration_tranche_values = indel_recalibration_tranche_values, + recalibration_annotation_values = indel_recalibration_annotation_values, + mills_resource_vcf = mills_resource_vcf, + mills_resource_vcf_index = mills_resource_vcf_index, + axiomPoly_resource_vcf = axiomPoly_resource_vcf, + axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + if (num_gvcfs > snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibratorCreateModel { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + downsampleFactor = SNP_VQSR_downsampleFactor, + model_report_filename = callset_name + ".snps.model.report", + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { + input: + sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], + sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], + recalibration_filename = callset_name + ".snps." + idx + ".recal", + tranches_filename = callset_name + ".snps." + idx + ".tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + model_report = SNPsVariantRecalibratorCreateModel.model_report, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + call Tasks.GatherTranches as SNPGatherTranches { + input: + tranches = SNPsVariantRecalibratorScattered.tranches, + output_filename = callset_name + ".snps.gathered.tranches", + disk_size = small_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + if (num_gvcfs <= snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { + #for really large callsets we give to friends, just apply filters to the sites-only + call Tasks.ApplyRecalibration { + input: + recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", + input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], + input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], + indels_recalibration = IndelsVariantRecalibrator.recalibration, + indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, + indels_tranches = IndelsVariantRecalibrator.tranches, + snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), + snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), + snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), + indel_filter_level = indel_filter_level, + snp_filter_level = snp_filter_level, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + # For large callsets we need to collect metrics from the shards and gather them later. + if (!is_small_callset) { + call Tasks.CollectVariantCallingMetrics as CollectMetricsSharded { + input: + input_vcf = ApplyRecalibration.recalibrated_vcf, + input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, + metrics_filename_prefix = callset_name + "." + idx, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict, + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + } + + # For small callsets we can gather the VCF shards and then collect metrics on it. + if (is_small_callset) { + call Tasks.GatherVcfs as FinalGatherVcf { + input: + input_vcfs = ApplyRecalibration.recalibrated_vcf, + output_vcf_name = callset_name + ".vcf.gz", + disk_size = huge_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + call Tasks.CollectVariantCallingMetrics as CollectMetricsOnFullVcf { + input: + input_vcf = FinalGatherVcf.output_vcf, + input_vcf_index = FinalGatherVcf.output_vcf_index, + metrics_filename_prefix = callset_name, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict, + disk_size = large_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + if (!is_small_callset) { + # For large callsets we still need to gather the sharded metrics. + call Tasks.GatherVariantCallingMetrics { + input: + input_details = select_all(CollectMetricsSharded.detail_metrics_file), + input_summaries = select_all(CollectMetricsSharded.summary_metrics_file), + output_prefix = callset_name, + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + # CrossCheckFingerprints takes forever on large callsets. + # We scatter over the input GVCFs to make things faster. + if (scatter_cross_check_fingerprints) { + call Tasks.GetFingerprintingIntervalIndices { + input: + unpadded_intervals = unpadded_intervals, + haplotype_database = haplotype_database + } + + Array[Int] fingerprinting_indices = GetFingerprintingIntervalIndices.indices_to_fingerprint + + scatter (idx in fingerprinting_indices) { + File vcfs_to_fingerprint = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx] + } + + call Tasks.GatherVcfs as GatherFingerprintingVcfs { + input: + input_vcfs = vcfs_to_fingerprint, + output_vcf_name = callset_name + ".gathered.fingerprinting.vcf.gz", + disk_size = medium_disk + } + + call Tasks.SelectFingerprintSiteVariants { + input: + input_vcf = GatherFingerprintingVcfs.output_vcf, + base_output_name = callset_name + ".fingerprinting", + haplotype_database = haplotype_database, + disk_size = medium_disk + } + + call Tasks.PartitionSampleNameMap { + input: + sample_name_map = sample_name_map, + line_limit = 1000 + } + + scatter (idx in range(length(PartitionSampleNameMap.partitions))) { + + Array[File] files_in_partition = read_lines(PartitionSampleNameMap.partitions[idx]) + + call Tasks.CrossCheckFingerprint as CrossCheckFingerprintsScattered { + input: + gvcf_paths = files_in_partition, + vcf_paths = vcfs_to_fingerprint, + sample_name_map = sample_name_map, + haplotype_database = haplotype_database, + output_base_name = callset_name + "." + idx, + scattered = true, + picard_docker = picard_docker + } + } + + call Tasks.GatherPicardMetrics as GatherFingerprintingMetrics { + input: + metrics_files = CrossCheckFingerprintsScattered.crosscheck_metrics, + output_file_name = callset_name + ".fingerprintcheck", + disk_size = small_disk + } + } + + if (!scatter_cross_check_fingerprints) { + + scatter (line in sample_name_map_lines) { + File gvcf_paths = line[1] + } + + call Tasks.CrossCheckFingerprint as CrossCheckFingerprintSolo { + input: + gvcf_paths = gvcf_paths, + vcf_paths = ApplyRecalibration.recalibrated_vcf, + sample_name_map = sample_name_map, + haplotype_database = haplotype_database, + output_base_name = callset_name, + picard_docker = picard_docker + } + } + + # Get the metrics from either code path + File output_detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherVariantCallingMetrics.detail_metrics_file]) + File output_summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherVariantCallingMetrics.summary_metrics_file]) + + # Get the VCFs from either code path + Array[File?] output_vcf_files = if defined(FinalGatherVcf.output_vcf) then [FinalGatherVcf.output_vcf] else ApplyRecalibration.recalibrated_vcf + Array[File?] output_vcf_index_files = if defined(FinalGatherVcf.output_vcf_index) then [FinalGatherVcf.output_vcf_index] else ApplyRecalibration.recalibrated_vcf_index + + output { + # Metrics from either the small or large callset + File detail_metrics_file = output_detail_metrics_file + File summary_metrics_file = output_summary_metrics_file + + # Outputs from the small callset path through the wdl. + Array[File] output_vcfs = select_all(output_vcf_files) + Array[File] output_vcf_indices = select_all(output_vcf_index_files) + + # Output the interval list generated/used by this run workflow. + Array[File] output_intervals = SplitIntervalList.output_intervals + + # Output the metrics from crosschecking fingerprints. + File crosscheck_fingerprint_check = select_first([CrossCheckFingerprintSolo.crosscheck_metrics, GatherFingerprintingMetrics.gathered_metrics]) + } +} diff --git a/JointGenotyping.hg38.wgs.inputs.json b/JointGenotyping.hg38.wgs.inputs.json new file mode 100644 index 0000000..47b1632 --- /dev/null +++ b/JointGenotyping.hg38.wgs.inputs.json @@ -0,0 +1,41 @@ +{ + "JointGenotyping.sample_name_map": "gs://gatk-test-data/joint_discovery/1kg_50_hg38/gvcf/hg38_1kg_50.sample_map", + "JointGenotyping.callset_name": "hg38_1kg_50", + "JointGenotyping.unbounded_scatter_count_scale_factor": 2.5, + "JointGenotyping.SplitIntervalList.scatter_mode": "INTERVAL_SUBDIVISION", + + "JointGenotyping.unpadded_intervals_file": "gs://broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", + "JointGenotyping.ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "JointGenotyping.ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "JointGenotyping.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "JointGenotyping.eval_interval_list": "gs://broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", + "JointGenotyping.haplotype_database": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt", + + "JointGenotyping.axiomPoly_resource_vcf": "gs://broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", + "JointGenotyping.axiomPoly_resource_vcf_index": "gs://broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", + "JointGenotyping.dbsnp_vcf": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", + "JointGenotyping.dbsnp_vcf_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", + "JointGenotyping.hapmap_resource_vcf": "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", + "JointGenotyping.hapmap_resource_vcf_index": "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", + "JointGenotyping.mills_resource_vcf": "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "JointGenotyping.mills_resource_vcf_index": "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", + "JointGenotyping.omni_resource_vcf": "gs://broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", + "JointGenotyping.omni_resource_vcf_index": "gs://broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", + "JointGenotyping.one_thousand_genomes_resource_vcf": "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", + "JointGenotyping.one_thousand_genomes_resource_vcf_index": "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", + + "JointGenotyping.SNP_VQSR_downsampleFactor": 10, + "JointGenotyping.snps_variant_recalibration_threshold": 20000, + "JointGenotyping.snp_filter_level": 99.7, + "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], + "JointGenotyping.snp_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ], + + "JointGenotyping.indel_filter_level": 99.0, + "JointGenotyping.indel_recalibration_annotation_values": ["FS", "ReadPosRankSum", "MQRankSum", "QD", "SOR", "DP"], + "JointGenotyping.indel_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], + + "JointGenotyping.small_disk": 100, + "JointGenotyping.medium_disk": 200, + "JointGenotyping.large_disk": 1000, + "JointGenotyping.huge_disk": 2000 +} diff --git a/JointGenotyping.wdl b/JointGenotyping.wdl index 58cf33c..499f48e 100644 --- a/JointGenotyping.wdl +++ b/JointGenotyping.wdl @@ -45,13 +45,14 @@ version 1.0 # WORKFLOW DEFINITION -import "./tasks/JointGenotypingTasks.wdl" as Tasks +#import "./tasks/JointGenotypingTasks.wdl" as Tasks +import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/updateGenotype2develop/tasks/JointGenotypingTasks.wdl" as Tasks # Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19) workflow JointGenotyping { - String pipeline_version = "1.1" + String pipeline_version = "1.2" input { File unpadded_intervals_file @@ -106,10 +107,13 @@ workflow JointGenotyping { Float unbounded_scatter_count_scale_factor = 0.15 Int gnarly_scatter_count = 10 Boolean use_gnarly_genotyper = false + Boolean use_allele_specific_annotations = true Boolean cross_check_fingerprints = true Boolean scatter_cross_check_fingerprints = false } + Boolean allele_specific_annotations = !use_gnarly_genotyper && use_allele_specific_annotations + Array[Array[String]] sample_name_map_lines = read_tsv(sample_name_map) Int num_gvcfs = length(sample_name_map_lines) @@ -249,7 +253,7 @@ workflow JointGenotyping { axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, dbsnp_resource_vcf = dbsnp_resource_vcf, dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - use_allele_specific_annotations = !use_gnarly_genotyper, + use_allele_specific_annotations = allele_specific_annotations, disk_size = small_disk } @@ -272,7 +276,7 @@ workflow JointGenotyping { one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, dbsnp_resource_vcf = dbsnp_resource_vcf, dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - use_allele_specific_annotations = !use_gnarly_genotyper, + use_allele_specific_annotations = allele_specific_annotations, disk_size = small_disk } @@ -294,7 +298,7 @@ workflow JointGenotyping { one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, dbsnp_resource_vcf = dbsnp_resource_vcf, dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - use_allele_specific_annotations = !use_gnarly_genotyper, + use_allele_specific_annotations = allele_specific_annotations, disk_size = small_disk } } @@ -324,7 +328,7 @@ workflow JointGenotyping { one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, dbsnp_resource_vcf = dbsnp_resource_vcf, dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - use_allele_specific_annotations = !use_gnarly_genotyper, + use_allele_specific_annotations = allele_specific_annotations, disk_size = small_disk } } @@ -344,7 +348,7 @@ workflow JointGenotyping { snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), indel_filter_level = indel_filter_level, snp_filter_level = snp_filter_level, - use_allele_specific_annotations = !use_gnarly_genotyper, + use_allele_specific_annotations = allele_specific_annotations, disk_size = medium_disk } diff --git a/README.md b/README.md index ee9821c..8b252c4 100644 --- a/README.md +++ b/README.md @@ -8,12 +8,12 @@ The haplotypecaller-gvcf-gatk4 workflow runs the GATK4 HaplotypeCaller tool in GVCF mode on a single sample according to GATK Best Practices. When executed the workflow scatters the HaplotypeCaller tool over the input bam sample using an interval list file. The output produced by the workflow will be a single GVCF -file which can then be provided to the joint-discovery workflow along with several other +file which can then be provided to the JointGenotyping workflow along with several other GVCF files to call for variants simultaneously, producing a multisample VCF. The haplotypecaller-gvcf-gatk4 workflows default GVCF mode is useful when calling variants for several samples efficiently. However, for instances when calling variants for one or a few samples it is possible to have the workflow directly call variants and output a VCF file by -setting the `make_gvcf` input variable to `true`. +setting the `make_gvcf` input variable to `false`. #### Requirements/expectations - One analysis-ready BAM file for a single sample (as identified in RG:SM) @@ -22,22 +22,22 @@ setting the `make_gvcf` input variable to `true`. #### Outputs - One GVCF file and its index -### joint-discovery-gatk : +### JointGenotyping.wdl : This WDL implements the joint calling and VQSR filtering portion of the GATK Best Practices for germline SNP and Indel discovery -in human whole-genome sequencing (WGS). +in human whole-genome sequencing (WGS). The workflow accept a sample map +file with 50 or more GVCFs and produces a multisample VCF. *NOTE:* -*- joint-discovery-gatk4-local.wdl is a slightly modified version of the -original to support users interested in running the workflow locally.* -*- joint-discovery-gatk4-fc.wdl is a slightly modified version of the -original to support users interested in running the workflow firecloud with and -using an array of gvcfs as input.* +*- JointGenotyping-terra.wdl is a slightly modified version of the +original workflow to support users interested in running the +workflow on Terra. The changes include variables for dockers and disksize, making +it easier to configure the workflow.* #### Requirements/expectations - One or more GVCFs produced by HaplotypeCaller in GVCF mode -- Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. +- Bare minimum 50 samples. Gene panels are not supported. - When determining disk size in the JSON, use the guideline below - small_disk = (num_gvcfs / 10) + 10 - medium_disk = (num_gvcfs * 15) + 10 diff --git a/joint-discovery-gatk4-fc.wdl b/joint-discovery-gatk4-fc.wdl deleted file mode 100644 index 8ac6542..0000000 --- a/joint-discovery-gatk4-fc.wdl +++ /dev/null @@ -1,1026 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL implements the joint discovery and VQSR filtering portion of the GATK -## Best Practices (June 2016) for germline SNP and Indel discovery in human -## whole-genome sequencing (WGS) and exome sequencing data. -## -## Requirements/expectations : -## - One or more GVCFs produced by HaplotypeCaller in GVCF mode -## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. -## -## Outputs : -## - A VCF file and its index, filtered using variant quality score recalibration -## (VQSR) with genotypes for all samples present in the input VCF. All sites that -## are present in the input VCF are retained; filtered sites are annotated as such -## in the FILTER field. -## - Note that the sample_names is what the sample will be called in the output, but not necessarily what the sample name is called in its GVCF. -## -## Note about VQSR wiring : -## The SNP and INDEL models are built in parallel, but then the corresponding -## recalibrations are applied in series. Because the INDEL model is generally ready -## first (because there are fewer indels than SNPs) we set INDEL recalibration to -## be applied first to the input VCF, while the SNP model is still being built. By -## the time the SNP model is available, the indel-recalibrated file is available to -## serve as input to apply the SNP recalibration. If we did it the other way around, -## we would have to wait until the SNP recal file was available despite the INDEL -## recal file being there already, then apply SNP recalibration, then apply INDEL -## recalibration. This would lead to a longer wall clock time for complete workflow -## execution. Wiring the INDEL recalibration to be applied first solves the problem. -## -## Cromwell version support -## - Successfully tested on v31 -## - Does not work on versions < v23 due to output syntax -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -workflow JointGenotyping { - # Input Sample - String callset_name - - Array[String] sample_names - Array[File] input_gvcfs - Array[File] input_gvcfs_indices - - # Reference and Resources - File ref_fasta - File ref_fasta_index - File ref_dict - - File dbsnp_vcf - File dbsnp_vcf_index - - Array[String] snp_recalibration_tranche_values - Array[String] snp_recalibration_annotation_values - Array[String] indel_recalibration_tranche_values - Array[String] indel_recalibration_annotation_values - - File eval_interval_list - File hapmap_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf - File one_thousand_genomes_resource_vcf_index - File mills_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf = dbsnp_vcf - File dbsnp_resource_vcf_index = dbsnp_vcf_index - - File unpadded_intervals_file - # Runtime attributes - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - - Int? small_disk_override - Int small_disk = select_first([small_disk_override, "100"]) - Int? medium_disk_override - Int medium_disk = select_first([medium_disk_override, "200"]) - Int? large_disk_override - Int large_disk = select_first([large_disk_override, "300"]) - Int? huge_disk_override - Int huge_disk = select_first([huge_disk_override, "400"]) - - String? preemptible_tries_override - Int preemptible_tries = select_first([preemptible_tries_override, "3"]) - - # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme - # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 - Float excess_het_threshold = 54.69 - Float snp_filter_level - Float indel_filter_level - Int SNP_VQSR_downsampleFactor - - Int num_of_original_intervals = length(read_lines(unpadded_intervals_file)) - Int num_gvcfs = length(input_gvcfs) - - # Make a 2.5:1 interval number to samples in callset ratio interval list - Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5) - Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1 - - call DynamicallyCombineIntervals { - input: - intervals = unpadded_intervals_file, - merge_count = merge_count, - preemptible_tries = preemptible_tries - } - - Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals) - - scatter (idx in range(length(unpadded_intervals))) { - # the batch_size value was carefully chosen here as it - # is the optimal value for the amount of memory allocated - # within the task; please do not change it without consulting - # the Hellbender (GATK engine) team! - call ImportGVCFs { - input: - sample_names = sample_names, - input_gvcfs = input_gvcfs, - input_gvcfs_indices = input_gvcfs_indices, - interval = unpadded_intervals[idx], - workspace_dir_name = "genomicsdb", - disk_size = medium_disk, - batch_size = 50, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call GenotypeGVCFs { - input: - workspace_tar = ImportGVCFs.output_genomicsdb, - interval = unpadded_intervals[idx], - output_vcf_filename = "output.vcf.gz", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - dbsnp_vcf = dbsnp_vcf, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call HardFilterAndMakeSitesOnlyVcf { - input: - vcf = GenotypeGVCFs.output_vcf, - vcf_index = GenotypeGVCFs.output_vcf_index, - excess_het_threshold = excess_het_threshold, - variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", - sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - call GatherVcfs as SitesOnlyGatherVcf { - input: - input_vcfs_fofn = write_lines(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf), - output_vcf_name = callset_name + ".sites_only.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call IndelsVariantRecalibrator { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".indels.recal", - tranches_filename = callset_name + ".indels.tranches", - recalibration_tranche_values = indel_recalibration_tranche_values, - recalibration_annotation_values = indel_recalibration_annotation_values, - mills_resource_vcf = mills_resource_vcf, - mills_resource_vcf_index = mills_resource_vcf_index, - axiomPoly_resource_vcf = axiomPoly_resource_vcf, - axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - if (num_gvcfs > 10000) { - call SNPsVariantRecalibratorCreateModel { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - downsampleFactor = SNP_VQSR_downsampleFactor, - model_report_filename = callset_name + ".snps.model.report", - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { - call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { - input: - sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], - sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], - recalibration_filename = callset_name + ".snps." + idx + ".recal", - tranches_filename = callset_name + ".snps." + idx + ".tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - model_report = SNPsVariantRecalibratorCreateModel.model_report, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - call GatherTranches as SNPGatherTranches { - input: - input_fofn = write_lines(SNPsVariantRecalibratorScattered.tranches), - output_filename = callset_name + ".snps.gathered.tranches", - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - - if (num_gvcfs <= 10000){ - call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. - # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. - Boolean is_small_callset = num_gvcfs <= 1000 - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { - call ApplyRecalibration { - input: - recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", - input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], - input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], - indels_recalibration = IndelsVariantRecalibrator.recalibration, - indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, - indels_tranches = IndelsVariantRecalibrator.tranches, - snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), - snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), - snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), - indel_filter_level = indel_filter_level, - snp_filter_level = snp_filter_level, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - # for large callsets we need to collect metrics from the shards and gather them later - if (!is_small_callset) { - call CollectVariantCallingMetrics as CollectMetricsSharded { - input: - input_vcf = ApplyRecalibration.recalibrated_vcf, - input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, - metrics_filename_prefix = callset_name + "." + idx, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - } - - # for small callsets we can gather the VCF shards and then collect metrics on it - if (is_small_callset) { - call GatherVcfs as FinalGatherVcf { - input: - input_vcfs_fofn = write_lines(ApplyRecalibration.recalibrated_vcf), - output_vcf_name = callset_name + ".vcf.gz", - disk_size = huge_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call CollectVariantCallingMetrics as CollectMetricsOnFullVcf { - input: - input_vcf = FinalGatherVcf.output_vcf, - input_vcf_index = FinalGatherVcf.output_vcf_index, - metrics_filename_prefix = callset_name, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = large_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - # for large callsets we still need to gather the sharded metrics - if (!is_small_callset) { - call GatherMetrics { - input: - input_details_fofn = write_lines(select_all(CollectMetricsSharded.detail_metrics_file)), - input_summaries_fofn = write_lines(select_all(CollectMetricsSharded.summary_metrics_file)), - output_prefix = callset_name, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - output { - # outputs from the small callset path through the wdl - File? output_vcf = FinalGatherVcf.output_vcf - File? output_vcf_index = FinalGatherVcf.output_vcf_index - - # select metrics from the small callset path and the large callset path - File detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherMetrics.detail_metrics_file]) - File summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherMetrics.summary_metrics_file]) - - # output the interval list generated/used by this run workflow - File output_intervals = DynamicallyCombineIntervals.output_intervals - } -} - -task GetNumberOfSamples { - File sample_name_map - String docker - Int preemptible_tries - command <<< - wc -l ${sample_name_map} | awk '{print $1}' - >>> - runtime { - docker: docker - memory: "1 GB" - preemptible: preemptible_tries - } - output { - Int sample_count = read_int(stdout()) - } -} - -task ImportGVCFs { - Array[String] sample_names - Array[File] input_gvcfs - Array[File] input_gvcfs_indices - String interval - - String workspace_dir_name - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - Int batch_size - - command <<< - set -e - set -o pipefail - - python << CODE - gvcfs = ['${sep="','" input_gvcfs}'] - sample_names = ['${sep="','" sample_names}'] - - if len(gvcfs)!= len(sample_names): - exit(1) - - with open("inputs.list", "w") as fi: - for i in range(len(gvcfs)): - fi.write(sample_names[i] + "\t" + gvcfs[i] + "\n") - - CODE - - rm -rf ${workspace_dir_name} - - # The memory setting here is very important and must be several GB lower - # than the total memory allocated to the VM because this tool uses - # a significant amount of non-heap memory for native libraries. - # Also, testing has shown that the multithreaded reader initialization - # does not scale well beyond 5 threads, so don't increase beyond that. - ${gatk_path} --java-options "-Xmx4g -Xms4g" \ - GenomicsDBImport \ - --genomicsdb-workspace-path ${workspace_dir_name} \ - --batch-size ${batch_size} \ - -L ${interval} \ - --sample-name-map inputs.list \ - --reader-threads 5 \ - -ip 500 - - tar -cf ${workspace_dir_name}.tar ${workspace_dir_name} - - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_genomicsdb = "${workspace_dir_name}.tar" - } -} - -task GenotypeGVCFs { - File workspace_tar - String interval - - String output_vcf_filename - - String gatk_path - - File ref_fasta - File ref_fasta_index - File ref_dict - - String dbsnp_vcf - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - - tar -xf ${workspace_tar} - WORKSPACE=$( basename ${workspace_tar} .tar) - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - GenotypeGVCFs \ - -R ${ref_fasta} \ - -O ${output_vcf_filename} \ - -D ${dbsnp_vcf} \ - -G StandardAnnotation \ - --only-output-calls-starting-in-intervals \ - --use-new-qual-calculator \ - -V gendb://$WORKSPACE \ - -L ${interval} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_vcf = "${output_vcf_filename}" - File output_vcf_index = "${output_vcf_filename}.tbi" - } -} - -task HardFilterAndMakeSitesOnlyVcf { - File vcf - File vcf_index - Float excess_het_threshold - - String variant_filtered_vcf_filename - String sites_only_vcf_filename - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command { - set -e - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - VariantFiltration \ - --filter-expression "ExcessHet > ${excess_het_threshold}" \ - --filter-name ExcessHet \ - -O ${variant_filtered_vcf_filename} \ - -V ${vcf} - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - MakeSitesOnlyVcf \ - --INPUT ${variant_filtered_vcf_filename} \ - --OUTPUT ${sites_only_vcf_filename} - - } - runtime { - docker: docker - memory: "3.5 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File variant_filtered_vcf = "${variant_filtered_vcf_filename}" - File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi" - File sites_only_vcf = "${sites_only_vcf_filename}" - File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi" - } -} - -task IndelsVariantRecalibrator { - String recalibration_filename - String tranches_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File mills_resource_vcf - File axiomPoly_resource_vcf - File dbsnp_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx24g -Xms24g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode INDEL \ - --max-gaussians 4 \ - --resource:mills,known=false,training=true,truth=true,prior=12 ${mills_resource_vcf} \ - --resource:axiomPoly,known=false,training=true,truth=false,prior=10 ${axiomPoly_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=2 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "26 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task SNPsVariantRecalibratorCreateModel { - String recalibration_filename - String tranches_filename - Int downsampleFactor - String model_report_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx100g -Xms100g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - --sample-every-Nth-variant ${downsampleFactor} \ - --output-model ${model_report_filename} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "104 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File model_report = "${model_report_filename}" - } -} - -task SNPsVariantRecalibrator { - String recalibration_filename - String tranches_filename - File? model_report - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int? machine_mem_gb - Int auto_mem = ceil(2*size(sites_only_variant_filtered_vcf, "GB" )) - Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: machine_mem + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task GatherTranches { - File input_fofn - String output_filename - - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - set -o pipefail - - # this is here to deal with the JES bug where commands may be run twice - rm -rf tranches - - mkdir tranches - RETRY_LIMIT=5 - - count=0 - until cat ${input_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the tranches from the cloud' && exit 1 - fi - - cat ${input_fofn} | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherTranches \ - --input inputs.list \ - --output ${output_filename} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File tranches = "${output_filename}" - } -} - -task ApplyRecalibration { - String recalibrated_vcf_filename - File input_vcf - File input_vcf_index - File indels_recalibration - File indels_recalibration_index - File indels_tranches - File snps_recalibration - File snps_recalibration_index - File snps_tranches - - Float indel_filter_level - Float snp_filter_level - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - set -e - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O tmp.indel.recalibrated.vcf \ - -V ${input_vcf} \ - --recal-file ${indels_recalibration} \ - --tranches-file ${indels_tranches} \ - --truth-sensitivity-filter-level ${indel_filter_level} \ - --create-output-variant-index true \ - -mode INDEL - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O ${recalibrated_vcf_filename} \ - -V tmp.indel.recalibrated.vcf \ - --recal-file ${snps_recalibration} \ - --tranches-file ${snps_tranches} \ - --truth-sensitivity-filter-level ${snp_filter_level} \ - --create-output-variant-index true \ - -mode SNP - } - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibrated_vcf = "${recalibrated_vcf_filename}" - File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi" - } -} - -task GatherVcfs { - File input_vcfs_fofn - String output_vcf_name - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - - # Now using NIO to localize the vcfs but the input file must have a ".list" extension - mv ${input_vcfs_fofn} inputs.list - - # --ignore-safety-checks makes a big performance difference so we include it in our invocation. - # This argument disables expensive checks that the file headers contain the same set of - # genotyped samples and that files are in order by position of first record. - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherVcfsCloud \ - --ignore-safety-checks \ - --gather-type BLOCK \ - --input inputs.list \ - --output ${output_vcf_name} - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - IndexFeatureFile \ - --feature-file ${output_vcf_name} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_vcf = "${output_vcf_name}" - File output_vcf_index = "${output_vcf_name}.tbi" - } -} - -task CollectVariantCallingMetrics { - File input_vcf - File input_vcf_index - - String metrics_filename_prefix - File dbsnp_vcf - File dbsnp_vcf_index - File interval_list - File ref_dict - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - CollectVariantCallingMetrics \ - --INPUT ${input_vcf} \ - --DBSNP ${dbsnp_vcf} \ - --SEQUENCE_DICTIONARY ${ref_dict} \ - --OUTPUT ${metrics_filename_prefix} \ - --THREAD_COUNT 8 \ - --TARGET_INTERVALS ${interval_list} - } - output { - File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics" - } - runtime { - docker: docker - memory: "7 GB" - cpu: 2 - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } -} - -task GatherMetrics { - File input_details_fofn - File input_summaries_fofn - - String output_prefix - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - set -o pipefail - - # this is here to deal with the JES bug where commands may be run twice - rm -rf metrics - - mkdir metrics - RETRY_LIMIT=5 - - count=0 - until cat ${input_details_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the metrics from the cloud' && exit 1 - fi - - count=0 - until cat ${input_summaries_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the metrics from the cloud' && exit 1 - fi - - INPUT=`cat ${input_details_fofn} | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("-I=metrics/%s ", $1)}'` - - ${gatk_path} --java-options "-Xmx2g -Xms2g" \ - AccumulateVariantCallingMetrics \ - $INPUT \ - -O ${output_prefix} - >>> - runtime { - docker: docker - memory: "3 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics" - } -} - -task DynamicallyCombineIntervals { - File intervals - Int merge_count - Int preemptible_tries - - command { - python << CODE - def parse_interval(interval): - colon_split = interval.split(":") - chromosome = colon_split[0] - dash_split = colon_split[1].split("-") - start = int(dash_split[0]) - end = int(dash_split[1]) - return chromosome, start, end - - def add_interval(chr, start, end): - lines_to_write.append(chr + ":" + str(start) + "-" + str(end)) - return chr, start, end - - count = 0 - chain_count = ${merge_count} - l_chr, l_start, l_end = "", 0, 0 - lines_to_write = [] - with open("${intervals}") as f: - with open("out.intervals", "w") as f1: - for line in f.readlines(): - # initialization - if count == 0: - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - # reached number to combine, so spit out and start over - if count == chain_count: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - - c_chr, c_start, c_end = parse_interval(line) - # if adjacent keep the chain going - if c_chr == w_chr and c_start == w_end + 1: - w_end = c_end - count += 1 - continue - # not adjacent, end here and start a new chain - else: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - if l_char != w_chr or l_start != w_start or l_end != w_end: - add_interval(w_chr, w_start, w_end) - f1.writelines("\n".join(lines_to_write)) - CODE - } - - runtime { - memory: "3 GB" - preemptible: preemptible_tries - docker: "python:2.7" - } - - output { - File output_intervals = "out.intervals" - } -} - diff --git a/joint-discovery-gatk4-local.hg38.wgs.inputs.json b/joint-discovery-gatk4-local.hg38.wgs.inputs.json deleted file mode 100644 index 2742e5d..0000000 --- a/joint-discovery-gatk4-local.hg38.wgs.inputs.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "##_COMMENT1": "INPUT GVCFs & COHORT -- DATASET-SPECIFC, MUST BE ADAPTED", - "JointGenotyping.sample_names": ["NA12878"], - "JointGenotyping.input_gvcfs": ["/home/bshifaw/data/joint_discovery/NA12878.g.vcf.gz"], - "JointGenotyping.input_gvcfs_indices": ["/home/bshifaw/data/joint_discovery/NA12878.g.vcf.gz.tbi"], - "JointGenotyping.callset_name": "NA12878", - - "##_COMMENT2": "REFERENCE FILES", - "JointGenotyping.ref_fasta": "/home/bshifaw/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "JointGenotyping.ref_fasta_index": "/home/bshifaw/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "JointGenotyping.ref_dict": "/home/bshifaw/broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - - "##_COMMENT3": "INTERVALS", - "JointGenotyping.eval_interval_list": "/home/bshifaw/broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", - "JointGenotyping.unpadded_intervals_file": "/home/bshifaw/broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", - - "##_COMMENT4": "RESOURCE FILES", - "JointGenotyping.dbsnp_vcf": "/home/bshifaw/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", - "JointGenotyping.dbsnp_vcf_index": "/home/bshifaw/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", - "JointGenotyping.one_thousand_genomes_resource_vcf": "/home/bshifaw/broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", - "JointGenotyping.one_thousand_genomes_resource_vcf_index": "/home/bshifaw/broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", - "JointGenotyping.omni_resource_vcf": "/home/bshifaw/broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", - "JointGenotyping.omni_resource_vcf_index": "/home/bshifaw/broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", - "JointGenotyping.mills_resource_vcf": "/home/bshifaw/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", - "JointGenotyping.mills_resource_vcf_index": "/home/bshifaw/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", - "JointGenotyping.axiomPoly_resource_vcf": "/home/bshifaw/broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", - "JointGenotyping.axiomPoly_resource_vcf_index": "/home/bshifaw/broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", - "JointGenotyping.hapmap_resource_vcf": "/home/bshifaw/broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", - "JointGenotyping.hapmap_resource_vcf_index": "/home/bshifaw/broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", - - "##_COMMENT5": "VQSR PARAMETERS", - "JointGenotyping.SNP_VQSR_downsampleFactor": 10, - "JointGenotyping.snp_filter_level": 99.7, - "JointGenotyping.indel_filter_level": 99.7, - "JointGenotyping.indel_recalibration_annotation_values": ["FS", "ReadPosRankSum", "MQRankSum", "QD", "SOR", "DP"], - "JointGenotyping.indel_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], - "JointGenotyping.snp_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ], - "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], - - "##_COMMENT4": "DOCKERS", - "#JointGenotyping.gatk_docker_override": "String? (optional)", - - "##_COMMENT5": "PATHS", - "#JointGenotyping.gatk_path_override": "String? (optional)", - - "##_COMMENT8": "DISK SIZE ALLOCATION", - "#JointGenotyping.small_disk_override": "Int? (optional)", - "#JointGenotyping.medium_disk_override": "Int? (optional)", - "#JointGenotyping.large_disk_override": "Int? (optional)", - "#JointGenotyping.huge_disk_override": "Int? (optional)" - -} diff --git a/joint-discovery-gatk4-local.wdl b/joint-discovery-gatk4-local.wdl deleted file mode 100644 index 771debd..0000000 --- a/joint-discovery-gatk4-local.wdl +++ /dev/null @@ -1,947 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL implements the joint discovery and VQSR filtering portion of the GATK -## Best Practices (June 2016) for germline SNP and Indel discovery in human -## whole-genome sequencing (WGS) and exome sequencing data. -## -## Requirements/expectations : -## - One or more GVCFs produced by HaplotypeCaller in GVCF mode -## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. -## -## Outputs : -## - A VCF file and its index, filtered using variant quality score recalibration -## (VQSR) with genotypes for all samples present in the input VCF. All sites that -## are present in the input VCF are retained; filtered sites are annotated as such -## in the FILTER field. -## -## Note about VQSR wiring : -## The SNP and INDEL models are built in parallel, but then the corresponding -## recalibrations are applied in series. Because the INDEL model is generally ready -## first (because there are fewer indels than SNPs) we set INDEL recalibration to -## be applied first to the input VCF, while the SNP model is still being built. By -## the time the SNP model is available, the indel-recalibrated file is available to -## serve as input to apply the SNP recalibration. If we did it the other way around, -## we would have to wait until the SNP recal file was available despite the INDEL -## recal file being there already, then apply SNP recalibration, then apply INDEL -## recalibration. This would lead to a longer wall clock time for complete workflow -## execution. Wiring the INDEL recalibration to be applied first solves the problem. -## -## Cromwell version support -## - Successfully tested on v31 -## - Does not work on versions < v23 due to output syntax -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -workflow JointGenotyping { - # Input Sample - String callset_name - Array[String] sample_names - Array[File] input_gvcfs - Array[File] input_gvcfs_indices - - # Reference and Resources - File ref_fasta - File ref_fasta_index - File ref_dict - - File dbsnp_vcf - File dbsnp_vcf_index - - Array[String] snp_recalibration_tranche_values - Array[String] snp_recalibration_annotation_values - Array[String] indel_recalibration_tranche_values - Array[String] indel_recalibration_annotation_values - - File eval_interval_list - File hapmap_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf - File one_thousand_genomes_resource_vcf_index - File mills_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf = dbsnp_vcf - File dbsnp_resource_vcf_index = dbsnp_vcf_index - - File unpadded_intervals_file - - # Runtime attributes - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - - Int? small_disk_override - Int small_disk = select_first([small_disk_override, "100"]) - Int? medium_disk_override - Int medium_disk = select_first([medium_disk_override, "200"]) - Int? large_disk_override - Int large_disk = select_first([large_disk_override, "300"]) - Int? huge_disk_override - Int huge_disk = select_first([huge_disk_override, "400"]) - - # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme - # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 - Float excess_het_threshold = 54.69 - Float snp_filter_level - Float indel_filter_level - Int SNP_VQSR_downsampleFactor - - Int num_of_original_intervals = length(read_lines(unpadded_intervals_file)) - Int num_gvcfs = length(input_gvcfs) - - # Make a 2.5:1 interval number to samples in callset ratio interval list - Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5) - Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1 - - call DynamicallyCombineIntervals { - input: - intervals = unpadded_intervals_file, - merge_count = merge_count - } - - Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals) - - scatter (idx in range(length(unpadded_intervals))) { - # the batch_size value was carefully chosen here as it - # is the optimal value for the amount of memory allocated - # within the task; please do not change it without consulting - # the Hellbender (GATK engine) team! - call ImportGVCFs { - input: - sample_names = sample_names, - interval = unpadded_intervals[idx], - workspace_dir_name = "genomicsdb", - input_gvcfs = input_gvcfs, - input_gvcfs_indices = input_gvcfs_indices, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - batch_size = 50 - } - - call GenotypeGVCFs { - input: - workspace_tar = ImportGVCFs.output_genomicsdb, - interval = unpadded_intervals[idx], - output_vcf_filename = "output.vcf.gz", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - call HardFilterAndMakeSitesOnlyVcf { - input: - vcf = GenotypeGVCFs.output_vcf, - vcf_index = GenotypeGVCFs.output_vcf_index, - excess_het_threshold = excess_het_threshold, - variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", - sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - call GatherVcfs as SitesOnlyGatherVcf { - input: - input_vcfs_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf, - input_vcf_indexes_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index, - output_vcf_name = callset_name + ".sites_only.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - call IndelsVariantRecalibrator { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".indels.recal", - tranches_filename = callset_name + ".indels.tranches", - recalibration_tranche_values = indel_recalibration_tranche_values, - recalibration_annotation_values = indel_recalibration_annotation_values, - mills_resource_vcf = mills_resource_vcf, - mills_resource_vcf_index = mills_resource_vcf_index, - axiomPoly_resource_vcf = axiomPoly_resource_vcf, - axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - if (num_gvcfs > 10000) { - call SNPsVariantRecalibratorCreateModel { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - downsampleFactor = SNP_VQSR_downsampleFactor, - model_report_filename = callset_name + ".snps.model.report", - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { - call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { - input: - sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], - sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], - recalibration_filename = callset_name + ".snps." + idx + ".recal", - tranches_filename = callset_name + ".snps." + idx + ".tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - model_report = SNPsVariantRecalibratorCreateModel.model_report, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - call GatherTranches as SNPGatherTranches { - input: - input_fofn = SNPsVariantRecalibratorScattered.tranches, - output_filename = callset_name + ".snps.gathered.tranches", - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - if (num_gvcfs <= 10000){ - call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. - # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. - Boolean is_small_callset = num_gvcfs <= 1000 - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { - call ApplyRecalibration { - input: - recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", - input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], - input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], - indels_recalibration = IndelsVariantRecalibrator.recalibration, - indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, - indels_tranches = IndelsVariantRecalibrator.tranches, - snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), - snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), - snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), - indel_filter_level = indel_filter_level, - snp_filter_level = snp_filter_level, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - # for large callsets we need to collect metrics from the shards and gather them later - if (!is_small_callset) { - call CollectVariantCallingMetrics as CollectMetricsSharded { - input: - input_vcf = ApplyRecalibration.recalibrated_vcf, - input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, - metrics_filename_prefix = callset_name + "." + idx, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - } - - # for small callsets we can gather the VCF shards and then collect metrics on it - if (is_small_callset) { - call GatherVcfs as FinalGatherVcf { - input: - input_vcfs_fofn = ApplyRecalibration.recalibrated_vcf, - input_vcf_indexes_fofn = ApplyRecalibration.recalibrated_vcf_index, - output_vcf_name = callset_name + ".vcf.gz", - disk_size = huge_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - call CollectVariantCallingMetrics as CollectMetricsOnFullVcf { - input: - input_vcf = FinalGatherVcf.output_vcf, - input_vcf_index = FinalGatherVcf.output_vcf_index, - metrics_filename_prefix = callset_name, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = large_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - # for large callsets we still need to gather the sharded metrics - if (!is_small_callset) { - call GatherMetrics { - input: - input_details_fofn = select_all(CollectMetricsSharded.detail_metrics_file), - input_summaries_fofn = select_all(CollectMetricsSharded.summary_metrics_file), - output_prefix = callset_name, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - output { - # outputs from the small callset path through the wdl - File? output_vcf = FinalGatherVcf.output_vcf - File? output_vcf_index = FinalGatherVcf.output_vcf_index - - # select metrics from the small callset path and the large callset path - File detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherMetrics.detail_metrics_file]) - File summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherMetrics.summary_metrics_file]) - - # output the interval list generated/used by this run workflow - File output_intervals = DynamicallyCombineIntervals.output_intervals - } -} - -task GetNumberOfSamples { - File sample_name_map - String docker - command <<< - wc -l ${sample_name_map} | awk '{print $1}' - >>> - runtime { - docker: docker - memory: "1 GB" - preemptible: 5 - } - output { - Int sample_count = read_int(stdout()) - } -} - -task ImportGVCFs { - Array[String] sample_names - Array[File] input_gvcfs - Array[File] input_gvcfs_indices - String interval - - String workspace_dir_name - - String gatk_path - String docker - Int disk_size - Int batch_size - - command <<< - set -e - set -o pipefail - - python << CODE - gvcfs = ['${sep="','" input_gvcfs}'] - sample_names = ['${sep="','" sample_names}'] - - if len(gvcfs)!= len(sample_names): - exit(1) - - with open("inputs.list", "w") as fi: - for i in range(len(gvcfs)): - fi.write(sample_names[i] + "\t" + gvcfs[i] + "\n") - - CODE - - # The memory setting here is very important and must be several GB lower - # than the total memory allocated to the VM because this tool uses - # a significant amount of non-heap memory for native libraries. - # Also, testing has shown that the multithreaded reader initialization - # does not scale well beyond 5 threads, so don't increase beyond that. - ${gatk_path} --java-options "-Xmx4g -Xms4g" \ - GenomicsDBImport \ - --genomicsdb-workspace-path ${workspace_dir_name} \ - --batch-size ${batch_size} \ - -L ${interval} \ - --sample-name-map inputs.list \ - --reader-threads 5 \ - -ip 500 - - tar -cf ${workspace_dir_name}.tar ${workspace_dir_name} - - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File output_genomicsdb = "${workspace_dir_name}.tar" - } -} - -task GenotypeGVCFs { - File workspace_tar - String interval - - String output_vcf_filename - - File ref_fasta - File ref_fasta_index - File ref_dict - - File dbsnp_vcf - File dbsnp_vcf_index - - String gatk_path - String docker - Int disk_size - - command <<< - set -e - - tar -xf ${workspace_tar} - WORKSPACE=$( basename ${workspace_tar} .tar) - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - GenotypeGVCFs \ - -R ${ref_fasta} \ - -O ${output_vcf_filename} \ - -D ${dbsnp_vcf} \ - -G StandardAnnotation \ - --only-output-calls-starting-in-intervals \ - --use-new-qual-calculator \ - -V gendb://$WORKSPACE \ - -L ${interval} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File output_vcf = "${output_vcf_filename}" - File output_vcf_index = "${output_vcf_filename}.tbi" - } -} - -task HardFilterAndMakeSitesOnlyVcf { - File vcf - File vcf_index - Float excess_het_threshold - - String variant_filtered_vcf_filename - String sites_only_vcf_filename - String gatk_path - - String docker - Int disk_size - - command { - set -e - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - VariantFiltration \ - --filter-expression "ExcessHet > ${excess_het_threshold}" \ - --filter-name ExcessHet \ - -O ${variant_filtered_vcf_filename} \ - -V ${vcf} - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - MakeSitesOnlyVcf \ - --INPUT ${variant_filtered_vcf_filename} \ - --OUTPUT ${sites_only_vcf_filename} - - } - runtime { - docker: docker - memory: "3.5 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File variant_filtered_vcf = "${variant_filtered_vcf_filename}" - File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi" - File sites_only_vcf = "${sites_only_vcf_filename}" - File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi" - } -} - -task IndelsVariantRecalibrator { - String recalibration_filename - String tranches_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File mills_resource_vcf - File axiomPoly_resource_vcf - File dbsnp_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - - command { - ${gatk_path} --java-options "-Xmx24g -Xms24g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode INDEL \ - --max-gaussians 4 \ - --resource:mills,known=false,training=true,truth=true,prior=12 ${mills_resource_vcf} \ - --resource:axiomPoly,known=false,training=true,truth=false,prior=10 ${axiomPoly_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=2 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "26 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task SNPsVariantRecalibratorCreateModel { - String recalibration_filename - String tranches_filename - Int downsampleFactor - String model_report_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - - command { - ${gatk_path} --java-options "-Xmx100g -Xms100g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - --sample-every-Nth-variant ${downsampleFactor} \ - --output-model ${model_report_filename} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "104 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File model_report = "${model_report_filename}" - } -} - -task SNPsVariantRecalibrator { - String recalibration_filename - String tranches_filename - File? model_report - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - - command { - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "3.5 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task GatherTranches { - Array[File] input_fofn - String output_filename - - String gatk_path - - String docker - Int disk_size - - command <<< - set -e - set -o pipefail - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherTranches \ - --input ${sep=" --input " input_fofn} \ - --output ${output_filename} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File tranches = "${output_filename}" - } -} - -task ApplyRecalibration { - String recalibrated_vcf_filename - File input_vcf - File input_vcf_index - File indels_recalibration - File indels_recalibration_index - File indels_tranches - File snps_recalibration - File snps_recalibration_index - File snps_tranches - - Float indel_filter_level - Float snp_filter_level - - String gatk_path - String docker - Int disk_size - - command { - set -e - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O tmp.indel.recalibrated.vcf \ - -V ${input_vcf} \ - --recal-file ${indels_recalibration} \ - --tranches-file ${indels_tranches} \ - --truth-sensitivity-filter-level ${indel_filter_level} \ - --create-output-variant-index true \ - -mode INDEL - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O ${recalibrated_vcf_filename} \ - -V tmp.indel.recalibrated.vcf \ - --recal-file ${snps_recalibration} \ - --tranches-file ${snps_tranches} \ - --truth-sensitivity-filter-level ${snp_filter_level} \ - --create-output-variant-index true \ - -mode SNP - } - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File recalibrated_vcf = "${recalibrated_vcf_filename}" - File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi" - } -} - -task GatherVcfs { - Array[File] input_vcfs_fofn - Array[File] input_vcf_indexes_fofn - String output_vcf_name - - String gatk_path - String docker - Int disk_size - - command <<< - set -e - set -o pipefail - - # ignoreSafetyChecks make a big performance difference so we include it in our invocation - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherVcfsCloud \ - --ignore-safety-checks \ - --gather-type BLOCK \ - --input ${sep=" --input " input_vcfs_fofn} \ - --output ${output_vcf_name} - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - IndexFeatureFile \ - --feature-file ${output_vcf_name} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File output_vcf = "${output_vcf_name}" - File output_vcf_index = "${output_vcf_name}.tbi" - } -} - -task CollectVariantCallingMetrics { - File input_vcf - File input_vcf_index - - String metrics_filename_prefix - File dbsnp_vcf - File dbsnp_vcf_index - File interval_list - File ref_dict - - String gatk_path - String docker - Int disk_size - - command { - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - CollectVariantCallingMetrics \ - --INPUT ${input_vcf} \ - --DBSNP ${dbsnp_vcf} \ - --SEQUENCE_DICTIONARY ${ref_dict} \ - --OUTPUT ${metrics_filename_prefix} \ - --THREAD_COUNT 8 \ - --TARGET_INTERVALS ${interval_list} - } - output { - File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics" - } - runtime { - docker: docker - memory: "7 GB" - cpu: 2 - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } -} - -task GatherMetrics { - Array[File] input_details_fofn - Array[File] input_summaries_fofn - - String output_prefix - - String gatk_path - String docker - Int disk_size - - command <<< - set -e - set -o pipefail - - - ${gatk_path} --java-options "-Xmx2g -Xms2g" \ - AccumulateVariantCallingMetrics \ - --INPUT ${sep=" --INPUT " input_details_fofn} \ - --OUTPUT ${output_prefix} - >>> - runtime { - docker: docker - memory: "3 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics" - } -} - -task DynamicallyCombineIntervals { - File intervals - Int merge_count - - command { - python << CODE - def parse_interval(interval): - colon_split = interval.split(":") - chromosome = colon_split[0] - dash_split = colon_split[1].split("-") - start = int(dash_split[0]) - end = int(dash_split[1]) - return chromosome, start, end - - def add_interval(chr, start, end): - lines_to_write.append(chr + ":" + str(start) + "-" + str(end)) - return chr, start, end - - count = 0 - chain_count = ${merge_count} - l_chr, l_start, l_end = "", 0, 0 - lines_to_write = [] - with open("${intervals}") as f: - with open("out.intervals", "w") as f1: - for line in f.readlines(): - # initialization - if count == 0: - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - # reached number to combine, so spit out and start over - if count == chain_count: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - - c_chr, c_start, c_end = parse_interval(line) - # if adjacent keep the chain going - if c_chr == w_chr and c_start == w_end + 1: - w_end = c_end - count += 1 - continue - # not adjacent, end here and start a new chain - else: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - if l_char != w_chr or l_start != w_start or l_end != w_end: - add_interval(w_chr, w_start, w_end) - f1.writelines("\n".join(lines_to_write)) - CODE - } - - runtime { - memory: "3 GB" - preemptible: 5 - docker: "python:2.7" - } - - output { - File output_intervals = "out.intervals" - } -} diff --git a/joint-discovery-gatk4.hg38.wgs.inputs.json b/joint-discovery-gatk4.hg38.wgs.inputs.json deleted file mode 100644 index 4bd2d8b..0000000 --- a/joint-discovery-gatk4.hg38.wgs.inputs.json +++ /dev/null @@ -1,84 +0,0 @@ -{ - "##_COMMENT1": "INPUT GVCFs & COHORT -- DATASET-SPECIFC, MUST BE ADAPTED", - "JointGenotyping.callset_name": "NA12878", - "JointGenotyping.sample_name_map": "gs://gatk-test-data/joint_discovery/NA12878.sample_map", - - "##_COMMENT2": "REFERENCE FILES", - "JointGenotyping.ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "JointGenotyping.ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "JointGenotyping.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - - "##_COMMENT3": "INTERVALS", - "JointGenotyping.eval_interval_list": "gs://broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", - "JointGenotyping.unpadded_intervals_file": "gs://gatk-test-data/intervals/hg38.even.handcurated.20k.intervals", - - "##_COMMENT4": "RESOURCE FILES", - "JointGenotyping.dbsnp_vcf": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", - "JointGenotyping.dbsnp_vcf_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", - "JointGenotyping.one_thousand_genomes_resource_vcf": "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", - "JointGenotyping.one_thousand_genomes_resource_vcf_index": "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", - "JointGenotyping.omni_resource_vcf": "gs://broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", - "JointGenotyping.omni_resource_vcf_index": "gs://broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", - "JointGenotyping.mills_resource_vcf": "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", - "JointGenotyping.mills_resource_vcf_index": "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", - "JointGenotyping.axiomPoly_resource_vcf": "gs://broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", - "JointGenotyping.axiomPoly_resource_vcf_index": "gs://broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", - "JointGenotyping.hapmap_resource_vcf": "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", - "JointGenotyping.hapmap_resource_vcf_index": "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", - - "##_COMMENT5": "VQSR PARAMETERS", - "JointGenotyping.SNP_VQSR_downsampleFactor": 10, - "JointGenotyping.snp_filter_level": 99.7, - "JointGenotyping.indel_filter_level": 99.7, - "JointGenotyping.indel_recalibration_annotation_values": ["FS", "ReadPosRankSum", "MQRankSum", "QD", "SOR", "DP"], - "JointGenotyping.indel_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], - "JointGenotyping.snp_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ], - "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], - - "##_COMMENT4": "DOCKERS", - "#JointGenotyping.gatk_docker_override": "String? (optional)", - - "##_COMMENT5": "PATHS", - "#JointGenotyping.gatk_path_override": "String? (optional)", - - "##_COMMENT6": "JAVA OPTIONS", - "JointGenotyping.SNPsVariantRecalibratorScattered.java_opt": "-Xmx3g -Xms3g", - "JointGenotyping.CollectMetricsOnFullVcf.java_opt": "-Xmx6g -Xms6g", - "JointGenotyping.IndelsVariantRecalibrator.java_opt": "-Xmx24g -Xms24g", - "JointGenotyping.HardFilterAndMakeSitesOnlyVcf.java_opt": "-Xmx3g -Xms3g", - "JointGenotyping.SNPGatherTranches.java_opt": "-Xmx6g -Xms6g", - "JointGenotyping.CollectMetricsSharded.java_opt": "-Xmx6g -Xms6g", - "JointGenotyping.SitesOnlyGatherVcf.java_opt": "-Xmx6g -Xms6g", - "JointGenotyping.ApplyRecalibration.java_opt": "-Xmx5g -Xms5g", - "JointGenotyping.FinalGatherVcf.java_opt": "-Xmx6g -Xms6g", - "JointGenotyping.ImportGVCFs.java_opt": "-Xmx4g -Xms4g", - "JointGenotyping.SNPsVariantRecalibratorCreateModel.java_opt": "-Xmx100g -Xms100g", - "JointGenotyping.GatherMetrics.java_opt": "-Xmx2g -Xms2g", - "JointGenotyping.GenotypeGVCFs.java_opt": "-Xmx5g -Xms5g", - - "##_COMMENT7": "MEMORY ALLOCATION", - "JointGenotyping.CollectMetricsSharded.mem_size": "7 GB", - "JointGenotyping.ImportGVCFs.mem_size": "7 GB", - "JointGenotyping.IndelsVariantRecalibrator.mem_size": "26 GB", - "JointGenotyping.ApplyRecalibration.mem_size": "7 GB", - "JointGenotyping.CollectMetricsOnFullVcf.mem_size": "7 GB", - "JointGenotyping.GenotypeGVCFs.mem_size": "7 GB", - "JointGenotyping.FinalGatherVcf.mem_size": "7 GB", - "JointGenotyping.SitesOnlyGatherVcf.mem_size": "7 GB", - "JointGenotyping.SNPsVariantRecalibratorScattered.mem_size": "3.5 GB", - "JointGenotyping.SNPsVariantRecalibratorCreateModel.mem_size": "104 GB", - "JointGenotyping.DynamicallyCombineIntervals.mem_size": "3 GB", - "JointGenotyping.GatherMetrics.mem_size": "3 GB", - "JointGenotyping.HardFilterAndMakeSitesOnlyVcf.mem_size": "3.5 GB", - "JointGenotyping.SNPGatherTranches.mem_size": "7 GB", - - "##_COMMENT8": "DISK SIZE ALLOCATION", - "#JointGenotyping.small_disk_override": "Int? (optional)", - "#JointGenotyping.medium_disk_override": "Int? (optional)", - "#JointGenotyping.large_disk_override": "Int? (optional)", - "#JointGenotyping.huge_disk_override": "Int? (optional)", - - "##_COMMENT9": "PREEMPTIBLES", - "#JointGenotyping.preemptible_tries_override": "Int? (optional)" -} - diff --git a/joint-discovery-gatk4.wdl b/joint-discovery-gatk4.wdl deleted file mode 100644 index 2b2df1d..0000000 --- a/joint-discovery-gatk4.wdl +++ /dev/null @@ -1,1018 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL implements the joint discovery and VQSR filtering portion of the GATK -## Best Practices (June 2016) for germline SNP and Indel discovery in human -## whole-genome sequencing (WGS) and exome sequencing data. -## -## Requirements/expectations : -## - One or more GVCFs produced by HaplotypeCaller in GVCF mode -## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. -## -## Outputs : -## - A VCF file and its index, filtered using variant quality score recalibration -## (VQSR) with genotypes for all samples present in the input VCF. All sites that -## are present in the input VCF are retained; filtered sites are annotated as such -## in the FILTER field. -## -## Note about VQSR wiring : -## The SNP and INDEL models are built in parallel, but then the corresponding -## recalibrations are applied in series. Because the INDEL model is generally ready -## first (because there are fewer indels than SNPs) we set INDEL recalibration to -## be applied first to the input VCF, while the SNP model is still being built. By -## the time the SNP model is available, the indel-recalibrated file is available to -## serve as input to apply the SNP recalibration. If we did it the other way around, -## we would have to wait until the SNP recal file was available despite the INDEL -## recal file being there already, then apply SNP recalibration, then apply INDEL -## recalibration. This would lead to a longer wall clock time for complete workflow -## execution. Wiring the INDEL recalibration to be applied first solves the problem. -## -## Cromwell version support -## - Successfully tested on v31 -## - Does not work on versions < v23 due to output syntax -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -workflow JointGenotyping { - # Input Sample - String callset_name - File sample_name_map - - # Reference and Resources - File ref_fasta - File ref_fasta_index - File ref_dict - - File dbsnp_vcf - File dbsnp_vcf_index - - Array[String] snp_recalibration_tranche_values - Array[String] snp_recalibration_annotation_values - Array[String] indel_recalibration_tranche_values - Array[String] indel_recalibration_annotation_values - - File eval_interval_list - File hapmap_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf - File one_thousand_genomes_resource_vcf_index - File mills_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf = dbsnp_vcf - File dbsnp_resource_vcf_index = dbsnp_vcf_index - - File unpadded_intervals_file - - # Runtime attributes - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - - Int? small_disk_override - Int small_disk = select_first([small_disk_override, "100"]) - Int? medium_disk_override - Int medium_disk = select_first([medium_disk_override, "200"]) - Int? large_disk_override - Int large_disk = select_first([large_disk_override, "300"]) - Int? huge_disk_override - Int huge_disk = select_first([huge_disk_override, "400"]) - - String? preemptible_tries_override - Int preemptible_tries = select_first([preemptible_tries_override, "3"]) - - # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme - # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 - Float excess_het_threshold = 54.69 - Float snp_filter_level - Float indel_filter_level - Int SNP_VQSR_downsampleFactor - - Int num_of_original_intervals = length(read_lines(unpadded_intervals_file)) - Int num_gvcfs = length(read_lines(sample_name_map)) - - # Make a 2.5:1 interval number to samples in callset ratio interval list - Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5) - Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1 - - call DynamicallyCombineIntervals { - input: - intervals = unpadded_intervals_file, - merge_count = merge_count, - preemptible_tries = preemptible_tries - } - - Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals) - - scatter (idx in range(length(unpadded_intervals))) { - # the batch_size value was carefully chosen here as it - # is the optimal value for the amount of memory allocated - # within the task; please do not change it without consulting - # the Hellbender (GATK engine) team! - call ImportGVCFs { - input: - sample_name_map = sample_name_map, - interval = unpadded_intervals[idx], - workspace_dir_name = "genomicsdb", - disk_size = medium_disk, - batch_size = 50, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call GenotypeGVCFs { - input: - workspace_tar = ImportGVCFs.output_genomicsdb, - interval = unpadded_intervals[idx], - output_vcf_filename = "output.vcf.gz", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - dbsnp_vcf = dbsnp_vcf, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call HardFilterAndMakeSitesOnlyVcf { - input: - vcf = GenotypeGVCFs.output_vcf, - vcf_index = GenotypeGVCFs.output_vcf_index, - excess_het_threshold = excess_het_threshold, - variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", - sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - call GatherVcfs as SitesOnlyGatherVcf { - input: - input_vcfs_fofn = write_lines(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf), - output_vcf_name = callset_name + ".sites_only.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call IndelsVariantRecalibrator { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".indels.recal", - tranches_filename = callset_name + ".indels.tranches", - recalibration_tranche_values = indel_recalibration_tranche_values, - recalibration_annotation_values = indel_recalibration_annotation_values, - mills_resource_vcf = mills_resource_vcf, - mills_resource_vcf_index = mills_resource_vcf_index, - axiomPoly_resource_vcf = axiomPoly_resource_vcf, - axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - if (num_gvcfs > 10000) { - call SNPsVariantRecalibratorCreateModel { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - downsampleFactor = SNP_VQSR_downsampleFactor, - model_report_filename = callset_name + ".snps.model.report", - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { - call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { - input: - sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], - sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], - recalibration_filename = callset_name + ".snps." + idx + ".recal", - tranches_filename = callset_name + ".snps." + idx + ".tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - model_report = SNPsVariantRecalibratorCreateModel.model_report, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - call GatherTranches as SNPGatherTranches { - input: - input_fofn = write_lines(SNPsVariantRecalibratorScattered.tranches), - output_filename = callset_name + ".snps.gathered.tranches", - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - - if (num_gvcfs <= 10000){ - call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. - # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. - Boolean is_small_callset = num_gvcfs <= 1000 - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { - call ApplyRecalibration { - input: - recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", - input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], - input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], - indels_recalibration = IndelsVariantRecalibrator.recalibration, - indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, - indels_tranches = IndelsVariantRecalibrator.tranches, - snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), - snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), - snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), - indel_filter_level = indel_filter_level, - snp_filter_level = snp_filter_level, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - # for large callsets we need to collect metrics from the shards and gather them later - if (!is_small_callset) { - call CollectVariantCallingMetrics as CollectMetricsSharded { - input: - input_vcf = ApplyRecalibration.recalibrated_vcf, - input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, - metrics_filename_prefix = callset_name + "." + idx, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - } - - # for small callsets we can gather the VCF shards and then collect metrics on it - if (is_small_callset) { - call GatherVcfs as FinalGatherVcf { - input: - input_vcfs_fofn = write_lines(ApplyRecalibration.recalibrated_vcf), - output_vcf_name = callset_name + ".vcf.gz", - disk_size = huge_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call CollectVariantCallingMetrics as CollectMetricsOnFullVcf { - input: - input_vcf = FinalGatherVcf.output_vcf, - input_vcf_index = FinalGatherVcf.output_vcf_index, - metrics_filename_prefix = callset_name, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = large_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - # for large callsets we still need to gather the sharded metrics - if (!is_small_callset) { - call GatherMetrics { - input: - input_details_fofn = write_lines(select_all(CollectMetricsSharded.detail_metrics_file)), - input_summaries_fofn = write_lines(select_all(CollectMetricsSharded.summary_metrics_file)), - output_prefix = callset_name, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - output { - # outputs from the small callset path through the wdl - File? output_vcf = FinalGatherVcf.output_vcf - File? output_vcf_index = FinalGatherVcf.output_vcf_index - - # select metrics from the small callset path and the large callset path - File detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherMetrics.detail_metrics_file]) - File summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherMetrics.summary_metrics_file]) - - # output the interval list generated/used by this run workflow - File output_intervals = DynamicallyCombineIntervals.output_intervals - } -} - -task GetNumberOfSamples { - File sample_name_map - String docker - Int preemptible_tries - command <<< - wc -l ${sample_name_map} | awk '{print $1}' - >>> - runtime { - docker: docker - memory: "1 GB" - preemptible: preemptible_tries - } - output { - Int sample_count = read_int(stdout()) - } -} - -task ImportGVCFs { - File sample_name_map - String interval - - String workspace_dir_name - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - Int batch_size - - command <<< - set -e - - rm -rf ${workspace_dir_name} - - # The memory setting here is very important and must be several GB lower - # than the total memory allocated to the VM because this tool uses - # a significant amount of non-heap memory for native libraries. - # Also, testing has shown that the multithreaded reader initialization - # does not scale well beyond 5 threads, so don't increase beyond that. - ${gatk_path} --java-options "-Xmx4g -Xms4g" \ - GenomicsDBImport \ - --genomicsdb-workspace-path ${workspace_dir_name} \ - --batch-size ${batch_size} \ - -L ${interval} \ - --sample-name-map ${sample_name_map} \ - --reader-threads 5 \ - -ip 500 - - tar -cf ${workspace_dir_name}.tar ${workspace_dir_name} - - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_genomicsdb = "${workspace_dir_name}.tar" - } -} - -task GenotypeGVCFs { - File workspace_tar - String interval - - String output_vcf_filename - - String gatk_path - - File ref_fasta - File ref_fasta_index - File ref_dict - - String dbsnp_vcf - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - - tar -xf ${workspace_tar} - WORKSPACE=$( basename ${workspace_tar} .tar) - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - GenotypeGVCFs \ - -R ${ref_fasta} \ - -O ${output_vcf_filename} \ - -D ${dbsnp_vcf} \ - -G StandardAnnotation \ - --only-output-calls-starting-in-intervals \ - --use-new-qual-calculator \ - -V gendb://$WORKSPACE \ - -L ${interval} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_vcf = "${output_vcf_filename}" - File output_vcf_index = "${output_vcf_filename}.tbi" - } -} - -task HardFilterAndMakeSitesOnlyVcf { - File vcf - File vcf_index - Float excess_het_threshold - - String variant_filtered_vcf_filename - String sites_only_vcf_filename - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command { - set -e - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - VariantFiltration \ - --filter-expression "ExcessHet > ${excess_het_threshold}" \ - --filter-name ExcessHet \ - -O ${variant_filtered_vcf_filename} \ - -V ${vcf} - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - MakeSitesOnlyVcf \ - --INPUT ${variant_filtered_vcf_filename} \ - --OUTPUT ${sites_only_vcf_filename} - - } - runtime { - docker: docker - memory: "3.5 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File variant_filtered_vcf = "${variant_filtered_vcf_filename}" - File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi" - File sites_only_vcf = "${sites_only_vcf_filename}" - File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi" - } -} - -task IndelsVariantRecalibrator { - String recalibration_filename - String tranches_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File mills_resource_vcf - File axiomPoly_resource_vcf - File dbsnp_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - Int? machine_mem_gb - Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf, - mills_resource_vcf, - axiomPoly_resource_vcf, - dbsnp_resource_vcf], - "GiB")) - Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) - Int java_mem = machine_mem-2 - - command { - ${gatk_path} --java-options "-Xmx${java_mem}g -Xms${java_mem}g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode INDEL \ - --max-gaussians 4 \ - --resource:mills,known=false,training=true,truth=true,prior=12 ${mills_resource_vcf} \ - --resource:axiomPoly,known=false,training=true,truth=false,prior=10 ${axiomPoly_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=2 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: machine_mem_gb + " GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task SNPsVariantRecalibratorCreateModel { - String recalibration_filename - String tranches_filename - Int downsampleFactor - String model_report_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx100g -Xms100g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - --sample-every-Nth-variant ${downsampleFactor} \ - --output-model ${model_report_filename} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "104 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File model_report = "${model_report_filename}" - } -} - -task SNPsVariantRecalibrator { - String recalibration_filename - String tranches_filename - File? model_report - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int? machine_mem_gb - Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf, - hapmap_resource_vcf, - omni_resource_vcf, - one_thousand_genomes_resource_vcf, - dbsnp_resource_vcf], - "GiB")) - Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) - Int java_mem = machine_mem-2 - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx${java_mem}g -Xms${java_mem}g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: machine_mem + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task GatherTranches { - File input_fofn - String output_filename - - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - set -o pipefail - - # this is here to deal with the JES bug where commands may be run twice - rm -rf tranches - - mkdir tranches - RETRY_LIMIT=5 - - count=0 - until cat ${input_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the tranches from the cloud' && exit 1 - fi - - cat ${input_fofn} | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherTranches \ - --input inputs.list \ - --output ${output_filename} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File tranches = "${output_filename}" - } -} - -task ApplyRecalibration { - String recalibrated_vcf_filename - File input_vcf - File input_vcf_index - File indels_recalibration - File indels_recalibration_index - File indels_tranches - File snps_recalibration - File snps_recalibration_index - File snps_tranches - - Float indel_filter_level - Float snp_filter_level - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - set -e - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O tmp.indel.recalibrated.vcf \ - -V ${input_vcf} \ - --recal-file ${indels_recalibration} \ - --tranches-file ${indels_tranches} \ - --truth-sensitivity-filter-level ${indel_filter_level} \ - --create-output-variant-index true \ - -mode INDEL - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O ${recalibrated_vcf_filename} \ - -V tmp.indel.recalibrated.vcf \ - --recal-file ${snps_recalibration} \ - --tranches-file ${snps_tranches} \ - --truth-sensitivity-filter-level ${snp_filter_level} \ - --create-output-variant-index true \ - -mode SNP - } - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibrated_vcf = "${recalibrated_vcf_filename}" - File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi" - } -} - -task GatherVcfs { - File input_vcfs_fofn - String output_vcf_name - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - - # Now using NIO to localize the vcfs but the input file must have a ".list" extension - mv ${input_vcfs_fofn} inputs.list - - # --ignore-safety-checks makes a big performance difference so we include it in our invocation. - # This argument disables expensive checks that the file headers contain the same set of - # genotyped samples and that files are in order by position of first record. - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherVcfsCloud \ - --ignore-safety-checks \ - --gather-type BLOCK \ - --input inputs.list \ - --output ${output_vcf_name} - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - IndexFeatureFile \ - --feature-file ${output_vcf_name} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_vcf = "${output_vcf_name}" - File output_vcf_index = "${output_vcf_name}.tbi" - } -} - -task CollectVariantCallingMetrics { - File input_vcf - File input_vcf_index - - String metrics_filename_prefix - File dbsnp_vcf - File dbsnp_vcf_index - File interval_list - File ref_dict - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - CollectVariantCallingMetrics \ - --INPUT ${input_vcf} \ - --DBSNP ${dbsnp_vcf} \ - --SEQUENCE_DICTIONARY ${ref_dict} \ - --OUTPUT ${metrics_filename_prefix} \ - --THREAD_COUNT 8 \ - --TARGET_INTERVALS ${interval_list} - } - output { - File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics" - } - runtime { - docker: docker - memory: "7 GB" - cpu: 2 - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } -} - -task GatherMetrics { - File input_details_fofn - File input_summaries_fofn - - String output_prefix - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - set -o pipefail - - # this is here to deal with the JES bug where commands may be run twice - rm -rf metrics - - mkdir metrics - RETRY_LIMIT=5 - - count=0 - until cat ${input_details_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the metrics from the cloud' && exit 1 - fi - - count=0 - until cat ${input_summaries_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the metrics from the cloud' && exit 1 - fi - - INPUT=`cat ${input_details_fofn} | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("-I=metrics/%s ", $1)}'` - - ${gatk_path} --java-options "-Xmx2g -Xms2g" \ - AccumulateVariantCallingMetrics \ - $INPUT \ - -O ${output_prefix} - >>> - runtime { - docker: docker - memory: "3 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics" - } -} - -task DynamicallyCombineIntervals { - File intervals - Int merge_count - Int preemptible_tries - - command { - python << CODE - def parse_interval(interval): - colon_split = interval.split(":") - chromosome = colon_split[0] - dash_split = colon_split[1].split("-") - start = int(dash_split[0]) - end = int(dash_split[1]) - return chromosome, start, end - - def add_interval(chr, start, end): - lines_to_write.append(chr + ":" + str(start) + "-" + str(end)) - return chr, start, end - - count = 0 - chain_count = ${merge_count} - l_chr, l_start, l_end = "", 0, 0 - lines_to_write = [] - with open("${intervals}") as f: - with open("out.intervals", "w") as f1: - for line in f.readlines(): - # initialization - if count == 0: - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - # reached number to combine, so spit out and start over - if count == chain_count: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - - c_chr, c_start, c_end = parse_interval(line) - # if adjacent keep the chain going - if c_chr == w_chr and c_start == w_end + 1: - w_end = c_end - count += 1 - continue - # not adjacent, end here and start a new chain - else: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - if l_char != w_chr or l_start != w_start or l_end != w_end: - add_interval(w_chr, w_start, w_end) - f1.writelines("\n".join(lines_to_write)) - CODE - } - - runtime { - memory: "3 GB" - preemptible: preemptible_tries - docker: "python:2.7" - } - - output { - File output_intervals = "out.intervals" - } -} diff --git a/tasks/JointGenotypingTasks-terra.wdl b/tasks/JointGenotypingTasks-terra.wdl new file mode 100644 index 0000000..78b062a --- /dev/null +++ b/tasks/JointGenotypingTasks-terra.wdl @@ -0,0 +1,1099 @@ +version 1.0 + + +task CheckSamplesUnique { + input { + File sample_name_map + } + + command { + set -euo pipefail + if [[ $(cut -f 1 ~{sample_name_map} | wc -l) -ne $(cut -f 1 ~{sample_name_map} | sort | uniq | wc -l) ]] + then + echo "Samples in the sample_name_map are not unique" 1>&2 + exit 1 + elif [[ $(cut -f 1 ~{sample_name_map} | wc -l) -lt 50 ]] + then + echo "There are less than 50 samples in the sample_name_map" 1>&2 + echo "Having less than 50 samples means there likely isn't enough data to complete joint calling" 1>&2 + exit 1 + else + echo true + fi + } + + output { + Boolean samples_unique = read_boolean(stdout()) + } + + runtime { + memory: "1 GiB" + preemptible: 1 + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} + +task SplitIntervalList { + + input { + File interval_list + Int scatter_count + File ref_fasta + File ref_fasta_index + File ref_dict + Boolean sample_names_unique_done + Int disk_size + String scatter_mode = "BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW" + String gatk_docker + String gatk_path + String preemptible_tries + } + + parameter_meta { + interval_list: { + localization_optional: true + } + } + + command <<< + ~{gatk_path} --java-options -Xms3g SplitIntervals \ + -L ~{interval_list} -O scatterDir -scatter ~{scatter_count} -R ~{ref_fasta} \ + -mode ~{scatter_mode} + >>> + + runtime { + memory: "3.75 GiB" + preemptible: preemptible_tries + disks: "local-disk " + disk_size + " HDD" + docker: gatk_docker + } + + output { + Array[File] output_intervals = glob("scatterDir/*") + } +} + +task ImportGVCFs { + + input { + File sample_name_map + File interval + File ref_fasta + File ref_fasta_index + File ref_dict + + String workspace_dir_name + + Int disk_size + Int batch_size + + # Using a nightly version of GATK containing fixes for GenomicsDB + # https://github.com/broadinstitute/gatk/pull/5899 + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + rm -rf ~{workspace_dir_name} + + # We've seen some GenomicsDB performance regressions related to intervals, so we're going to pretend we only have a single interval + # using the --merge-input-intervals arg + # There's no data in between since we didn't run HaplotypeCaller over those loci so we're not wasting any compute + + # The memory setting here is very important and must be several GiB lower + # than the total memory allocated to the VM because this tool uses + # a significant amount of non-heap memory for native libraries. + # Also, testing has shown that the multithreaded reader initialization + # does not scale well beyond 5 threads, so don't increase beyond that. + ~{gatk_path} --java-options -Xms8g \ + GenomicsDBImport \ + --genomicsdb-workspace-path ~{workspace_dir_name} \ + --batch-size ~{batch_size} \ + -L ~{interval} \ + --sample-name-map ~{sample_name_map} \ + --reader-threads 5 \ + --merge-input-intervals \ + --consolidate + + tar -cf ~{workspace_dir_name}.tar ~{workspace_dir_name} + >>> + + runtime { + memory: "26 GiB" + cpu: 4 + disks: "local-disk " + disk_size + " HDD" + docker: gatk_docker + preemptible: preemptible_tries + } + + output { + File output_genomicsdb = "~{workspace_dir_name}.tar" + } +} + +task GenotypeGVCFs { + + input { + File workspace_tar + File interval + + String output_vcf_filename + + File ref_fasta + File ref_fasta_index + File ref_dict + + String dbsnp_vcf + + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + parameter_meta { + interval: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + tar -xf ~{workspace_tar} + WORKSPACE=$(basename ~{workspace_tar} .tar) + + ~{gatk_path} --java-options -Xms8g \ + GenotypeGVCFs \ + -R ~{ref_fasta} \ + -O ~{output_vcf_filename} \ + -D ~{dbsnp_vcf} \ + -G StandardAnnotation -G AS_StandardAnnotation \ + --only-output-calls-starting-in-intervals \ + --use-new-qual-calculator \ + -V gendb://$WORKSPACE \ + -L ~{interval} \ + --merge-input-intervals + >>> + + runtime { + memory: "26 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_filename}" + File output_vcf_index = "~{output_vcf_filename}.tbi" + } +} + +task GnarlyGenotyper { + + input { + File workspace_tar + File interval + String output_vcf_filename + File ref_fasta + File ref_fasta_index + File ref_dict + String dbsnp_vcf + + String gatk_docker = "us.gcr.io/broad-gotc-prod/gnarly_genotyper:fixNegativeRefCount" + String preemptible_tries + } + + parameter_meta { + interval: { + localization_optional: true + } + } + + Int disk_size = ceil(size(workspace_tar, "GiB") + size(ref_fasta, "GiB") + size(dbsnp_vcf, "GiB") * 3) + + command <<< + set -e + + tar -xf ~{workspace_tar} + WORKSPACE=$( basename ~{workspace_tar} .tar) + + # use a query.json to set some params that aren't exposed -- ewwwww + cat < $WORKSPACE/query.json + { + "scan_full": true, + "workspace": "genomicsdb", + "array": "genomicsdb_array", + "vid_mapping_file": "genomicsdb/vidmap.json", + "callset_mapping_file": "genomicsdb/callset.json", + "reference_genome": "/cromwell_root/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "max_diploid_alt_alleles_that_can_be_genotyped": 6, + "produce_GT_field": true + } + EOF + + gatk --java-options -Xms8g \ + GnarlyGenotyper \ + -R ~{ref_fasta} \ + -O ~{output_vcf_filename} \ + --output-database-name annotationDB.vcf.gz \ + -D ~{dbsnp_vcf} \ + --only-output-calls-starting-in-intervals \ + --use-new-qual-calculator \ + -V gendb://$WORKSPACE \ + -L ~{interval} \ + -stand-call-conf 10 \ + --merge-input-intervals + >>> + + runtime { + memory: "26 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_filename}" + File output_vcf_index = "~{output_vcf_filename}.tbi" + File output_database = "annotationDB.vcf.gz" + File output_database_index = "annotationDB.vcf.gz.tbi" + } +} + +task HardFilterAndMakeSitesOnlyVcf { + + input { + File vcf + File vcf_index + Float excess_het_threshold + + String variant_filtered_vcf_filename + String sites_only_vcf_filename + + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + ~{gatk_path} --java-options -Xms3g \ + VariantFiltration \ + --filter-expression "ExcessHet > ~{excess_het_threshold}" \ + --filter-name ExcessHet \ + -O ~{variant_filtered_vcf_filename} \ + -V ~{vcf} + + ~{gatk_path} --java-options -Xms3g \ + MakeSitesOnlyVcf \ + -I ~{variant_filtered_vcf_filename} \ + -O ~{sites_only_vcf_filename} + >>> + + runtime { + memory: "3.75 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File variant_filtered_vcf = "~{variant_filtered_vcf_filename}" + File variant_filtered_vcf_index = "~{variant_filtered_vcf_filename}.tbi" + File sites_only_vcf = "~{sites_only_vcf_filename}" + File sites_only_vcf_index = "~{sites_only_vcf_filename}.tbi" + } +} + +task IndelsVariantRecalibrator { + + input { + String recalibration_filename + String tranches_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File mills_resource_vcf + File axiomPoly_resource_vcf + File dbsnp_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 4 + + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + ~{gatk_path} --java-options -Xms24g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode INDEL \ + --max-gaussians ~{max_gaussians} \ + -resource:mills,known=false,training=true,truth=true,prior=12 ~{mills_resource_vcf} \ + -resource:axiomPoly,known=false,training=true,truth=false,prior=10 ~{axiomPoly_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=2 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "26 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File recalibration = "~{recalibration_filename}" + File recalibration_index = "~{recalibration_filename}.idx" + File tranches = "~{tranches_filename}" + } +} + +task SNPsVariantRecalibratorCreateModel { + + input { + String recalibration_filename + String tranches_filename + Int downsampleFactor + String model_report_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + ~{gatk_path} --java-options -Xms100g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + --sample-every-Nth-variant ~{downsampleFactor} \ + --output-model ~{model_report_filename} \ + --max-gaussians ~{max_gaussians} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "104 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File model_report = "~{model_report_filename}" + } +} + +task SNPsVariantRecalibrator { + + input { + String recalibration_filename + String tranches_filename + File? model_report + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + Int? machine_mem_gb + + } + + Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf, + hapmap_resource_vcf, + omni_resource_vcf, + one_thousand_genomes_resource_vcf, + dbsnp_resource_vcf], + "GiB")) + Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) + Int java_mem = machine_mem - 1 + + + String model_report_arg = if defined(model_report) then "--input-model $MODEL_REPORT --output-tranches-for-scatter" else "" + + command <<< + set -euo pipefail + + MODEL_REPORT=~{model_report} + + ~{gatk_path} --java-options -Xms~{java_mem}g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + ~{model_report_arg} \ + --max-gaussians ~{max_gaussians} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "~{machine_mem} GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File recalibration = "~{recalibration_filename}" + File recalibration_index = "~{recalibration_filename}.idx" + File tranches = "~{tranches_filename}" + } +} + +task GatherTranches { + + input { + Array[File] tranches + String output_filename + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + parameter_meta { + tranches: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + tranches_fofn=~{write_lines(tranches)} + + # Jose says: + # Cromwell will fall over if we have it try to localize tens of thousands of files, + # so we manually localize files using gsutil. + # Using gsutil also lets us parallelize the localization, which (as far as we can tell) + # PAPI doesn't do. + + # This is here to deal with the JES bug where commands may be run twice + rm -rf tranches + mkdir tranches + RETRY_LIMIT=5 + + count=0 + until cat $tranches_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the tranches from the cloud' && exit 1 + fi + + cat $tranches_fofn | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list + + ~{gatk_path} --java-options -Xms6g \ + GatherTranches \ + --input inputs.list \ + --output ~{output_filename} + >>> + + runtime { + memory: "7.5 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File tranches = "~{output_filename}" + } +} + +task ApplyRecalibration { + + input { + String recalibrated_vcf_filename + File input_vcf + File input_vcf_index + File indels_recalibration + File indels_recalibration_index + File indels_tranches + File snps_recalibration + File snps_recalibration_index + File snps_tranches + Float indel_filter_level + Float snp_filter_level + Boolean use_allele_specific_annotations + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + ~{gatk_path} --java-options -Xms5g \ + ApplyVQSR \ + -O tmp.indel.recalibrated.vcf \ + -V ~{input_vcf} \ + --recal-file ~{indels_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{indels_tranches} \ + --truth-sensitivity-filter-level ~{indel_filter_level} \ + --create-output-variant-index true \ + -mode INDEL + + ~{gatk_path} --java-options -Xms5g \ + ApplyVQSR \ + -O ~{recalibrated_vcf_filename} \ + -V tmp.indel.recalibrated.vcf \ + --recal-file ~{snps_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{snps_tranches} \ + --truth-sensitivity-filter-level ~{snp_filter_level} \ + --create-output-variant-index true \ + -mode SNP + >>> + + runtime { + memory: "7 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File recalibrated_vcf = "~{recalibrated_vcf_filename}" + File recalibrated_vcf_index = "~{recalibrated_vcf_filename}.tbi" + } +} + +task GatherVcfs { + + input { + Array[File] input_vcfs + String output_vcf_name + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + parameter_meta { + input_vcfs: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + # --ignore-safety-checks makes a big performance difference so we include it in our invocation. + # This argument disables expensive checks that the file headers contain the same set of + # genotyped samples and that files are in order by position of first record. + ~{gatk_path} --java-options -Xms6g \ + GatherVcfsCloud \ + --ignore-safety-checks \ + --gather-type BLOCK \ + --input ~{sep=" --input " input_vcfs} \ + --output ~{output_vcf_name} + + tabix ~{output_vcf_name} + >>> + + runtime { + memory: "7 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_name}" + File output_vcf_index = "~{output_vcf_name}.tbi" + } +} + +task SelectFingerprintSiteVariants { + + input { + File input_vcf + File haplotype_database + String base_output_name + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + parameter_meta { + input_vcf: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + function hdb_to_interval_list() { + input=$1 + awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1 + } + + hdb_to_interval_list ~{haplotype_database} > hdb.interval_list + + ~{gatk_path} --java-options -Xms6g \ + SelectVariants \ + --variant ~{input_vcf} \ + --intervals hdb.interval_list \ + --output ~{base_output_name}.vcf.gz + >>> + + runtime { + memory: "7.5 GiB" + cpu: 1 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File output_vcf = "~{base_output_name}.vcf.gz" + File output_vcf_index = "~{base_output_name}.vcf.gz.tbi" + } +} + +task CollectVariantCallingMetrics { + + input { + File input_vcf + File input_vcf_index + String metrics_filename_prefix + File dbsnp_vcf + File dbsnp_vcf_index + File interval_list + File ref_dict + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + ~{gatk_path} --java-options -Xms6g \ + CollectVariantCallingMetrics \ + --INPUT ~{input_vcf} \ + --DBSNP ~{dbsnp_vcf} \ + --SEQUENCE_DICTIONARY ~{ref_dict} \ + --OUTPUT ~{metrics_filename_prefix} \ + --THREAD_COUNT 8 \ + --TARGET_INTERVALS ~{interval_list} + >>> + + output { + File detail_metrics_file = "~{metrics_filename_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "~{metrics_filename_prefix}.variant_calling_summary_metrics" + } + + runtime { + memory: "7.5 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } +} + +task GatherVariantCallingMetrics { + + input { + Array[File] input_details + Array[File] input_summaries + String output_prefix + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + parameter_meta { + input_details: { + localization_optional: true + } + input_summaries: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + input_details_fofn=~{write_lines(input_details)} + input_summaries_fofn=~{write_lines(input_summaries)} + + # Jose says: + # Cromwell will fall over if we have it try to localize tens of thousands of files, + # so we manually localize files using gsutil. + # Using gsutil also lets us parallelize the localization, which (as far as we can tell) + # PAPI doesn't do. + + # This is here to deal with the JES bug where commands may be run twice + rm -rf metrics + + mkdir metrics + RETRY_LIMIT=5 + + count=0 + until cat $input_details_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the metrics from the cloud' && exit 1 + fi + + count=0 + until cat $input_summaries_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the metrics from the cloud' && exit 1 + fi + + INPUT=$(cat $input_details_fofn | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("--INPUT metrics/%s ", $1)}') + + ~{gatk_path} --java-options -Xms2g \ + AccumulateVariantCallingMetrics \ + $INPUT \ + --OUTPUT ~{output_prefix} + >>> + + runtime { + memory: "3 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File detail_metrics_file = "~{output_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "~{output_prefix}.variant_calling_summary_metrics" + } +} + +task CrossCheckFingerprint { + + input { + Array[File] gvcf_paths + Array[File] vcf_paths + File sample_name_map + File haplotype_database + String output_base_name + Boolean scattered = false + Array[String] expected_inconclusive_samples = [] + String picard_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map" + String preemptible_tries + } + + parameter_meta { + gvcf_paths: { + localization_optional: true + } + vcf_paths: { + localization_optional: true + } + } + + Int num_gvcfs = length(gvcf_paths) + Int cpu = if num_gvcfs < 32 then num_gvcfs else 32 + # Compute memory to use based on the CPU count, following the pattern of + # 3.75GiB / cpu used by GCP's pricing: https://cloud.google.com/compute/pricing + Int memMb = round(cpu * 3.75 * 1024) + Int disk = 100 + + String output_name = output_base_name + ".fingerprintcheck" + + command <<< + set -eu + + gvcfInputsList=~{write_lines(gvcf_paths)} + vcfInputsList=~{write_lines(vcf_paths)} + + cp $gvcfInputsList gvcf_inputs.list + cp $vcfInputsList vcf_inputs.list + + java -Dpicard.useLegacyParser=false -Xms~{memMb - 512}m \ + -jar /usr/gitc/PicardPublicWithCrosscheckNIOandSampleMapping.jar \ + CrosscheckFingerprints \ + --INPUT gvcf_inputs.list \ + --SECOND_INPUT vcf_inputs.list \ + --HAPLOTYPE_MAP ~{haplotype_database} \ + --INPUT_SAMPLE_FILE_MAP ~{sample_name_map} \ + --CROSSCHECK_BY SAMPLE \ + --CROSSCHECK_MODE CHECK_SAME_SAMPLE \ + --NUM_THREADS ~{cpu} \ + --SKIP_INPUT_READABLITY_TEST \ + ~{true='--EXIT_CODE_WHEN_MISMATCH 0' false='' scattered} \ + --OUTPUT ~{output_name} + + if ~{scattered}; then + # UNEXPECTED_MATCH is not possible with CHECK_SAME_SAMPLE + matches=$(grep "EXPECTED_MATCH" ~{output_name} | wc -l) + + # check inconclusive samples + expectedInconclusiveSamples=("~{sep='" "' expected_inconclusive_samples}") + inconclusiveSamplesCount=0 + inconclusiveSamples=($(grep 'INCONCLUSIVE' ~{output_name} | cut -f 1)) + for sample in ${inconclusiveSamples[@]}; do + if printf '%s\n' ${expectedInconclusiveSamples[@]} | grep -P '^'${sample}'$'; then + inconclusiveSamplesCount=$((inconclusiveSamplesCount+1)) + fi + done + + total_matches=$((inconclusiveSamplesCount + matches)) + if [[ ${total_matches} -eq ~{num_gvcfs} ]]; then + >&2 echo "Found the correct number of matches (~{num_gvcfs}) for this shard" + else + >&2 echo "ERROR: Found $total_matches 'EXPECTED_MATCH' records, but expected ~{num_gvcfs}" + exit 1 + fi + fi + >>> + + runtime { + memory: memMb + " MiB" + disks: "local-disk " + disk + " HDD" + preemptible: preemptible_tries + docker: picard_docker + } + + output { + File crosscheck_metrics = output_name + } +} + +task GatherPicardMetrics { + + input { + Array[File] metrics_files + String output_file_name + Int disk_size + } + + command { + # Don't use this task to gather tens of thousands of files. + # Cromwell can't handle it. + + # This cannot gather metrics with histograms + + head -n 7 ~{metrics_files[0]} > ~{output_file_name} + + for metrics_file in ~{sep=' ' metrics_files}; do + sed -n '1,7d;p' $metrics_file | grep -v '^$' >> ~{output_file_name} + done + } + + output { + File gathered_metrics = "~{output_file_name}" + } + + runtime { + cpu: 1 + memory: "3.75 GiB" + preemptible: 1 + disks: "local-disk " + disk_size + " HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} + +task GetFingerprintingIntervalIndices { + + input { + Array[File] unpadded_intervals + File haplotype_database + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -xeo pipefail + + function rename_intervals(){ + interval_list=$1 + name=$2 + + awk 'BEGIN{FS=IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {$5="'$name'"; print}' $interval_list + } + export -f rename_intervals + + function hdb_to_interval_list(){ + input=$1 + + awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1 + } + + function rename_scatter(){ + file=$1 + number=$(echo $file | sed -E 's|([0-9]+)-scattered\.interval.*|\1|') + rename_intervals $file $number > scattered.renamed.$number.interval_list + } + export -f rename_scatter + + # rename the intervals within each interval_list according to the number in the name of the list + + cp ~{sep=' ' unpadded_intervals} ./ + + cat ~{write_lines(unpadded_intervals)} | xargs -n1 basename | xargs -I{} bash -c 'rename_scatter $@' _ {} + + #find the first header + find . -name "scattered.renamed.*.interval_list" | head -n1 | xargs cat | grep '^@' > all.interval_list + + # concatenate the resulting intervals (with no header) + find . -name "scattered.renamed.*.interval_list" | xargs cat | grep -v '^@' >> all.interval_list + + # convert the Haplotype_database to an interval_list + hdb_to_interval_list ~{haplotype_database} > hdb.interval_list + + # find the intervals that overlap the haplotype_database + ~{gatk_path} IntervalListTools \ + -ACTION OVERLAPS \ + -O all.sorted.interval_list \ + -I all.interval_list \ + -SI hdb.interval_list + + if grep -v '^@' all.sorted.interval_list; then + grep -v '^@' all.sorted.interval_list | awk '{FS="\t"; print $5}' | uniq > indices.out + else + touch indices.out + fi + >>> + + output { + Array[String] indices_to_fingerprint = read_lines("indices.out") + File all_sorted_interval_list = "all.sorted.interval_list" + File all_interval_list = "all.interval_list" + File hdb_interval_list = "hdb.interval_list" + } + + runtime { + cpu: 2 + memory: "3.75 GiB" + preemptible: preemptible_tries + disks: "local-disk 10 HDD" + docker: gatk_docker + } +} + +task PartitionSampleNameMap { + + input { + File sample_name_map + Int line_limit + } + + command { + + cut -f 2 ~{sample_name_map} > sample_paths + split -l ~{line_limit} -d sample_paths partition_ + + # Let the OS catch up with creation of files for glob command + sleep 1 + } + + output { + Array[File] partitions = glob("partition_*") + } + + runtime { + memory: "1 GiB" + preemptible: 1 + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} diff --git a/tasks/JointGenotypingTasks.wdl b/tasks/JointGenotypingTasks.wdl index f847e5d..e15f19c 100644 --- a/tasks/JointGenotypingTasks.wdl +++ b/tasks/JointGenotypingTasks.wdl @@ -4,6 +4,7 @@ version 1.0 task CheckSamplesUnique { input { File sample_name_map + Int sample_num_threshold = 50 } command { @@ -12,10 +13,10 @@ task CheckSamplesUnique { then echo "Samples in the sample_name_map are not unique" 1>&2 exit 1 - elif [[ $(cut -f 1 ~{sample_name_map} | wc -l) -lt 50 ]] + elif [[ $(cut -f 1 ~{sample_name_map} | wc -l) -lt ~{sample_num_threshold} ]] then - echo "There are less than 50 samples in the sample_name_map" 1>&2 - echo "Having less than 50 samples means there likely isn't enough data to complete joint calling" 1>&2 + echo "There are fewer than ~{sample_num_threshold} samples in the sample_name_map" 1>&2 + echo "Having fewer than ~{sample_num_threshold} samples means there likely isn't enough data to complete joint calling" 1>&2 exit 1 else echo true @@ -146,6 +147,8 @@ task GenotypeGVCFs { String dbsnp_vcf Int disk_size + # This is needed for gVCFs generated with GATK3 HaplotypeCaller + Boolean allow_old_rms_mapping_quality_annotation_data = false String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.4.0" } @@ -171,6 +174,7 @@ task GenotypeGVCFs { --use-new-qual-calculator \ -V gendb://$WORKSPACE \ -L ~{interval} \ + ~{true='--allow-old-rms-mapping-quality-annotation-data' false='' allow_old_rms_mapping_quality_annotation_data} \ --merge-input-intervals >>> From 0e2d9c2b561460508013f84f440c5bb5de610e50 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 2 Dec 2019 15:24:11 -0500 Subject: [PATCH 14/24] Update genotype2develop (#45) * added updated joint-discovery wdl, needs testing * Added place holder for terra verion of JointGenotyping wdl * Replaced joint-discovery with JointGenotyping workflow, added place holders for terra version for JointGenotyping workflow * updated the terra version of the JointGenotyping * added urls for imports for JointGenotyping workflows * Updated ReadMe * added missing runtime variables * added missing runtime variable --- JointGenotyping-terra.wdl | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) diff --git a/JointGenotyping-terra.wdl b/JointGenotyping-terra.wdl index 754b3ff..3a937c9 100644 --- a/JointGenotyping-terra.wdl +++ b/JointGenotyping-terra.wdl @@ -159,7 +159,10 @@ workflow JointGenotyping { ref_fasta_index = ref_fasta_index, ref_dict = ref_dict, disk_size = small_disk, - sample_names_unique_done = CheckSamplesUnique.samples_unique + sample_names_unique_done = CheckSamplesUnique.samples_unique, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } Array[File] unpadded_intervals = SplitIntervalList.output_intervals @@ -212,6 +215,7 @@ workflow JointGenotyping { ref_fasta_index = ref_fasta_index, ref_dict = ref_dict, dbsnp_vcf = dbsnp_vcf, + preemptible_tries = preemptible_tries } } @@ -221,7 +225,10 @@ workflow JointGenotyping { input: input_vcfs = gnarly_gvcfs, output_vcf_name = callset_name + "." + idx + ".gnarly.vcf.gz", - disk_size = large_disk + disk_size = large_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } } @@ -467,7 +474,10 @@ workflow JointGenotyping { call Tasks.GetFingerprintingIntervalIndices { input: unpadded_intervals = unpadded_intervals, - haplotype_database = haplotype_database + haplotype_database = haplotype_database, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } Array[Int] fingerprinting_indices = GetFingerprintingIntervalIndices.indices_to_fingerprint @@ -480,7 +490,10 @@ workflow JointGenotyping { input: input_vcfs = vcfs_to_fingerprint, output_vcf_name = callset_name + ".gathered.fingerprinting.vcf.gz", - disk_size = medium_disk + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } call Tasks.SelectFingerprintSiteVariants { @@ -488,7 +501,10 @@ workflow JointGenotyping { input_vcf = GatherFingerprintingVcfs.output_vcf, base_output_name = callset_name + ".fingerprinting", haplotype_database = haplotype_database, - disk_size = medium_disk + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries } call Tasks.PartitionSampleNameMap { @@ -509,7 +525,8 @@ workflow JointGenotyping { haplotype_database = haplotype_database, output_base_name = callset_name + "." + idx, scattered = true, - picard_docker = picard_docker + picard_docker = picard_docker, + preemptible_tries = preemptible_tries } } @@ -534,7 +551,8 @@ workflow JointGenotyping { sample_name_map = sample_name_map, haplotype_database = haplotype_database, output_base_name = callset_name, - picard_docker = picard_docker + picard_docker = picard_docker, + preemptible_tries = preemptible_tries } } From 481a17781e45b1420173d88c4772b10f8b6bcb72 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 2 Dec 2019 15:58:29 -0500 Subject: [PATCH 15/24] Update genotype2develop (#46) * added updated joint-discovery wdl, needs testing * Added place holder for terra verion of JointGenotyping wdl * Replaced joint-discovery with JointGenotyping workflow, added place holders for terra version for JointGenotyping workflow * updated the terra version of the JointGenotyping * added urls for imports for JointGenotyping workflows * Updated ReadMe * added missing runtime variables * added missing runtime variable From c61c12805c16e643ff9065f72961480e7e571fd2 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Tue, 3 Dec 2019 12:05:47 -0500 Subject: [PATCH 16/24] Update genotype2develop (#48) * added updated joint-discovery wdl, needs testing * Added place holder for terra verion of JointGenotyping wdl * Replaced joint-discovery with JointGenotyping workflow, added place holders for terra version for JointGenotyping workflow * updated the terra version of the JointGenotyping * added urls for imports for JointGenotyping workflows * Updated ReadMe * added missing runtime variables * added missing runtime variable * added recent updates from dsde-pipelines repo to terra version --- tasks/JointGenotypingTasks-terra.wdl | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tasks/JointGenotypingTasks-terra.wdl b/tasks/JointGenotypingTasks-terra.wdl index 78b062a..bc7edf1 100644 --- a/tasks/JointGenotypingTasks-terra.wdl +++ b/tasks/JointGenotypingTasks-terra.wdl @@ -4,6 +4,7 @@ version 1.0 task CheckSamplesUnique { input { File sample_name_map + Int sample_num_threshold = 50 } command { @@ -12,10 +13,10 @@ task CheckSamplesUnique { then echo "Samples in the sample_name_map are not unique" 1>&2 exit 1 - elif [[ $(cut -f 1 ~{sample_name_map} | wc -l) -lt 50 ]] + elif [[ $(cut -f 1 ~{sample_name_map} | wc -l) -lt ~{sample_num_threshold} ]] then - echo "There are less than 50 samples in the sample_name_map" 1>&2 - echo "Having less than 50 samples means there likely isn't enough data to complete joint calling" 1>&2 + echo "There are fewer than ~{sample_num_threshold} samples in the sample_name_map" 1>&2 + echo "Having fewer than ~{sample_num_threshold} samples means there likely isn't enough data to complete joint calling" 1>&2 exit 1 else echo true @@ -90,7 +91,7 @@ task ImportGVCFs { # Using a nightly version of GATK containing fixes for GenomicsDB # https://github.com/broadinstitute/gatk/pull/5899 - String gatk_docker + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk-nightly:2019-05-07-4.1.2.0-5-g53d015e4f-NIGHTLY-SNAPSHOT" String gatk_path String preemptible_tries } @@ -150,6 +151,8 @@ task GenotypeGVCFs { String dbsnp_vcf Int disk_size + # This is needed for gVCFs generated with GATK3 HaplotypeCaller + Boolean allow_old_rms_mapping_quality_annotation_data = false String gatk_docker String gatk_path String preemptible_tries @@ -177,6 +180,7 @@ task GenotypeGVCFs { --use-new-qual-calculator \ -V gendb://$WORKSPACE \ -L ~{interval} \ + ~{true='--allow-old-rms-mapping-quality-annotation-data' false='' allow_old_rms_mapping_quality_annotation_data} \ --merge-input-intervals >>> @@ -530,7 +534,7 @@ task GatherTranches { Array[File] tranches String output_filename Int disk_size - String gatk_docker + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152" String gatk_path String preemptible_tries } @@ -568,7 +572,7 @@ task GatherTranches { cat $tranches_fofn | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list - ~{gatk_path} --java-options -Xms6g \ + /usr/gitc/gatk --java-options -Xms6g \ GatherTranches \ --input inputs.list \ --output ~{output_filename} @@ -794,7 +798,7 @@ task GatherVariantCallingMetrics { Array[File] input_summaries String output_prefix Int disk_size - String gatk_docker + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152" String gatk_path String preemptible_tries } @@ -846,7 +850,7 @@ task GatherVariantCallingMetrics { INPUT=$(cat $input_details_fofn | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("--INPUT metrics/%s ", $1)}') - ~{gatk_path} --java-options -Xms2g \ + /usr/gitc/gatk --java-options -Xms2g \ AccumulateVariantCallingMetrics \ $INPUT \ --OUTPUT ~{output_prefix} From cd1b265a510b427db4bbed84ea8dced24d216d78 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Fri, 6 Dec 2019 18:27:46 +0000 Subject: [PATCH 17/24] removed unnecessary optional override variables --- JointGenotyping-terra.wdl | 28 ++++++++++------------------ haplotypecaller-gvcf-gatk4.wdl | 24 +++++++++--------------- 2 files changed, 19 insertions(+), 33 deletions(-) diff --git a/JointGenotyping-terra.wdl b/JointGenotyping-terra.wdl index 3a937c9..f28bfd7 100644 --- a/JointGenotyping-terra.wdl +++ b/JointGenotyping-terra.wdl @@ -89,24 +89,16 @@ workflow JointGenotyping { File dbsnp_resource_vcf_index = dbsnp_vcf_index # Runtime attributes - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.4.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - String? picard_docker_override - String picard_docker = select_first([picard_docker_override, "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map"]) - - Int? small_disk_override - Int small_disk = select_first([small_disk_override, "100"]) - Int? medium_disk_override - Int medium_disk = select_first([medium_disk_override, "200"]) - Int? large_disk_override - Int large_disk = select_first([large_disk_override, "300"]) - Int? huge_disk_override - Int huge_disk = select_first([huge_disk_override, "400"]) - - String? preemptible_tries_override - Int preemptible_tries = select_first([preemptible_tries_override, "3"]) + String gatk_docker = "broadinstitute/gatk:4.1.4.0" + String gatk_path = "/gatk/gatk" + String picard_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map" + + Int small_disk = 100 + Int medium_disk = 200 + Int large_disk = 300 + Int huge_disk = 400 + + Int preemptible_tries = 3 # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 diff --git a/haplotypecaller-gvcf-gatk4.wdl b/haplotypecaller-gvcf-gatk4.wdl index abf9092..4b17aee 100644 --- a/haplotypecaller-gvcf-gatk4.wdl +++ b/haplotypecaller-gvcf-gatk4.wdl @@ -38,18 +38,13 @@ workflow HaplotypeCallerGvcf_GATK4 { File ref_fasta_index File scattered_calling_intervals_list - Boolean? make_gvcf - Boolean making_gvcf = select_first([make_gvcf,true]) + Boolean make_gvcf = true + String gatk_docker = "broadinstitute/gatk:4.1.4.0" + String gatk_path = "/gatk/gatk" + String gitc_docker = "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817" + String samtools_path = "samtools" + } - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.4.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - String? gitc_docker_override - String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) - String? samtools_path_override - String samtools_path = select_first([samtools_path_override, "samtools"]) - Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) #is the input a cram file? @@ -57,7 +52,7 @@ workflow HaplotypeCallerGvcf_GATK4 { String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") String vcf_basename = sample_basename - String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" + String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" String output_filename = vcf_basename + output_suffix # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. @@ -65,7 +60,6 @@ workflow HaplotypeCallerGvcf_GATK4 { # to account for the fact that the data is quite uneven across the shards. Int potential_hc_divisor = length(scattered_calling_intervals) - 20 Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 - } if ( is_cram ) { call CramToBamTask { @@ -94,7 +88,7 @@ workflow HaplotypeCallerGvcf_GATK4 { ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, hc_scatter = hc_divisor, - make_gvcf = making_gvcf, + make_gvcf = make_gvcf, docker = gatk_docker, gatk_path = gatk_path } @@ -241,9 +235,9 @@ task MergeGVCFs { String docker Int? mem_gb Int? disk_space_gb - Boolean use_ssd = false Int? preemptible_attempts } + Boolean use_ssd = false Int machine_mem_gb = select_first([mem_gb, 3]) Int command_mem_gb = machine_mem_gb - 1 From 7da582425a119a10487783efe82f8a0de0381a24 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 18 Dec 2019 17:05:05 +0000 Subject: [PATCH 18/24] replaced '$' with '~' --- haplotypecaller-gvcf-gatk4.wdl | 38 +++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/haplotypecaller-gvcf-gatk4.wdl b/haplotypecaller-gvcf-gatk4.wdl index 4b17aee..e6c849d 100644 --- a/haplotypecaller-gvcf-gatk4.wdl +++ b/haplotypecaller-gvcf-gatk4.wdl @@ -138,10 +138,10 @@ task CramToBamTask { set -e set -o pipefail - ${samtools_path} view -h -T ${ref_fasta} ${input_cram} | - ${samtools_path} view -b -o ${sample_name}.bam - - ${samtools_path} index -b ${sample_name}.bam - mv ${sample_name}.bam.bai ${sample_name}.bai + ~{samtools_path} view -h -T ~{ref_fasta} ~{input_cram} | + ~{samtools_path} view -b -o ~{sample_name}.bam - + ~{samtools_path} index -b ~{sample_name}.bam + mv ~{sample_name}.bam.bai ~{sample_name}.bai } runtime { docker: docker @@ -150,8 +150,8 @@ task CramToBamTask { preemptible: select_first([preemptible_attempts, 3]) } output { - File output_bam = "${sample_name}.bam" - File output_bai = "${sample_name}.bai" + File output_bam = "~{sample_name}.bam" + File output_bai = "~{sample_name}.bai" } } @@ -202,13 +202,13 @@ task HaplotypeCaller { command { set -e - ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ + ~{gatk_path} --java-options "-Xmx~{command_mem_gb}G ~{java_opt}" \ HaplotypeCaller \ - -R ${ref_fasta} \ - -I ${input_bam} \ - -L ${interval_list} \ - -O ${output_filename} \ - -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} + -R ~{ref_fasta} \ + -I ~{input_bam} \ + -L ~{interval_list} \ + -O ~{output_filename} \ + -contamination ~{default=0 contamination} ~{true="-ERC GVCF" false="" make_gvcf} } runtime { docker: docker @@ -217,8 +217,8 @@ task HaplotypeCaller { preemptible: select_first([preemptible_attempts, 3]) } output { - File output_vcf = "${output_filename}" - File output_vcf_index = "${output_filename}.tbi" + File output_vcf = "~{output_filename}" + File output_vcf_index = "~{output_filename}.tbi" } } # Merge GVCFs generated per-interval for the same sample @@ -244,10 +244,10 @@ task MergeGVCFs { command { set -e - ${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ + ~{gatk_path} --java-options "-Xmx~{command_mem_gb}G" \ MergeVcfs \ - --INPUT ${sep=' --INPUT ' input_vcfs} \ - --OUTPUT ${output_filename} + --INPUT ~{sep=' --INPUT ' input_vcfs} \ + --OUTPUT ~{output_filename} } runtime { docker: docker @@ -256,8 +256,8 @@ task MergeGVCFs { preemptible: select_first([preemptible_attempts, 3]) } output { - File output_vcf = "${output_filename}" - File output_vcf_index = "${output_filename}.tbi" + File output_vcf = "~{output_filename}" + File output_vcf_index = "~{output_filename}.tbi" } } From 976988f93874d80e0f57acf969f712e8856fc487 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Wed, 8 Jan 2020 20:09:09 +0000 Subject: [PATCH 19/24] updated broad reference bucket path --- JointGenotyping.hg19.wgs.inputs.json | 41 +++++++++++++++++++ JointGenotyping.hg38.terra.wgs.inputs.json | 35 ++++++++++++++++ JointGenotyping.hg38.wgs.inputs.json | 36 ++++++++-------- ...typecaller-gvcf-gatk4.hg38.wgs.inputs.json | 6 +-- 4 files changed, 97 insertions(+), 21 deletions(-) create mode 100644 JointGenotyping.hg19.wgs.inputs.json create mode 100644 JointGenotyping.hg38.terra.wgs.inputs.json diff --git a/JointGenotyping.hg19.wgs.inputs.json b/JointGenotyping.hg19.wgs.inputs.json new file mode 100644 index 0000000..980cc27 --- /dev/null +++ b/JointGenotyping.hg19.wgs.inputs.json @@ -0,0 +1,41 @@ +{ + "JointGenotyping.sample_name_map": "gs://gatk-test-data/joint_discovery/1kg_50_2/downsample.1kg_50.sample_map", + "JointGenotyping.callset_name": "NA12878", + "JointGenotyping.unbounded_scatter_count_scale_factor": 2.5, + "JointGenotyping.SplitIntervalList.scatter_mode": "INTERVAL_SUBDIVISION", + + "JointGenotyping.unpadded_intervals_file": "gs://gatk-test-data/intervals/wgs_calling_regions.v1.list", + "JointGenotyping.ref_fasta": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta", + "JointGenotyping.ref_fasta_index": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai", + "JointGenotyping.ref_dict": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict", + "JointGenotyping.eval_interval_list": "gs://gcp-public-data--broad-references/hg19/v0/wgs_evaluation_regions.v1.interval_list", + "JointGenotyping.haplotype_database": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.haplotype_database.txt", + + "JointGenotyping.axiomPoly_resource_vcf": "gs://gcp-public-data--broad-references/hg19/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.vcf.gz", + "JointGenotyping.axiomPoly_resource_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.vcf.gz.tbi", + "JointGenotyping.dbsnp_vcf": "gs://gcp-public-data--broad-references/hg19/v0/dbsnp_138.b37.vcf.gz", + "JointGenotyping.dbsnp_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/dbsnp_138.b37.vcf.gz.tbi", + "JointGenotyping.hapmap_resource_vcf": "gs://gcp-public-data--broad-references/hg19/v0/hapmap_3.3.b37.vcf.gz", + "JointGenotyping.hapmap_resource_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/hapmap_3.3.b37.vcf.gz.tbi", + "JointGenotyping.mills_resource_vcf": "gs://gcp-public-data--broad-references/hg19/v0/Mills_and_1000G_gold_standard.indels.b37.sites.vcf", + "JointGenotyping.mills_resource_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/Mills_and_1000G_gold_standard.indels.b37.sites.vcf.idx", + "JointGenotyping.omni_resource_vcf": "gs://gcp-public-data--broad-references/hg19/v0/1000G_omni2.5.b37.vcf.gz", + "JointGenotyping.omni_resource_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/1000G_omni2.5.b37.vcf.gz.tbi", + "JointGenotyping.one_thousand_genomes_resource_vcf": "gs://gcp-public-data--broad-references/hg19/v0/1000G_phase1.snps.high_confidence.b37.vcf.gz", + "JointGenotyping.one_thousand_genomes_resource_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/1000G_phase1.snps.high_confidence.b37.vcf.gz.tbi", + + "JointGenotyping.SNP_VQSR_downsampleFactor": 10, + "JointGenotyping.snps_variant_recalibration_threshold": 20000, + "JointGenotyping.snp_filter_level": 99.7, + "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], + "JointGenotyping.snp_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ], + + "JointGenotyping.indel_filter_level": 99.0, + "JointGenotyping.indel_recalibration_annotation_values": ["FS", "ReadPosRankSum", "MQRankSum", "QD", "SOR", "DP"], + "JointGenotyping.indel_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], + + "JointGenotyping.small_disk": 100, + "JointGenotyping.medium_disk": 200, + "JointGenotyping.large_disk": 1000, + "JointGenotyping.huge_disk": 2000 +} diff --git a/JointGenotyping.hg38.terra.wgs.inputs.json b/JointGenotyping.hg38.terra.wgs.inputs.json new file mode 100644 index 0000000..1902afa --- /dev/null +++ b/JointGenotyping.hg38.terra.wgs.inputs.json @@ -0,0 +1,35 @@ +{ +"JointGenotyping.large_disk":"1000", +"JointGenotyping.medium_disk":"200", +"JointGenotyping.indel_recalibration_annotation_values":["FS","ReadPosRankSum","MQRankSum","QD","SOR","DP"], +"JointGenotyping.snp_recalibration_tranche_values":"[100.0,99.95,99.9,99.8,99.6,99.5,99.4,99.3,99.0,98.0,97.0,90.0]", +"JointGenotyping.unbounded_scatter_count_scale_factor":"2.5", +"JointGenotyping.omni_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", +"JointGenotyping.eval_interval_list":"gs://gcp-public-data--broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", +"JointGenotyping.one_thousand_genomes_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", +"JointGenotyping.one_thousand_genomes_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", +"JointGenotyping.small_disk":"100", +"JointGenotyping.snp_recalibration_annotation_values":"[QD,MQRankSum,ReadPosRankSum,FS,MQ,SOR,DP]", +"JointGenotyping.dbsnp_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", +"JointGenotyping.callset_name":"hg38_1kg_50", +"JointGenotyping.unpadded_intervals_file":"gs://gcp-public-data--broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", +"JointGenotyping.dbsnp_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", +"JointGenotyping.ref_fasta_index":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", +"JointGenotyping.ref_dict":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", +"JointGenotyping.mills_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", +"JointGenotyping.sample_name_map":"gs://gatk-test-data/joint_discovery/1kg_50_hg38/gvcf/hg38_1kg_50.sample_map", +"JointGenotyping.indel_recalibration_tranche_values":"[100.0,99.95,99.9,99.5,99.0,97.0,96.0,95.0,94.0,93.5,93.0,92.0,91.0,90.0]", +"JointGenotyping.omni_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", +"JointGenotyping.mills_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", +"JointGenotyping.axiomPoly_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", +"JointGenotyping.snp_filter_level":"99.7", +"JointGenotyping.snps_variant_recalibration_threshold":"20000", +"JointGenotyping.haplotype_database":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt", +"JointGenotyping.hapmap_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", +"JointGenotyping.indel_filter_level":"99", +"JointGenotyping.axiomPoly_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", +"JointGenotyping.ref_fasta":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", +"JointGenotyping.hapmap_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", +"JointGenotyping.SNP_VQSR_downsampleFactor":"10", +"JointGenotyping.huge_disk":"2000" +} diff --git a/JointGenotyping.hg38.wgs.inputs.json b/JointGenotyping.hg38.wgs.inputs.json index 47b1632..868cd5c 100644 --- a/JointGenotyping.hg38.wgs.inputs.json +++ b/JointGenotyping.hg38.wgs.inputs.json @@ -4,25 +4,25 @@ "JointGenotyping.unbounded_scatter_count_scale_factor": 2.5, "JointGenotyping.SplitIntervalList.scatter_mode": "INTERVAL_SUBDIVISION", - "JointGenotyping.unpadded_intervals_file": "gs://broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", - "JointGenotyping.ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "JointGenotyping.ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "JointGenotyping.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - "JointGenotyping.eval_interval_list": "gs://broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", - "JointGenotyping.haplotype_database": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt", + "JointGenotyping.unpadded_intervals_file": "gs://gcp-public-data--broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", + "JointGenotyping.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "JointGenotyping.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "JointGenotyping.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "JointGenotyping.eval_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", + "JointGenotyping.haplotype_database": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt", - "JointGenotyping.axiomPoly_resource_vcf": "gs://broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", - "JointGenotyping.axiomPoly_resource_vcf_index": "gs://broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", - "JointGenotyping.dbsnp_vcf": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", - "JointGenotyping.dbsnp_vcf_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", - "JointGenotyping.hapmap_resource_vcf": "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", - "JointGenotyping.hapmap_resource_vcf_index": "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", - "JointGenotyping.mills_resource_vcf": "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", - "JointGenotyping.mills_resource_vcf_index": "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", - "JointGenotyping.omni_resource_vcf": "gs://broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", - "JointGenotyping.omni_resource_vcf_index": "gs://broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", - "JointGenotyping.one_thousand_genomes_resource_vcf": "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", - "JointGenotyping.one_thousand_genomes_resource_vcf_index": "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", + "JointGenotyping.axiomPoly_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", + "JointGenotyping.axiomPoly_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", + "JointGenotyping.dbsnp_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", + "JointGenotyping.dbsnp_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", + "JointGenotyping.hapmap_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", + "JointGenotyping.hapmap_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", + "JointGenotyping.mills_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "JointGenotyping.mills_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", + "JointGenotyping.omni_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", + "JointGenotyping.omni_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", + "JointGenotyping.one_thousand_genomes_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", + "JointGenotyping.one_thousand_genomes_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", "JointGenotyping.SNP_VQSR_downsampleFactor": 10, "JointGenotyping.snps_variant_recalibration_threshold": 20000, diff --git a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json index 313ce6b..aedac6d 100644 --- a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json +++ b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json @@ -2,9 +2,9 @@ "HaplotypeCallerGvcf_GATK4.input_bam": "gs://broad-public-datasets/NA12878/NA12878.cram", "HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://broad-public-datasets/NA12878/NA12878.cram.crai", - "HaplotypeCallerGvcf_GATK4.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - "HaplotypeCallerGvcf_GATK4.ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "HaplotypeCallerGvcf_GATK4.ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "HaplotypeCallerGvcf_GATK4.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "HaplotypeCallerGvcf_GATK4.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "HaplotypeCallerGvcf_GATK4.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", "HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt" } From 30d95c15f0466c4b58341ce2d0abac1befddb2a7 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 9 Jan 2020 15:23:28 +0000 Subject: [PATCH 20/24] Add important notes regarding JointGenotype workflow to Readme --- README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.md b/README.md index 8b252c4..080b76b 100644 --- a/README.md +++ b/README.md @@ -81,6 +81,19 @@ it easier to configure the workflow.* The dynamic scatter interval creating was optimized for genomes. The scattered SNP VariantRecalibration may fail because of two few "bad" variants to build the negative model. Also, apologies that the logging for SNP recalibration is overly verbose. +- No allele subsetting for the JointGenotyping workflow + - for large cohorts, even exome callsets can have more than 1000 alleles at low + complexity/STR sites + - for sites with more than 6 alternate alleles (by default) called genotypes will be returned, + but without the PLs since the PL arrays get enormous + - allele-specific filtering could be performed if AS annotations are present, + but the data will still be in the VCF in one giant INFO field +- JointGenotyping output is divided into lots of shards + - desirable for use in [Hail](https://hail.is/), which supports parallel import + - Its possible to use [GatherVcfs](https://gatk.broadinstitute.org/hc/en-us/search?utf8=%E2%9C%93&query=GatherVcfs) to combine shards per chromosome. +- GnarlyGenotyper uses a QUAL score approximation + - dramatically improves performance compared with GenotypeGVCFs, but QUAL output (and thus + the QD annotation) may be slightly discordant between the two tools - The provided JSON is meant to be a ready to use example JSON template of the workflow. It is the user’s responsibility to correctly set the reference and resource input variables using the [GATK Tool and Tutorial Documentations](https://software.broadinstitute.org/gatk/documentation/). - Relevant reference and resources bundles can be accessed in [Resource Bundle](https://software.broadinstitute.org/gatk/download/bundle). - Runtime parameters are optimized for Broad's Google Cloud Platform implementation. From dae556645f8274ade5e6d490b8df2bc4c6873e9c Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 9 Jan 2020 20:29:59 +0000 Subject: [PATCH 21/24] minor update to Readme, renamed input file name in json --- JointGenotyping.hg19.wgs.inputs.json | 2 +- README.md | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/JointGenotyping.hg19.wgs.inputs.json b/JointGenotyping.hg19.wgs.inputs.json index 980cc27..d1fbc66 100644 --- a/JointGenotyping.hg19.wgs.inputs.json +++ b/JointGenotyping.hg19.wgs.inputs.json @@ -1,5 +1,5 @@ { - "JointGenotyping.sample_name_map": "gs://gatk-test-data/joint_discovery/1kg_50_2/downsample.1kg_50.sample_map", + "JointGenotyping.sample_name_map": "gs://gatk-test-data/1kgp/downsampled_gvcf_hg37/hg37.1kg_50.sample_map", "JointGenotyping.callset_name": "NA12878", "JointGenotyping.unbounded_scatter_count_scale_factor": 2.5, "JointGenotyping.SplitIntervalList.scatter_mode": "INTERVAL_SUBDIVISION", diff --git a/README.md b/README.md index 080b76b..b57f2c4 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ setting the `make_gvcf` input variable to `false`. ### JointGenotyping.wdl : This WDL implements the joint calling and VQSR filtering portion of the GATK Best Practices for germline SNP and Indel discovery -in human whole-genome sequencing (WGS). The workflow accept a sample map +in human whole-genome sequencing (WGS). The workflow requires a sample map file with 50 or more GVCFs and produces a multisample VCF. *NOTE:* @@ -33,6 +33,7 @@ file with 50 or more GVCFs and produces a multisample VCF. original workflow to support users interested in running the workflow on Terra. The changes include variables for dockers and disksize, making it easier to configure the workflow.* +*- Creating a sample map can be nusience on Terra, use the [generate-sample-map](https://portal.firecloud.org/?return=terra#methods/gatk/generate-sample-map/1) to create one for you.* #### Requirements/expectations @@ -90,7 +91,7 @@ it easier to configure the workflow.* but the data will still be in the VCF in one giant INFO field - JointGenotyping output is divided into lots of shards - desirable for use in [Hail](https://hail.is/), which supports parallel import - - Its possible to use [GatherVcfs](https://gatk.broadinstitute.org/hc/en-us/search?utf8=%E2%9C%93&query=GatherVcfs) to combine shards per chromosome. + - Its possible to use [GatherVcfs](https://gatk.broadinstitute.org/hc/en-us/search?utf8=%E2%9C%93&query=GatherVcfs) to combine shards. - GnarlyGenotyper uses a QUAL score approximation - dramatically improves performance compared with GenotypeGVCFs, but QUAL output (and thus the QD annotation) may be slightly discordant between the two tools From 87b6198bb4960f3a8458b177321f0e43ee6cc53c Mon Sep 17 00:00:00 2001 From: bshifaw Date: Thu, 9 Jan 2020 20:33:09 +0000 Subject: [PATCH 22/24] minor spelling --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b57f2c4..5357ad8 100644 --- a/README.md +++ b/README.md @@ -33,7 +33,7 @@ file with 50 or more GVCFs and produces a multisample VCF. original workflow to support users interested in running the workflow on Terra. The changes include variables for dockers and disksize, making it easier to configure the workflow.* -*- Creating a sample map can be nusience on Terra, use the [generate-sample-map](https://portal.firecloud.org/?return=terra#methods/gatk/generate-sample-map/1) to create one for you.* +*- Creating a sample map can be nuisance on Terra, use the [generate-sample-map](https://portal.firecloud.org/?return=terra#methods/gatk/generate-sample-map/1) to create one for you.* #### Requirements/expectations From 3416bbdfa10c99f02fc959cc0f30436c3591d758 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 13 Jan 2020 15:26:10 +0000 Subject: [PATCH 23/24] Added allele-specific annotations to HC command --- haplotypecaller-gvcf-gatk4.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/haplotypecaller-gvcf-gatk4.wdl b/haplotypecaller-gvcf-gatk4.wdl index e6c849d..9d31e3b 100644 --- a/haplotypecaller-gvcf-gatk4.wdl +++ b/haplotypecaller-gvcf-gatk4.wdl @@ -208,7 +208,8 @@ task HaplotypeCaller { -I ~{input_bam} \ -L ~{interval_list} \ -O ~{output_filename} \ - -contamination ~{default=0 contamination} ~{true="-ERC GVCF" false="" make_gvcf} + -contamination ~{default=0 contamination} ~{true="-ERC GVCF" false="" make_gvcf} \ + -G StandardAnnotation -G AS_StandardAnnotation -G StandardHCAnnotation } runtime { docker: docker From fec597576c03552d750347057e99a450f5d126df Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 13 Jan 2020 15:35:16 +0000 Subject: [PATCH 24/24] Updated import url to the next release tag --- JointGenotyping-terra.wdl | 2 +- JointGenotyping.wdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/JointGenotyping-terra.wdl b/JointGenotyping-terra.wdl index f28bfd7..b1d5c90 100644 --- a/JointGenotyping-terra.wdl +++ b/JointGenotyping-terra.wdl @@ -47,7 +47,7 @@ version 1.0 #import "./tasks/JointGenotypingTasks-terra.wdl" as Tasks -import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/updateGenotype2develop/tasks/JointGenotypingTasks-terra.wdl" as Tasks +import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/2.0.0/tasks/JointGenotypingTasks-terra.wdl" as Tasks # Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19) workflow JointGenotyping { diff --git a/JointGenotyping.wdl b/JointGenotyping.wdl index 499f48e..97a6835 100644 --- a/JointGenotyping.wdl +++ b/JointGenotyping.wdl @@ -47,7 +47,7 @@ version 1.0 #import "./tasks/JointGenotypingTasks.wdl" as Tasks -import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/updateGenotype2develop/tasks/JointGenotypingTasks.wdl" as Tasks +import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/2.0.0/tasks/JointGenotypingTasks.wdl" as Tasks # Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19) workflow JointGenotyping {