From 6d70cd35c31165703da77d005ed8c43ec9f46003 Mon Sep 17 00:00:00 2001
From: bshifaw <bshifaw@broadinstitute.com>
Date: Mon, 22 Jun 2020 16:29:23 -0400
Subject: [PATCH] Dev (#55)

* Removed Terra JointGenotyping
* minor correction to requirements
---
 JointGenotyping-terra.wdl                  |  574 ----------
 JointGenotyping.hg38.terra.wgs.inputs.json |   35 -
 JointGenotyping.wdl                        |    3 +-
 README.md                                  |   15 +-
 tasks/JointGenotypingTasks-terra.wdl       | 1103 --------------------
 5 files changed, 6 insertions(+), 1724 deletions(-)
 delete mode 100644 JointGenotyping-terra.wdl
 delete mode 100644 JointGenotyping.hg38.terra.wgs.inputs.json
 delete mode 100644 tasks/JointGenotypingTasks-terra.wdl

diff --git a/JointGenotyping-terra.wdl b/JointGenotyping-terra.wdl
deleted file mode 100644
index b1d5c90..0000000
--- a/JointGenotyping-terra.wdl
+++ /dev/null
@@ -1,574 +0,0 @@
-version 1.0
-
-## Copyright Broad Institute, 2019
-## 
-## This WDL implements the joint discovery and VQSR filtering portion of the GATK 
-## Best Practices (June 2016) for germline SNP and Indel discovery in human 
-## whole-genome sequencing (WGS) and exome sequencing data.
-##
-## Requirements/expectations :
-## - One or more GVCFs produced by HaplotypeCaller in GVCF mode 
-## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported.
-##
-## Outputs :
-## - A VCF file and its index, filtered using variant quality score recalibration 
-##   (VQSR) with genotypes for all samples present in the input VCF. All sites that 
-##   are present in the input VCF are retained; filtered sites are annotated as such 
-##   in the FILTER field.
-##
-## Note about VQSR wiring :
-## The SNP and INDEL models are built in parallel, but then the corresponding 
-## recalibrations are applied in series. Because the INDEL model is generally ready 
-## first (because there are fewer indels than SNPs) we set INDEL recalibration to 
-## be applied first to the input VCF, while the SNP model is still being built. By 
-## the time the SNP model is available, the indel-recalibrated file is available to 
-## serve as input to apply the SNP recalibration. If we did it the other way around, 
-## we would have to wait until the SNP recal file was available despite the INDEL 
-## recal file being there already, then apply SNP recalibration, then apply INDEL 
-## recalibration. This would lead to a longer wall clock time for complete workflow 
-## execution. Wiring the INDEL recalibration to be applied first solves the problem.
-##
-## Cromwell version support 
-## - Successfully tested on v47
-## - Does not work on versions < v23 due to output syntax
-##
-## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. 
-## For program versions, see docker containers. 
-##
-## LICENSING : 
-## This script is released under the WDL source code license (BSD-3) (see LICENSE in 
-## https://github.com/broadinstitute/wdl). Note however that the programs it calls may 
-## be subject to different licenses. Users are responsible for checking that they are
-## authorized to run all programs before running this script. Please see the docker 
-## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed
-## licensing information pertaining to the included programs.
-
-# WORKFLOW DEFINITION 
-
-#import "./tasks/JointGenotypingTasks-terra.wdl" as Tasks
-
-import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/2.0.0/tasks/JointGenotypingTasks-terra.wdl" as Tasks
-
-# Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19)
-workflow JointGenotyping {
-
-  String pipeline_version = "1.2"
-
-  input {
-    File unpadded_intervals_file
-
-    String callset_name
-    File sample_name_map
-
-    File ref_fasta
-    File ref_fasta_index
-    File ref_dict
-
-    File dbsnp_vcf
-    File dbsnp_vcf_index
-
-    Array[String] snp_recalibration_tranche_values
-    Array[String] snp_recalibration_annotation_values
-    Array[String] indel_recalibration_tranche_values
-    Array[String] indel_recalibration_annotation_values
-
-    File haplotype_database
-
-    File eval_interval_list
-    File hapmap_resource_vcf
-    File hapmap_resource_vcf_index
-    File omni_resource_vcf
-    File omni_resource_vcf_index
-    File one_thousand_genomes_resource_vcf
-    File one_thousand_genomes_resource_vcf_index
-    File mills_resource_vcf
-    File mills_resource_vcf_index
-    File axiomPoly_resource_vcf
-    File axiomPoly_resource_vcf_index
-    File dbsnp_resource_vcf = dbsnp_vcf
-    File dbsnp_resource_vcf_index = dbsnp_vcf_index
-
-    # Runtime attributes
-    String gatk_docker = "broadinstitute/gatk:4.1.4.0"
-    String gatk_path = "/gatk/gatk"
-    String picard_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map"
-
-    Int small_disk = 100
-    Int medium_disk = 200
-    Int large_disk = 300
-    Int huge_disk = 400
-
-    Int preemptible_tries = 3
-
-    # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme
-    # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69
-    Float excess_het_threshold = 54.69
-    Float snp_filter_level
-    Float indel_filter_level
-    Int SNP_VQSR_downsampleFactor
-
-    Int? top_level_scatter_count
-    Boolean? gather_vcfs
-    Int snps_variant_recalibration_threshold = 500000
-    Boolean rename_gvcf_samples = true
-    Float unbounded_scatter_count_scale_factor = 0.15
-    Int gnarly_scatter_count = 10
-    Boolean use_gnarly_genotyper = false
-    Boolean use_allele_specific_annotations = true
-    Boolean cross_check_fingerprints = true
-    Boolean scatter_cross_check_fingerprints = false
-  }
-
-  Boolean allele_specific_annotations = !use_gnarly_genotyper && use_allele_specific_annotations
-
-  Array[Array[String]] sample_name_map_lines = read_tsv(sample_name_map)
-  Int num_gvcfs = length(sample_name_map_lines)
-
-  # Make a 2.5:1 interval number to samples in callset ratio interval list.
-  # We allow overriding the behavior by specifying the desired number of vcfs
-  # to scatter over for testing / special requests.
-  # Zamboni notes say "WGS runs get 30x more scattering than Exome" and
-  # exome scatterCountPerSample is 0.05, min scatter 10, max 1000
-
-  # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly.
-  # For anything larger, we need to keep the VCF sharded and gather metrics collected from them.
-  # We allow overriding this default behavior for testing / special requests.
-  Boolean is_small_callset = select_first([gather_vcfs, num_gvcfs <= 1000])
-
-  Int unbounded_scatter_count = select_first([top_level_scatter_count, round(unbounded_scatter_count_scale_factor * num_gvcfs)])
-  Int scatter_count = if unbounded_scatter_count > 2 then unbounded_scatter_count else 2 #I think weird things happen if scatterCount is 1 -- IntervalListTools is noop?
-
-  call Tasks.CheckSamplesUnique {
-    input:
-      sample_name_map = sample_name_map
-  }
-
-  call Tasks.SplitIntervalList {
-    input:
-      interval_list = unpadded_intervals_file,
-      scatter_count = scatter_count,
-      ref_fasta = ref_fasta,
-      ref_fasta_index = ref_fasta_index,
-      ref_dict = ref_dict,
-      disk_size = small_disk,
-      sample_names_unique_done = CheckSamplesUnique.samples_unique,
-      gatk_docker = gatk_docker,
-      gatk_path = gatk_path,
-      preemptible_tries = preemptible_tries
-  }
-
-  Array[File] unpadded_intervals = SplitIntervalList.output_intervals
-
-  scatter (idx in range(length(unpadded_intervals))) {
-    # The batch_size value was carefully chosen here as it
-    # is the optimal value for the amount of memory allocated
-    # within the task; please do not change it without consulting
-    # the Hellbender (GATK engine) team!
-    call Tasks.ImportGVCFs {
-      input:
-        sample_name_map = sample_name_map,
-        interval = unpadded_intervals[idx],
-        ref_fasta = ref_fasta,
-        ref_fasta_index = ref_fasta_index,
-        ref_dict = ref_dict,
-        workspace_dir_name = "genomicsdb",
-        disk_size = medium_disk,
-        batch_size = 50,
-        gatk_docker = gatk_docker,
-        gatk_path = gatk_path,
-        preemptible_tries = preemptible_tries
-    }
-
-    if (use_gnarly_genotyper) {
-
-      call Tasks.SplitIntervalList as GnarlyIntervalScatterDude {
-        input:
-          interval_list = unpadded_intervals[idx],
-          scatter_count = gnarly_scatter_count,
-          ref_fasta = ref_fasta,
-          ref_fasta_index = ref_fasta_index,
-          ref_dict = ref_dict,
-          disk_size = small_disk,
-          sample_names_unique_done = CheckSamplesUnique.samples_unique,
-          gatk_docker = gatk_docker,
-          gatk_path = gatk_path,
-          preemptible_tries = preemptible_tries
-      }
-
-      Array[File] gnarly_intervals = GnarlyIntervalScatterDude.output_intervals
-
-      scatter (gnarly_idx in range(length(gnarly_intervals))) {
-        call Tasks.GnarlyGenotyper {
-          input:
-            workspace_tar = ImportGVCFs.output_genomicsdb,
-            interval = gnarly_intervals[gnarly_idx],
-            output_vcf_filename = callset_name + "." + idx + "." + gnarly_idx + ".vcf.gz",
-            ref_fasta = ref_fasta,
-            ref_fasta_index = ref_fasta_index,
-            ref_dict = ref_dict,
-            dbsnp_vcf = dbsnp_vcf,
-            preemptible_tries = preemptible_tries
-        }
-      }
-
-      Array[File] gnarly_gvcfs = GnarlyGenotyper.output_vcf
-
-      call Tasks.GatherVcfs as TotallyRadicalGatherVcfs {
-        input:
-          input_vcfs = gnarly_gvcfs,
-          output_vcf_name = callset_name + "." + idx + ".gnarly.vcf.gz",
-          disk_size = large_disk,
-          gatk_docker = gatk_docker,
-          gatk_path = gatk_path,
-          preemptible_tries = preemptible_tries
-      }
-    }
-
-    if (!use_gnarly_genotyper) {
-      call Tasks.GenotypeGVCFs {
-        input:
-          workspace_tar = ImportGVCFs.output_genomicsdb,
-          interval = unpadded_intervals[idx],
-          output_vcf_filename = callset_name + "." + idx + ".vcf.gz",
-          ref_fasta = ref_fasta,
-          ref_fasta_index = ref_fasta_index,
-          ref_dict = ref_dict,
-          dbsnp_vcf = dbsnp_vcf,
-          disk_size = medium_disk,
-          gatk_docker = gatk_docker,
-          gatk_path = gatk_path,
-          preemptible_tries = preemptible_tries
-      }
-    }
-
-    File genotyped_vcf = select_first([TotallyRadicalGatherVcfs.output_vcf, GenotypeGVCFs.output_vcf])
-    File genotyped_vcf_index = select_first([TotallyRadicalGatherVcfs.output_vcf_index, GenotypeGVCFs.output_vcf_index])
-
-    call Tasks.HardFilterAndMakeSitesOnlyVcf {
-      input:
-        vcf = genotyped_vcf,
-        vcf_index = genotyped_vcf_index,
-        excess_het_threshold = excess_het_threshold,
-        variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz",
-        sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz",
-        disk_size = medium_disk,
-        gatk_docker = gatk_docker,
-        gatk_path = gatk_path,
-        preemptible_tries = preemptible_tries
-    }
-  }
-
-  call Tasks.GatherVcfs as SitesOnlyGatherVcf {
-    input:
-      input_vcfs = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf,
-      output_vcf_name = callset_name + ".sites_only.vcf.gz",
-      disk_size = medium_disk,
-      gatk_docker = gatk_docker,
-      gatk_path = gatk_path,
-      preemptible_tries = preemptible_tries
-  }
-
-  call Tasks.IndelsVariantRecalibrator {
-    input:
-      sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
-      sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
-      recalibration_filename = callset_name + ".indels.recal",
-      tranches_filename = callset_name + ".indels.tranches",
-      recalibration_tranche_values = indel_recalibration_tranche_values,
-      recalibration_annotation_values = indel_recalibration_annotation_values,
-      mills_resource_vcf = mills_resource_vcf,
-      mills_resource_vcf_index = mills_resource_vcf_index,
-      axiomPoly_resource_vcf = axiomPoly_resource_vcf,
-      axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index,
-      dbsnp_resource_vcf = dbsnp_resource_vcf,
-      dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
-      use_allele_specific_annotations = allele_specific_annotations,
-      disk_size = small_disk,
-      gatk_docker = gatk_docker,
-      gatk_path = gatk_path,
-      preemptible_tries = preemptible_tries
-  }
-
-  if (num_gvcfs > snps_variant_recalibration_threshold) {
-    call Tasks.SNPsVariantRecalibratorCreateModel {
-      input:
-        sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
-        sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
-        recalibration_filename = callset_name + ".snps.recal",
-        tranches_filename = callset_name + ".snps.tranches",
-        recalibration_tranche_values = snp_recalibration_tranche_values,
-        recalibration_annotation_values = snp_recalibration_annotation_values,
-        downsampleFactor = SNP_VQSR_downsampleFactor,
-        model_report_filename = callset_name + ".snps.model.report",
-        hapmap_resource_vcf = hapmap_resource_vcf,
-        hapmap_resource_vcf_index = hapmap_resource_vcf_index,
-        omni_resource_vcf = omni_resource_vcf,
-        omni_resource_vcf_index = omni_resource_vcf_index,
-        one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
-        one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
-        dbsnp_resource_vcf = dbsnp_resource_vcf,
-        dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
-        use_allele_specific_annotations = allele_specific_annotations,
-        disk_size = small_disk,
-        gatk_docker = gatk_docker,
-        gatk_path = gatk_path,
-        preemptible_tries = preemptible_tries
-    }
-
-    scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) {
-      call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered {
-        input:
-          sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx],
-          sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx],
-          recalibration_filename = callset_name + ".snps." + idx + ".recal",
-          tranches_filename = callset_name + ".snps." + idx + ".tranches",
-          recalibration_tranche_values = snp_recalibration_tranche_values,
-          recalibration_annotation_values = snp_recalibration_annotation_values,
-          model_report = SNPsVariantRecalibratorCreateModel.model_report,
-          hapmap_resource_vcf = hapmap_resource_vcf,
-          hapmap_resource_vcf_index = hapmap_resource_vcf_index,
-          omni_resource_vcf = omni_resource_vcf,
-          omni_resource_vcf_index = omni_resource_vcf_index,
-          one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
-          one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
-          dbsnp_resource_vcf = dbsnp_resource_vcf,
-          dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
-          use_allele_specific_annotations = allele_specific_annotations,
-          disk_size = small_disk,
-          gatk_docker = gatk_docker,
-          gatk_path = gatk_path,
-          preemptible_tries = preemptible_tries
-        }
-    }
-
-    call Tasks.GatherTranches as SNPGatherTranches {
-      input:
-        tranches = SNPsVariantRecalibratorScattered.tranches,
-        output_filename = callset_name + ".snps.gathered.tranches",
-        disk_size = small_disk,
-        gatk_docker = gatk_docker,
-        gatk_path = gatk_path,
-        preemptible_tries = preemptible_tries
-    }
-  }
-
-  if (num_gvcfs <= snps_variant_recalibration_threshold) {
-    call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic {
-      input:
-        sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf,
-        sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index,
-        recalibration_filename = callset_name + ".snps.recal",
-        tranches_filename = callset_name + ".snps.tranches",
-        recalibration_tranche_values = snp_recalibration_tranche_values,
-        recalibration_annotation_values = snp_recalibration_annotation_values,
-        hapmap_resource_vcf = hapmap_resource_vcf,
-        hapmap_resource_vcf_index = hapmap_resource_vcf_index,
-        omni_resource_vcf = omni_resource_vcf,
-        omni_resource_vcf_index = omni_resource_vcf_index,
-        one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf,
-        one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index,
-        dbsnp_resource_vcf = dbsnp_resource_vcf,
-        dbsnp_resource_vcf_index = dbsnp_resource_vcf_index,
-        use_allele_specific_annotations = allele_specific_annotations,
-        disk_size = small_disk,
-        gatk_docker = gatk_docker,
-        gatk_path = gatk_path,
-        preemptible_tries = preemptible_tries
-    }
-  }
-
-  scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) {
-    #for really large callsets we give to friends, just apply filters to the sites-only
-    call Tasks.ApplyRecalibration {
-      input:
-        recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz",
-        input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx],
-        input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx],
-        indels_recalibration = IndelsVariantRecalibrator.recalibration,
-        indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index,
-        indels_tranches = IndelsVariantRecalibrator.tranches,
-        snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]),
-        snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]),
-        snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]),
-        indel_filter_level = indel_filter_level,
-        snp_filter_level = snp_filter_level,
-        use_allele_specific_annotations = allele_specific_annotations,
-        disk_size = medium_disk,
-        gatk_docker = gatk_docker,
-        gatk_path = gatk_path,
-        preemptible_tries = preemptible_tries
-    }
-
-    # For large callsets we need to collect metrics from the shards and gather them later.
-    if (!is_small_callset) {
-      call Tasks.CollectVariantCallingMetrics as CollectMetricsSharded {
-        input:
-          input_vcf = ApplyRecalibration.recalibrated_vcf,
-          input_vcf_index = ApplyRecalibration.recalibrated_vcf_index,
-          metrics_filename_prefix = callset_name + "." + idx,
-          dbsnp_vcf = dbsnp_vcf,
-          dbsnp_vcf_index = dbsnp_vcf_index,
-          interval_list = eval_interval_list,
-          ref_dict = ref_dict,
-          disk_size = medium_disk,
-          gatk_docker = gatk_docker,
-          gatk_path = gatk_path,
-          preemptible_tries = preemptible_tries
-      }
-    }
-  }
-
-  # For small callsets we can gather the VCF shards and then collect metrics on it.
-  if (is_small_callset) {
-    call Tasks.GatherVcfs as FinalGatherVcf {
-      input:
-        input_vcfs = ApplyRecalibration.recalibrated_vcf,
-        output_vcf_name = callset_name + ".vcf.gz",
-        disk_size = huge_disk,
-        gatk_docker = gatk_docker,
-        gatk_path = gatk_path,
-        preemptible_tries = preemptible_tries
-    }
-
-    call Tasks.CollectVariantCallingMetrics as CollectMetricsOnFullVcf {
-      input:
-        input_vcf = FinalGatherVcf.output_vcf,
-        input_vcf_index = FinalGatherVcf.output_vcf_index,
-        metrics_filename_prefix = callset_name,
-        dbsnp_vcf = dbsnp_vcf,
-        dbsnp_vcf_index = dbsnp_vcf_index,
-        interval_list = eval_interval_list,
-        ref_dict = ref_dict,
-        disk_size = large_disk,
-        gatk_docker = gatk_docker,
-        gatk_path = gatk_path,
-        preemptible_tries = preemptible_tries
-    }
-  }
-
-  if (!is_small_callset) {
-    # For large callsets we still need to gather the sharded metrics.
-    call Tasks.GatherVariantCallingMetrics {
-      input:
-        input_details = select_all(CollectMetricsSharded.detail_metrics_file),
-        input_summaries = select_all(CollectMetricsSharded.summary_metrics_file),
-        output_prefix = callset_name,
-        disk_size = medium_disk,
-        gatk_docker = gatk_docker,
-        gatk_path = gatk_path,
-        preemptible_tries = preemptible_tries
-    }
-  }
-
-    # CrossCheckFingerprints takes forever on large callsets.
-    # We scatter over the input GVCFs to make things faster.
-    if (scatter_cross_check_fingerprints) {
-      call Tasks.GetFingerprintingIntervalIndices {
-        input:
-          unpadded_intervals = unpadded_intervals,
-          haplotype_database = haplotype_database,
-          gatk_docker = gatk_docker,
-          gatk_path = gatk_path,
-          preemptible_tries = preemptible_tries
-      }
-
-      Array[Int] fingerprinting_indices = GetFingerprintingIntervalIndices.indices_to_fingerprint
-
-      scatter (idx in fingerprinting_indices) {
-        File vcfs_to_fingerprint = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx]
-      }
-
-      call Tasks.GatherVcfs as GatherFingerprintingVcfs {
-        input:
-          input_vcfs = vcfs_to_fingerprint,
-          output_vcf_name = callset_name + ".gathered.fingerprinting.vcf.gz",
-          disk_size = medium_disk,
-          gatk_docker = gatk_docker,
-          gatk_path = gatk_path,
-          preemptible_tries = preemptible_tries
-      }
-
-      call Tasks.SelectFingerprintSiteVariants {
-        input:
-          input_vcf = GatherFingerprintingVcfs.output_vcf,
-          base_output_name = callset_name + ".fingerprinting",
-          haplotype_database = haplotype_database,
-          disk_size = medium_disk,
-          gatk_docker = gatk_docker,
-          gatk_path = gatk_path,
-          preemptible_tries = preemptible_tries
-      }
-
-      call Tasks.PartitionSampleNameMap {
-        input:
-          sample_name_map = sample_name_map,
-          line_limit = 1000
-      }
-
-      scatter (idx in range(length(PartitionSampleNameMap.partitions))) {
-
-        Array[File] files_in_partition = read_lines(PartitionSampleNameMap.partitions[idx])
-
-        call Tasks.CrossCheckFingerprint as CrossCheckFingerprintsScattered {
-          input:
-            gvcf_paths = files_in_partition,
-            vcf_paths = vcfs_to_fingerprint,
-            sample_name_map = sample_name_map,
-            haplotype_database = haplotype_database,
-            output_base_name = callset_name + "." + idx,
-            scattered = true,
-            picard_docker = picard_docker,
-            preemptible_tries = preemptible_tries
-        }
-      }
-
-      call Tasks.GatherPicardMetrics as GatherFingerprintingMetrics {
-        input:
-          metrics_files = CrossCheckFingerprintsScattered.crosscheck_metrics,
-          output_file_name = callset_name + ".fingerprintcheck",
-          disk_size = small_disk
-      }
-    }
-
-    if (!scatter_cross_check_fingerprints) {
-
-      scatter (line in sample_name_map_lines) {
-         File gvcf_paths = line[1]
-      }
-
-      call Tasks.CrossCheckFingerprint as CrossCheckFingerprintSolo {
-        input:
-          gvcf_paths = gvcf_paths,
-          vcf_paths = ApplyRecalibration.recalibrated_vcf,
-          sample_name_map = sample_name_map,
-          haplotype_database = haplotype_database,
-          output_base_name = callset_name,
-          picard_docker = picard_docker,
-          preemptible_tries = preemptible_tries
-      }
-    }
-
-  # Get the metrics from either code path
-  File output_detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherVariantCallingMetrics.detail_metrics_file])
-  File output_summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherVariantCallingMetrics.summary_metrics_file])
-
-  # Get the VCFs from either code path
-  Array[File?] output_vcf_files = if defined(FinalGatherVcf.output_vcf) then [FinalGatherVcf.output_vcf] else ApplyRecalibration.recalibrated_vcf
-  Array[File?] output_vcf_index_files = if defined(FinalGatherVcf.output_vcf_index) then [FinalGatherVcf.output_vcf_index] else ApplyRecalibration.recalibrated_vcf_index
-
-  output {
-    # Metrics from either the small or large callset
-    File detail_metrics_file = output_detail_metrics_file
-    File summary_metrics_file = output_summary_metrics_file
-
-    # Outputs from the small callset path through the wdl.
-    Array[File] output_vcfs = select_all(output_vcf_files)
-    Array[File] output_vcf_indices = select_all(output_vcf_index_files)
-
-    # Output the interval list generated/used by this run workflow.
-    Array[File] output_intervals = SplitIntervalList.output_intervals
-
-    # Output the metrics from crosschecking fingerprints.
-    File crosscheck_fingerprint_check = select_first([CrossCheckFingerprintSolo.crosscheck_metrics, GatherFingerprintingMetrics.gathered_metrics])
-  }
-}
diff --git a/JointGenotyping.hg38.terra.wgs.inputs.json b/JointGenotyping.hg38.terra.wgs.inputs.json
deleted file mode 100644
index f74532d..0000000
--- a/JointGenotyping.hg38.terra.wgs.inputs.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-"JointGenotyping.large_disk":"1000",
-"JointGenotyping.medium_disk":"200",
-"JointGenotyping.indel_recalibration_annotation_values":["FS","ReadPosRankSum","MQRankSum","QD","SOR","DP"],
-"JointGenotyping.snp_recalibration_tranche_values":"[100.0,99.95,99.9,99.8,99.6,99.5,99.4,99.3,99.0,98.0,97.0,90.0]",
-"JointGenotyping.unbounded_scatter_count_scale_factor":"2.5",
-"JointGenotyping.omni_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi",
-"JointGenotyping.eval_interval_list":"gs://gcp-public-data--broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list",
-"JointGenotyping.one_thousand_genomes_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi",
-"JointGenotyping.one_thousand_genomes_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz",
-"JointGenotyping.small_disk":"100",
-"JointGenotyping.snp_recalibration_annotation_values":"[QD,MQRankSum,ReadPosRankSum,FS,MQ,SOR,DP]",
-"JointGenotyping.dbsnp_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf",
-"JointGenotyping.callset_name":"hg38_1kg_50",
-"JointGenotyping.unpadded_intervals_file":"gs://gcp-public-data--broad-references/hg38/v0/hg38.even.handcurated.20k.intervals",
-"JointGenotyping.dbsnp_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx",
-"JointGenotyping.ref_fasta_index":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai",
-"JointGenotyping.ref_dict":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict",
-"JointGenotyping.mills_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi",
-"JointGenotyping.sample_name_map":"gs://gatk-test-data/1kgp/downsampled_gvcf_hg38/hg38_1kg_50.sample_map",
-"JointGenotyping.indel_recalibration_tranche_values":"[100.0,99.95,99.9,99.5,99.0,97.0,96.0,95.0,94.0,93.5,93.0,92.0,91.0,90.0]",
-"JointGenotyping.omni_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz",
-"JointGenotyping.mills_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz",
-"JointGenotyping.axiomPoly_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz",
-"JointGenotyping.snp_filter_level":"99.7",
-"JointGenotyping.snps_variant_recalibration_threshold":"20000",
-"JointGenotyping.haplotype_database":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt",
-"JointGenotyping.hapmap_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz",
-"JointGenotyping.indel_filter_level":"99",
-"JointGenotyping.axiomPoly_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi",
-"JointGenotyping.ref_fasta":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
-"JointGenotyping.hapmap_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi",
-"JointGenotyping.SNP_VQSR_downsampleFactor":"10",
-"JointGenotyping.huge_disk":"2000"
-}
diff --git a/JointGenotyping.wdl b/JointGenotyping.wdl
index 97a6835..414591c 100644
--- a/JointGenotyping.wdl
+++ b/JointGenotyping.wdl
@@ -45,9 +45,8 @@ version 1.0
 
 # WORKFLOW DEFINITION 
 
-#import "./tasks/JointGenotypingTasks.wdl" as Tasks
+import "./tasks/JointGenotypingTasks.wdl" as Tasks
 
-import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/2.0.0/tasks/JointGenotypingTasks.wdl" as Tasks
 
 # Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19)
 workflow JointGenotyping {
diff --git a/README.md b/README.md
index af5dd34..54fd76d 100644
--- a/README.md
+++ b/README.md
@@ -29,16 +29,12 @@ in human whole-genome sequencing (WGS). The workflow requires a sample map
 file with 50 or more GVCFs and produces a multisample VCF.
 
 *NOTE:*  
-*- JointGenotyping-terra.wdl is a slightly modified version of the 
-original workflow to support users interested in running the 
-workflow on Terra. The changes include variables for dockers and disksize, making 
-it easier to configure the workflow.*
-*- Creating a sample map can be nuisance on Terra, use the [generate-sample-map](https://portal.firecloud.org/?return=terra#methods/gatk/generate-sample-map/1) to create one for you.*
+*- To create a sample map use the [generate-sample-map](https://portal.firecloud.org/?return=terra#methods/gatk/generate-sample-map/1) workflow.*
 
 
 #### Requirements/expectations
-- One or more GVCFs produced by HaplotypeCaller in GVCF mode
-- Bare minimum 50 samples. Gene panels are not supported.
+- A sample map listing 50 or more GVCFs produced by HaplotypeCaller in GVCF mode.
+- Gene panels are not supported.
 
 ### Outputs 
 - A VCF file and its index, filtered using variant quality score recalibration  
@@ -47,12 +43,11 @@ it easier to configure the workflow.*
   in the FILTER field.
 
 ### Software version requirements :
-- GATK 4.1.4.0 
+- GATK 4 
 - Samtools 1.3.1
 - Python 2.7
 - Cromwell version support 
-  - Successfully tested on v37
-  - Does not work on versions < v23 due to output syntax
+  - Successfully tested on v50
 
 ### IMPORTANT NOTE : 
 - VQSR wiring. The SNP and INDEL models are built in parallel, but then the corresponding 
diff --git a/tasks/JointGenotypingTasks-terra.wdl b/tasks/JointGenotypingTasks-terra.wdl
deleted file mode 100644
index bc7edf1..0000000
--- a/tasks/JointGenotypingTasks-terra.wdl
+++ /dev/null
@@ -1,1103 +0,0 @@
-version 1.0
-
-
-task CheckSamplesUnique {
-  input {
-    File sample_name_map
-    Int sample_num_threshold = 50
-  }
-
-  command {
-    set -euo pipefail
-    if [[ $(cut -f 1 ~{sample_name_map} | wc -l) -ne $(cut -f 1 ~{sample_name_map} | sort | uniq | wc -l) ]]
-    then
-      echo "Samples in the sample_name_map are not unique" 1>&2
-      exit 1
-    elif [[ $(cut -f 1 ~{sample_name_map} | wc -l) -lt ~{sample_num_threshold} ]]
-    then
-      echo "There are fewer than ~{sample_num_threshold} samples in the sample_name_map" 1>&2
-      echo "Having fewer than ~{sample_num_threshold} samples means there likely isn't enough data to complete joint calling" 1>&2
-      exit 1
-    else
-      echo true
-    fi
-  }
-
-  output {
-    Boolean samples_unique = read_boolean(stdout())
-  }
-
-  runtime {
-    memory: "1 GiB"
-    preemptible: 1
-    disks: "local-disk 10 HDD"
-    docker: "us.gcr.io/broad-gotc-prod/python:2.7"
-  }
-}
-
-task SplitIntervalList {
-
-  input {
-    File interval_list
-    Int scatter_count
-    File ref_fasta
-    File ref_fasta_index
-    File ref_dict
-    Boolean sample_names_unique_done
-    Int disk_size
-    String scatter_mode = "BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW"
-    String gatk_docker
-    String gatk_path
-    String preemptible_tries 
-  }
-
-  parameter_meta {
-    interval_list: {
-      localization_optional: true
-    }
-  }
-
-  command <<<
-    ~{gatk_path} --java-options -Xms3g SplitIntervals \
-      -L ~{interval_list} -O  scatterDir -scatter ~{scatter_count} -R ~{ref_fasta} \
-      -mode ~{scatter_mode}
-   >>>
-
-  runtime {
-    memory: "3.75 GiB"
-    preemptible: preemptible_tries
-    disks: "local-disk " + disk_size + " HDD"
-    docker: gatk_docker
-  }
-
-  output {
-    Array[File] output_intervals = glob("scatterDir/*")
-  }
-}
-
-task ImportGVCFs {
-
-  input {
-    File sample_name_map
-    File interval
-    File ref_fasta
-    File ref_fasta_index
-    File ref_dict
-
-    String workspace_dir_name
-
-    Int disk_size
-    Int batch_size
-
-    # Using a nightly version of GATK containing fixes for GenomicsDB
-    # https://github.com/broadinstitute/gatk/pull/5899
-    String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk-nightly:2019-05-07-4.1.2.0-5-g53d015e4f-NIGHTLY-SNAPSHOT"
-    String gatk_path
-    String preemptible_tries
-  }
-
-  command <<<
-    set -euo pipefail
-
-    rm -rf ~{workspace_dir_name}
-
-    # We've seen some GenomicsDB performance regressions related to intervals, so we're going to pretend we only have a single interval
-    # using the --merge-input-intervals arg
-    # There's no data in between since we didn't run HaplotypeCaller over those loci so we're not wasting any compute
-
-    # The memory setting here is very important and must be several GiB lower
-    # than the total memory allocated to the VM because this tool uses
-    # a significant amount of non-heap memory for native libraries.
-    # Also, testing has shown that the multithreaded reader initialization
-    # does not scale well beyond 5 threads, so don't increase beyond that.
-    ~{gatk_path} --java-options -Xms8g \
-      GenomicsDBImport \
-      --genomicsdb-workspace-path ~{workspace_dir_name} \
-      --batch-size ~{batch_size} \
-      -L ~{interval} \
-      --sample-name-map ~{sample_name_map} \
-      --reader-threads 5 \
-      --merge-input-intervals \
-      --consolidate
-
-    tar -cf ~{workspace_dir_name}.tar ~{workspace_dir_name}
-  >>>
-
-  runtime {
-    memory: "26 GiB"
-    cpu: 4
-    disks: "local-disk " + disk_size + " HDD"
-    docker: gatk_docker
-    preemptible: preemptible_tries
-  }
-
-  output {
-    File output_genomicsdb = "~{workspace_dir_name}.tar"
-  }
-}
-
-task GenotypeGVCFs {
-
-  input {
-    File workspace_tar
-    File interval
-
-    String output_vcf_filename
-
-    File ref_fasta
-    File ref_fasta_index
-    File ref_dict
-
-    String dbsnp_vcf
-
-    Int disk_size
-    # This is needed for gVCFs generated with GATK3 HaplotypeCaller
-    Boolean allow_old_rms_mapping_quality_annotation_data = false
-    String gatk_docker
-    String gatk_path
-    String preemptible_tries
-  }
-
-    parameter_meta {
-      interval: {
-        localization_optional: true
-      }
-    }
-
-  command <<<
-    set -euo pipefail
-
-    tar -xf ~{workspace_tar}
-    WORKSPACE=$(basename ~{workspace_tar} .tar)
-
-    ~{gatk_path} --java-options -Xms8g \
-      GenotypeGVCFs \
-      -R ~{ref_fasta} \
-      -O ~{output_vcf_filename} \
-      -D ~{dbsnp_vcf} \
-      -G StandardAnnotation -G AS_StandardAnnotation \
-      --only-output-calls-starting-in-intervals \
-      --use-new-qual-calculator \
-      -V gendb://$WORKSPACE \
-      -L ~{interval} \
-      ~{true='--allow-old-rms-mapping-quality-annotation-data' false='' allow_old_rms_mapping_quality_annotation_data} \
-      --merge-input-intervals
-  >>>
-
-  runtime {
-    memory: "26 GiB"
-    cpu: 2
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File output_vcf = "~{output_vcf_filename}"
-    File output_vcf_index = "~{output_vcf_filename}.tbi"
-  }
-}
-
-task GnarlyGenotyper {
-
-  input {
-    File workspace_tar
-    File interval
-    String output_vcf_filename
-    File ref_fasta
-    File ref_fasta_index
-    File ref_dict
-    String dbsnp_vcf
-
-    String gatk_docker = "us.gcr.io/broad-gotc-prod/gnarly_genotyper:fixNegativeRefCount"
-    String preemptible_tries
-  }
-
-  parameter_meta {
-    interval: {
-      localization_optional: true
-    }
-  }
-
-  Int disk_size = ceil(size(workspace_tar, "GiB") + size(ref_fasta, "GiB") + size(dbsnp_vcf, "GiB") * 3)
-
-  command <<<
-    set -e
-
-    tar -xf ~{workspace_tar}
-    WORKSPACE=$( basename ~{workspace_tar} .tar)
-
-    # use a query.json to set some params that aren't exposed -- ewwwww
-    cat <<EOF > $WORKSPACE/query.json
-      {
-        "scan_full": true,
-        "workspace": "genomicsdb",
-        "array": "genomicsdb_array",
-        "vid_mapping_file": "genomicsdb/vidmap.json",
-        "callset_mapping_file": "genomicsdb/callset.json",
-        "reference_genome": "/cromwell_root/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta",
-        "max_diploid_alt_alleles_that_can_be_genotyped": 6,
-        "produce_GT_field": true
-      }
-    EOF
-
-    gatk --java-options -Xms8g \
-      GnarlyGenotyper \
-      -R ~{ref_fasta} \
-      -O ~{output_vcf_filename} \
-      --output-database-name annotationDB.vcf.gz \
-      -D ~{dbsnp_vcf} \
-      --only-output-calls-starting-in-intervals \
-      --use-new-qual-calculator \
-      -V gendb://$WORKSPACE \
-      -L ~{interval} \
-      -stand-call-conf 10 \
-      --merge-input-intervals
-  >>>
-
-  runtime {
-    memory: "26 GiB"
-    cpu: 2
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File output_vcf = "~{output_vcf_filename}"
-    File output_vcf_index = "~{output_vcf_filename}.tbi"
-    File output_database = "annotationDB.vcf.gz"
-    File output_database_index = "annotationDB.vcf.gz.tbi"
-  }
-}
-
-task HardFilterAndMakeSitesOnlyVcf {
-
-  input {
-    File vcf
-    File vcf_index
-    Float excess_het_threshold
-
-    String variant_filtered_vcf_filename
-    String sites_only_vcf_filename
-
-    Int disk_size
-    String gatk_docker
-    String gatk_path
-    String preemptible_tries
-  }
-
-  command <<<
-    set -euo pipefail
-
-    ~{gatk_path} --java-options -Xms3g \
-      VariantFiltration \
-      --filter-expression "ExcessHet > ~{excess_het_threshold}" \
-      --filter-name ExcessHet \
-      -O ~{variant_filtered_vcf_filename} \
-      -V ~{vcf}
-
-    ~{gatk_path} --java-options -Xms3g \
-      MakeSitesOnlyVcf \
-      -I ~{variant_filtered_vcf_filename} \
-      -O ~{sites_only_vcf_filename}
-  >>>
-
-  runtime {
-    memory: "3.75 GiB"
-    cpu: "1"
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File variant_filtered_vcf = "~{variant_filtered_vcf_filename}"
-    File variant_filtered_vcf_index = "~{variant_filtered_vcf_filename}.tbi"
-    File sites_only_vcf = "~{sites_only_vcf_filename}"
-    File sites_only_vcf_index = "~{sites_only_vcf_filename}.tbi"
-  }
-}
-
-task IndelsVariantRecalibrator {
-
-  input {
-    String recalibration_filename
-    String tranches_filename
-
-    Array[String] recalibration_tranche_values
-    Array[String] recalibration_annotation_values
-
-    File sites_only_variant_filtered_vcf
-    File sites_only_variant_filtered_vcf_index
-
-    File mills_resource_vcf
-    File axiomPoly_resource_vcf
-    File dbsnp_resource_vcf
-    File mills_resource_vcf_index
-    File axiomPoly_resource_vcf_index
-    File dbsnp_resource_vcf_index
-    Boolean use_allele_specific_annotations
-    Int max_gaussians = 4
-
-    Int disk_size
-    String gatk_docker
-    String gatk_path
-    String preemptible_tries
-  }
-
-  command <<<
-    set -euo pipefail
-
-    ~{gatk_path} --java-options -Xms24g \
-      VariantRecalibrator \
-      -V ~{sites_only_variant_filtered_vcf} \
-      -O ~{recalibration_filename} \
-      --tranches-file ~{tranches_filename} \
-      --trust-all-polymorphic \
-      -tranche ~{sep=' -tranche ' recalibration_tranche_values} \
-      -an ~{sep=' -an ' recalibration_annotation_values} \
-      ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \
-      -mode INDEL \
-      --max-gaussians ~{max_gaussians} \
-      -resource:mills,known=false,training=true,truth=true,prior=12 ~{mills_resource_vcf} \
-      -resource:axiomPoly,known=false,training=true,truth=false,prior=10 ~{axiomPoly_resource_vcf} \
-      -resource:dbsnp,known=true,training=false,truth=false,prior=2 ~{dbsnp_resource_vcf}
-  >>>
-
-  runtime {
-    memory: "26 GiB"
-    cpu: "2"
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File recalibration = "~{recalibration_filename}"
-    File recalibration_index = "~{recalibration_filename}.idx"
-    File tranches = "~{tranches_filename}"
-  }
-}
-
-task SNPsVariantRecalibratorCreateModel {
-
-  input {
-    String recalibration_filename
-    String tranches_filename
-    Int downsampleFactor
-    String model_report_filename
-
-    Array[String] recalibration_tranche_values
-    Array[String] recalibration_annotation_values
-
-    File sites_only_variant_filtered_vcf
-    File sites_only_variant_filtered_vcf_index
-
-    File hapmap_resource_vcf
-    File omni_resource_vcf
-    File one_thousand_genomes_resource_vcf
-    File dbsnp_resource_vcf
-    File hapmap_resource_vcf_index
-    File omni_resource_vcf_index
-    File one_thousand_genomes_resource_vcf_index
-    File dbsnp_resource_vcf_index
-    Boolean use_allele_specific_annotations
-    Int max_gaussians = 6
-
-    Int disk_size
-    String gatk_docker
-    String gatk_path
-    String preemptible_tries
-  }
-
-  command <<<
-    set -euo pipefail
-
-    ~{gatk_path} --java-options -Xms100g \
-      VariantRecalibrator \
-      -V ~{sites_only_variant_filtered_vcf} \
-      -O ~{recalibration_filename} \
-      --tranches-file ~{tranches_filename} \
-      --trust-all-polymorphic \
-      -tranche ~{sep=' -tranche ' recalibration_tranche_values} \
-      -an ~{sep=' -an ' recalibration_annotation_values} \
-      ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \
-      -mode SNP \
-      --sample-every-Nth-variant ~{downsampleFactor} \
-      --output-model ~{model_report_filename} \
-      --max-gaussians ~{max_gaussians} \
-      -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \
-      -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \
-      -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \
-      -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf}
-  >>>
-
-  runtime {
-    memory: "104 GiB"
-    cpu: "2"
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File model_report = "~{model_report_filename}"
-  }
-}
-
-task SNPsVariantRecalibrator {
-
-  input {
-    String recalibration_filename
-    String tranches_filename
-    File? model_report
-
-    Array[String] recalibration_tranche_values
-    Array[String] recalibration_annotation_values
-
-    File sites_only_variant_filtered_vcf
-    File sites_only_variant_filtered_vcf_index
-
-    File hapmap_resource_vcf
-    File omni_resource_vcf
-    File one_thousand_genomes_resource_vcf
-    File dbsnp_resource_vcf
-    File hapmap_resource_vcf_index
-    File omni_resource_vcf_index
-    File one_thousand_genomes_resource_vcf_index
-    File dbsnp_resource_vcf_index
-    Boolean use_allele_specific_annotations
-    Int max_gaussians = 6
-
-    Int disk_size
-    String gatk_docker
-    String gatk_path
-    String preemptible_tries
-    Int? machine_mem_gb
-
-  }
-
-  Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf,
-                                hapmap_resource_vcf,
-                                omni_resource_vcf,
-                                one_thousand_genomes_resource_vcf,
-                                dbsnp_resource_vcf],
-                     "GiB"))
-  Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem])
-  Int java_mem = machine_mem - 1
-
-
-  String model_report_arg = if defined(model_report) then "--input-model $MODEL_REPORT --output-tranches-for-scatter" else ""
-
-  command <<<
-    set -euo pipefail
-
-    MODEL_REPORT=~{model_report}
-
-    ~{gatk_path} --java-options -Xms~{java_mem}g \
-      VariantRecalibrator \
-      -V ~{sites_only_variant_filtered_vcf} \
-      -O ~{recalibration_filename} \
-      --tranches-file ~{tranches_filename} \
-      --trust-all-polymorphic \
-      -tranche ~{sep=' -tranche ' recalibration_tranche_values} \
-      -an ~{sep=' -an ' recalibration_annotation_values} \
-      ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \
-      -mode SNP \
-       ~{model_report_arg} \
-      --max-gaussians ~{max_gaussians} \
-      -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \
-      -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \
-      -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \
-      -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf}
-  >>>
-
-  runtime {
-    memory: "~{machine_mem} GiB"
-    cpu: 2
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File recalibration = "~{recalibration_filename}"
-    File recalibration_index = "~{recalibration_filename}.idx"
-    File tranches = "~{tranches_filename}"
-  }
-}
-
-task GatherTranches {
-
-  input {
-    Array[File] tranches
-    String output_filename
-    Int disk_size
-    String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152"
-    String gatk_path
-    String preemptible_tries
-  }
-
-  parameter_meta {
-    tranches: {
-      localization_optional: true
-    }
-  }
-
-  command <<<
-    set -euo pipefail
-
-    tranches_fofn=~{write_lines(tranches)}
-
-    # Jose says:
-    # Cromwell will fall over if we have it try to localize tens of thousands of files,
-    # so we manually localize files using gsutil.
-    # Using gsutil also lets us parallelize the localization, which (as far as we can tell)
-    # PAPI doesn't do.
-
-    # This is here to deal with the JES bug where commands may be run twice
-    rm -rf tranches
-    mkdir tranches
-    RETRY_LIMIT=5
-
-    count=0
-    until cat $tranches_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do
-        sleep 1
-        ((count++)) && ((count >= $RETRY_LIMIT)) && break
-    done
-    if [ "$count" -ge "$RETRY_LIMIT" ]; then
-        echo 'Could not copy all the tranches from the cloud' && exit 1
-    fi
-
-    cat $tranches_fofn | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list
-
-    /usr/gitc/gatk --java-options -Xms6g \
-      GatherTranches \
-      --input inputs.list \
-      --output ~{output_filename}
-  >>>
-
-  runtime {
-    memory: "7.5 GiB"
-    cpu: "2"
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File tranches = "~{output_filename}"
-  }
-}
-
-task ApplyRecalibration {
-
-  input {
-    String recalibrated_vcf_filename
-    File input_vcf
-    File input_vcf_index
-    File indels_recalibration
-    File indels_recalibration_index
-    File indels_tranches
-    File snps_recalibration
-    File snps_recalibration_index
-    File snps_tranches
-    Float indel_filter_level
-    Float snp_filter_level
-    Boolean use_allele_specific_annotations
-    Int disk_size
-    String gatk_docker
-    String gatk_path
-    String preemptible_tries
-  }
-
-  command <<<
-    set -euo pipefail
-
-    ~{gatk_path} --java-options -Xms5g \
-      ApplyVQSR \
-      -O tmp.indel.recalibrated.vcf \
-      -V ~{input_vcf} \
-      --recal-file ~{indels_recalibration} \
-      ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \
-      --tranches-file ~{indels_tranches} \
-      --truth-sensitivity-filter-level ~{indel_filter_level} \
-      --create-output-variant-index true \
-      -mode INDEL
-
-    ~{gatk_path} --java-options -Xms5g \
-      ApplyVQSR \
-      -O ~{recalibrated_vcf_filename} \
-      -V tmp.indel.recalibrated.vcf \
-      --recal-file ~{snps_recalibration} \
-      ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \
-      --tranches-file ~{snps_tranches} \
-      --truth-sensitivity-filter-level ~{snp_filter_level} \
-      --create-output-variant-index true \
-      -mode SNP
-  >>>
-
-  runtime {
-    memory: "7 GiB"
-    cpu: "1"
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File recalibrated_vcf = "~{recalibrated_vcf_filename}"
-    File recalibrated_vcf_index = "~{recalibrated_vcf_filename}.tbi"
-  }
-}
-
-task GatherVcfs {
-
-  input {
-    Array[File] input_vcfs
-    String output_vcf_name
-    Int disk_size
-    String gatk_docker
-    String gatk_path
-    String preemptible_tries
-  }
-
-  parameter_meta {
-    input_vcfs: {
-      localization_optional: true
-    }
-  }
-
-  command <<<
-    set -euo pipefail
-
-    # --ignore-safety-checks makes a big performance difference so we include it in our invocation.
-    # This argument disables expensive checks that the file headers contain the same set of
-    # genotyped samples and that files are in order by position of first record.
-    ~{gatk_path} --java-options -Xms6g \
-      GatherVcfsCloud \
-      --ignore-safety-checks \
-      --gather-type BLOCK \
-      --input ~{sep=" --input " input_vcfs} \
-      --output ~{output_vcf_name}
-
-    tabix ~{output_vcf_name}
-  >>>
-
-  runtime {
-    memory: "7 GiB"
-    cpu: "1"
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File output_vcf = "~{output_vcf_name}"
-    File output_vcf_index = "~{output_vcf_name}.tbi"
-  }
-}
-
-task SelectFingerprintSiteVariants {
-
-  input {
-    File input_vcf
-    File haplotype_database
-    String base_output_name
-    Int disk_size
-    String gatk_docker
-    String gatk_path
-    String preemptible_tries
-  }
-
-  parameter_meta {
-    input_vcf: {
-      localization_optional: true
-    }
-  }
-
-  command <<<
-    set -euo pipefail
-
-    function hdb_to_interval_list() {
-        input=$1
-        awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1
-    }
-
-    hdb_to_interval_list ~{haplotype_database} > hdb.interval_list
-
-    ~{gatk_path} --java-options -Xms6g \
-      SelectVariants \
-      --variant ~{input_vcf} \
-      --intervals hdb.interval_list \
-      --output ~{base_output_name}.vcf.gz
-  >>>
-
-  runtime {
-    memory: "7.5 GiB"
-    cpu: 1
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File output_vcf = "~{base_output_name}.vcf.gz"
-    File output_vcf_index = "~{base_output_name}.vcf.gz.tbi"
-  }
-}
-
-task CollectVariantCallingMetrics {
-
-  input {
-    File input_vcf
-    File input_vcf_index
-    String metrics_filename_prefix
-    File dbsnp_vcf
-    File dbsnp_vcf_index
-    File interval_list
-    File ref_dict
-    Int disk_size
-    String gatk_docker
-    String gatk_path 
-    String preemptible_tries
-  }
-
-  command <<<
-    set -euo pipefail
-
-    ~{gatk_path} --java-options -Xms6g \
-      CollectVariantCallingMetrics \
-      --INPUT ~{input_vcf} \
-      --DBSNP ~{dbsnp_vcf} \
-      --SEQUENCE_DICTIONARY ~{ref_dict} \
-      --OUTPUT ~{metrics_filename_prefix} \
-      --THREAD_COUNT 8 \
-      --TARGET_INTERVALS ~{interval_list}
-  >>>
-
-  output {
-    File detail_metrics_file = "~{metrics_filename_prefix}.variant_calling_detail_metrics"
-    File summary_metrics_file = "~{metrics_filename_prefix}.variant_calling_summary_metrics"
-  }
-
-  runtime {
-    memory: "7.5 GiB"
-    cpu: 2
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-}
-
-task GatherVariantCallingMetrics {
-
-  input {
-    Array[File] input_details
-    Array[File] input_summaries
-    String output_prefix
-    Int disk_size
-    String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152"
-    String gatk_path
-    String preemptible_tries
-  }
-
-  parameter_meta {
-    input_details: {
-      localization_optional: true
-    }
-    input_summaries: {
-      localization_optional: true
-    }
-  }
-
-  command <<<
-    set -euo pipefail
-
-    input_details_fofn=~{write_lines(input_details)}
-    input_summaries_fofn=~{write_lines(input_summaries)}
-
-    # Jose says:
-    # Cromwell will fall over if we have it try to localize tens of thousands of files,
-    # so we manually localize files using gsutil.
-    # Using gsutil also lets us parallelize the localization, which (as far as we can tell)
-    # PAPI doesn't do.
-
-    # This is here to deal with the JES bug where commands may be run twice
-    rm -rf metrics
-
-    mkdir metrics
-    RETRY_LIMIT=5
-
-    count=0
-    until cat $input_details_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do
-        sleep 1
-        ((count++)) && ((count >= $RETRY_LIMIT)) && break
-    done
-    if [ "$count" -ge "$RETRY_LIMIT" ]; then
-        echo 'Could not copy all the metrics from the cloud' && exit 1
-    fi
-
-    count=0
-    until cat $input_summaries_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do
-        sleep 1
-        ((count++)) && ((count >= $RETRY_LIMIT)) && break
-    done
-    if [ "$count" -ge "$RETRY_LIMIT" ]; then
-        echo 'Could not copy all the metrics from the cloud' && exit 1
-    fi
-
-    INPUT=$(cat $input_details_fofn | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("--INPUT metrics/%s ", $1)}')
-
-    /usr/gitc/gatk --java-options -Xms2g \
-      AccumulateVariantCallingMetrics \
-      $INPUT \
-      --OUTPUT ~{output_prefix}
-  >>>
-
-  runtime {
-    memory: "3 GiB"
-    cpu: "1"
-    disks: "local-disk " + disk_size + " HDD"
-    preemptible: preemptible_tries
-    docker: gatk_docker
-  }
-
-  output {
-    File detail_metrics_file = "~{output_prefix}.variant_calling_detail_metrics"
-    File summary_metrics_file = "~{output_prefix}.variant_calling_summary_metrics"
-  }
-}
-
-task CrossCheckFingerprint {
-
-  input {
-    Array[File] gvcf_paths
-    Array[File] vcf_paths
-    File sample_name_map
-    File haplotype_database
-    String output_base_name
-    Boolean scattered = false
-    Array[String] expected_inconclusive_samples = []
-    String picard_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map"
-    String preemptible_tries
-  }
-
-  parameter_meta {
-    gvcf_paths: {
-      localization_optional: true
-    }
-    vcf_paths: {
-      localization_optional: true
-    }
-  }
-
-  Int num_gvcfs = length(gvcf_paths)
-  Int cpu = if num_gvcfs < 32 then num_gvcfs else 32
-  # Compute memory to use based on the CPU count, following the pattern of
-  # 3.75GiB / cpu used by GCP's pricing: https://cloud.google.com/compute/pricing
-  Int memMb = round(cpu * 3.75 * 1024)
-  Int disk = 100
-
-  String output_name = output_base_name + ".fingerprintcheck"
-
-  command <<<
-    set -eu
-
-    gvcfInputsList=~{write_lines(gvcf_paths)}
-    vcfInputsList=~{write_lines(vcf_paths)}
-
-    cp $gvcfInputsList gvcf_inputs.list
-    cp $vcfInputsList vcf_inputs.list
-
-    java -Dpicard.useLegacyParser=false -Xms~{memMb - 512}m \
-      -jar /usr/gitc/PicardPublicWithCrosscheckNIOandSampleMapping.jar \
-      CrosscheckFingerprints \
-      --INPUT gvcf_inputs.list \
-      --SECOND_INPUT vcf_inputs.list \
-      --HAPLOTYPE_MAP ~{haplotype_database} \
-      --INPUT_SAMPLE_FILE_MAP ~{sample_name_map} \
-      --CROSSCHECK_BY SAMPLE \
-      --CROSSCHECK_MODE CHECK_SAME_SAMPLE \
-      --NUM_THREADS ~{cpu} \
-      --SKIP_INPUT_READABLITY_TEST \
-      ~{true='--EXIT_CODE_WHEN_MISMATCH 0' false='' scattered} \
-      --OUTPUT ~{output_name}
-
-    if ~{scattered}; then
-      # UNEXPECTED_MATCH is not possible with CHECK_SAME_SAMPLE
-      matches=$(grep "EXPECTED_MATCH" ~{output_name} | wc -l)
-
-      # check inconclusive samples
-      expectedInconclusiveSamples=("~{sep='" "' expected_inconclusive_samples}")
-      inconclusiveSamplesCount=0
-      inconclusiveSamples=($(grep 'INCONCLUSIVE' ~{output_name} | cut -f 1))
-      for sample in ${inconclusiveSamples[@]}; do
-        if printf '%s\n' ${expectedInconclusiveSamples[@]} | grep -P '^'${sample}'$'; then
-          inconclusiveSamplesCount=$((inconclusiveSamplesCount+1))
-        fi
-      done
-
-      total_matches=$((inconclusiveSamplesCount + matches))
-      if [[ ${total_matches} -eq ~{num_gvcfs} ]]; then
-        >&2 echo "Found the correct number of matches (~{num_gvcfs}) for this shard"
-      else
-        >&2 echo "ERROR: Found $total_matches 'EXPECTED_MATCH' records, but expected ~{num_gvcfs}"
-        exit 1
-      fi
-    fi
-  >>>
-
-  runtime {
-    memory: memMb + " MiB"
-    disks: "local-disk " + disk + " HDD"
-    preemptible: preemptible_tries
-    docker: picard_docker
-  }
-
-  output {
-    File crosscheck_metrics = output_name
-  }
-}
-
-task GatherPicardMetrics {
-
-  input {
-    Array[File] metrics_files
-    String output_file_name
-    Int disk_size
-  }
-
-  command {
-    # Don't use this task to gather tens of thousands of files.
-    # Cromwell can't handle it.
-
-    # This cannot gather metrics with histograms
-
-    head -n 7 ~{metrics_files[0]} > ~{output_file_name}
-
-    for metrics_file in ~{sep=' ' metrics_files}; do
-      sed -n '1,7d;p' $metrics_file | grep -v '^$' >> ~{output_file_name}
-    done
-  }
-
-  output {
-    File gathered_metrics = "~{output_file_name}"
-  }
-
-  runtime {
-    cpu: 1
-    memory: "3.75 GiB"
-    preemptible: 1
-    disks: "local-disk " + disk_size + " HDD"
-    docker: "us.gcr.io/broad-gotc-prod/python:2.7"
-  }
-}
-
-task GetFingerprintingIntervalIndices {
-
-  input {
-    Array[File] unpadded_intervals
-    File haplotype_database
-    String gatk_docker 
-    String gatk_path
-    String preemptible_tries
-  }
-
-  command <<<
-    set -xeo pipefail
-
-    function rename_intervals(){
-        interval_list=$1
-        name=$2
-
-        awk 'BEGIN{FS=IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {$5="'$name'"; print}' $interval_list
-    }
-    export -f rename_intervals
-
-    function hdb_to_interval_list(){
-        input=$1
-
-        awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1
-    }
-
-    function rename_scatter(){
-        file=$1
-        number=$(echo $file | sed -E 's|([0-9]+)-scattered\.interval.*|\1|')
-        rename_intervals $file $number > scattered.renamed.$number.interval_list
-    }
-    export -f rename_scatter
-
-    # rename the intervals within each interval_list according to the number in the name of the list
-
-    cp ~{sep=' ' unpadded_intervals} ./
-
-    cat ~{write_lines(unpadded_intervals)} | xargs -n1 basename | xargs -I{} bash -c 'rename_scatter $@' _ {}
-
-    #find the first header
-    find . -name "scattered.renamed.*.interval_list" | head -n1 | xargs cat | grep '^@' > all.interval_list
-
-    # concatenate the resulting intervals (with no header)
-    find . -name "scattered.renamed.*.interval_list"  | xargs cat | grep -v '^@' >> all.interval_list
-
-    # convert the Haplotype_database to an interval_list
-    hdb_to_interval_list ~{haplotype_database} > hdb.interval_list
-
-    # find the intervals that overlap the haplotype_database
-    ~{gatk_path} IntervalListTools \
-      -ACTION OVERLAPS \
-      -O all.sorted.interval_list \
-      -I all.interval_list \
-      -SI hdb.interval_list
-
-    if grep -v '^@' all.sorted.interval_list; then
-      grep -v '^@' all.sorted.interval_list | awk '{FS="\t"; print $5}' | uniq > indices.out
-    else
-      touch indices.out
-    fi
-  >>>
-
-  output {
-    Array[String] indices_to_fingerprint = read_lines("indices.out")
-    File all_sorted_interval_list = "all.sorted.interval_list"
-    File all_interval_list = "all.interval_list"
-    File hdb_interval_list = "hdb.interval_list"
-  }
-
-  runtime {
-    cpu: 2
-    memory: "3.75 GiB"
-    preemptible: preemptible_tries
-    disks: "local-disk 10 HDD"
-    docker: gatk_docker
-  }
-}
-
-task PartitionSampleNameMap {
-
-  input {
-    File sample_name_map
-    Int line_limit
-  }
-
-  command {
-
-    cut -f 2 ~{sample_name_map} > sample_paths
-    split -l ~{line_limit} -d sample_paths partition_
-
-    # Let the OS catch up with creation of files for glob command
-    sleep 1
-  }
-
-  output {
-    Array[File] partitions = glob("partition_*")
-  }
-
-  runtime {
-    memory: "1 GiB"
-    preemptible: 1
-    disks: "local-disk 10 HDD"
-    docker: "us.gcr.io/broad-gotc-prod/python:2.7"
-  }
-}