From 6d70cd35c31165703da77d005ed8c43ec9f46003 Mon Sep 17 00:00:00 2001 From: bshifaw Date: Mon, 22 Jun 2020 16:29:23 -0400 Subject: [PATCH] Dev (#55) * Removed Terra JointGenotyping * minor correction to requirements --- JointGenotyping-terra.wdl | 574 ---------- JointGenotyping.hg38.terra.wgs.inputs.json | 35 - JointGenotyping.wdl | 3 +- README.md | 15 +- tasks/JointGenotypingTasks-terra.wdl | 1103 -------------------- 5 files changed, 6 insertions(+), 1724 deletions(-) delete mode 100644 JointGenotyping-terra.wdl delete mode 100644 JointGenotyping.hg38.terra.wgs.inputs.json delete mode 100644 tasks/JointGenotypingTasks-terra.wdl diff --git a/JointGenotyping-terra.wdl b/JointGenotyping-terra.wdl deleted file mode 100644 index b1d5c90..0000000 --- a/JointGenotyping-terra.wdl +++ /dev/null @@ -1,574 +0,0 @@ -version 1.0 - -## Copyright Broad Institute, 2019 -## -## This WDL implements the joint discovery and VQSR filtering portion of the GATK -## Best Practices (June 2016) for germline SNP and Indel discovery in human -## whole-genome sequencing (WGS) and exome sequencing data. -## -## Requirements/expectations : -## - One or more GVCFs produced by HaplotypeCaller in GVCF mode -## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. -## -## Outputs : -## - A VCF file and its index, filtered using variant quality score recalibration -## (VQSR) with genotypes for all samples present in the input VCF. All sites that -## are present in the input VCF are retained; filtered sites are annotated as such -## in the FILTER field. -## -## Note about VQSR wiring : -## The SNP and INDEL models are built in parallel, but then the corresponding -## recalibrations are applied in series. Because the INDEL model is generally ready -## first (because there are fewer indels than SNPs) we set INDEL recalibration to -## be applied first to the input VCF, while the SNP model is still being built. By -## the time the SNP model is available, the indel-recalibrated file is available to -## serve as input to apply the SNP recalibration. If we did it the other way around, -## we would have to wait until the SNP recal file was available despite the INDEL -## recal file being there already, then apply SNP recalibration, then apply INDEL -## recalibration. This would lead to a longer wall clock time for complete workflow -## execution. Wiring the INDEL recalibration to be applied first solves the problem. -## -## Cromwell version support -## - Successfully tested on v47 -## - Does not work on versions < v23 due to output syntax -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -# WORKFLOW DEFINITION - -#import "./tasks/JointGenotypingTasks-terra.wdl" as Tasks - -import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/2.0.0/tasks/JointGenotypingTasks-terra.wdl" as Tasks - -# Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19) -workflow JointGenotyping { - - String pipeline_version = "1.2" - - input { - File unpadded_intervals_file - - String callset_name - File sample_name_map - - File ref_fasta - File ref_fasta_index - File ref_dict - - File dbsnp_vcf - File dbsnp_vcf_index - - Array[String] snp_recalibration_tranche_values - Array[String] snp_recalibration_annotation_values - Array[String] indel_recalibration_tranche_values - Array[String] indel_recalibration_annotation_values - - File haplotype_database - - File eval_interval_list - File hapmap_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf - File one_thousand_genomes_resource_vcf_index - File mills_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf = dbsnp_vcf - File dbsnp_resource_vcf_index = dbsnp_vcf_index - - # Runtime attributes - String gatk_docker = "broadinstitute/gatk:4.1.4.0" - String gatk_path = "/gatk/gatk" - String picard_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map" - - Int small_disk = 100 - Int medium_disk = 200 - Int large_disk = 300 - Int huge_disk = 400 - - Int preemptible_tries = 3 - - # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme - # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 - Float excess_het_threshold = 54.69 - Float snp_filter_level - Float indel_filter_level - Int SNP_VQSR_downsampleFactor - - Int? top_level_scatter_count - Boolean? gather_vcfs - Int snps_variant_recalibration_threshold = 500000 - Boolean rename_gvcf_samples = true - Float unbounded_scatter_count_scale_factor = 0.15 - Int gnarly_scatter_count = 10 - Boolean use_gnarly_genotyper = false - Boolean use_allele_specific_annotations = true - Boolean cross_check_fingerprints = true - Boolean scatter_cross_check_fingerprints = false - } - - Boolean allele_specific_annotations = !use_gnarly_genotyper && use_allele_specific_annotations - - Array[Array[String]] sample_name_map_lines = read_tsv(sample_name_map) - Int num_gvcfs = length(sample_name_map_lines) - - # Make a 2.5:1 interval number to samples in callset ratio interval list. - # We allow overriding the behavior by specifying the desired number of vcfs - # to scatter over for testing / special requests. - # Zamboni notes say "WGS runs get 30x more scattering than Exome" and - # exome scatterCountPerSample is 0.05, min scatter 10, max 1000 - - # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. - # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. - # We allow overriding this default behavior for testing / special requests. - Boolean is_small_callset = select_first([gather_vcfs, num_gvcfs <= 1000]) - - Int unbounded_scatter_count = select_first([top_level_scatter_count, round(unbounded_scatter_count_scale_factor * num_gvcfs)]) - Int scatter_count = if unbounded_scatter_count > 2 then unbounded_scatter_count else 2 #I think weird things happen if scatterCount is 1 -- IntervalListTools is noop? - - call Tasks.CheckSamplesUnique { - input: - sample_name_map = sample_name_map - } - - call Tasks.SplitIntervalList { - input: - interval_list = unpadded_intervals_file, - scatter_count = scatter_count, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - disk_size = small_disk, - sample_names_unique_done = CheckSamplesUnique.samples_unique, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - Array[File] unpadded_intervals = SplitIntervalList.output_intervals - - scatter (idx in range(length(unpadded_intervals))) { - # The batch_size value was carefully chosen here as it - # is the optimal value for the amount of memory allocated - # within the task; please do not change it without consulting - # the Hellbender (GATK engine) team! - call Tasks.ImportGVCFs { - input: - sample_name_map = sample_name_map, - interval = unpadded_intervals[idx], - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - workspace_dir_name = "genomicsdb", - disk_size = medium_disk, - batch_size = 50, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - if (use_gnarly_genotyper) { - - call Tasks.SplitIntervalList as GnarlyIntervalScatterDude { - input: - interval_list = unpadded_intervals[idx], - scatter_count = gnarly_scatter_count, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - disk_size = small_disk, - sample_names_unique_done = CheckSamplesUnique.samples_unique, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - Array[File] gnarly_intervals = GnarlyIntervalScatterDude.output_intervals - - scatter (gnarly_idx in range(length(gnarly_intervals))) { - call Tasks.GnarlyGenotyper { - input: - workspace_tar = ImportGVCFs.output_genomicsdb, - interval = gnarly_intervals[gnarly_idx], - output_vcf_filename = callset_name + "." + idx + "." + gnarly_idx + ".vcf.gz", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - dbsnp_vcf = dbsnp_vcf, - preemptible_tries = preemptible_tries - } - } - - Array[File] gnarly_gvcfs = GnarlyGenotyper.output_vcf - - call Tasks.GatherVcfs as TotallyRadicalGatherVcfs { - input: - input_vcfs = gnarly_gvcfs, - output_vcf_name = callset_name + "." + idx + ".gnarly.vcf.gz", - disk_size = large_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - if (!use_gnarly_genotyper) { - call Tasks.GenotypeGVCFs { - input: - workspace_tar = ImportGVCFs.output_genomicsdb, - interval = unpadded_intervals[idx], - output_vcf_filename = callset_name + "." + idx + ".vcf.gz", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - dbsnp_vcf = dbsnp_vcf, - disk_size = medium_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - File genotyped_vcf = select_first([TotallyRadicalGatherVcfs.output_vcf, GenotypeGVCFs.output_vcf]) - File genotyped_vcf_index = select_first([TotallyRadicalGatherVcfs.output_vcf_index, GenotypeGVCFs.output_vcf_index]) - - call Tasks.HardFilterAndMakeSitesOnlyVcf { - input: - vcf = genotyped_vcf, - vcf_index = genotyped_vcf_index, - excess_het_threshold = excess_het_threshold, - variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", - sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", - disk_size = medium_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - call Tasks.GatherVcfs as SitesOnlyGatherVcf { - input: - input_vcfs = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf, - output_vcf_name = callset_name + ".sites_only.vcf.gz", - disk_size = medium_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call Tasks.IndelsVariantRecalibrator { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".indels.recal", - tranches_filename = callset_name + ".indels.tranches", - recalibration_tranche_values = indel_recalibration_tranche_values, - recalibration_annotation_values = indel_recalibration_annotation_values, - mills_resource_vcf = mills_resource_vcf, - mills_resource_vcf_index = mills_resource_vcf_index, - axiomPoly_resource_vcf = axiomPoly_resource_vcf, - axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - use_allele_specific_annotations = allele_specific_annotations, - disk_size = small_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - if (num_gvcfs > snps_variant_recalibration_threshold) { - call Tasks.SNPsVariantRecalibratorCreateModel { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - downsampleFactor = SNP_VQSR_downsampleFactor, - model_report_filename = callset_name + ".snps.model.report", - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - use_allele_specific_annotations = allele_specific_annotations, - disk_size = small_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { - call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { - input: - sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], - sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], - recalibration_filename = callset_name + ".snps." + idx + ".recal", - tranches_filename = callset_name + ".snps." + idx + ".tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - model_report = SNPsVariantRecalibratorCreateModel.model_report, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - use_allele_specific_annotations = allele_specific_annotations, - disk_size = small_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - call Tasks.GatherTranches as SNPGatherTranches { - input: - tranches = SNPsVariantRecalibratorScattered.tranches, - output_filename = callset_name + ".snps.gathered.tranches", - disk_size = small_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - if (num_gvcfs <= snps_variant_recalibration_threshold) { - call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - use_allele_specific_annotations = allele_specific_annotations, - disk_size = small_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { - #for really large callsets we give to friends, just apply filters to the sites-only - call Tasks.ApplyRecalibration { - input: - recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", - input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], - input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], - indels_recalibration = IndelsVariantRecalibrator.recalibration, - indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, - indels_tranches = IndelsVariantRecalibrator.tranches, - snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), - snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), - snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), - indel_filter_level = indel_filter_level, - snp_filter_level = snp_filter_level, - use_allele_specific_annotations = allele_specific_annotations, - disk_size = medium_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - # For large callsets we need to collect metrics from the shards and gather them later. - if (!is_small_callset) { - call Tasks.CollectVariantCallingMetrics as CollectMetricsSharded { - input: - input_vcf = ApplyRecalibration.recalibrated_vcf, - input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, - metrics_filename_prefix = callset_name + "." + idx, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = medium_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - } - - # For small callsets we can gather the VCF shards and then collect metrics on it. - if (is_small_callset) { - call Tasks.GatherVcfs as FinalGatherVcf { - input: - input_vcfs = ApplyRecalibration.recalibrated_vcf, - output_vcf_name = callset_name + ".vcf.gz", - disk_size = huge_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call Tasks.CollectVariantCallingMetrics as CollectMetricsOnFullVcf { - input: - input_vcf = FinalGatherVcf.output_vcf, - input_vcf_index = FinalGatherVcf.output_vcf_index, - metrics_filename_prefix = callset_name, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = large_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - if (!is_small_callset) { - # For large callsets we still need to gather the sharded metrics. - call Tasks.GatherVariantCallingMetrics { - input: - input_details = select_all(CollectMetricsSharded.detail_metrics_file), - input_summaries = select_all(CollectMetricsSharded.summary_metrics_file), - output_prefix = callset_name, - disk_size = medium_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - # CrossCheckFingerprints takes forever on large callsets. - # We scatter over the input GVCFs to make things faster. - if (scatter_cross_check_fingerprints) { - call Tasks.GetFingerprintingIntervalIndices { - input: - unpadded_intervals = unpadded_intervals, - haplotype_database = haplotype_database, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - Array[Int] fingerprinting_indices = GetFingerprintingIntervalIndices.indices_to_fingerprint - - scatter (idx in fingerprinting_indices) { - File vcfs_to_fingerprint = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx] - } - - call Tasks.GatherVcfs as GatherFingerprintingVcfs { - input: - input_vcfs = vcfs_to_fingerprint, - output_vcf_name = callset_name + ".gathered.fingerprinting.vcf.gz", - disk_size = medium_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call Tasks.SelectFingerprintSiteVariants { - input: - input_vcf = GatherFingerprintingVcfs.output_vcf, - base_output_name = callset_name + ".fingerprinting", - haplotype_database = haplotype_database, - disk_size = medium_disk, - gatk_docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call Tasks.PartitionSampleNameMap { - input: - sample_name_map = sample_name_map, - line_limit = 1000 - } - - scatter (idx in range(length(PartitionSampleNameMap.partitions))) { - - Array[File] files_in_partition = read_lines(PartitionSampleNameMap.partitions[idx]) - - call Tasks.CrossCheckFingerprint as CrossCheckFingerprintsScattered { - input: - gvcf_paths = files_in_partition, - vcf_paths = vcfs_to_fingerprint, - sample_name_map = sample_name_map, - haplotype_database = haplotype_database, - output_base_name = callset_name + "." + idx, - scattered = true, - picard_docker = picard_docker, - preemptible_tries = preemptible_tries - } - } - - call Tasks.GatherPicardMetrics as GatherFingerprintingMetrics { - input: - metrics_files = CrossCheckFingerprintsScattered.crosscheck_metrics, - output_file_name = callset_name + ".fingerprintcheck", - disk_size = small_disk - } - } - - if (!scatter_cross_check_fingerprints) { - - scatter (line in sample_name_map_lines) { - File gvcf_paths = line[1] - } - - call Tasks.CrossCheckFingerprint as CrossCheckFingerprintSolo { - input: - gvcf_paths = gvcf_paths, - vcf_paths = ApplyRecalibration.recalibrated_vcf, - sample_name_map = sample_name_map, - haplotype_database = haplotype_database, - output_base_name = callset_name, - picard_docker = picard_docker, - preemptible_tries = preemptible_tries - } - } - - # Get the metrics from either code path - File output_detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherVariantCallingMetrics.detail_metrics_file]) - File output_summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherVariantCallingMetrics.summary_metrics_file]) - - # Get the VCFs from either code path - Array[File?] output_vcf_files = if defined(FinalGatherVcf.output_vcf) then [FinalGatherVcf.output_vcf] else ApplyRecalibration.recalibrated_vcf - Array[File?] output_vcf_index_files = if defined(FinalGatherVcf.output_vcf_index) then [FinalGatherVcf.output_vcf_index] else ApplyRecalibration.recalibrated_vcf_index - - output { - # Metrics from either the small or large callset - File detail_metrics_file = output_detail_metrics_file - File summary_metrics_file = output_summary_metrics_file - - # Outputs from the small callset path through the wdl. - Array[File] output_vcfs = select_all(output_vcf_files) - Array[File] output_vcf_indices = select_all(output_vcf_index_files) - - # Output the interval list generated/used by this run workflow. - Array[File] output_intervals = SplitIntervalList.output_intervals - - # Output the metrics from crosschecking fingerprints. - File crosscheck_fingerprint_check = select_first([CrossCheckFingerprintSolo.crosscheck_metrics, GatherFingerprintingMetrics.gathered_metrics]) - } -} diff --git a/JointGenotyping.hg38.terra.wgs.inputs.json b/JointGenotyping.hg38.terra.wgs.inputs.json deleted file mode 100644 index f74532d..0000000 --- a/JointGenotyping.hg38.terra.wgs.inputs.json +++ /dev/null @@ -1,35 +0,0 @@ -{ -"JointGenotyping.large_disk":"1000", -"JointGenotyping.medium_disk":"200", -"JointGenotyping.indel_recalibration_annotation_values":["FS","ReadPosRankSum","MQRankSum","QD","SOR","DP"], -"JointGenotyping.snp_recalibration_tranche_values":"[100.0,99.95,99.9,99.8,99.6,99.5,99.4,99.3,99.0,98.0,97.0,90.0]", -"JointGenotyping.unbounded_scatter_count_scale_factor":"2.5", -"JointGenotyping.omni_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", -"JointGenotyping.eval_interval_list":"gs://gcp-public-data--broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", -"JointGenotyping.one_thousand_genomes_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", -"JointGenotyping.one_thousand_genomes_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", -"JointGenotyping.small_disk":"100", -"JointGenotyping.snp_recalibration_annotation_values":"[QD,MQRankSum,ReadPosRankSum,FS,MQ,SOR,DP]", -"JointGenotyping.dbsnp_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", -"JointGenotyping.callset_name":"hg38_1kg_50", -"JointGenotyping.unpadded_intervals_file":"gs://gcp-public-data--broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", -"JointGenotyping.dbsnp_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", -"JointGenotyping.ref_fasta_index":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", -"JointGenotyping.ref_dict":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", -"JointGenotyping.mills_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", -"JointGenotyping.sample_name_map":"gs://gatk-test-data/1kgp/downsampled_gvcf_hg38/hg38_1kg_50.sample_map", -"JointGenotyping.indel_recalibration_tranche_values":"[100.0,99.95,99.9,99.5,99.0,97.0,96.0,95.0,94.0,93.5,93.0,92.0,91.0,90.0]", -"JointGenotyping.omni_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", -"JointGenotyping.mills_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", -"JointGenotyping.axiomPoly_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", -"JointGenotyping.snp_filter_level":"99.7", -"JointGenotyping.snps_variant_recalibration_threshold":"20000", -"JointGenotyping.haplotype_database":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt", -"JointGenotyping.hapmap_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", -"JointGenotyping.indel_filter_level":"99", -"JointGenotyping.axiomPoly_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", -"JointGenotyping.ref_fasta":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", -"JointGenotyping.hapmap_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", -"JointGenotyping.SNP_VQSR_downsampleFactor":"10", -"JointGenotyping.huge_disk":"2000" -} diff --git a/JointGenotyping.wdl b/JointGenotyping.wdl index 97a6835..414591c 100644 --- a/JointGenotyping.wdl +++ b/JointGenotyping.wdl @@ -45,9 +45,8 @@ version 1.0 # WORKFLOW DEFINITION -#import "./tasks/JointGenotypingTasks.wdl" as Tasks +import "./tasks/JointGenotypingTasks.wdl" as Tasks -import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/2.0.0/tasks/JointGenotypingTasks.wdl" as Tasks # Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19) workflow JointGenotyping { diff --git a/README.md b/README.md index af5dd34..54fd76d 100644 --- a/README.md +++ b/README.md @@ -29,16 +29,12 @@ in human whole-genome sequencing (WGS). The workflow requires a sample map file with 50 or more GVCFs and produces a multisample VCF. *NOTE:* -*- JointGenotyping-terra.wdl is a slightly modified version of the -original workflow to support users interested in running the -workflow on Terra. The changes include variables for dockers and disksize, making -it easier to configure the workflow.* -*- Creating a sample map can be nuisance on Terra, use the [generate-sample-map](https://portal.firecloud.org/?return=terra#methods/gatk/generate-sample-map/1) to create one for you.* +*- To create a sample map use the [generate-sample-map](https://portal.firecloud.org/?return=terra#methods/gatk/generate-sample-map/1) workflow.* #### Requirements/expectations -- One or more GVCFs produced by HaplotypeCaller in GVCF mode -- Bare minimum 50 samples. Gene panels are not supported. +- A sample map listing 50 or more GVCFs produced by HaplotypeCaller in GVCF mode. +- Gene panels are not supported. ### Outputs - A VCF file and its index, filtered using variant quality score recalibration @@ -47,12 +43,11 @@ it easier to configure the workflow.* in the FILTER field. ### Software version requirements : -- GATK 4.1.4.0 +- GATK 4 - Samtools 1.3.1 - Python 2.7 - Cromwell version support - - Successfully tested on v37 - - Does not work on versions < v23 due to output syntax + - Successfully tested on v50 ### IMPORTANT NOTE : - VQSR wiring. The SNP and INDEL models are built in parallel, but then the corresponding diff --git a/tasks/JointGenotypingTasks-terra.wdl b/tasks/JointGenotypingTasks-terra.wdl deleted file mode 100644 index bc7edf1..0000000 --- a/tasks/JointGenotypingTasks-terra.wdl +++ /dev/null @@ -1,1103 +0,0 @@ -version 1.0 - - -task CheckSamplesUnique { - input { - File sample_name_map - Int sample_num_threshold = 50 - } - - command { - set -euo pipefail - if [[ $(cut -f 1 ~{sample_name_map} | wc -l) -ne $(cut -f 1 ~{sample_name_map} | sort | uniq | wc -l) ]] - then - echo "Samples in the sample_name_map are not unique" 1>&2 - exit 1 - elif [[ $(cut -f 1 ~{sample_name_map} | wc -l) -lt ~{sample_num_threshold} ]] - then - echo "There are fewer than ~{sample_num_threshold} samples in the sample_name_map" 1>&2 - echo "Having fewer than ~{sample_num_threshold} samples means there likely isn't enough data to complete joint calling" 1>&2 - exit 1 - else - echo true - fi - } - - output { - Boolean samples_unique = read_boolean(stdout()) - } - - runtime { - memory: "1 GiB" - preemptible: 1 - disks: "local-disk 10 HDD" - docker: "us.gcr.io/broad-gotc-prod/python:2.7" - } -} - -task SplitIntervalList { - - input { - File interval_list - Int scatter_count - File ref_fasta - File ref_fasta_index - File ref_dict - Boolean sample_names_unique_done - Int disk_size - String scatter_mode = "BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW" - String gatk_docker - String gatk_path - String preemptible_tries - } - - parameter_meta { - interval_list: { - localization_optional: true - } - } - - command <<< - ~{gatk_path} --java-options -Xms3g SplitIntervals \ - -L ~{interval_list} -O scatterDir -scatter ~{scatter_count} -R ~{ref_fasta} \ - -mode ~{scatter_mode} - >>> - - runtime { - memory: "3.75 GiB" - preemptible: preemptible_tries - disks: "local-disk " + disk_size + " HDD" - docker: gatk_docker - } - - output { - Array[File] output_intervals = glob("scatterDir/*") - } -} - -task ImportGVCFs { - - input { - File sample_name_map - File interval - File ref_fasta - File ref_fasta_index - File ref_dict - - String workspace_dir_name - - Int disk_size - Int batch_size - - # Using a nightly version of GATK containing fixes for GenomicsDB - # https://github.com/broadinstitute/gatk/pull/5899 - String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk-nightly:2019-05-07-4.1.2.0-5-g53d015e4f-NIGHTLY-SNAPSHOT" - String gatk_path - String preemptible_tries - } - - command <<< - set -euo pipefail - - rm -rf ~{workspace_dir_name} - - # We've seen some GenomicsDB performance regressions related to intervals, so we're going to pretend we only have a single interval - # using the --merge-input-intervals arg - # There's no data in between since we didn't run HaplotypeCaller over those loci so we're not wasting any compute - - # The memory setting here is very important and must be several GiB lower - # than the total memory allocated to the VM because this tool uses - # a significant amount of non-heap memory for native libraries. - # Also, testing has shown that the multithreaded reader initialization - # does not scale well beyond 5 threads, so don't increase beyond that. - ~{gatk_path} --java-options -Xms8g \ - GenomicsDBImport \ - --genomicsdb-workspace-path ~{workspace_dir_name} \ - --batch-size ~{batch_size} \ - -L ~{interval} \ - --sample-name-map ~{sample_name_map} \ - --reader-threads 5 \ - --merge-input-intervals \ - --consolidate - - tar -cf ~{workspace_dir_name}.tar ~{workspace_dir_name} - >>> - - runtime { - memory: "26 GiB" - cpu: 4 - disks: "local-disk " + disk_size + " HDD" - docker: gatk_docker - preemptible: preemptible_tries - } - - output { - File output_genomicsdb = "~{workspace_dir_name}.tar" - } -} - -task GenotypeGVCFs { - - input { - File workspace_tar - File interval - - String output_vcf_filename - - File ref_fasta - File ref_fasta_index - File ref_dict - - String dbsnp_vcf - - Int disk_size - # This is needed for gVCFs generated with GATK3 HaplotypeCaller - Boolean allow_old_rms_mapping_quality_annotation_data = false - String gatk_docker - String gatk_path - String preemptible_tries - } - - parameter_meta { - interval: { - localization_optional: true - } - } - - command <<< - set -euo pipefail - - tar -xf ~{workspace_tar} - WORKSPACE=$(basename ~{workspace_tar} .tar) - - ~{gatk_path} --java-options -Xms8g \ - GenotypeGVCFs \ - -R ~{ref_fasta} \ - -O ~{output_vcf_filename} \ - -D ~{dbsnp_vcf} \ - -G StandardAnnotation -G AS_StandardAnnotation \ - --only-output-calls-starting-in-intervals \ - --use-new-qual-calculator \ - -V gendb://$WORKSPACE \ - -L ~{interval} \ - ~{true='--allow-old-rms-mapping-quality-annotation-data' false='' allow_old_rms_mapping_quality_annotation_data} \ - --merge-input-intervals - >>> - - runtime { - memory: "26 GiB" - cpu: 2 - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File output_vcf = "~{output_vcf_filename}" - File output_vcf_index = "~{output_vcf_filename}.tbi" - } -} - -task GnarlyGenotyper { - - input { - File workspace_tar - File interval - String output_vcf_filename - File ref_fasta - File ref_fasta_index - File ref_dict - String dbsnp_vcf - - String gatk_docker = "us.gcr.io/broad-gotc-prod/gnarly_genotyper:fixNegativeRefCount" - String preemptible_tries - } - - parameter_meta { - interval: { - localization_optional: true - } - } - - Int disk_size = ceil(size(workspace_tar, "GiB") + size(ref_fasta, "GiB") + size(dbsnp_vcf, "GiB") * 3) - - command <<< - set -e - - tar -xf ~{workspace_tar} - WORKSPACE=$( basename ~{workspace_tar} .tar) - - # use a query.json to set some params that aren't exposed -- ewwwww - cat < $WORKSPACE/query.json - { - "scan_full": true, - "workspace": "genomicsdb", - "array": "genomicsdb_array", - "vid_mapping_file": "genomicsdb/vidmap.json", - "callset_mapping_file": "genomicsdb/callset.json", - "reference_genome": "/cromwell_root/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "max_diploid_alt_alleles_that_can_be_genotyped": 6, - "produce_GT_field": true - } - EOF - - gatk --java-options -Xms8g \ - GnarlyGenotyper \ - -R ~{ref_fasta} \ - -O ~{output_vcf_filename} \ - --output-database-name annotationDB.vcf.gz \ - -D ~{dbsnp_vcf} \ - --only-output-calls-starting-in-intervals \ - --use-new-qual-calculator \ - -V gendb://$WORKSPACE \ - -L ~{interval} \ - -stand-call-conf 10 \ - --merge-input-intervals - >>> - - runtime { - memory: "26 GiB" - cpu: 2 - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File output_vcf = "~{output_vcf_filename}" - File output_vcf_index = "~{output_vcf_filename}.tbi" - File output_database = "annotationDB.vcf.gz" - File output_database_index = "annotationDB.vcf.gz.tbi" - } -} - -task HardFilterAndMakeSitesOnlyVcf { - - input { - File vcf - File vcf_index - Float excess_het_threshold - - String variant_filtered_vcf_filename - String sites_only_vcf_filename - - Int disk_size - String gatk_docker - String gatk_path - String preemptible_tries - } - - command <<< - set -euo pipefail - - ~{gatk_path} --java-options -Xms3g \ - VariantFiltration \ - --filter-expression "ExcessHet > ~{excess_het_threshold}" \ - --filter-name ExcessHet \ - -O ~{variant_filtered_vcf_filename} \ - -V ~{vcf} - - ~{gatk_path} --java-options -Xms3g \ - MakeSitesOnlyVcf \ - -I ~{variant_filtered_vcf_filename} \ - -O ~{sites_only_vcf_filename} - >>> - - runtime { - memory: "3.75 GiB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File variant_filtered_vcf = "~{variant_filtered_vcf_filename}" - File variant_filtered_vcf_index = "~{variant_filtered_vcf_filename}.tbi" - File sites_only_vcf = "~{sites_only_vcf_filename}" - File sites_only_vcf_index = "~{sites_only_vcf_filename}.tbi" - } -} - -task IndelsVariantRecalibrator { - - input { - String recalibration_filename - String tranches_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File mills_resource_vcf - File axiomPoly_resource_vcf - File dbsnp_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf_index - Boolean use_allele_specific_annotations - Int max_gaussians = 4 - - Int disk_size - String gatk_docker - String gatk_path - String preemptible_tries - } - - command <<< - set -euo pipefail - - ~{gatk_path} --java-options -Xms24g \ - VariantRecalibrator \ - -V ~{sites_only_variant_filtered_vcf} \ - -O ~{recalibration_filename} \ - --tranches-file ~{tranches_filename} \ - --trust-all-polymorphic \ - -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ - -an ~{sep=' -an ' recalibration_annotation_values} \ - ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ - -mode INDEL \ - --max-gaussians ~{max_gaussians} \ - -resource:mills,known=false,training=true,truth=true,prior=12 ~{mills_resource_vcf} \ - -resource:axiomPoly,known=false,training=true,truth=false,prior=10 ~{axiomPoly_resource_vcf} \ - -resource:dbsnp,known=true,training=false,truth=false,prior=2 ~{dbsnp_resource_vcf} - >>> - - runtime { - memory: "26 GiB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File recalibration = "~{recalibration_filename}" - File recalibration_index = "~{recalibration_filename}.idx" - File tranches = "~{tranches_filename}" - } -} - -task SNPsVariantRecalibratorCreateModel { - - input { - String recalibration_filename - String tranches_filename - Int downsampleFactor - String model_report_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - Boolean use_allele_specific_annotations - Int max_gaussians = 6 - - Int disk_size - String gatk_docker - String gatk_path - String preemptible_tries - } - - command <<< - set -euo pipefail - - ~{gatk_path} --java-options -Xms100g \ - VariantRecalibrator \ - -V ~{sites_only_variant_filtered_vcf} \ - -O ~{recalibration_filename} \ - --tranches-file ~{tranches_filename} \ - --trust-all-polymorphic \ - -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ - -an ~{sep=' -an ' recalibration_annotation_values} \ - ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ - -mode SNP \ - --sample-every-Nth-variant ~{downsampleFactor} \ - --output-model ~{model_report_filename} \ - --max-gaussians ~{max_gaussians} \ - -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ - -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ - -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ - -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} - >>> - - runtime { - memory: "104 GiB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File model_report = "~{model_report_filename}" - } -} - -task SNPsVariantRecalibrator { - - input { - String recalibration_filename - String tranches_filename - File? model_report - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - Boolean use_allele_specific_annotations - Int max_gaussians = 6 - - Int disk_size - String gatk_docker - String gatk_path - String preemptible_tries - Int? machine_mem_gb - - } - - Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf, - hapmap_resource_vcf, - omni_resource_vcf, - one_thousand_genomes_resource_vcf, - dbsnp_resource_vcf], - "GiB")) - Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) - Int java_mem = machine_mem - 1 - - - String model_report_arg = if defined(model_report) then "--input-model $MODEL_REPORT --output-tranches-for-scatter" else "" - - command <<< - set -euo pipefail - - MODEL_REPORT=~{model_report} - - ~{gatk_path} --java-options -Xms~{java_mem}g \ - VariantRecalibrator \ - -V ~{sites_only_variant_filtered_vcf} \ - -O ~{recalibration_filename} \ - --tranches-file ~{tranches_filename} \ - --trust-all-polymorphic \ - -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ - -an ~{sep=' -an ' recalibration_annotation_values} \ - ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ - -mode SNP \ - ~{model_report_arg} \ - --max-gaussians ~{max_gaussians} \ - -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ - -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ - -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ - -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} - >>> - - runtime { - memory: "~{machine_mem} GiB" - cpu: 2 - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File recalibration = "~{recalibration_filename}" - File recalibration_index = "~{recalibration_filename}.idx" - File tranches = "~{tranches_filename}" - } -} - -task GatherTranches { - - input { - Array[File] tranches - String output_filename - Int disk_size - String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152" - String gatk_path - String preemptible_tries - } - - parameter_meta { - tranches: { - localization_optional: true - } - } - - command <<< - set -euo pipefail - - tranches_fofn=~{write_lines(tranches)} - - # Jose says: - # Cromwell will fall over if we have it try to localize tens of thousands of files, - # so we manually localize files using gsutil. - # Using gsutil also lets us parallelize the localization, which (as far as we can tell) - # PAPI doesn't do. - - # This is here to deal with the JES bug where commands may be run twice - rm -rf tranches - mkdir tranches - RETRY_LIMIT=5 - - count=0 - until cat $tranches_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the tranches from the cloud' && exit 1 - fi - - cat $tranches_fofn | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list - - /usr/gitc/gatk --java-options -Xms6g \ - GatherTranches \ - --input inputs.list \ - --output ~{output_filename} - >>> - - runtime { - memory: "7.5 GiB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File tranches = "~{output_filename}" - } -} - -task ApplyRecalibration { - - input { - String recalibrated_vcf_filename - File input_vcf - File input_vcf_index - File indels_recalibration - File indels_recalibration_index - File indels_tranches - File snps_recalibration - File snps_recalibration_index - File snps_tranches - Float indel_filter_level - Float snp_filter_level - Boolean use_allele_specific_annotations - Int disk_size - String gatk_docker - String gatk_path - String preemptible_tries - } - - command <<< - set -euo pipefail - - ~{gatk_path} --java-options -Xms5g \ - ApplyVQSR \ - -O tmp.indel.recalibrated.vcf \ - -V ~{input_vcf} \ - --recal-file ~{indels_recalibration} \ - ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ - --tranches-file ~{indels_tranches} \ - --truth-sensitivity-filter-level ~{indel_filter_level} \ - --create-output-variant-index true \ - -mode INDEL - - ~{gatk_path} --java-options -Xms5g \ - ApplyVQSR \ - -O ~{recalibrated_vcf_filename} \ - -V tmp.indel.recalibrated.vcf \ - --recal-file ~{snps_recalibration} \ - ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ - --tranches-file ~{snps_tranches} \ - --truth-sensitivity-filter-level ~{snp_filter_level} \ - --create-output-variant-index true \ - -mode SNP - >>> - - runtime { - memory: "7 GiB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File recalibrated_vcf = "~{recalibrated_vcf_filename}" - File recalibrated_vcf_index = "~{recalibrated_vcf_filename}.tbi" - } -} - -task GatherVcfs { - - input { - Array[File] input_vcfs - String output_vcf_name - Int disk_size - String gatk_docker - String gatk_path - String preemptible_tries - } - - parameter_meta { - input_vcfs: { - localization_optional: true - } - } - - command <<< - set -euo pipefail - - # --ignore-safety-checks makes a big performance difference so we include it in our invocation. - # This argument disables expensive checks that the file headers contain the same set of - # genotyped samples and that files are in order by position of first record. - ~{gatk_path} --java-options -Xms6g \ - GatherVcfsCloud \ - --ignore-safety-checks \ - --gather-type BLOCK \ - --input ~{sep=" --input " input_vcfs} \ - --output ~{output_vcf_name} - - tabix ~{output_vcf_name} - >>> - - runtime { - memory: "7 GiB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File output_vcf = "~{output_vcf_name}" - File output_vcf_index = "~{output_vcf_name}.tbi" - } -} - -task SelectFingerprintSiteVariants { - - input { - File input_vcf - File haplotype_database - String base_output_name - Int disk_size - String gatk_docker - String gatk_path - String preemptible_tries - } - - parameter_meta { - input_vcf: { - localization_optional: true - } - } - - command <<< - set -euo pipefail - - function hdb_to_interval_list() { - input=$1 - awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1 - } - - hdb_to_interval_list ~{haplotype_database} > hdb.interval_list - - ~{gatk_path} --java-options -Xms6g \ - SelectVariants \ - --variant ~{input_vcf} \ - --intervals hdb.interval_list \ - --output ~{base_output_name}.vcf.gz - >>> - - runtime { - memory: "7.5 GiB" - cpu: 1 - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File output_vcf = "~{base_output_name}.vcf.gz" - File output_vcf_index = "~{base_output_name}.vcf.gz.tbi" - } -} - -task CollectVariantCallingMetrics { - - input { - File input_vcf - File input_vcf_index - String metrics_filename_prefix - File dbsnp_vcf - File dbsnp_vcf_index - File interval_list - File ref_dict - Int disk_size - String gatk_docker - String gatk_path - String preemptible_tries - } - - command <<< - set -euo pipefail - - ~{gatk_path} --java-options -Xms6g \ - CollectVariantCallingMetrics \ - --INPUT ~{input_vcf} \ - --DBSNP ~{dbsnp_vcf} \ - --SEQUENCE_DICTIONARY ~{ref_dict} \ - --OUTPUT ~{metrics_filename_prefix} \ - --THREAD_COUNT 8 \ - --TARGET_INTERVALS ~{interval_list} - >>> - - output { - File detail_metrics_file = "~{metrics_filename_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "~{metrics_filename_prefix}.variant_calling_summary_metrics" - } - - runtime { - memory: "7.5 GiB" - cpu: 2 - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } -} - -task GatherVariantCallingMetrics { - - input { - Array[File] input_details - Array[File] input_summaries - String output_prefix - Int disk_size - String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152" - String gatk_path - String preemptible_tries - } - - parameter_meta { - input_details: { - localization_optional: true - } - input_summaries: { - localization_optional: true - } - } - - command <<< - set -euo pipefail - - input_details_fofn=~{write_lines(input_details)} - input_summaries_fofn=~{write_lines(input_summaries)} - - # Jose says: - # Cromwell will fall over if we have it try to localize tens of thousands of files, - # so we manually localize files using gsutil. - # Using gsutil also lets us parallelize the localization, which (as far as we can tell) - # PAPI doesn't do. - - # This is here to deal with the JES bug where commands may be run twice - rm -rf metrics - - mkdir metrics - RETRY_LIMIT=5 - - count=0 - until cat $input_details_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the metrics from the cloud' && exit 1 - fi - - count=0 - until cat $input_summaries_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the metrics from the cloud' && exit 1 - fi - - INPUT=$(cat $input_details_fofn | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("--INPUT metrics/%s ", $1)}') - - /usr/gitc/gatk --java-options -Xms2g \ - AccumulateVariantCallingMetrics \ - $INPUT \ - --OUTPUT ~{output_prefix} - >>> - - runtime { - memory: "3 GiB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - docker: gatk_docker - } - - output { - File detail_metrics_file = "~{output_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "~{output_prefix}.variant_calling_summary_metrics" - } -} - -task CrossCheckFingerprint { - - input { - Array[File] gvcf_paths - Array[File] vcf_paths - File sample_name_map - File haplotype_database - String output_base_name - Boolean scattered = false - Array[String] expected_inconclusive_samples = [] - String picard_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map" - String preemptible_tries - } - - parameter_meta { - gvcf_paths: { - localization_optional: true - } - vcf_paths: { - localization_optional: true - } - } - - Int num_gvcfs = length(gvcf_paths) - Int cpu = if num_gvcfs < 32 then num_gvcfs else 32 - # Compute memory to use based on the CPU count, following the pattern of - # 3.75GiB / cpu used by GCP's pricing: https://cloud.google.com/compute/pricing - Int memMb = round(cpu * 3.75 * 1024) - Int disk = 100 - - String output_name = output_base_name + ".fingerprintcheck" - - command <<< - set -eu - - gvcfInputsList=~{write_lines(gvcf_paths)} - vcfInputsList=~{write_lines(vcf_paths)} - - cp $gvcfInputsList gvcf_inputs.list - cp $vcfInputsList vcf_inputs.list - - java -Dpicard.useLegacyParser=false -Xms~{memMb - 512}m \ - -jar /usr/gitc/PicardPublicWithCrosscheckNIOandSampleMapping.jar \ - CrosscheckFingerprints \ - --INPUT gvcf_inputs.list \ - --SECOND_INPUT vcf_inputs.list \ - --HAPLOTYPE_MAP ~{haplotype_database} \ - --INPUT_SAMPLE_FILE_MAP ~{sample_name_map} \ - --CROSSCHECK_BY SAMPLE \ - --CROSSCHECK_MODE CHECK_SAME_SAMPLE \ - --NUM_THREADS ~{cpu} \ - --SKIP_INPUT_READABLITY_TEST \ - ~{true='--EXIT_CODE_WHEN_MISMATCH 0' false='' scattered} \ - --OUTPUT ~{output_name} - - if ~{scattered}; then - # UNEXPECTED_MATCH is not possible with CHECK_SAME_SAMPLE - matches=$(grep "EXPECTED_MATCH" ~{output_name} | wc -l) - - # check inconclusive samples - expectedInconclusiveSamples=("~{sep='" "' expected_inconclusive_samples}") - inconclusiveSamplesCount=0 - inconclusiveSamples=($(grep 'INCONCLUSIVE' ~{output_name} | cut -f 1)) - for sample in ${inconclusiveSamples[@]}; do - if printf '%s\n' ${expectedInconclusiveSamples[@]} | grep -P '^'${sample}'$'; then - inconclusiveSamplesCount=$((inconclusiveSamplesCount+1)) - fi - done - - total_matches=$((inconclusiveSamplesCount + matches)) - if [[ ${total_matches} -eq ~{num_gvcfs} ]]; then - >&2 echo "Found the correct number of matches (~{num_gvcfs}) for this shard" - else - >&2 echo "ERROR: Found $total_matches 'EXPECTED_MATCH' records, but expected ~{num_gvcfs}" - exit 1 - fi - fi - >>> - - runtime { - memory: memMb + " MiB" - disks: "local-disk " + disk + " HDD" - preemptible: preemptible_tries - docker: picard_docker - } - - output { - File crosscheck_metrics = output_name - } -} - -task GatherPicardMetrics { - - input { - Array[File] metrics_files - String output_file_name - Int disk_size - } - - command { - # Don't use this task to gather tens of thousands of files. - # Cromwell can't handle it. - - # This cannot gather metrics with histograms - - head -n 7 ~{metrics_files[0]} > ~{output_file_name} - - for metrics_file in ~{sep=' ' metrics_files}; do - sed -n '1,7d;p' $metrics_file | grep -v '^$' >> ~{output_file_name} - done - } - - output { - File gathered_metrics = "~{output_file_name}" - } - - runtime { - cpu: 1 - memory: "3.75 GiB" - preemptible: 1 - disks: "local-disk " + disk_size + " HDD" - docker: "us.gcr.io/broad-gotc-prod/python:2.7" - } -} - -task GetFingerprintingIntervalIndices { - - input { - Array[File] unpadded_intervals - File haplotype_database - String gatk_docker - String gatk_path - String preemptible_tries - } - - command <<< - set -xeo pipefail - - function rename_intervals(){ - interval_list=$1 - name=$2 - - awk 'BEGIN{FS=IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {$5="'$name'"; print}' $interval_list - } - export -f rename_intervals - - function hdb_to_interval_list(){ - input=$1 - - awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1 - } - - function rename_scatter(){ - file=$1 - number=$(echo $file | sed -E 's|([0-9]+)-scattered\.interval.*|\1|') - rename_intervals $file $number > scattered.renamed.$number.interval_list - } - export -f rename_scatter - - # rename the intervals within each interval_list according to the number in the name of the list - - cp ~{sep=' ' unpadded_intervals} ./ - - cat ~{write_lines(unpadded_intervals)} | xargs -n1 basename | xargs -I{} bash -c 'rename_scatter $@' _ {} - - #find the first header - find . -name "scattered.renamed.*.interval_list" | head -n1 | xargs cat | grep '^@' > all.interval_list - - # concatenate the resulting intervals (with no header) - find . -name "scattered.renamed.*.interval_list" | xargs cat | grep -v '^@' >> all.interval_list - - # convert the Haplotype_database to an interval_list - hdb_to_interval_list ~{haplotype_database} > hdb.interval_list - - # find the intervals that overlap the haplotype_database - ~{gatk_path} IntervalListTools \ - -ACTION OVERLAPS \ - -O all.sorted.interval_list \ - -I all.interval_list \ - -SI hdb.interval_list - - if grep -v '^@' all.sorted.interval_list; then - grep -v '^@' all.sorted.interval_list | awk '{FS="\t"; print $5}' | uniq > indices.out - else - touch indices.out - fi - >>> - - output { - Array[String] indices_to_fingerprint = read_lines("indices.out") - File all_sorted_interval_list = "all.sorted.interval_list" - File all_interval_list = "all.interval_list" - File hdb_interval_list = "hdb.interval_list" - } - - runtime { - cpu: 2 - memory: "3.75 GiB" - preemptible: preemptible_tries - disks: "local-disk 10 HDD" - docker: gatk_docker - } -} - -task PartitionSampleNameMap { - - input { - File sample_name_map - Int line_limit - } - - command { - - cut -f 2 ~{sample_name_map} > sample_paths - split -l ~{line_limit} -d sample_paths partition_ - - # Let the OS catch up with creation of files for glob command - sleep 1 - } - - output { - Array[File] partitions = glob("partition_*") - } - - runtime { - memory: "1 GiB" - preemptible: 1 - disks: "local-disk 10 HDD" - docker: "us.gcr.io/broad-gotc-prod/python:2.7" - } -}