diff --git a/JointGenotyping-terra.wdl b/JointGenotyping-terra.wdl new file mode 100644 index 0000000..b1d5c90 --- /dev/null +++ b/JointGenotyping-terra.wdl @@ -0,0 +1,574 @@ +version 1.0 + +## Copyright Broad Institute, 2019 +## +## This WDL implements the joint discovery and VQSR filtering portion of the GATK +## Best Practices (June 2016) for germline SNP and Indel discovery in human +## whole-genome sequencing (WGS) and exome sequencing data. +## +## Requirements/expectations : +## - One or more GVCFs produced by HaplotypeCaller in GVCF mode +## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. +## +## Outputs : +## - A VCF file and its index, filtered using variant quality score recalibration +## (VQSR) with genotypes for all samples present in the input VCF. All sites that +## are present in the input VCF are retained; filtered sites are annotated as such +## in the FILTER field. +## +## Note about VQSR wiring : +## The SNP and INDEL models are built in parallel, but then the corresponding +## recalibrations are applied in series. Because the INDEL model is generally ready +## first (because there are fewer indels than SNPs) we set INDEL recalibration to +## be applied first to the input VCF, while the SNP model is still being built. By +## the time the SNP model is available, the indel-recalibrated file is available to +## serve as input to apply the SNP recalibration. If we did it the other way around, +## we would have to wait until the SNP recal file was available despite the INDEL +## recal file being there already, then apply SNP recalibration, then apply INDEL +## recalibration. This would lead to a longer wall clock time for complete workflow +## execution. Wiring the INDEL recalibration to be applied first solves the problem. +## +## Cromwell version support +## - Successfully tested on v47 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# WORKFLOW DEFINITION + +#import "./tasks/JointGenotypingTasks-terra.wdl" as Tasks + +import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/2.0.0/tasks/JointGenotypingTasks-terra.wdl" as Tasks + +# Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19) +workflow JointGenotyping { + + String pipeline_version = "1.2" + + input { + File unpadded_intervals_file + + String callset_name + File sample_name_map + + File ref_fasta + File ref_fasta_index + File ref_dict + + File dbsnp_vcf + File dbsnp_vcf_index + + Array[String] snp_recalibration_tranche_values + Array[String] snp_recalibration_annotation_values + Array[String] indel_recalibration_tranche_values + Array[String] indel_recalibration_annotation_values + + File haplotype_database + + File eval_interval_list + File hapmap_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf + File one_thousand_genomes_resource_vcf_index + File mills_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf = dbsnp_vcf + File dbsnp_resource_vcf_index = dbsnp_vcf_index + + # Runtime attributes + String gatk_docker = "broadinstitute/gatk:4.1.4.0" + String gatk_path = "/gatk/gatk" + String picard_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map" + + Int small_disk = 100 + Int medium_disk = 200 + Int large_disk = 300 + Int huge_disk = 400 + + Int preemptible_tries = 3 + + # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme + # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 + Float excess_het_threshold = 54.69 + Float snp_filter_level + Float indel_filter_level + Int SNP_VQSR_downsampleFactor + + Int? top_level_scatter_count + Boolean? gather_vcfs + Int snps_variant_recalibration_threshold = 500000 + Boolean rename_gvcf_samples = true + Float unbounded_scatter_count_scale_factor = 0.15 + Int gnarly_scatter_count = 10 + Boolean use_gnarly_genotyper = false + Boolean use_allele_specific_annotations = true + Boolean cross_check_fingerprints = true + Boolean scatter_cross_check_fingerprints = false + } + + Boolean allele_specific_annotations = !use_gnarly_genotyper && use_allele_specific_annotations + + Array[Array[String]] sample_name_map_lines = read_tsv(sample_name_map) + Int num_gvcfs = length(sample_name_map_lines) + + # Make a 2.5:1 interval number to samples in callset ratio interval list. + # We allow overriding the behavior by specifying the desired number of vcfs + # to scatter over for testing / special requests. + # Zamboni notes say "WGS runs get 30x more scattering than Exome" and + # exome scatterCountPerSample is 0.05, min scatter 10, max 1000 + + # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. + # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. + # We allow overriding this default behavior for testing / special requests. + Boolean is_small_callset = select_first([gather_vcfs, num_gvcfs <= 1000]) + + Int unbounded_scatter_count = select_first([top_level_scatter_count, round(unbounded_scatter_count_scale_factor * num_gvcfs)]) + Int scatter_count = if unbounded_scatter_count > 2 then unbounded_scatter_count else 2 #I think weird things happen if scatterCount is 1 -- IntervalListTools is noop? + + call Tasks.CheckSamplesUnique { + input: + sample_name_map = sample_name_map + } + + call Tasks.SplitIntervalList { + input: + interval_list = unpadded_intervals_file, + scatter_count = scatter_count, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = small_disk, + sample_names_unique_done = CheckSamplesUnique.samples_unique, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + Array[File] unpadded_intervals = SplitIntervalList.output_intervals + + scatter (idx in range(length(unpadded_intervals))) { + # The batch_size value was carefully chosen here as it + # is the optimal value for the amount of memory allocated + # within the task; please do not change it without consulting + # the Hellbender (GATK engine) team! + call Tasks.ImportGVCFs { + input: + sample_name_map = sample_name_map, + interval = unpadded_intervals[idx], + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + workspace_dir_name = "genomicsdb", + disk_size = medium_disk, + batch_size = 50, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + if (use_gnarly_genotyper) { + + call Tasks.SplitIntervalList as GnarlyIntervalScatterDude { + input: + interval_list = unpadded_intervals[idx], + scatter_count = gnarly_scatter_count, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = small_disk, + sample_names_unique_done = CheckSamplesUnique.samples_unique, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + Array[File] gnarly_intervals = GnarlyIntervalScatterDude.output_intervals + + scatter (gnarly_idx in range(length(gnarly_intervals))) { + call Tasks.GnarlyGenotyper { + input: + workspace_tar = ImportGVCFs.output_genomicsdb, + interval = gnarly_intervals[gnarly_idx], + output_vcf_filename = callset_name + "." + idx + "." + gnarly_idx + ".vcf.gz", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + preemptible_tries = preemptible_tries + } + } + + Array[File] gnarly_gvcfs = GnarlyGenotyper.output_vcf + + call Tasks.GatherVcfs as TotallyRadicalGatherVcfs { + input: + input_vcfs = gnarly_gvcfs, + output_vcf_name = callset_name + "." + idx + ".gnarly.vcf.gz", + disk_size = large_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + if (!use_gnarly_genotyper) { + call Tasks.GenotypeGVCFs { + input: + workspace_tar = ImportGVCFs.output_genomicsdb, + interval = unpadded_intervals[idx], + output_vcf_filename = callset_name + "." + idx + ".vcf.gz", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + File genotyped_vcf = select_first([TotallyRadicalGatherVcfs.output_vcf, GenotypeGVCFs.output_vcf]) + File genotyped_vcf_index = select_first([TotallyRadicalGatherVcfs.output_vcf_index, GenotypeGVCFs.output_vcf_index]) + + call Tasks.HardFilterAndMakeSitesOnlyVcf { + input: + vcf = genotyped_vcf, + vcf_index = genotyped_vcf_index, + excess_het_threshold = excess_het_threshold, + variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", + sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + call Tasks.GatherVcfs as SitesOnlyGatherVcf { + input: + input_vcfs = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf, + output_vcf_name = callset_name + ".sites_only.vcf.gz", + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + call Tasks.IndelsVariantRecalibrator { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".indels.recal", + tranches_filename = callset_name + ".indels.tranches", + recalibration_tranche_values = indel_recalibration_tranche_values, + recalibration_annotation_values = indel_recalibration_annotation_values, + mills_resource_vcf = mills_resource_vcf, + mills_resource_vcf_index = mills_resource_vcf_index, + axiomPoly_resource_vcf = axiomPoly_resource_vcf, + axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + if (num_gvcfs > snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibratorCreateModel { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + downsampleFactor = SNP_VQSR_downsampleFactor, + model_report_filename = callset_name + ".snps.model.report", + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { + input: + sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], + sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], + recalibration_filename = callset_name + ".snps." + idx + ".recal", + tranches_filename = callset_name + ".snps." + idx + ".tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + model_report = SNPsVariantRecalibratorCreateModel.model_report, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + call Tasks.GatherTranches as SNPGatherTranches { + input: + tranches = SNPsVariantRecalibratorScattered.tranches, + output_filename = callset_name + ".snps.gathered.tranches", + disk_size = small_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + if (num_gvcfs <= snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { + #for really large callsets we give to friends, just apply filters to the sites-only + call Tasks.ApplyRecalibration { + input: + recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", + input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], + input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], + indels_recalibration = IndelsVariantRecalibrator.recalibration, + indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, + indels_tranches = IndelsVariantRecalibrator.tranches, + snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), + snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), + snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), + indel_filter_level = indel_filter_level, + snp_filter_level = snp_filter_level, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + # For large callsets we need to collect metrics from the shards and gather them later. + if (!is_small_callset) { + call Tasks.CollectVariantCallingMetrics as CollectMetricsSharded { + input: + input_vcf = ApplyRecalibration.recalibrated_vcf, + input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, + metrics_filename_prefix = callset_name + "." + idx, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict, + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + } + + # For small callsets we can gather the VCF shards and then collect metrics on it. + if (is_small_callset) { + call Tasks.GatherVcfs as FinalGatherVcf { + input: + input_vcfs = ApplyRecalibration.recalibrated_vcf, + output_vcf_name = callset_name + ".vcf.gz", + disk_size = huge_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + call Tasks.CollectVariantCallingMetrics as CollectMetricsOnFullVcf { + input: + input_vcf = FinalGatherVcf.output_vcf, + input_vcf_index = FinalGatherVcf.output_vcf_index, + metrics_filename_prefix = callset_name, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict, + disk_size = large_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + if (!is_small_callset) { + # For large callsets we still need to gather the sharded metrics. + call Tasks.GatherVariantCallingMetrics { + input: + input_details = select_all(CollectMetricsSharded.detail_metrics_file), + input_summaries = select_all(CollectMetricsSharded.summary_metrics_file), + output_prefix = callset_name, + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + } + + # CrossCheckFingerprints takes forever on large callsets. + # We scatter over the input GVCFs to make things faster. + if (scatter_cross_check_fingerprints) { + call Tasks.GetFingerprintingIntervalIndices { + input: + unpadded_intervals = unpadded_intervals, + haplotype_database = haplotype_database, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + Array[Int] fingerprinting_indices = GetFingerprintingIntervalIndices.indices_to_fingerprint + + scatter (idx in fingerprinting_indices) { + File vcfs_to_fingerprint = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx] + } + + call Tasks.GatherVcfs as GatherFingerprintingVcfs { + input: + input_vcfs = vcfs_to_fingerprint, + output_vcf_name = callset_name + ".gathered.fingerprinting.vcf.gz", + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + call Tasks.SelectFingerprintSiteVariants { + input: + input_vcf = GatherFingerprintingVcfs.output_vcf, + base_output_name = callset_name + ".fingerprinting", + haplotype_database = haplotype_database, + disk_size = medium_disk, + gatk_docker = gatk_docker, + gatk_path = gatk_path, + preemptible_tries = preemptible_tries + } + + call Tasks.PartitionSampleNameMap { + input: + sample_name_map = sample_name_map, + line_limit = 1000 + } + + scatter (idx in range(length(PartitionSampleNameMap.partitions))) { + + Array[File] files_in_partition = read_lines(PartitionSampleNameMap.partitions[idx]) + + call Tasks.CrossCheckFingerprint as CrossCheckFingerprintsScattered { + input: + gvcf_paths = files_in_partition, + vcf_paths = vcfs_to_fingerprint, + sample_name_map = sample_name_map, + haplotype_database = haplotype_database, + output_base_name = callset_name + "." + idx, + scattered = true, + picard_docker = picard_docker, + preemptible_tries = preemptible_tries + } + } + + call Tasks.GatherPicardMetrics as GatherFingerprintingMetrics { + input: + metrics_files = CrossCheckFingerprintsScattered.crosscheck_metrics, + output_file_name = callset_name + ".fingerprintcheck", + disk_size = small_disk + } + } + + if (!scatter_cross_check_fingerprints) { + + scatter (line in sample_name_map_lines) { + File gvcf_paths = line[1] + } + + call Tasks.CrossCheckFingerprint as CrossCheckFingerprintSolo { + input: + gvcf_paths = gvcf_paths, + vcf_paths = ApplyRecalibration.recalibrated_vcf, + sample_name_map = sample_name_map, + haplotype_database = haplotype_database, + output_base_name = callset_name, + picard_docker = picard_docker, + preemptible_tries = preemptible_tries + } + } + + # Get the metrics from either code path + File output_detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherVariantCallingMetrics.detail_metrics_file]) + File output_summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherVariantCallingMetrics.summary_metrics_file]) + + # Get the VCFs from either code path + Array[File?] output_vcf_files = if defined(FinalGatherVcf.output_vcf) then [FinalGatherVcf.output_vcf] else ApplyRecalibration.recalibrated_vcf + Array[File?] output_vcf_index_files = if defined(FinalGatherVcf.output_vcf_index) then [FinalGatherVcf.output_vcf_index] else ApplyRecalibration.recalibrated_vcf_index + + output { + # Metrics from either the small or large callset + File detail_metrics_file = output_detail_metrics_file + File summary_metrics_file = output_summary_metrics_file + + # Outputs from the small callset path through the wdl. + Array[File] output_vcfs = select_all(output_vcf_files) + Array[File] output_vcf_indices = select_all(output_vcf_index_files) + + # Output the interval list generated/used by this run workflow. + Array[File] output_intervals = SplitIntervalList.output_intervals + + # Output the metrics from crosschecking fingerprints. + File crosscheck_fingerprint_check = select_first([CrossCheckFingerprintSolo.crosscheck_metrics, GatherFingerprintingMetrics.gathered_metrics]) + } +} diff --git a/JointGenotyping.hg19.wgs.inputs.json b/JointGenotyping.hg19.wgs.inputs.json new file mode 100644 index 0000000..d1fbc66 --- /dev/null +++ b/JointGenotyping.hg19.wgs.inputs.json @@ -0,0 +1,41 @@ +{ + "JointGenotyping.sample_name_map": "gs://gatk-test-data/1kgp/downsampled_gvcf_hg37/hg37.1kg_50.sample_map", + "JointGenotyping.callset_name": "NA12878", + "JointGenotyping.unbounded_scatter_count_scale_factor": 2.5, + "JointGenotyping.SplitIntervalList.scatter_mode": "INTERVAL_SUBDIVISION", + + "JointGenotyping.unpadded_intervals_file": "gs://gatk-test-data/intervals/wgs_calling_regions.v1.list", + "JointGenotyping.ref_fasta": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta", + "JointGenotyping.ref_fasta_index": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.fasta.fai", + "JointGenotyping.ref_dict": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.dict", + "JointGenotyping.eval_interval_list": "gs://gcp-public-data--broad-references/hg19/v0/wgs_evaluation_regions.v1.interval_list", + "JointGenotyping.haplotype_database": "gs://gcp-public-data--broad-references/hg19/v0/Homo_sapiens_assembly19.haplotype_database.txt", + + "JointGenotyping.axiomPoly_resource_vcf": "gs://gcp-public-data--broad-references/hg19/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.vcf.gz", + "JointGenotyping.axiomPoly_resource_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.vcf.gz.tbi", + "JointGenotyping.dbsnp_vcf": "gs://gcp-public-data--broad-references/hg19/v0/dbsnp_138.b37.vcf.gz", + "JointGenotyping.dbsnp_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/dbsnp_138.b37.vcf.gz.tbi", + "JointGenotyping.hapmap_resource_vcf": "gs://gcp-public-data--broad-references/hg19/v0/hapmap_3.3.b37.vcf.gz", + "JointGenotyping.hapmap_resource_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/hapmap_3.3.b37.vcf.gz.tbi", + "JointGenotyping.mills_resource_vcf": "gs://gcp-public-data--broad-references/hg19/v0/Mills_and_1000G_gold_standard.indels.b37.sites.vcf", + "JointGenotyping.mills_resource_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/Mills_and_1000G_gold_standard.indels.b37.sites.vcf.idx", + "JointGenotyping.omni_resource_vcf": "gs://gcp-public-data--broad-references/hg19/v0/1000G_omni2.5.b37.vcf.gz", + "JointGenotyping.omni_resource_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/1000G_omni2.5.b37.vcf.gz.tbi", + "JointGenotyping.one_thousand_genomes_resource_vcf": "gs://gcp-public-data--broad-references/hg19/v0/1000G_phase1.snps.high_confidence.b37.vcf.gz", + "JointGenotyping.one_thousand_genomes_resource_vcf_index": "gs://gcp-public-data--broad-references/hg19/v0/1000G_phase1.snps.high_confidence.b37.vcf.gz.tbi", + + "JointGenotyping.SNP_VQSR_downsampleFactor": 10, + "JointGenotyping.snps_variant_recalibration_threshold": 20000, + "JointGenotyping.snp_filter_level": 99.7, + "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], + "JointGenotyping.snp_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ], + + "JointGenotyping.indel_filter_level": 99.0, + "JointGenotyping.indel_recalibration_annotation_values": ["FS", "ReadPosRankSum", "MQRankSum", "QD", "SOR", "DP"], + "JointGenotyping.indel_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], + + "JointGenotyping.small_disk": 100, + "JointGenotyping.medium_disk": 200, + "JointGenotyping.large_disk": 1000, + "JointGenotyping.huge_disk": 2000 +} diff --git a/JointGenotyping.hg38.terra.wgs.inputs.json b/JointGenotyping.hg38.terra.wgs.inputs.json new file mode 100644 index 0000000..1902afa --- /dev/null +++ b/JointGenotyping.hg38.terra.wgs.inputs.json @@ -0,0 +1,35 @@ +{ +"JointGenotyping.large_disk":"1000", +"JointGenotyping.medium_disk":"200", +"JointGenotyping.indel_recalibration_annotation_values":["FS","ReadPosRankSum","MQRankSum","QD","SOR","DP"], +"JointGenotyping.snp_recalibration_tranche_values":"[100.0,99.95,99.9,99.8,99.6,99.5,99.4,99.3,99.0,98.0,97.0,90.0]", +"JointGenotyping.unbounded_scatter_count_scale_factor":"2.5", +"JointGenotyping.omni_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", +"JointGenotyping.eval_interval_list":"gs://gcp-public-data--broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", +"JointGenotyping.one_thousand_genomes_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", +"JointGenotyping.one_thousand_genomes_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", +"JointGenotyping.small_disk":"100", +"JointGenotyping.snp_recalibration_annotation_values":"[QD,MQRankSum,ReadPosRankSum,FS,MQ,SOR,DP]", +"JointGenotyping.dbsnp_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", +"JointGenotyping.callset_name":"hg38_1kg_50", +"JointGenotyping.unpadded_intervals_file":"gs://gcp-public-data--broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", +"JointGenotyping.dbsnp_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", +"JointGenotyping.ref_fasta_index":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", +"JointGenotyping.ref_dict":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", +"JointGenotyping.mills_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", +"JointGenotyping.sample_name_map":"gs://gatk-test-data/joint_discovery/1kg_50_hg38/gvcf/hg38_1kg_50.sample_map", +"JointGenotyping.indel_recalibration_tranche_values":"[100.0,99.95,99.9,99.5,99.0,97.0,96.0,95.0,94.0,93.5,93.0,92.0,91.0,90.0]", +"JointGenotyping.omni_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", +"JointGenotyping.mills_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", +"JointGenotyping.axiomPoly_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", +"JointGenotyping.snp_filter_level":"99.7", +"JointGenotyping.snps_variant_recalibration_threshold":"20000", +"JointGenotyping.haplotype_database":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt", +"JointGenotyping.hapmap_resource_vcf":"gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", +"JointGenotyping.indel_filter_level":"99", +"JointGenotyping.axiomPoly_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", +"JointGenotyping.ref_fasta":"gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", +"JointGenotyping.hapmap_resource_vcf_index":"gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", +"JointGenotyping.SNP_VQSR_downsampleFactor":"10", +"JointGenotyping.huge_disk":"2000" +} diff --git a/JointGenotyping.hg38.wgs.inputs.json b/JointGenotyping.hg38.wgs.inputs.json new file mode 100644 index 0000000..868cd5c --- /dev/null +++ b/JointGenotyping.hg38.wgs.inputs.json @@ -0,0 +1,41 @@ +{ + "JointGenotyping.sample_name_map": "gs://gatk-test-data/joint_discovery/1kg_50_hg38/gvcf/hg38_1kg_50.sample_map", + "JointGenotyping.callset_name": "hg38_1kg_50", + "JointGenotyping.unbounded_scatter_count_scale_factor": 2.5, + "JointGenotyping.SplitIntervalList.scatter_mode": "INTERVAL_SUBDIVISION", + + "JointGenotyping.unpadded_intervals_file": "gs://gcp-public-data--broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", + "JointGenotyping.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "JointGenotyping.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "JointGenotyping.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "JointGenotyping.eval_interval_list": "gs://gcp-public-data--broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", + "JointGenotyping.haplotype_database": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt", + + "JointGenotyping.axiomPoly_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", + "JointGenotyping.axiomPoly_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", + "JointGenotyping.dbsnp_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", + "JointGenotyping.dbsnp_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", + "JointGenotyping.hapmap_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", + "JointGenotyping.hapmap_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", + "JointGenotyping.mills_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", + "JointGenotyping.mills_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", + "JointGenotyping.omni_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", + "JointGenotyping.omni_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", + "JointGenotyping.one_thousand_genomes_resource_vcf": "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", + "JointGenotyping.one_thousand_genomes_resource_vcf_index": "gs://gcp-public-data--broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", + + "JointGenotyping.SNP_VQSR_downsampleFactor": 10, + "JointGenotyping.snps_variant_recalibration_threshold": 20000, + "JointGenotyping.snp_filter_level": 99.7, + "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], + "JointGenotyping.snp_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ], + + "JointGenotyping.indel_filter_level": 99.0, + "JointGenotyping.indel_recalibration_annotation_values": ["FS", "ReadPosRankSum", "MQRankSum", "QD", "SOR", "DP"], + "JointGenotyping.indel_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], + + "JointGenotyping.small_disk": 100, + "JointGenotyping.medium_disk": 200, + "JointGenotyping.large_disk": 1000, + "JointGenotyping.huge_disk": 2000 +} diff --git a/JointGenotyping.wdl b/JointGenotyping.wdl new file mode 100644 index 0000000..97a6835 --- /dev/null +++ b/JointGenotyping.wdl @@ -0,0 +1,502 @@ +version 1.0 + +## Copyright Broad Institute, 2019 +## +## This WDL implements the joint discovery and VQSR filtering portion of the GATK +## Best Practices (June 2016) for germline SNP and Indel discovery in human +## whole-genome sequencing (WGS) and exome sequencing data. +## +## Requirements/expectations : +## - One or more GVCFs produced by HaplotypeCaller in GVCF mode +## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. +## +## Outputs : +## - A VCF file and its index, filtered using variant quality score recalibration +## (VQSR) with genotypes for all samples present in the input VCF. All sites that +## are present in the input VCF are retained; filtered sites are annotated as such +## in the FILTER field. +## +## Note about VQSR wiring : +## The SNP and INDEL models are built in parallel, but then the corresponding +## recalibrations are applied in series. Because the INDEL model is generally ready +## first (because there are fewer indels than SNPs) we set INDEL recalibration to +## be applied first to the input VCF, while the SNP model is still being built. By +## the time the SNP model is available, the indel-recalibrated file is available to +## serve as input to apply the SNP recalibration. If we did it the other way around, +## we would have to wait until the SNP recal file was available despite the INDEL +## recal file being there already, then apply SNP recalibration, then apply INDEL +## recalibration. This would lead to a longer wall clock time for complete workflow +## execution. Wiring the INDEL recalibration to be applied first solves the problem. +## +## Cromwell version support +## - Successfully tested on v47 +## - Does not work on versions < v23 due to output syntax +## +## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. +## For program versions, see docker containers. +## +## LICENSING : +## This script is released under the WDL source code license (BSD-3) (see LICENSE in +## https://github.com/broadinstitute/wdl). Note however that the programs it calls may +## be subject to different licenses. Users are responsible for checking that they are +## authorized to run all programs before running this script. Please see the docker +## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed +## licensing information pertaining to the included programs. + +# WORKFLOW DEFINITION + +#import "./tasks/JointGenotypingTasks.wdl" as Tasks + +import "https://raw.githubusercontent.com/gatk-workflows/gatk4-germline-snps-indels/2.0.0/tasks/JointGenotypingTasks.wdl" as Tasks + +# Joint Genotyping for hg38 Whole Genomes and Exomes (has not been tested on hg19) +workflow JointGenotyping { + + String pipeline_version = "1.2" + + input { + File unpadded_intervals_file + + String callset_name + File sample_name_map + + File ref_fasta + File ref_fasta_index + File ref_dict + + File dbsnp_vcf + File dbsnp_vcf_index + + Int small_disk + Int medium_disk + Int large_disk + Int huge_disk + + Array[String] snp_recalibration_tranche_values + Array[String] snp_recalibration_annotation_values + Array[String] indel_recalibration_tranche_values + Array[String] indel_recalibration_annotation_values + + File haplotype_database + + File eval_interval_list + File hapmap_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf + File one_thousand_genomes_resource_vcf_index + File mills_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf = dbsnp_vcf + File dbsnp_resource_vcf_index = dbsnp_vcf_index + + # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme + # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 + Float excess_het_threshold = 54.69 + Float snp_filter_level + Float indel_filter_level + Int SNP_VQSR_downsampleFactor + + Int? top_level_scatter_count + Boolean? gather_vcfs + Int snps_variant_recalibration_threshold = 500000 + Boolean rename_gvcf_samples = true + Float unbounded_scatter_count_scale_factor = 0.15 + Int gnarly_scatter_count = 10 + Boolean use_gnarly_genotyper = false + Boolean use_allele_specific_annotations = true + Boolean cross_check_fingerprints = true + Boolean scatter_cross_check_fingerprints = false + } + + Boolean allele_specific_annotations = !use_gnarly_genotyper && use_allele_specific_annotations + + Array[Array[String]] sample_name_map_lines = read_tsv(sample_name_map) + Int num_gvcfs = length(sample_name_map_lines) + + # Make a 2.5:1 interval number to samples in callset ratio interval list. + # We allow overriding the behavior by specifying the desired number of vcfs + # to scatter over for testing / special requests. + # Zamboni notes say "WGS runs get 30x more scattering than Exome" and + # exome scatterCountPerSample is 0.05, min scatter 10, max 1000 + + # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. + # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. + # We allow overriding this default behavior for testing / special requests. + Boolean is_small_callset = select_first([gather_vcfs, num_gvcfs <= 1000]) + + Int unbounded_scatter_count = select_first([top_level_scatter_count, round(unbounded_scatter_count_scale_factor * num_gvcfs)]) + Int scatter_count = if unbounded_scatter_count > 2 then unbounded_scatter_count else 2 #I think weird things happen if scatterCount is 1 -- IntervalListTools is noop? + + call Tasks.CheckSamplesUnique { + input: + sample_name_map = sample_name_map + } + + call Tasks.SplitIntervalList { + input: + interval_list = unpadded_intervals_file, + scatter_count = scatter_count, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = small_disk, + sample_names_unique_done = CheckSamplesUnique.samples_unique + } + + Array[File] unpadded_intervals = SplitIntervalList.output_intervals + + scatter (idx in range(length(unpadded_intervals))) { + # The batch_size value was carefully chosen here as it + # is the optimal value for the amount of memory allocated + # within the task; please do not change it without consulting + # the Hellbender (GATK engine) team! + call Tasks.ImportGVCFs { + input: + sample_name_map = sample_name_map, + interval = unpadded_intervals[idx], + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + workspace_dir_name = "genomicsdb", + disk_size = medium_disk, + batch_size = 50 + } + + if (use_gnarly_genotyper) { + + call Tasks.SplitIntervalList as GnarlyIntervalScatterDude { + input: + interval_list = unpadded_intervals[idx], + scatter_count = gnarly_scatter_count, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + disk_size = small_disk, + sample_names_unique_done = CheckSamplesUnique.samples_unique + } + + Array[File] gnarly_intervals = GnarlyIntervalScatterDude.output_intervals + + scatter (gnarly_idx in range(length(gnarly_intervals))) { + call Tasks.GnarlyGenotyper { + input: + workspace_tar = ImportGVCFs.output_genomicsdb, + interval = gnarly_intervals[gnarly_idx], + output_vcf_filename = callset_name + "." + idx + "." + gnarly_idx + ".vcf.gz", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + } + } + + Array[File] gnarly_gvcfs = GnarlyGenotyper.output_vcf + + call Tasks.GatherVcfs as TotallyRadicalGatherVcfs { + input: + input_vcfs = gnarly_gvcfs, + output_vcf_name = callset_name + "." + idx + ".gnarly.vcf.gz", + disk_size = large_disk + } + } + + if (!use_gnarly_genotyper) { + call Tasks.GenotypeGVCFs { + input: + workspace_tar = ImportGVCFs.output_genomicsdb, + interval = unpadded_intervals[idx], + output_vcf_filename = callset_name + "." + idx + ".vcf.gz", + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + disk_size = medium_disk + } + } + + File genotyped_vcf = select_first([TotallyRadicalGatherVcfs.output_vcf, GenotypeGVCFs.output_vcf]) + File genotyped_vcf_index = select_first([TotallyRadicalGatherVcfs.output_vcf_index, GenotypeGVCFs.output_vcf_index]) + + call Tasks.HardFilterAndMakeSitesOnlyVcf { + input: + vcf = genotyped_vcf, + vcf_index = genotyped_vcf_index, + excess_het_threshold = excess_het_threshold, + variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", + sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", + disk_size = medium_disk + } + } + + call Tasks.GatherVcfs as SitesOnlyGatherVcf { + input: + input_vcfs = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf, + output_vcf_name = callset_name + ".sites_only.vcf.gz", + disk_size = medium_disk + } + + call Tasks.IndelsVariantRecalibrator { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".indels.recal", + tranches_filename = callset_name + ".indels.tranches", + recalibration_tranche_values = indel_recalibration_tranche_values, + recalibration_annotation_values = indel_recalibration_annotation_values, + mills_resource_vcf = mills_resource_vcf, + mills_resource_vcf_index = mills_resource_vcf_index, + axiomPoly_resource_vcf = axiomPoly_resource_vcf, + axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk + } + + if (num_gvcfs > snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibratorCreateModel { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + downsampleFactor = SNP_VQSR_downsampleFactor, + model_report_filename = callset_name + ".snps.model.report", + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk + } + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { + input: + sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], + sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], + recalibration_filename = callset_name + ".snps." + idx + ".recal", + tranches_filename = callset_name + ".snps." + idx + ".tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + model_report = SNPsVariantRecalibratorCreateModel.model_report, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk + } + } + + call Tasks.GatherTranches as SNPGatherTranches { + input: + tranches = SNPsVariantRecalibratorScattered.tranches, + output_filename = callset_name + ".snps.gathered.tranches", + disk_size = small_disk + } + } + + if (num_gvcfs <= snps_variant_recalibration_threshold) { + call Tasks.SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { + input: + sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, + sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, + recalibration_filename = callset_name + ".snps.recal", + tranches_filename = callset_name + ".snps.tranches", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + hapmap_resource_vcf = hapmap_resource_vcf, + hapmap_resource_vcf_index = hapmap_resource_vcf_index, + omni_resource_vcf = omni_resource_vcf, + omni_resource_vcf_index = omni_resource_vcf_index, + one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, + one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, + dbsnp_resource_vcf = dbsnp_resource_vcf, + dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = small_disk + } + } + + scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { + #for really large callsets we give to friends, just apply filters to the sites-only + call Tasks.ApplyRecalibration { + input: + recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", + input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], + input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], + indels_recalibration = IndelsVariantRecalibrator.recalibration, + indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, + indels_tranches = IndelsVariantRecalibrator.tranches, + snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), + snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), + snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), + indel_filter_level = indel_filter_level, + snp_filter_level = snp_filter_level, + use_allele_specific_annotations = allele_specific_annotations, + disk_size = medium_disk + } + + # For large callsets we need to collect metrics from the shards and gather them later. + if (!is_small_callset) { + call Tasks.CollectVariantCallingMetrics as CollectMetricsSharded { + input: + input_vcf = ApplyRecalibration.recalibrated_vcf, + input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, + metrics_filename_prefix = callset_name + "." + idx, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict, + disk_size = medium_disk + } + } + } + + # For small callsets we can gather the VCF shards and then collect metrics on it. + if (is_small_callset) { + call Tasks.GatherVcfs as FinalGatherVcf { + input: + input_vcfs = ApplyRecalibration.recalibrated_vcf, + output_vcf_name = callset_name + ".vcf.gz", + disk_size = huge_disk + } + + call Tasks.CollectVariantCallingMetrics as CollectMetricsOnFullVcf { + input: + input_vcf = FinalGatherVcf.output_vcf, + input_vcf_index = FinalGatherVcf.output_vcf_index, + metrics_filename_prefix = callset_name, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index, + interval_list = eval_interval_list, + ref_dict = ref_dict, + disk_size = large_disk + } + } + + if (!is_small_callset) { + # For large callsets we still need to gather the sharded metrics. + call Tasks.GatherVariantCallingMetrics { + input: + input_details = select_all(CollectMetricsSharded.detail_metrics_file), + input_summaries = select_all(CollectMetricsSharded.summary_metrics_file), + output_prefix = callset_name, + disk_size = medium_disk + } + } + + # CrossCheckFingerprints takes forever on large callsets. + # We scatter over the input GVCFs to make things faster. + if (scatter_cross_check_fingerprints) { + call Tasks.GetFingerprintingIntervalIndices { + input: + unpadded_intervals = unpadded_intervals, + haplotype_database = haplotype_database + } + + Array[Int] fingerprinting_indices = GetFingerprintingIntervalIndices.indices_to_fingerprint + + scatter (idx in fingerprinting_indices) { + File vcfs_to_fingerprint = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx] + } + + call Tasks.GatherVcfs as GatherFingerprintingVcfs { + input: + input_vcfs = vcfs_to_fingerprint, + output_vcf_name = callset_name + ".gathered.fingerprinting.vcf.gz", + disk_size = medium_disk + } + + call Tasks.SelectFingerprintSiteVariants { + input: + input_vcf = GatherFingerprintingVcfs.output_vcf, + base_output_name = callset_name + ".fingerprinting", + haplotype_database = haplotype_database, + disk_size = medium_disk + } + + call Tasks.PartitionSampleNameMap { + input: + sample_name_map = sample_name_map, + line_limit = 1000 + } + + scatter (idx in range(length(PartitionSampleNameMap.partitions))) { + + Array[File] files_in_partition = read_lines(PartitionSampleNameMap.partitions[idx]) + + call Tasks.CrossCheckFingerprint as CrossCheckFingerprintsScattered { + input: + gvcf_paths = files_in_partition, + vcf_paths = vcfs_to_fingerprint, + sample_name_map = sample_name_map, + haplotype_database = haplotype_database, + output_base_name = callset_name + "." + idx, + scattered = true + } + } + + call Tasks.GatherPicardMetrics as GatherFingerprintingMetrics { + input: + metrics_files = CrossCheckFingerprintsScattered.crosscheck_metrics, + output_file_name = callset_name + ".fingerprintcheck", + disk_size = small_disk + } + } + + if (!scatter_cross_check_fingerprints) { + + scatter (line in sample_name_map_lines) { + File gvcf_paths = line[1] + } + + call Tasks.CrossCheckFingerprint as CrossCheckFingerprintSolo { + input: + gvcf_paths = gvcf_paths, + vcf_paths = ApplyRecalibration.recalibrated_vcf, + sample_name_map = sample_name_map, + haplotype_database = haplotype_database, + output_base_name = callset_name + } + } + + # Get the metrics from either code path + File output_detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherVariantCallingMetrics.detail_metrics_file]) + File output_summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherVariantCallingMetrics.summary_metrics_file]) + + # Get the VCFs from either code path + Array[File?] output_vcf_files = if defined(FinalGatherVcf.output_vcf) then [FinalGatherVcf.output_vcf] else ApplyRecalibration.recalibrated_vcf + Array[File?] output_vcf_index_files = if defined(FinalGatherVcf.output_vcf_index) then [FinalGatherVcf.output_vcf_index] else ApplyRecalibration.recalibrated_vcf_index + + output { + # Metrics from either the small or large callset + File detail_metrics_file = output_detail_metrics_file + File summary_metrics_file = output_summary_metrics_file + + # Outputs from the small callset path through the wdl. + Array[File] output_vcfs = select_all(output_vcf_files) + Array[File] output_vcf_indices = select_all(output_vcf_index_files) + + # Output the interval list generated/used by this run workflow. + Array[File] output_intervals = SplitIntervalList.output_intervals + + # Output the metrics from crosschecking fingerprints. + File crosscheck_fingerprint_check = select_first([CrossCheckFingerprintSolo.crosscheck_metrics, GatherFingerprintingMetrics.gathered_metrics]) + } +} diff --git a/README.md b/README.md index 9880957..5357ad8 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,16 @@ Workflows for [germline short variant discovery](https://software.broadinstitute.org/gatk/best-practices/workflow?id=11145) with GATK4. ### haplotypecaller-gvcf-gatk : -The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool -from GATK4 in GVCF mode on a single sample according to GATK Best Practices. -When executed the workflow scatters the HaplotypeCaller tool over a sample -using an intervals list file. The output file produced will be a -single gvcf file which can be used by the joint-discovery workflow. +The haplotypecaller-gvcf-gatk4 workflow runs the GATK4 HaplotypeCaller tool +in GVCF mode on a single sample according to GATK Best Practices. When +executed the workflow scatters the HaplotypeCaller tool over the input bam sample +using an interval list file. The output produced by the workflow will be a single GVCF +file which can then be provided to the JointGenotyping workflow along with several other +GVCF files to call for variants simultaneously, producing a multisample VCF. +The haplotypecaller-gvcf-gatk4 workflows default GVCF mode is useful when calling variants +for several samples efficiently. However, for instances when calling variants for one or a +few samples it is possible to have the workflow directly call variants and output a VCF file by +setting the `make_gvcf` input variable to `false`. #### Requirements/expectations - One analysis-ready BAM file for a single sample (as identified in RG:SM) @@ -17,22 +22,23 @@ single gvcf file which can be used by the joint-discovery workflow. #### Outputs - One GVCF file and its index -### joint-discovery-gatk : +### JointGenotyping.wdl : This WDL implements the joint calling and VQSR filtering portion of the GATK Best Practices for germline SNP and Indel discovery -in human whole-genome sequencing (WGS). +in human whole-genome sequencing (WGS). The workflow requires a sample map +file with 50 or more GVCFs and produces a multisample VCF. *NOTE:* -*- joint-discovery-gatk4-local.wdl is a slightly modified version of the -original to support users interested in running the workflow locally.* -*- joint-discovery-gatk4-fc.wdl is a slightly modified version of the -original to support users interested in running the workflow firecloud with and -using an array of gvcfs as input.* +*- JointGenotyping-terra.wdl is a slightly modified version of the +original workflow to support users interested in running the +workflow on Terra. The changes include variables for dockers and disksize, making +it easier to configure the workflow.* +*- Creating a sample map can be nuisance on Terra, use the [generate-sample-map](https://portal.firecloud.org/?return=terra#methods/gatk/generate-sample-map/1) to create one for you.* #### Requirements/expectations - One or more GVCFs produced by HaplotypeCaller in GVCF mode -- Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. +- Bare minimum 50 samples. Gene panels are not supported. - When determining disk size in the JSON, use the guideline below - small_disk = (num_gvcfs / 10) + 10 - medium_disk = (num_gvcfs * 15) + 10 @@ -45,7 +51,7 @@ using an array of gvcfs as input.* in the FILTER field. ### Software version requirements : -- GATK 4.1 +- GATK 4.1.4.0 - Samtools 1.3.1 - Python 2.7 - Cromwell version support @@ -76,6 +82,19 @@ using an array of gvcfs as input.* The dynamic scatter interval creating was optimized for genomes. The scattered SNP VariantRecalibration may fail because of two few "bad" variants to build the negative model. Also, apologies that the logging for SNP recalibration is overly verbose. +- No allele subsetting for the JointGenotyping workflow + - for large cohorts, even exome callsets can have more than 1000 alleles at low + complexity/STR sites + - for sites with more than 6 alternate alleles (by default) called genotypes will be returned, + but without the PLs since the PL arrays get enormous + - allele-specific filtering could be performed if AS annotations are present, + but the data will still be in the VCF in one giant INFO field +- JointGenotyping output is divided into lots of shards + - desirable for use in [Hail](https://hail.is/), which supports parallel import + - Its possible to use [GatherVcfs](https://gatk.broadinstitute.org/hc/en-us/search?utf8=%E2%9C%93&query=GatherVcfs) to combine shards. +- GnarlyGenotyper uses a QUAL score approximation + - dramatically improves performance compared with GenotypeGVCFs, but QUAL output (and thus + the QD annotation) may be slightly discordant between the two tools - The provided JSON is meant to be a ready to use example JSON template of the workflow. It is the user’s responsibility to correctly set the reference and resource input variables using the [GATK Tool and Tutorial Documentations](https://software.broadinstitute.org/gatk/documentation/). - Relevant reference and resources bundles can be accessed in [Resource Bundle](https://software.broadinstitute.org/gatk/download/bundle). - Runtime parameters are optimized for Broad's Google Cloud Platform implementation. diff --git a/haplotypecaller-gvcf-gatk4-nio.wdl b/haplotypecaller-gvcf-gatk4-nio.wdl deleted file mode 100644 index afcbbfb..0000000 --- a/haplotypecaller-gvcf-gatk4-nio.wdl +++ /dev/null @@ -1,254 +0,0 @@ -## Copyright Broad Institute, 2019 -## -## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool -## from GATK4 in GVCF mode on a single sample according to GATK Best Practices. -## When executed the workflow scatters the HaplotypeCaller tool over a sample -## using an intervals list file. The output file produced will be a -## single gvcf file which can be used by the joint-discovery workflow. -## -## Requirements/expectations : -## - One analysis-ready BAM file for a single sample (as identified in RG:SM) -## - Set of variant calling intervals lists for the scatter, provided in a file -## -## Outputs : -## - One GVCF file and its index -## -## Cromwell version support -## - Successfully tested on v37 -## - Does not work on versions < v23 due to output syntax -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the dockers -## for detailed licensing information pertaining to the included programs. - -# WORKFLOW DEFINITION -workflow HaplotypeCallerGvcf_GATK4 { - File input_bam - File input_bam_index - File ref_dict - File ref_fasta - File ref_fasta_index - File scattered_calling_intervals_list - - Boolean? make_gvcf - Boolean making_gvcf = select_first([make_gvcf,true]) - - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - String? gitc_docker_override - String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) - String? samtools_path_override - String samtools_path = select_first([samtools_path_override, "samtools"]) - - Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) - - #is the input a cram file? - Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" - - String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") - String vcf_basename = sample_basename - String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" - String output_filename = vcf_basename + output_suffix - - # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. - # If we take the number we are scattering by and reduce by 20 we will have enough disk space - # to account for the fact that the data is quite uneven across the shards. - Int potential_hc_divisor = length(scattered_calling_intervals) - 20 - Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 - - - if ( is_cram ) { - call CramToBamTask { - input: - input_cram = input_bam, - sample_name = sample_basename, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - docker = gitc_docker, - samtools_path = samtools_path - } - } - - # Call variants in parallel over grouped calling intervals - scatter (interval_file in scattered_calling_intervals) { - - # Generate GVCF by interval - call HaplotypeCaller { - input: - input_bam = select_first([CramToBamTask.output_bam, input_bam]), - input_bam_index = select_first([CramToBamTask.output_bai, input_bam_index]), - interval_list = interval_file, - output_filename = output_filename, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - hc_scatter = hc_divisor, - make_gvcf = making_gvcf, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - # Merge per-interval GVCFs - call MergeGVCFs { - input: - input_vcfs = HaplotypeCaller.output_vcf, - input_vcfs_indexes = HaplotypeCaller.output_vcf_index, - output_filename = output_filename, - docker = gatk_docker, - gatk_path = gatk_path - } - - # Outputs that will be retained when execution is complete - output { - File output_vcf = MergeGVCFs.output_vcf - File output_vcf_index = MergeGVCFs.output_vcf_index - } -} - -# TASK DEFINITIONS - -task CramToBamTask { - # Command parameters - File ref_fasta - File ref_fasta_index - File ref_dict - File input_cram - String sample_name - - # Runtime parameters - String docker - Int? machine_mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts - String samtools_path - - Float output_bam_size = size(input_cram, "GB") / 0.60 - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 - - command { - set -e - set -o pipefail - - ${samtools_path} view -h -T ${ref_fasta} ${input_cram} | - ${samtools_path} view -b -o ${sample_name}.bam - - ${samtools_path} index -b ${sample_name}.bam - mv ${sample_name}.bam.bai ${sample_name}.bai - } - runtime { - docker: docker - memory: select_first([machine_mem_gb, 15]) + " GB" - disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - } - output { - File output_bam = "${sample_name}.bam" - File output_bai = "${sample_name}.bai" - } -} - -# HaplotypeCaller per-sample in GVCF mode -task HaplotypeCaller { - String input_bam - String input_bam_index - File interval_list - String output_filename - File ref_dict - File ref_fasta - File ref_fasta_index - Float? contamination - Boolean make_gvcf - Int hc_scatter - - String gatk_path - String? java_options - String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) - - # Runtime parameters - String docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts - - Int machine_mem_gb = select_first([mem_gb, 7]) - Int command_mem_gb = machine_mem_gb - 1 - - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 - - command <<< - set -e - - ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ - HaplotypeCaller \ - -R ${ref_fasta} \ - -I ${input_bam} \ - -L ${interval_list} \ - -O ${output_filename} \ - -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} - >>> - - runtime { - docker: docker - memory: machine_mem_gb + " GB" - disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - } - - output { - File output_vcf = "${output_filename}" - File output_vcf_index = "${output_filename}.tbi" - } -} -# Merge GVCFs generated per-interval for the same sample -task MergeGVCFs { - Array[File] input_vcfs - Array[File] input_vcfs_indexes - String output_filename - - String gatk_path - - # Runtime parameters - String docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts - - Int machine_mem_gb = select_first([mem_gb, 3]) - Int command_mem_gb = machine_mem_gb - 1 - - command <<< - set -e - - ${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ - MergeVcfs \ - --INPUT ${sep=' --INPUT ' input_vcfs} \ - --OUTPUT ${output_filename} - >>> - - runtime { - docker: docker - memory: machine_mem_gb + " GB" - disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" - preemptible: select_first([preemptible_attempts, 3]) - } - - - output { - File output_vcf = "${output_filename}" - File output_vcf_index = "${output_filename}.tbi" - } -} - diff --git a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json index 0b5df9a..aedac6d 100644 --- a/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json +++ b/haplotypecaller-gvcf-gatk4.hg38.wgs.inputs.json @@ -1,45 +1,10 @@ { - "##_COMMENT1": "INPUT BAM", - "#HaplotypeCallerGvcf_GATK4.input_bam": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bam", - "#HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://gatk-test-data/wgs_bam/NA12878_24RG_hg38/NA12878_24RG_small.hg38.bai", "HaplotypeCallerGvcf_GATK4.input_bam": "gs://broad-public-datasets/NA12878/NA12878.cram", "HaplotypeCallerGvcf_GATK4.input_bam_index": "gs://broad-public-datasets/NA12878/NA12878.cram.crai", - "##_COMMENT2": "REFERENCE FILES", - "HaplotypeCallerGvcf_GATK4.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - "HaplotypeCallerGvcf_GATK4.ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "HaplotypeCallerGvcf_GATK4.ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", + "HaplotypeCallerGvcf_GATK4.ref_dict": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", + "HaplotypeCallerGvcf_GATK4.ref_fasta": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "HaplotypeCallerGvcf_GATK4.ref_fasta_index": "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "##_COMMENT3": "INTERVALS", - "HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt", - - "##_COMMENT4": "MISCELLANEOUS PARAMETERS", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.make_gvcf": "True", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.contamination": "Float? (optional)", - - "##_COMMENT5": "DOCKERS", - "#HaplotypeCallerGvcf_GATK4.gatk_docker_override": "String? (optional)", - "#HaplotypeCallerGvcf_GATK4.gitc_docker_override": "String? (optional)", - - "##_COMMENT6": "PATHS", - "#HaplotypeCallerGvcf_GATK4.gatk_path_override": "String? (optional)", - "#HaplotypeCallerGvcf_GATK4.samtools_path_override": "String? (optional)", - - "##_COMMENT7": "JAVA OPTIONS", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.java_options": "String? (optional)", - - "##_COMMENT8": "MEMORY ALLOCATION", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.mem_gb": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.mem_gb": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.CramToBamTask.machine_mem_gb": "Int? (optional)", - - "##_COMMENT9": "DISK SIZE ALLOCATION", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.disk_space_gb": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.disk_space_gb": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.CramToBamTask.disk_space_gb": "Int? (optional)", - - "##_COMMENT10": "PREEMPTION", - "#HaplotypeCallerGvcf_GATK4.HaplotypeCaller.preemptible_attempts": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.MergeGVCFs.preemptible_attempts": "Int? (optional)", - "#HaplotypeCallerGvcf_GATK4.CramToBamTask.preemptible_attempts": "Int? (optional)" + "HaplotypeCallerGvcf_GATK4.scattered_calling_intervals_list": "gs://gatk-test-data/intervals/hg38_wgs_scattered_calling_intervals.txt" } diff --git a/haplotypecaller-gvcf-gatk4.wdl b/haplotypecaller-gvcf-gatk4.wdl index 0270187..9d31e3b 100644 --- a/haplotypecaller-gvcf-gatk4.wdl +++ b/haplotypecaller-gvcf-gatk4.wdl @@ -1,3 +1,5 @@ +version 1.0 + ## Copyright Broad Institute, 2019 ## ## The haplotypecaller-gvcf-gatk4 workflow runs the HaplotypeCaller tool @@ -28,45 +30,47 @@ # WORKFLOW DEFINITION workflow HaplotypeCallerGvcf_GATK4 { - File input_bam - File input_bam_index - File ref_dict - File ref_fasta - File ref_fasta_index - File scattered_calling_intervals_list + input { + File input_bam + File input_bam_index + File ref_dict + File ref_fasta + File ref_fasta_index + File scattered_calling_intervals_list - Boolean? make_gvcf - Boolean making_gvcf = select_first([make_gvcf,true]) - - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - String? gitc_docker_override - String gitc_docker = select_first([gitc_docker_override, "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817"]) - String? samtools_path_override - String samtools_path = select_first([samtools_path_override, "samtools"]) - - Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) - - #is the input a cram file? - Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" - - String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") - String vcf_basename = sample_basename - String output_suffix = if making_gvcf then ".g.vcf.gz" else ".vcf.gz" - String output_filename = vcf_basename + output_suffix + Boolean make_gvcf = true + String gatk_docker = "broadinstitute/gatk:4.1.4.0" + String gatk_path = "/gatk/gatk" + String gitc_docker = "broadinstitute/genomes-in-the-cloud:2.3.1-1500064817" + String samtools_path = "samtools" + } + + Array[File] scattered_calling_intervals = read_lines(scattered_calling_intervals_list) + + #is the input a cram file? + Boolean is_cram = sub(basename(input_bam), ".*\\.", "") == "cram" + + String sample_basename = if is_cram then basename(input_bam, ".cram") else basename(input_bam, ".bam") + String vcf_basename = sample_basename + String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" + String output_filename = vcf_basename + output_suffix + + # We need disk to localize the sharded input and output due to the scatter for HaplotypeCaller. + # If we take the number we are scattering by and reduce by 20 we will have enough disk space + # to account for the fact that the data is quite uneven across the shards. + Int potential_hc_divisor = length(scattered_calling_intervals) - 20 + Int hc_divisor = if potential_hc_divisor > 1 then potential_hc_divisor else 1 if ( is_cram ) { call CramToBamTask { - input: - input_cram = input_bam, - sample_name = sample_basename, - ref_dict = ref_dict, - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - docker = gitc_docker, - samtools_path = samtools_path + input: + input_cram = input_bam, + sample_name = sample_basename, + ref_dict = ref_dict, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_index, + docker = gitc_docker, + samtools_path = samtools_path } } @@ -83,7 +87,8 @@ workflow HaplotypeCallerGvcf_GATK4 { ref_dict = ref_dict, ref_fasta = ref_fasta, ref_fasta_index = ref_fasta_index, - make_gvcf = making_gvcf, + hc_scatter = hc_divisor, + make_gvcf = make_gvcf, docker = gatk_docker, gatk_path = gatk_path } @@ -109,33 +114,34 @@ workflow HaplotypeCallerGvcf_GATK4 { # TASK DEFINITIONS task CramToBamTask { - # Command parameters - File ref_fasta - File ref_fasta_index - File ref_dict - File input_cram - String sample_name - - # Runtime parameters - String docker - Int? machine_mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts - String samtools_path - - Float output_bam_size = size(input_cram, "GB") / 0.60 - Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 - + input { + # Command parameters + File ref_fasta + File ref_fasta_index + File ref_dict + File input_cram + String sample_name + + # Runtime parameters + String docker + Int? machine_mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + String samtools_path + } + Float output_bam_size = size(input_cram, "GB") / 0.60 + Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") + Int disk_size = ceil(size(input_cram, "GB") + output_bam_size + ref_size) + 20 + command { set -e set -o pipefail - ${samtools_path} view -h -T ${ref_fasta} ${input_cram} | - ${samtools_path} view -b -o ${sample_name}.bam - - ${samtools_path} index -b ${sample_name}.bam - mv ${sample_name}.bam.bai ${sample_name}.bai + ~{samtools_path} view -h -T ~{ref_fasta} ~{input_cram} | + ~{samtools_path} view -b -o ~{sample_name}.bam - + ~{samtools_path} index -b ~{sample_name}.bam + mv ~{sample_name}.bam.bai ~{sample_name}.bai } runtime { docker: docker @@ -144,102 +150,115 @@ task CramToBamTask { preemptible: select_first([preemptible_attempts, 3]) } output { - File output_bam = "${sample_name}.bam" - File output_bai = "${sample_name}.bai" + File output_bam = "~{sample_name}.bam" + File output_bai = "~{sample_name}.bai" } } # HaplotypeCaller per-sample in GVCF mode task HaplotypeCaller { - File input_bam - File input_bam_index - File interval_list - String output_filename - File ref_dict - File ref_fasta - File ref_fasta_index - Float? contamination - Boolean make_gvcf - - String gatk_path - String? java_options - String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) - - # Runtime parameters - String docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts + input { + # Command parameters + File input_bam + File input_bam_index + File interval_list + String output_filename + File ref_dict + File ref_fasta + File ref_fasta_index + Float? contamination + Boolean make_gvcf + Int hc_scatter + + String gatk_path + String? java_options + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Boolean use_ssd = false + Int? preemptible_attempts + } + + String java_opt = select_first([java_options, "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10"]) Int machine_mem_gb = select_first([mem_gb, 7]) Int command_mem_gb = machine_mem_gb - 1 Float ref_size = size(ref_fasta, "GB") + size(ref_fasta_index, "GB") + size(ref_dict, "GB") - Int disk_size = ceil(size(input_bam, "GB") + ref_size) + 20 - - command <<< - set -e + Int disk_size = ceil(((size(input_bam, "GB") + 30) / hc_scatter) + ref_size) + 20 + + parameter_meta { + input_bam: { + description: "a bam file", + localization_optional: true + } + input_bam_index: { + description: "an index file for the bam input", + localization_optional: true + } + } + command { + set -e - ${gatk_path} --java-options "-Xmx${command_mem_gb}G ${java_opt}" \ + ~{gatk_path} --java-options "-Xmx~{command_mem_gb}G ~{java_opt}" \ HaplotypeCaller \ - -R ${ref_fasta} \ - -I ${input_bam} \ - -L ${interval_list} \ - -O ${output_filename} \ - -contamination ${default=0 contamination} ${true="-ERC GVCF" false="" make_gvcf} - >>> - + -R ~{ref_fasta} \ + -I ~{input_bam} \ + -L ~{interval_list} \ + -O ~{output_filename} \ + -contamination ~{default=0 contamination} ~{true="-ERC GVCF" false="" make_gvcf} \ + -G StandardAnnotation -G AS_StandardAnnotation -G StandardHCAnnotation + } runtime { docker: docker memory: machine_mem_gb + " GB" disks: "local-disk " + select_first([disk_space_gb, disk_size]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 3]) } - output { - File output_vcf = "${output_filename}" - File output_vcf_index = "${output_filename}.tbi" + File output_vcf = "~{output_filename}" + File output_vcf_index = "~{output_filename}.tbi" } } # Merge GVCFs generated per-interval for the same sample task MergeGVCFs { - Array[File] input_vcfs - Array[File] input_vcfs_indexes - String output_filename - - String gatk_path - - # Runtime parameters - String docker - Int? mem_gb - Int? disk_space_gb - Boolean use_ssd = false - Int? preemptible_attempts - - Int machine_mem_gb = select_first([mem_gb, 3]) - Int command_mem_gb = machine_mem_gb - 1 - - command <<< + input { + # Command parameters + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String output_filename + + String gatk_path + + # Runtime parameters + String docker + Int? mem_gb + Int? disk_space_gb + Int? preemptible_attempts + } + Boolean use_ssd = false + Int machine_mem_gb = select_first([mem_gb, 3]) + Int command_mem_gb = machine_mem_gb - 1 + + command { set -e - ${gatk_path} --java-options "-Xmx${command_mem_gb}G" \ + ~{gatk_path} --java-options "-Xmx~{command_mem_gb}G" \ MergeVcfs \ - --INPUT ${sep=' --INPUT ' input_vcfs} \ - --OUTPUT ${output_filename} - >>> - + --INPUT ~{sep=' --INPUT ' input_vcfs} \ + --OUTPUT ~{output_filename} + } runtime { docker: docker memory: machine_mem_gb + " GB" disks: "local-disk " + select_first([disk_space_gb, 100]) + if use_ssd then " SSD" else " HDD" preemptible: select_first([preemptible_attempts, 3]) } - - output { - File output_vcf = "${output_filename}" - File output_vcf_index = "${output_filename}.tbi" + File output_vcf = "~{output_filename}" + File output_vcf_index = "~{output_filename}.tbi" } } diff --git a/joint-discovery-gatk4-fc.wdl b/joint-discovery-gatk4-fc.wdl deleted file mode 100644 index 8ac6542..0000000 --- a/joint-discovery-gatk4-fc.wdl +++ /dev/null @@ -1,1026 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL implements the joint discovery and VQSR filtering portion of the GATK -## Best Practices (June 2016) for germline SNP and Indel discovery in human -## whole-genome sequencing (WGS) and exome sequencing data. -## -## Requirements/expectations : -## - One or more GVCFs produced by HaplotypeCaller in GVCF mode -## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. -## -## Outputs : -## - A VCF file and its index, filtered using variant quality score recalibration -## (VQSR) with genotypes for all samples present in the input VCF. All sites that -## are present in the input VCF are retained; filtered sites are annotated as such -## in the FILTER field. -## - Note that the sample_names is what the sample will be called in the output, but not necessarily what the sample name is called in its GVCF. -## -## Note about VQSR wiring : -## The SNP and INDEL models are built in parallel, but then the corresponding -## recalibrations are applied in series. Because the INDEL model is generally ready -## first (because there are fewer indels than SNPs) we set INDEL recalibration to -## be applied first to the input VCF, while the SNP model is still being built. By -## the time the SNP model is available, the indel-recalibrated file is available to -## serve as input to apply the SNP recalibration. If we did it the other way around, -## we would have to wait until the SNP recal file was available despite the INDEL -## recal file being there already, then apply SNP recalibration, then apply INDEL -## recalibration. This would lead to a longer wall clock time for complete workflow -## execution. Wiring the INDEL recalibration to be applied first solves the problem. -## -## Cromwell version support -## - Successfully tested on v31 -## - Does not work on versions < v23 due to output syntax -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -workflow JointGenotyping { - # Input Sample - String callset_name - - Array[String] sample_names - Array[File] input_gvcfs - Array[File] input_gvcfs_indices - - # Reference and Resources - File ref_fasta - File ref_fasta_index - File ref_dict - - File dbsnp_vcf - File dbsnp_vcf_index - - Array[String] snp_recalibration_tranche_values - Array[String] snp_recalibration_annotation_values - Array[String] indel_recalibration_tranche_values - Array[String] indel_recalibration_annotation_values - - File eval_interval_list - File hapmap_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf - File one_thousand_genomes_resource_vcf_index - File mills_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf = dbsnp_vcf - File dbsnp_resource_vcf_index = dbsnp_vcf_index - - File unpadded_intervals_file - # Runtime attributes - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - - Int? small_disk_override - Int small_disk = select_first([small_disk_override, "100"]) - Int? medium_disk_override - Int medium_disk = select_first([medium_disk_override, "200"]) - Int? large_disk_override - Int large_disk = select_first([large_disk_override, "300"]) - Int? huge_disk_override - Int huge_disk = select_first([huge_disk_override, "400"]) - - String? preemptible_tries_override - Int preemptible_tries = select_first([preemptible_tries_override, "3"]) - - # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme - # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 - Float excess_het_threshold = 54.69 - Float snp_filter_level - Float indel_filter_level - Int SNP_VQSR_downsampleFactor - - Int num_of_original_intervals = length(read_lines(unpadded_intervals_file)) - Int num_gvcfs = length(input_gvcfs) - - # Make a 2.5:1 interval number to samples in callset ratio interval list - Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5) - Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1 - - call DynamicallyCombineIntervals { - input: - intervals = unpadded_intervals_file, - merge_count = merge_count, - preemptible_tries = preemptible_tries - } - - Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals) - - scatter (idx in range(length(unpadded_intervals))) { - # the batch_size value was carefully chosen here as it - # is the optimal value for the amount of memory allocated - # within the task; please do not change it without consulting - # the Hellbender (GATK engine) team! - call ImportGVCFs { - input: - sample_names = sample_names, - input_gvcfs = input_gvcfs, - input_gvcfs_indices = input_gvcfs_indices, - interval = unpadded_intervals[idx], - workspace_dir_name = "genomicsdb", - disk_size = medium_disk, - batch_size = 50, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call GenotypeGVCFs { - input: - workspace_tar = ImportGVCFs.output_genomicsdb, - interval = unpadded_intervals[idx], - output_vcf_filename = "output.vcf.gz", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - dbsnp_vcf = dbsnp_vcf, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call HardFilterAndMakeSitesOnlyVcf { - input: - vcf = GenotypeGVCFs.output_vcf, - vcf_index = GenotypeGVCFs.output_vcf_index, - excess_het_threshold = excess_het_threshold, - variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", - sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - call GatherVcfs as SitesOnlyGatherVcf { - input: - input_vcfs_fofn = write_lines(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf), - output_vcf_name = callset_name + ".sites_only.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call IndelsVariantRecalibrator { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".indels.recal", - tranches_filename = callset_name + ".indels.tranches", - recalibration_tranche_values = indel_recalibration_tranche_values, - recalibration_annotation_values = indel_recalibration_annotation_values, - mills_resource_vcf = mills_resource_vcf, - mills_resource_vcf_index = mills_resource_vcf_index, - axiomPoly_resource_vcf = axiomPoly_resource_vcf, - axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - if (num_gvcfs > 10000) { - call SNPsVariantRecalibratorCreateModel { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - downsampleFactor = SNP_VQSR_downsampleFactor, - model_report_filename = callset_name + ".snps.model.report", - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { - call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { - input: - sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], - sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], - recalibration_filename = callset_name + ".snps." + idx + ".recal", - tranches_filename = callset_name + ".snps." + idx + ".tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - model_report = SNPsVariantRecalibratorCreateModel.model_report, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - call GatherTranches as SNPGatherTranches { - input: - input_fofn = write_lines(SNPsVariantRecalibratorScattered.tranches), - output_filename = callset_name + ".snps.gathered.tranches", - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - - if (num_gvcfs <= 10000){ - call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. - # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. - Boolean is_small_callset = num_gvcfs <= 1000 - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { - call ApplyRecalibration { - input: - recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", - input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], - input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], - indels_recalibration = IndelsVariantRecalibrator.recalibration, - indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, - indels_tranches = IndelsVariantRecalibrator.tranches, - snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), - snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), - snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), - indel_filter_level = indel_filter_level, - snp_filter_level = snp_filter_level, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - # for large callsets we need to collect metrics from the shards and gather them later - if (!is_small_callset) { - call CollectVariantCallingMetrics as CollectMetricsSharded { - input: - input_vcf = ApplyRecalibration.recalibrated_vcf, - input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, - metrics_filename_prefix = callset_name + "." + idx, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - } - - # for small callsets we can gather the VCF shards and then collect metrics on it - if (is_small_callset) { - call GatherVcfs as FinalGatherVcf { - input: - input_vcfs_fofn = write_lines(ApplyRecalibration.recalibrated_vcf), - output_vcf_name = callset_name + ".vcf.gz", - disk_size = huge_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call CollectVariantCallingMetrics as CollectMetricsOnFullVcf { - input: - input_vcf = FinalGatherVcf.output_vcf, - input_vcf_index = FinalGatherVcf.output_vcf_index, - metrics_filename_prefix = callset_name, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = large_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - # for large callsets we still need to gather the sharded metrics - if (!is_small_callset) { - call GatherMetrics { - input: - input_details_fofn = write_lines(select_all(CollectMetricsSharded.detail_metrics_file)), - input_summaries_fofn = write_lines(select_all(CollectMetricsSharded.summary_metrics_file)), - output_prefix = callset_name, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - output { - # outputs from the small callset path through the wdl - File? output_vcf = FinalGatherVcf.output_vcf - File? output_vcf_index = FinalGatherVcf.output_vcf_index - - # select metrics from the small callset path and the large callset path - File detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherMetrics.detail_metrics_file]) - File summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherMetrics.summary_metrics_file]) - - # output the interval list generated/used by this run workflow - File output_intervals = DynamicallyCombineIntervals.output_intervals - } -} - -task GetNumberOfSamples { - File sample_name_map - String docker - Int preemptible_tries - command <<< - wc -l ${sample_name_map} | awk '{print $1}' - >>> - runtime { - docker: docker - memory: "1 GB" - preemptible: preemptible_tries - } - output { - Int sample_count = read_int(stdout()) - } -} - -task ImportGVCFs { - Array[String] sample_names - Array[File] input_gvcfs - Array[File] input_gvcfs_indices - String interval - - String workspace_dir_name - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - Int batch_size - - command <<< - set -e - set -o pipefail - - python << CODE - gvcfs = ['${sep="','" input_gvcfs}'] - sample_names = ['${sep="','" sample_names}'] - - if len(gvcfs)!= len(sample_names): - exit(1) - - with open("inputs.list", "w") as fi: - for i in range(len(gvcfs)): - fi.write(sample_names[i] + "\t" + gvcfs[i] + "\n") - - CODE - - rm -rf ${workspace_dir_name} - - # The memory setting here is very important and must be several GB lower - # than the total memory allocated to the VM because this tool uses - # a significant amount of non-heap memory for native libraries. - # Also, testing has shown that the multithreaded reader initialization - # does not scale well beyond 5 threads, so don't increase beyond that. - ${gatk_path} --java-options "-Xmx4g -Xms4g" \ - GenomicsDBImport \ - --genomicsdb-workspace-path ${workspace_dir_name} \ - --batch-size ${batch_size} \ - -L ${interval} \ - --sample-name-map inputs.list \ - --reader-threads 5 \ - -ip 500 - - tar -cf ${workspace_dir_name}.tar ${workspace_dir_name} - - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_genomicsdb = "${workspace_dir_name}.tar" - } -} - -task GenotypeGVCFs { - File workspace_tar - String interval - - String output_vcf_filename - - String gatk_path - - File ref_fasta - File ref_fasta_index - File ref_dict - - String dbsnp_vcf - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - - tar -xf ${workspace_tar} - WORKSPACE=$( basename ${workspace_tar} .tar) - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - GenotypeGVCFs \ - -R ${ref_fasta} \ - -O ${output_vcf_filename} \ - -D ${dbsnp_vcf} \ - -G StandardAnnotation \ - --only-output-calls-starting-in-intervals \ - --use-new-qual-calculator \ - -V gendb://$WORKSPACE \ - -L ${interval} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_vcf = "${output_vcf_filename}" - File output_vcf_index = "${output_vcf_filename}.tbi" - } -} - -task HardFilterAndMakeSitesOnlyVcf { - File vcf - File vcf_index - Float excess_het_threshold - - String variant_filtered_vcf_filename - String sites_only_vcf_filename - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command { - set -e - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - VariantFiltration \ - --filter-expression "ExcessHet > ${excess_het_threshold}" \ - --filter-name ExcessHet \ - -O ${variant_filtered_vcf_filename} \ - -V ${vcf} - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - MakeSitesOnlyVcf \ - --INPUT ${variant_filtered_vcf_filename} \ - --OUTPUT ${sites_only_vcf_filename} - - } - runtime { - docker: docker - memory: "3.5 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File variant_filtered_vcf = "${variant_filtered_vcf_filename}" - File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi" - File sites_only_vcf = "${sites_only_vcf_filename}" - File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi" - } -} - -task IndelsVariantRecalibrator { - String recalibration_filename - String tranches_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File mills_resource_vcf - File axiomPoly_resource_vcf - File dbsnp_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx24g -Xms24g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode INDEL \ - --max-gaussians 4 \ - --resource:mills,known=false,training=true,truth=true,prior=12 ${mills_resource_vcf} \ - --resource:axiomPoly,known=false,training=true,truth=false,prior=10 ${axiomPoly_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=2 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "26 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task SNPsVariantRecalibratorCreateModel { - String recalibration_filename - String tranches_filename - Int downsampleFactor - String model_report_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx100g -Xms100g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - --sample-every-Nth-variant ${downsampleFactor} \ - --output-model ${model_report_filename} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "104 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File model_report = "${model_report_filename}" - } -} - -task SNPsVariantRecalibrator { - String recalibration_filename - String tranches_filename - File? model_report - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int? machine_mem_gb - Int auto_mem = ceil(2*size(sites_only_variant_filtered_vcf, "GB" )) - Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: machine_mem + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task GatherTranches { - File input_fofn - String output_filename - - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - set -o pipefail - - # this is here to deal with the JES bug where commands may be run twice - rm -rf tranches - - mkdir tranches - RETRY_LIMIT=5 - - count=0 - until cat ${input_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the tranches from the cloud' && exit 1 - fi - - cat ${input_fofn} | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherTranches \ - --input inputs.list \ - --output ${output_filename} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File tranches = "${output_filename}" - } -} - -task ApplyRecalibration { - String recalibrated_vcf_filename - File input_vcf - File input_vcf_index - File indels_recalibration - File indels_recalibration_index - File indels_tranches - File snps_recalibration - File snps_recalibration_index - File snps_tranches - - Float indel_filter_level - Float snp_filter_level - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - set -e - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O tmp.indel.recalibrated.vcf \ - -V ${input_vcf} \ - --recal-file ${indels_recalibration} \ - --tranches-file ${indels_tranches} \ - --truth-sensitivity-filter-level ${indel_filter_level} \ - --create-output-variant-index true \ - -mode INDEL - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O ${recalibrated_vcf_filename} \ - -V tmp.indel.recalibrated.vcf \ - --recal-file ${snps_recalibration} \ - --tranches-file ${snps_tranches} \ - --truth-sensitivity-filter-level ${snp_filter_level} \ - --create-output-variant-index true \ - -mode SNP - } - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibrated_vcf = "${recalibrated_vcf_filename}" - File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi" - } -} - -task GatherVcfs { - File input_vcfs_fofn - String output_vcf_name - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - - # Now using NIO to localize the vcfs but the input file must have a ".list" extension - mv ${input_vcfs_fofn} inputs.list - - # --ignore-safety-checks makes a big performance difference so we include it in our invocation. - # This argument disables expensive checks that the file headers contain the same set of - # genotyped samples and that files are in order by position of first record. - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherVcfsCloud \ - --ignore-safety-checks \ - --gather-type BLOCK \ - --input inputs.list \ - --output ${output_vcf_name} - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - IndexFeatureFile \ - --feature-file ${output_vcf_name} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_vcf = "${output_vcf_name}" - File output_vcf_index = "${output_vcf_name}.tbi" - } -} - -task CollectVariantCallingMetrics { - File input_vcf - File input_vcf_index - - String metrics_filename_prefix - File dbsnp_vcf - File dbsnp_vcf_index - File interval_list - File ref_dict - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - CollectVariantCallingMetrics \ - --INPUT ${input_vcf} \ - --DBSNP ${dbsnp_vcf} \ - --SEQUENCE_DICTIONARY ${ref_dict} \ - --OUTPUT ${metrics_filename_prefix} \ - --THREAD_COUNT 8 \ - --TARGET_INTERVALS ${interval_list} - } - output { - File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics" - } - runtime { - docker: docker - memory: "7 GB" - cpu: 2 - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } -} - -task GatherMetrics { - File input_details_fofn - File input_summaries_fofn - - String output_prefix - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - set -o pipefail - - # this is here to deal with the JES bug where commands may be run twice - rm -rf metrics - - mkdir metrics - RETRY_LIMIT=5 - - count=0 - until cat ${input_details_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the metrics from the cloud' && exit 1 - fi - - count=0 - until cat ${input_summaries_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the metrics from the cloud' && exit 1 - fi - - INPUT=`cat ${input_details_fofn} | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("-I=metrics/%s ", $1)}'` - - ${gatk_path} --java-options "-Xmx2g -Xms2g" \ - AccumulateVariantCallingMetrics \ - $INPUT \ - -O ${output_prefix} - >>> - runtime { - docker: docker - memory: "3 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics" - } -} - -task DynamicallyCombineIntervals { - File intervals - Int merge_count - Int preemptible_tries - - command { - python << CODE - def parse_interval(interval): - colon_split = interval.split(":") - chromosome = colon_split[0] - dash_split = colon_split[1].split("-") - start = int(dash_split[0]) - end = int(dash_split[1]) - return chromosome, start, end - - def add_interval(chr, start, end): - lines_to_write.append(chr + ":" + str(start) + "-" + str(end)) - return chr, start, end - - count = 0 - chain_count = ${merge_count} - l_chr, l_start, l_end = "", 0, 0 - lines_to_write = [] - with open("${intervals}") as f: - with open("out.intervals", "w") as f1: - for line in f.readlines(): - # initialization - if count == 0: - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - # reached number to combine, so spit out and start over - if count == chain_count: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - - c_chr, c_start, c_end = parse_interval(line) - # if adjacent keep the chain going - if c_chr == w_chr and c_start == w_end + 1: - w_end = c_end - count += 1 - continue - # not adjacent, end here and start a new chain - else: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - if l_char != w_chr or l_start != w_start or l_end != w_end: - add_interval(w_chr, w_start, w_end) - f1.writelines("\n".join(lines_to_write)) - CODE - } - - runtime { - memory: "3 GB" - preemptible: preemptible_tries - docker: "python:2.7" - } - - output { - File output_intervals = "out.intervals" - } -} - diff --git a/joint-discovery-gatk4-local.hg38.wgs.inputs.json b/joint-discovery-gatk4-local.hg38.wgs.inputs.json deleted file mode 100644 index 2742e5d..0000000 --- a/joint-discovery-gatk4-local.hg38.wgs.inputs.json +++ /dev/null @@ -1,52 +0,0 @@ -{ - "##_COMMENT1": "INPUT GVCFs & COHORT -- DATASET-SPECIFC, MUST BE ADAPTED", - "JointGenotyping.sample_names": ["NA12878"], - "JointGenotyping.input_gvcfs": ["/home/bshifaw/data/joint_discovery/NA12878.g.vcf.gz"], - "JointGenotyping.input_gvcfs_indices": ["/home/bshifaw/data/joint_discovery/NA12878.g.vcf.gz.tbi"], - "JointGenotyping.callset_name": "NA12878", - - "##_COMMENT2": "REFERENCE FILES", - "JointGenotyping.ref_fasta": "/home/bshifaw/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "JointGenotyping.ref_fasta_index": "/home/bshifaw/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "JointGenotyping.ref_dict": "/home/bshifaw/broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - - "##_COMMENT3": "INTERVALS", - "JointGenotyping.eval_interval_list": "/home/bshifaw/broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", - "JointGenotyping.unpadded_intervals_file": "/home/bshifaw/broad-references/hg38/v0/hg38.even.handcurated.20k.intervals", - - "##_COMMENT4": "RESOURCE FILES", - "JointGenotyping.dbsnp_vcf": "/home/bshifaw/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", - "JointGenotyping.dbsnp_vcf_index": "/home/bshifaw/broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", - "JointGenotyping.one_thousand_genomes_resource_vcf": "/home/bshifaw/broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", - "JointGenotyping.one_thousand_genomes_resource_vcf_index": "/home/bshifaw/broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", - "JointGenotyping.omni_resource_vcf": "/home/bshifaw/broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", - "JointGenotyping.omni_resource_vcf_index": "/home/bshifaw/broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", - "JointGenotyping.mills_resource_vcf": "/home/bshifaw/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", - "JointGenotyping.mills_resource_vcf_index": "/home/bshifaw/broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", - "JointGenotyping.axiomPoly_resource_vcf": "/home/bshifaw/broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", - "JointGenotyping.axiomPoly_resource_vcf_index": "/home/bshifaw/broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", - "JointGenotyping.hapmap_resource_vcf": "/home/bshifaw/broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", - "JointGenotyping.hapmap_resource_vcf_index": "/home/bshifaw/broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", - - "##_COMMENT5": "VQSR PARAMETERS", - "JointGenotyping.SNP_VQSR_downsampleFactor": 10, - "JointGenotyping.snp_filter_level": 99.7, - "JointGenotyping.indel_filter_level": 99.7, - "JointGenotyping.indel_recalibration_annotation_values": ["FS", "ReadPosRankSum", "MQRankSum", "QD", "SOR", "DP"], - "JointGenotyping.indel_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], - "JointGenotyping.snp_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ], - "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], - - "##_COMMENT4": "DOCKERS", - "#JointGenotyping.gatk_docker_override": "String? (optional)", - - "##_COMMENT5": "PATHS", - "#JointGenotyping.gatk_path_override": "String? (optional)", - - "##_COMMENT8": "DISK SIZE ALLOCATION", - "#JointGenotyping.small_disk_override": "Int? (optional)", - "#JointGenotyping.medium_disk_override": "Int? (optional)", - "#JointGenotyping.large_disk_override": "Int? (optional)", - "#JointGenotyping.huge_disk_override": "Int? (optional)" - -} diff --git a/joint-discovery-gatk4-local.wdl b/joint-discovery-gatk4-local.wdl deleted file mode 100644 index c41d7a6..0000000 --- a/joint-discovery-gatk4-local.wdl +++ /dev/null @@ -1,947 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL implements the joint discovery and VQSR filtering portion of the GATK -## Best Practices (June 2016) for germline SNP and Indel discovery in human -## whole-genome sequencing (WGS) and exome sequencing data. -## -## Requirements/expectations : -## - One or more GVCFs produced by HaplotypeCaller in GVCF mode -## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. -## -## Outputs : -## - A VCF file and its index, filtered using variant quality score recalibration -## (VQSR) with genotypes for all samples present in the input VCF. All sites that -## are present in the input VCF are retained; filtered sites are annotated as such -## in the FILTER field. -## -## Note about VQSR wiring : -## The SNP and INDEL models are built in parallel, but then the corresponding -## recalibrations are applied in series. Because the INDEL model is generally ready -## first (because there are fewer indels than SNPs) we set INDEL recalibration to -## be applied first to the input VCF, while the SNP model is still being built. By -## the time the SNP model is available, the indel-recalibrated file is available to -## serve as input to apply the SNP recalibration. If we did it the other way around, -## we would have to wait until the SNP recal file was available despite the INDEL -## recal file being there already, then apply SNP recalibration, then apply INDEL -## recalibration. This would lead to a longer wall clock time for complete workflow -## execution. Wiring the INDEL recalibration to be applied first solves the problem. -## -## Cromwell version support -## - Successfully tested on v31 -## - Does not work on versions < v23 due to output syntax -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -workflow JointGenotyping { - # Input Sample - String callset_name - Array[String] sample_names - Array[File] input_gvcfs - Array[File] input_gvcfs_indices - - # Reference and Resources - File ref_fasta - File ref_fasta_index - File ref_dict - - File dbsnp_vcf - File dbsnp_vcf_index - - Array[String] snp_recalibration_tranche_values - Array[String] snp_recalibration_annotation_values - Array[String] indel_recalibration_tranche_values - Array[String] indel_recalibration_annotation_values - - File eval_interval_list - File hapmap_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf - File one_thousand_genomes_resource_vcf_index - File mills_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf = dbsnp_vcf - File dbsnp_resource_vcf_index = dbsnp_vcf_index - - File unpadded_intervals_file - - # Runtime attributes - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - - Int? small_disk_override - Int small_disk = select_first([small_disk_override, "100"]) - Int? medium_disk_override - Int medium_disk = select_first([medium_disk_override, "200"]) - Int? large_disk_override - Int large_disk = select_first([large_disk_override, "300"]) - Int? huge_disk_override - Int huge_disk = select_first([huge_disk_override, "400"]) - - # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme - # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 - Float excess_het_threshold = 54.69 - Float snp_filter_level - Float indel_filter_level - Int SNP_VQSR_downsampleFactor - - Int num_of_original_intervals = length(read_lines(unpadded_intervals_file)) - Int num_gvcfs = length(input_gvcfs) - - # Make a 2.5:1 interval number to samples in callset ratio interval list - Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5) - Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1 - - call DynamicallyCombineIntervals { - input: - intervals = unpadded_intervals_file, - merge_count = merge_count - } - - Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals) - - scatter (idx in range(length(unpadded_intervals))) { - # the batch_size value was carefully chosen here as it - # is the optimal value for the amount of memory allocated - # within the task; please do not change it without consulting - # the Hellbender (GATK engine) team! - call ImportGVCFs { - input: - sample_names = sample_names, - interval = unpadded_intervals[idx], - workspace_dir_name = "genomicsdb", - input_gvcfs = input_gvcfs, - input_gvcfs_indices = input_gvcfs_indices, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - batch_size = 50 - } - - call GenotypeGVCFs { - input: - workspace_tar = ImportGVCFs.output_genomicsdb, - interval = unpadded_intervals[idx], - output_vcf_filename = "output.vcf.gz", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - call HardFilterAndMakeSitesOnlyVcf { - input: - vcf = GenotypeGVCFs.output_vcf, - vcf_index = GenotypeGVCFs.output_vcf_index, - excess_het_threshold = excess_het_threshold, - variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", - sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - call GatherVcfs as SitesOnlyGatherVcf { - input: - input_vcfs_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf, - input_vcf_indexes_fofn = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index, - output_vcf_name = callset_name + ".sites_only.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - call IndelsVariantRecalibrator { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".indels.recal", - tranches_filename = callset_name + ".indels.tranches", - recalibration_tranche_values = indel_recalibration_tranche_values, - recalibration_annotation_values = indel_recalibration_annotation_values, - mills_resource_vcf = mills_resource_vcf, - mills_resource_vcf_index = mills_resource_vcf_index, - axiomPoly_resource_vcf = axiomPoly_resource_vcf, - axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - if (num_gvcfs > 10000) { - call SNPsVariantRecalibratorCreateModel { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - downsampleFactor = SNP_VQSR_downsampleFactor, - model_report_filename = callset_name + ".snps.model.report", - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { - call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { - input: - sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], - sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], - recalibration_filename = callset_name + ".snps." + idx + ".recal", - tranches_filename = callset_name + ".snps." + idx + ".tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - model_report = SNPsVariantRecalibratorCreateModel.model_report, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - call GatherTranches as SNPGatherTranches { - input: - input_fofn = SNPsVariantRecalibratorScattered.tranches, - output_filename = callset_name + ".snps.gathered.tranches", - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - if (num_gvcfs <= 10000){ - call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. - # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. - Boolean is_small_callset = num_gvcfs <= 1000 - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { - call ApplyRecalibration { - input: - recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", - input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], - input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], - indels_recalibration = IndelsVariantRecalibrator.recalibration, - indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, - indels_tranches = IndelsVariantRecalibrator.tranches, - snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), - snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), - snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), - indel_filter_level = indel_filter_level, - snp_filter_level = snp_filter_level, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - # for large callsets we need to collect metrics from the shards and gather them later - if (!is_small_callset) { - call CollectVariantCallingMetrics as CollectMetricsSharded { - input: - input_vcf = ApplyRecalibration.recalibrated_vcf, - input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, - metrics_filename_prefix = callset_name + "." + idx, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - } - - # for small callsets we can gather the VCF shards and then collect metrics on it - if (is_small_callset) { - call GatherVcfs as FinalGatherVcf { - input: - input_vcfs_fofn = ApplyRecalibration.recalibrated_vcf, - input_vcf_indexes_fofn = ApplyRecalibration.recalibrated_vcf_index, - output_vcf_name = callset_name + ".vcf.gz", - disk_size = huge_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - - call CollectVariantCallingMetrics as CollectMetricsOnFullVcf { - input: - input_vcf = FinalGatherVcf.output_vcf, - input_vcf_index = FinalGatherVcf.output_vcf_index, - metrics_filename_prefix = callset_name, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = large_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - # for large callsets we still need to gather the sharded metrics - if (!is_small_callset) { - call GatherMetrics { - input: - input_details_fofn = select_all(CollectMetricsSharded.detail_metrics_file), - input_summaries_fofn = select_all(CollectMetricsSharded.summary_metrics_file), - output_prefix = callset_name, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path - } - } - - output { - # outputs from the small callset path through the wdl - File? output_vcf = FinalGatherVcf.output_vcf - File? output_vcf_index = FinalGatherVcf.output_vcf_index - - # select metrics from the small callset path and the large callset path - File detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherMetrics.detail_metrics_file]) - File summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherMetrics.summary_metrics_file]) - - # output the interval list generated/used by this run workflow - File output_intervals = DynamicallyCombineIntervals.output_intervals - } -} - -task GetNumberOfSamples { - File sample_name_map - String docker - command <<< - wc -l ${sample_name_map} | awk '{print $1}' - >>> - runtime { - docker: docker - memory: "1 GB" - preemptible: 5 - } - output { - Int sample_count = read_int(stdout()) - } -} - -task ImportGVCFs { - Array[String] sample_names - Array[File] input_gvcfs - Array[File] input_gvcfs_indices - String interval - - String workspace_dir_name - - String gatk_path - String docker - Int disk_size - Int batch_size - - command <<< - set -e - set -o pipefail - - python << CODE - gvcfs = ['${sep="','" input_gvcfs}'] - sample_names = ['${sep="','" sample_names}'] - - if len(gvcfs)!= len(sample_names): - exit(1) - - with open("inputs.list", "w") as fi: - for i in range(len(gvcfs)): - fi.write(sample_names[i] + "\t" + gvcfs[i] + "\n") - - CODE - - # The memory setting here is very important and must be several GB lower - # than the total memory allocated to the VM because this tool uses - # a significant amount of non-heap memory for native libraries. - # Also, testing has shown that the multithreaded reader initialization - # does not scale well beyond 5 threads, so don't increase beyond that. - ${gatk_path} --java-options "-Xmx4g -Xms4g" \ - GenomicsDBImport \ - --genomicsdb-workspace-path ${workspace_dir_name} \ - --batch-size ${batch_size} \ - -L ${interval} \ - --sample-name-map inputs.list \ - --reader-threads 5 \ - -ip 500 - - tar -cf ${workspace_dir_name}.tar ${workspace_dir_name} - - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File output_genomicsdb = "${workspace_dir_name}.tar" - } -} - -task GenotypeGVCFs { - File workspace_tar - String interval - - String output_vcf_filename - - File ref_fasta - File ref_fasta_index - File ref_dict - - File dbsnp_vcf - File dbsnp_vcf_index - - String gatk_path - String docker - Int disk_size - - command <<< - set -e - - tar -xf ${workspace_tar} - WORKSPACE=$( basename ${workspace_tar} .tar) - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - GenotypeGVCFs \ - -R ${ref_fasta} \ - -O ${output_vcf_filename} \ - -D ${dbsnp_vcf} \ - -G StandardAnnotation \ - --only-output-calls-starting-in-intervals \ - --use-new-qual-calculator \ - -V gendb://$WORKSPACE \ - -L ${interval} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File output_vcf = "${output_vcf_filename}" - File output_vcf_index = "${output_vcf_filename}.tbi" - } -} - -task HardFilterAndMakeSitesOnlyVcf { - File vcf - File vcf_index - Float excess_het_threshold - - String variant_filtered_vcf_filename - String sites_only_vcf_filename - String gatk_path - - String docker - Int disk_size - - command { - set -e - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - VariantFiltration \ - --filter-expression "ExcessHet > ${excess_het_threshold}" \ - --filter-name ExcessHet \ - -O ${variant_filtered_vcf_filename} \ - -V ${vcf} - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - MakeSitesOnlyVcf \ - --INPUT ${variant_filtered_vcf_filename} \ - --OUTPUT ${sites_only_vcf_filename} - - } - runtime { - docker: docker - memory: "3.5 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File variant_filtered_vcf = "${variant_filtered_vcf_filename}" - File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi" - File sites_only_vcf = "${sites_only_vcf_filename}" - File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi" - } -} - -task IndelsVariantRecalibrator { - String recalibration_filename - String tranches_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File mills_resource_vcf - File axiomPoly_resource_vcf - File dbsnp_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - - command { - ${gatk_path} --java-options "-Xmx24g -Xms24g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode INDEL \ - --max-gaussians 4 \ - --resource:mills,known=false,training=true,truth=true,prior=12 ${mills_resource_vcf} \ - --resource:axiomPoly,known=false,training=true,truth=false,prior=10 ${axiomPoly_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=2 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "26 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task SNPsVariantRecalibratorCreateModel { - String recalibration_filename - String tranches_filename - Int downsampleFactor - String model_report_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - - command { - ${gatk_path} --java-options "-Xmx100g -Xms100g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - --sample-every-Nth-variant ${downsampleFactor} \ - --output-model ${model_report_filename} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "104 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File model_report = "${model_report_filename}" - } -} - -task SNPsVariantRecalibrator { - String recalibration_filename - String tranches_filename - File? model_report - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - - command { - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15:${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12:${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10:${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7:${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "3.5 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task GatherTranches { - Array[File] input_fofn - String output_filename - - String gatk_path - - String docker - Int disk_size - - command <<< - set -e - set -o pipefail - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherTranches \ - --input ${sep=" --input " input_fofn} \ - --output ${output_filename} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File tranches = "${output_filename}" - } -} - -task ApplyRecalibration { - String recalibrated_vcf_filename - File input_vcf - File input_vcf_index - File indels_recalibration - File indels_recalibration_index - File indels_tranches - File snps_recalibration - File snps_recalibration_index - File snps_tranches - - Float indel_filter_level - Float snp_filter_level - - String gatk_path - String docker - Int disk_size - - command { - set -e - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O tmp.indel.recalibrated.vcf \ - -V ${input_vcf} \ - --recal-file ${indels_recalibration} \ - --tranches-file ${indels_tranches} \ - --truth-sensitivity-filter-level ${indel_filter_level} \ - --create-output-variant-index true \ - -mode INDEL - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O ${recalibrated_vcf_filename} \ - -V tmp.indel.recalibrated.vcf \ - --recal-file ${snps_recalibration} \ - --tranches-file ${snps_tranches} \ - --truth-sensitivity-filter-level ${snp_filter_level} \ - --create-output-variant-index true \ - -mode SNP - } - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File recalibrated_vcf = "${recalibrated_vcf_filename}" - File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi" - } -} - -task GatherVcfs { - Array[File] input_vcfs_fofn - Array[File] input_vcf_indexes_fofn - String output_vcf_name - - String gatk_path - String docker - Int disk_size - - command <<< - set -e - set -o pipefail - - # ignoreSafetyChecks make a big performance difference so we include it in our invocation - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherVcfsCloud \ - --ignore-safety-checks \ - --gather-type BLOCK \ - --input ${sep=" --input " input_vcfs_fofn} \ - --output ${output_vcf_name} - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - IndexFeatureFile \ - --feature-file ${output_vcf_name} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File output_vcf = "${output_vcf_name}" - File output_vcf_index = "${output_vcf_name}.tbi" - } -} - -task CollectVariantCallingMetrics { - File input_vcf - File input_vcf_index - - String metrics_filename_prefix - File dbsnp_vcf - File dbsnp_vcf_index - File interval_list - File ref_dict - - String gatk_path - String docker - Int disk_size - - command { - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - CollectVariantCallingMetrics \ - --INPUT ${input_vcf} \ - --DBSNP ${dbsnp_vcf} \ - --SEQUENCE_DICTIONARY ${ref_dict} \ - --OUTPUT ${metrics_filename_prefix} \ - --THREAD_COUNT 8 \ - --TARGET_INTERVALS ${interval_list} - } - output { - File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics" - } - runtime { - docker: docker - memory: "7 GB" - cpu: 2 - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } -} - -task GatherMetrics { - Array[File] input_details_fofn - Array[File] input_summaries_fofn - - String output_prefix - - String gatk_path - String docker - Int disk_size - - command <<< - set -e - set -o pipefail - - - ${gatk_path} --java-options "-Xmx2g -Xms2g" \ - AccumulateVariantCallingMetrics \ - --INPUT ${sep=" --INPUT " input_details_fofn} \ - --OUTPUT ${output_prefix} - >>> - runtime { - docker: docker - memory: "3 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 5 - } - output { - File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics" - } -} - -task DynamicallyCombineIntervals { - File intervals - Int merge_count - - command { - python << CODE - def parse_interval(interval): - colon_split = interval.split(":") - chromosome = colon_split[0] - dash_split = colon_split[1].split("-") - start = int(dash_split[0]) - end = int(dash_split[1]) - return chromosome, start, end - - def add_interval(chr, start, end): - lines_to_write.append(chr + ":" + str(start) + "-" + str(end)) - return chr, start, end - - count = 0 - chain_count = ${merge_count} - l_chr, l_start, l_end = "", 0, 0 - lines_to_write = [] - with open("${intervals}") as f: - with open("out.intervals", "w") as f1: - for line in f.readlines(): - # initialization - if count == 0: - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - # reached number to combine, so spit out and start over - if count == chain_count: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - - c_chr, c_start, c_end = parse_interval(line) - # if adjacent keep the chain going - if c_chr == w_chr and c_start == w_end + 1: - w_end = c_end - count += 1 - continue - # not adjacent, end here and start a new chain - else: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - if l_char != w_chr or l_start != w_start or l_end != w_end: - add_interval(w_chr, w_start, w_end) - f1.writelines("\n".join(lines_to_write)) - CODE - } - - runtime { - memory: "3 GB" - preemptible: 5 - docker: "python:2.7" - } - - output { - File output_intervals = "out.intervals" - } -} diff --git a/joint-discovery-gatk4.hg38.wgs.inputs.json b/joint-discovery-gatk4.hg38.wgs.inputs.json deleted file mode 100644 index 4bd2d8b..0000000 --- a/joint-discovery-gatk4.hg38.wgs.inputs.json +++ /dev/null @@ -1,84 +0,0 @@ -{ - "##_COMMENT1": "INPUT GVCFs & COHORT -- DATASET-SPECIFC, MUST BE ADAPTED", - "JointGenotyping.callset_name": "NA12878", - "JointGenotyping.sample_name_map": "gs://gatk-test-data/joint_discovery/NA12878.sample_map", - - "##_COMMENT2": "REFERENCE FILES", - "JointGenotyping.ref_fasta": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", - "JointGenotyping.ref_fasta_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.fasta.fai", - "JointGenotyping.ref_dict": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dict", - - "##_COMMENT3": "INTERVALS", - "JointGenotyping.eval_interval_list": "gs://broad-references/hg38/v0/wgs_evaluation_regions.hg38.interval_list", - "JointGenotyping.unpadded_intervals_file": "gs://gatk-test-data/intervals/hg38.even.handcurated.20k.intervals", - - "##_COMMENT4": "RESOURCE FILES", - "JointGenotyping.dbsnp_vcf": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf", - "JointGenotyping.dbsnp_vcf_index": "gs://broad-references/hg38/v0/Homo_sapiens_assembly38.dbsnp138.vcf.idx", - "JointGenotyping.one_thousand_genomes_resource_vcf": "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz", - "JointGenotyping.one_thousand_genomes_resource_vcf_index": "gs://broad-references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi", - "JointGenotyping.omni_resource_vcf": "gs://broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz", - "JointGenotyping.omni_resource_vcf_index": "gs://broad-references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi", - "JointGenotyping.mills_resource_vcf": "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz", - "JointGenotyping.mills_resource_vcf_index": "gs://broad-references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi", - "JointGenotyping.axiomPoly_resource_vcf": "gs://broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz", - "JointGenotyping.axiomPoly_resource_vcf_index": "gs://broad-references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi", - "JointGenotyping.hapmap_resource_vcf": "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz", - "JointGenotyping.hapmap_resource_vcf_index": "gs://broad-references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi", - - "##_COMMENT5": "VQSR PARAMETERS", - "JointGenotyping.SNP_VQSR_downsampleFactor": 10, - "JointGenotyping.snp_filter_level": 99.7, - "JointGenotyping.indel_filter_level": 99.7, - "JointGenotyping.indel_recalibration_annotation_values": ["FS", "ReadPosRankSum", "MQRankSum", "QD", "SOR", "DP"], - "JointGenotyping.indel_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.5", "99.0", "97.0", "96.0", "95.0", "94.0", "93.5", "93.0", "92.0", "91.0", "90.0"], - "JointGenotyping.snp_recalibration_tranche_values": ["100.0", "99.95", "99.9", "99.8", "99.6", "99.5", "99.4", "99.3", "99.0", "98.0", "97.0", "90.0" ], - "JointGenotyping.snp_recalibration_annotation_values": ["QD", "MQRankSum", "ReadPosRankSum", "FS", "MQ", "SOR", "DP"], - - "##_COMMENT4": "DOCKERS", - "#JointGenotyping.gatk_docker_override": "String? (optional)", - - "##_COMMENT5": "PATHS", - "#JointGenotyping.gatk_path_override": "String? (optional)", - - "##_COMMENT6": "JAVA OPTIONS", - "JointGenotyping.SNPsVariantRecalibratorScattered.java_opt": "-Xmx3g -Xms3g", - "JointGenotyping.CollectMetricsOnFullVcf.java_opt": "-Xmx6g -Xms6g", - "JointGenotyping.IndelsVariantRecalibrator.java_opt": "-Xmx24g -Xms24g", - "JointGenotyping.HardFilterAndMakeSitesOnlyVcf.java_opt": "-Xmx3g -Xms3g", - "JointGenotyping.SNPGatherTranches.java_opt": "-Xmx6g -Xms6g", - "JointGenotyping.CollectMetricsSharded.java_opt": "-Xmx6g -Xms6g", - "JointGenotyping.SitesOnlyGatherVcf.java_opt": "-Xmx6g -Xms6g", - "JointGenotyping.ApplyRecalibration.java_opt": "-Xmx5g -Xms5g", - "JointGenotyping.FinalGatherVcf.java_opt": "-Xmx6g -Xms6g", - "JointGenotyping.ImportGVCFs.java_opt": "-Xmx4g -Xms4g", - "JointGenotyping.SNPsVariantRecalibratorCreateModel.java_opt": "-Xmx100g -Xms100g", - "JointGenotyping.GatherMetrics.java_opt": "-Xmx2g -Xms2g", - "JointGenotyping.GenotypeGVCFs.java_opt": "-Xmx5g -Xms5g", - - "##_COMMENT7": "MEMORY ALLOCATION", - "JointGenotyping.CollectMetricsSharded.mem_size": "7 GB", - "JointGenotyping.ImportGVCFs.mem_size": "7 GB", - "JointGenotyping.IndelsVariantRecalibrator.mem_size": "26 GB", - "JointGenotyping.ApplyRecalibration.mem_size": "7 GB", - "JointGenotyping.CollectMetricsOnFullVcf.mem_size": "7 GB", - "JointGenotyping.GenotypeGVCFs.mem_size": "7 GB", - "JointGenotyping.FinalGatherVcf.mem_size": "7 GB", - "JointGenotyping.SitesOnlyGatherVcf.mem_size": "7 GB", - "JointGenotyping.SNPsVariantRecalibratorScattered.mem_size": "3.5 GB", - "JointGenotyping.SNPsVariantRecalibratorCreateModel.mem_size": "104 GB", - "JointGenotyping.DynamicallyCombineIntervals.mem_size": "3 GB", - "JointGenotyping.GatherMetrics.mem_size": "3 GB", - "JointGenotyping.HardFilterAndMakeSitesOnlyVcf.mem_size": "3.5 GB", - "JointGenotyping.SNPGatherTranches.mem_size": "7 GB", - - "##_COMMENT8": "DISK SIZE ALLOCATION", - "#JointGenotyping.small_disk_override": "Int? (optional)", - "#JointGenotyping.medium_disk_override": "Int? (optional)", - "#JointGenotyping.large_disk_override": "Int? (optional)", - "#JointGenotyping.huge_disk_override": "Int? (optional)", - - "##_COMMENT9": "PREEMPTIBLES", - "#JointGenotyping.preemptible_tries_override": "Int? (optional)" -} - diff --git a/joint-discovery-gatk4.wdl b/joint-discovery-gatk4.wdl deleted file mode 100644 index 2b2df1d..0000000 --- a/joint-discovery-gatk4.wdl +++ /dev/null @@ -1,1018 +0,0 @@ -## Copyright Broad Institute, 2018 -## -## This WDL implements the joint discovery and VQSR filtering portion of the GATK -## Best Practices (June 2016) for germline SNP and Indel discovery in human -## whole-genome sequencing (WGS) and exome sequencing data. -## -## Requirements/expectations : -## - One or more GVCFs produced by HaplotypeCaller in GVCF mode -## - Bare minimum 1 WGS sample or 30 Exome samples. Gene panels are not supported. -## -## Outputs : -## - A VCF file and its index, filtered using variant quality score recalibration -## (VQSR) with genotypes for all samples present in the input VCF. All sites that -## are present in the input VCF are retained; filtered sites are annotated as such -## in the FILTER field. -## -## Note about VQSR wiring : -## The SNP and INDEL models are built in parallel, but then the corresponding -## recalibrations are applied in series. Because the INDEL model is generally ready -## first (because there are fewer indels than SNPs) we set INDEL recalibration to -## be applied first to the input VCF, while the SNP model is still being built. By -## the time the SNP model is available, the indel-recalibrated file is available to -## serve as input to apply the SNP recalibration. If we did it the other way around, -## we would have to wait until the SNP recal file was available despite the INDEL -## recal file being there already, then apply SNP recalibration, then apply INDEL -## recalibration. This would lead to a longer wall clock time for complete workflow -## execution. Wiring the INDEL recalibration to be applied first solves the problem. -## -## Cromwell version support -## - Successfully tested on v31 -## - Does not work on versions < v23 due to output syntax -## -## Runtime parameters are optimized for Broad's Google Cloud Platform implementation. -## For program versions, see docker containers. -## -## LICENSING : -## This script is released under the WDL source code license (BSD-3) (see LICENSE in -## https://github.com/broadinstitute/wdl). Note however that the programs it calls may -## be subject to different licenses. Users are responsible for checking that they are -## authorized to run all programs before running this script. Please see the docker -## page at https://hub.docker.com/r/broadinstitute/genomes-in-the-cloud/ for detailed -## licensing information pertaining to the included programs. - -workflow JointGenotyping { - # Input Sample - String callset_name - File sample_name_map - - # Reference and Resources - File ref_fasta - File ref_fasta_index - File ref_dict - - File dbsnp_vcf - File dbsnp_vcf_index - - Array[String] snp_recalibration_tranche_values - Array[String] snp_recalibration_annotation_values - Array[String] indel_recalibration_tranche_values - Array[String] indel_recalibration_annotation_values - - File eval_interval_list - File hapmap_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf - File one_thousand_genomes_resource_vcf_index - File mills_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf = dbsnp_vcf - File dbsnp_resource_vcf_index = dbsnp_vcf_index - - File unpadded_intervals_file - - # Runtime attributes - String? gatk_docker_override - String gatk_docker = select_first([gatk_docker_override, "broadinstitute/gatk:4.1.0.0"]) - String? gatk_path_override - String gatk_path = select_first([gatk_path_override, "/gatk/gatk"]) - - Int? small_disk_override - Int small_disk = select_first([small_disk_override, "100"]) - Int? medium_disk_override - Int medium_disk = select_first([medium_disk_override, "200"]) - Int? large_disk_override - Int large_disk = select_first([large_disk_override, "300"]) - Int? huge_disk_override - Int huge_disk = select_first([huge_disk_override, "400"]) - - String? preemptible_tries_override - Int preemptible_tries = select_first([preemptible_tries_override, "3"]) - - # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme - # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 - Float excess_het_threshold = 54.69 - Float snp_filter_level - Float indel_filter_level - Int SNP_VQSR_downsampleFactor - - Int num_of_original_intervals = length(read_lines(unpadded_intervals_file)) - Int num_gvcfs = length(read_lines(sample_name_map)) - - # Make a 2.5:1 interval number to samples in callset ratio interval list - Int possible_merge_count = floor(num_of_original_intervals / num_gvcfs / 2.5) - Int merge_count = if possible_merge_count > 1 then possible_merge_count else 1 - - call DynamicallyCombineIntervals { - input: - intervals = unpadded_intervals_file, - merge_count = merge_count, - preemptible_tries = preemptible_tries - } - - Array[String] unpadded_intervals = read_lines(DynamicallyCombineIntervals.output_intervals) - - scatter (idx in range(length(unpadded_intervals))) { - # the batch_size value was carefully chosen here as it - # is the optimal value for the amount of memory allocated - # within the task; please do not change it without consulting - # the Hellbender (GATK engine) team! - call ImportGVCFs { - input: - sample_name_map = sample_name_map, - interval = unpadded_intervals[idx], - workspace_dir_name = "genomicsdb", - disk_size = medium_disk, - batch_size = 50, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call GenotypeGVCFs { - input: - workspace_tar = ImportGVCFs.output_genomicsdb, - interval = unpadded_intervals[idx], - output_vcf_filename = "output.vcf.gz", - ref_fasta = ref_fasta, - ref_fasta_index = ref_fasta_index, - ref_dict = ref_dict, - dbsnp_vcf = dbsnp_vcf, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call HardFilterAndMakeSitesOnlyVcf { - input: - vcf = GenotypeGVCFs.output_vcf, - vcf_index = GenotypeGVCFs.output_vcf_index, - excess_het_threshold = excess_het_threshold, - variant_filtered_vcf_filename = callset_name + "." + idx + ".variant_filtered.vcf.gz", - sites_only_vcf_filename = callset_name + "." + idx + ".sites_only.variant_filtered.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - call GatherVcfs as SitesOnlyGatherVcf { - input: - input_vcfs_fofn = write_lines(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf), - output_vcf_name = callset_name + ".sites_only.vcf.gz", - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call IndelsVariantRecalibrator { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".indels.recal", - tranches_filename = callset_name + ".indels.tranches", - recalibration_tranche_values = indel_recalibration_tranche_values, - recalibration_annotation_values = indel_recalibration_annotation_values, - mills_resource_vcf = mills_resource_vcf, - mills_resource_vcf_index = mills_resource_vcf_index, - axiomPoly_resource_vcf = axiomPoly_resource_vcf, - axiomPoly_resource_vcf_index = axiomPoly_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - if (num_gvcfs > 10000) { - call SNPsVariantRecalibratorCreateModel { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - downsampleFactor = SNP_VQSR_downsampleFactor, - model_report_filename = callset_name + ".snps.model.report", - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.sites_only_vcf))) { - call SNPsVariantRecalibrator as SNPsVariantRecalibratorScattered { - input: - sites_only_variant_filtered_vcf = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf[idx], - sites_only_variant_filtered_vcf_index = HardFilterAndMakeSitesOnlyVcf.sites_only_vcf_index[idx], - recalibration_filename = callset_name + ".snps." + idx + ".recal", - tranches_filename = callset_name + ".snps." + idx + ".tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - model_report = SNPsVariantRecalibratorCreateModel.model_report, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - call GatherTranches as SNPGatherTranches { - input: - input_fofn = write_lines(SNPsVariantRecalibratorScattered.tranches), - output_filename = callset_name + ".snps.gathered.tranches", - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - - if (num_gvcfs <= 10000){ - call SNPsVariantRecalibrator as SNPsVariantRecalibratorClassic { - input: - sites_only_variant_filtered_vcf = SitesOnlyGatherVcf.output_vcf, - sites_only_variant_filtered_vcf_index = SitesOnlyGatherVcf.output_vcf_index, - recalibration_filename = callset_name + ".snps.recal", - tranches_filename = callset_name + ".snps.tranches", - recalibration_tranche_values = snp_recalibration_tranche_values, - recalibration_annotation_values = snp_recalibration_annotation_values, - hapmap_resource_vcf = hapmap_resource_vcf, - hapmap_resource_vcf_index = hapmap_resource_vcf_index, - omni_resource_vcf = omni_resource_vcf, - omni_resource_vcf_index = omni_resource_vcf_index, - one_thousand_genomes_resource_vcf = one_thousand_genomes_resource_vcf, - one_thousand_genomes_resource_vcf_index = one_thousand_genomes_resource_vcf_index, - dbsnp_resource_vcf = dbsnp_resource_vcf, - dbsnp_resource_vcf_index = dbsnp_resource_vcf_index, - disk_size = small_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. - # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. - Boolean is_small_callset = num_gvcfs <= 1000 - - scatter (idx in range(length(HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf))) { - call ApplyRecalibration { - input: - recalibrated_vcf_filename = callset_name + ".filtered." + idx + ".vcf.gz", - input_vcf = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf[idx], - input_vcf_index = HardFilterAndMakeSitesOnlyVcf.variant_filtered_vcf_index[idx], - indels_recalibration = IndelsVariantRecalibrator.recalibration, - indels_recalibration_index = IndelsVariantRecalibrator.recalibration_index, - indels_tranches = IndelsVariantRecalibrator.tranches, - snps_recalibration = if defined(SNPsVariantRecalibratorScattered.recalibration) then select_first([SNPsVariantRecalibratorScattered.recalibration])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration]), - snps_recalibration_index = if defined(SNPsVariantRecalibratorScattered.recalibration_index) then select_first([SNPsVariantRecalibratorScattered.recalibration_index])[idx] else select_first([SNPsVariantRecalibratorClassic.recalibration_index]), - snps_tranches = select_first([SNPGatherTranches.tranches, SNPsVariantRecalibratorClassic.tranches]), - indel_filter_level = indel_filter_level, - snp_filter_level = snp_filter_level, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - # for large callsets we need to collect metrics from the shards and gather them later - if (!is_small_callset) { - call CollectVariantCallingMetrics as CollectMetricsSharded { - input: - input_vcf = ApplyRecalibration.recalibrated_vcf, - input_vcf_index = ApplyRecalibration.recalibrated_vcf_index, - metrics_filename_prefix = callset_name + "." + idx, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - } - - # for small callsets we can gather the VCF shards and then collect metrics on it - if (is_small_callset) { - call GatherVcfs as FinalGatherVcf { - input: - input_vcfs_fofn = write_lines(ApplyRecalibration.recalibrated_vcf), - output_vcf_name = callset_name + ".vcf.gz", - disk_size = huge_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - - call CollectVariantCallingMetrics as CollectMetricsOnFullVcf { - input: - input_vcf = FinalGatherVcf.output_vcf, - input_vcf_index = FinalGatherVcf.output_vcf_index, - metrics_filename_prefix = callset_name, - dbsnp_vcf = dbsnp_vcf, - dbsnp_vcf_index = dbsnp_vcf_index, - interval_list = eval_interval_list, - ref_dict = ref_dict, - disk_size = large_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - # for large callsets we still need to gather the sharded metrics - if (!is_small_callset) { - call GatherMetrics { - input: - input_details_fofn = write_lines(select_all(CollectMetricsSharded.detail_metrics_file)), - input_summaries_fofn = write_lines(select_all(CollectMetricsSharded.summary_metrics_file)), - output_prefix = callset_name, - disk_size = medium_disk, - docker = gatk_docker, - gatk_path = gatk_path, - preemptible_tries = preemptible_tries - } - } - - output { - # outputs from the small callset path through the wdl - File? output_vcf = FinalGatherVcf.output_vcf - File? output_vcf_index = FinalGatherVcf.output_vcf_index - - # select metrics from the small callset path and the large callset path - File detail_metrics_file = select_first([CollectMetricsOnFullVcf.detail_metrics_file, GatherMetrics.detail_metrics_file]) - File summary_metrics_file = select_first([CollectMetricsOnFullVcf.summary_metrics_file, GatherMetrics.summary_metrics_file]) - - # output the interval list generated/used by this run workflow - File output_intervals = DynamicallyCombineIntervals.output_intervals - } -} - -task GetNumberOfSamples { - File sample_name_map - String docker - Int preemptible_tries - command <<< - wc -l ${sample_name_map} | awk '{print $1}' - >>> - runtime { - docker: docker - memory: "1 GB" - preemptible: preemptible_tries - } - output { - Int sample_count = read_int(stdout()) - } -} - -task ImportGVCFs { - File sample_name_map - String interval - - String workspace_dir_name - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - Int batch_size - - command <<< - set -e - - rm -rf ${workspace_dir_name} - - # The memory setting here is very important and must be several GB lower - # than the total memory allocated to the VM because this tool uses - # a significant amount of non-heap memory for native libraries. - # Also, testing has shown that the multithreaded reader initialization - # does not scale well beyond 5 threads, so don't increase beyond that. - ${gatk_path} --java-options "-Xmx4g -Xms4g" \ - GenomicsDBImport \ - --genomicsdb-workspace-path ${workspace_dir_name} \ - --batch-size ${batch_size} \ - -L ${interval} \ - --sample-name-map ${sample_name_map} \ - --reader-threads 5 \ - -ip 500 - - tar -cf ${workspace_dir_name}.tar ${workspace_dir_name} - - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_genomicsdb = "${workspace_dir_name}.tar" - } -} - -task GenotypeGVCFs { - File workspace_tar - String interval - - String output_vcf_filename - - String gatk_path - - File ref_fasta - File ref_fasta_index - File ref_dict - - String dbsnp_vcf - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - - tar -xf ${workspace_tar} - WORKSPACE=$( basename ${workspace_tar} .tar) - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - GenotypeGVCFs \ - -R ${ref_fasta} \ - -O ${output_vcf_filename} \ - -D ${dbsnp_vcf} \ - -G StandardAnnotation \ - --only-output-calls-starting-in-intervals \ - --use-new-qual-calculator \ - -V gendb://$WORKSPACE \ - -L ${interval} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_vcf = "${output_vcf_filename}" - File output_vcf_index = "${output_vcf_filename}.tbi" - } -} - -task HardFilterAndMakeSitesOnlyVcf { - File vcf - File vcf_index - Float excess_het_threshold - - String variant_filtered_vcf_filename - String sites_only_vcf_filename - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command { - set -e - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - VariantFiltration \ - --filter-expression "ExcessHet > ${excess_het_threshold}" \ - --filter-name ExcessHet \ - -O ${variant_filtered_vcf_filename} \ - -V ${vcf} - - ${gatk_path} --java-options "-Xmx3g -Xms3g" \ - MakeSitesOnlyVcf \ - --INPUT ${variant_filtered_vcf_filename} \ - --OUTPUT ${sites_only_vcf_filename} - - } - runtime { - docker: docker - memory: "3.5 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File variant_filtered_vcf = "${variant_filtered_vcf_filename}" - File variant_filtered_vcf_index = "${variant_filtered_vcf_filename}.tbi" - File sites_only_vcf = "${sites_only_vcf_filename}" - File sites_only_vcf_index = "${sites_only_vcf_filename}.tbi" - } -} - -task IndelsVariantRecalibrator { - String recalibration_filename - String tranches_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File mills_resource_vcf - File axiomPoly_resource_vcf - File dbsnp_resource_vcf - File mills_resource_vcf_index - File axiomPoly_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - Int? machine_mem_gb - Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf, - mills_resource_vcf, - axiomPoly_resource_vcf, - dbsnp_resource_vcf], - "GiB")) - Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) - Int java_mem = machine_mem-2 - - command { - ${gatk_path} --java-options "-Xmx${java_mem}g -Xms${java_mem}g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode INDEL \ - --max-gaussians 4 \ - --resource:mills,known=false,training=true,truth=true,prior=12 ${mills_resource_vcf} \ - --resource:axiomPoly,known=false,training=true,truth=false,prior=10 ${axiomPoly_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=2 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: machine_mem_gb + " GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task SNPsVariantRecalibratorCreateModel { - String recalibration_filename - String tranches_filename - Int downsampleFactor - String model_report_filename - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx100g -Xms100g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - --sample-every-Nth-variant ${downsampleFactor} \ - --output-model ${model_report_filename} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: "104 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File model_report = "${model_report_filename}" - } -} - -task SNPsVariantRecalibrator { - String recalibration_filename - String tranches_filename - File? model_report - - Array[String] recalibration_tranche_values - Array[String] recalibration_annotation_values - - File sites_only_variant_filtered_vcf - File sites_only_variant_filtered_vcf_index - - File hapmap_resource_vcf - File omni_resource_vcf - File one_thousand_genomes_resource_vcf - File dbsnp_resource_vcf - File hapmap_resource_vcf_index - File omni_resource_vcf_index - File one_thousand_genomes_resource_vcf_index - File dbsnp_resource_vcf_index - - String gatk_path - String docker - Int? machine_mem_gb - Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf, - hapmap_resource_vcf, - omni_resource_vcf, - one_thousand_genomes_resource_vcf, - dbsnp_resource_vcf], - "GiB")) - Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) - Int java_mem = machine_mem-2 - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx${java_mem}g -Xms${java_mem}g" \ - VariantRecalibrator \ - -V ${sites_only_variant_filtered_vcf} \ - -O ${recalibration_filename} \ - --tranches-file ${tranches_filename} \ - --trust-all-polymorphic \ - -tranche ${sep=' -tranche ' recalibration_tranche_values} \ - -an ${sep=' -an ' recalibration_annotation_values} \ - -mode SNP \ - ${"--input-model " + model_report + " --output-tranches-for-scatter "} \ - --max-gaussians 6 \ - --resource:hapmap,known=false,training=true,truth=true,prior=15 ${hapmap_resource_vcf} \ - --resource:omni,known=false,training=true,truth=true,prior=12 ${omni_resource_vcf} \ - --resource:1000G,known=false,training=true,truth=false,prior=10 ${one_thousand_genomes_resource_vcf} \ - --resource:dbsnp,known=true,training=false,truth=false,prior=7 ${dbsnp_resource_vcf} - } - runtime { - docker: docker - memory: machine_mem + " GB" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibration = "${recalibration_filename}" - File recalibration_index = "${recalibration_filename}.idx" - File tranches = "${tranches_filename}" - } -} - -task GatherTranches { - File input_fofn - String output_filename - - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - set -o pipefail - - # this is here to deal with the JES bug where commands may be run twice - rm -rf tranches - - mkdir tranches - RETRY_LIMIT=5 - - count=0 - until cat ${input_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the tranches from the cloud' && exit 1 - fi - - cat ${input_fofn} | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherTranches \ - --input inputs.list \ - --output ${output_filename} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "2" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File tranches = "${output_filename}" - } -} - -task ApplyRecalibration { - String recalibrated_vcf_filename - File input_vcf - File input_vcf_index - File indels_recalibration - File indels_recalibration_index - File indels_tranches - File snps_recalibration - File snps_recalibration_index - File snps_tranches - - Float indel_filter_level - Float snp_filter_level - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - set -e - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O tmp.indel.recalibrated.vcf \ - -V ${input_vcf} \ - --recal-file ${indels_recalibration} \ - --tranches-file ${indels_tranches} \ - --truth-sensitivity-filter-level ${indel_filter_level} \ - --create-output-variant-index true \ - -mode INDEL - - ${gatk_path} --java-options "-Xmx5g -Xms5g" \ - ApplyVQSR \ - -O ${recalibrated_vcf_filename} \ - -V tmp.indel.recalibrated.vcf \ - --recal-file ${snps_recalibration} \ - --tranches-file ${snps_tranches} \ - --truth-sensitivity-filter-level ${snp_filter_level} \ - --create-output-variant-index true \ - -mode SNP - } - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File recalibrated_vcf = "${recalibrated_vcf_filename}" - File recalibrated_vcf_index = "${recalibrated_vcf_filename}.tbi" - } -} - -task GatherVcfs { - File input_vcfs_fofn - String output_vcf_name - String gatk_path - - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - - # Now using NIO to localize the vcfs but the input file must have a ".list" extension - mv ${input_vcfs_fofn} inputs.list - - # --ignore-safety-checks makes a big performance difference so we include it in our invocation. - # This argument disables expensive checks that the file headers contain the same set of - # genotyped samples and that files are in order by position of first record. - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - GatherVcfsCloud \ - --ignore-safety-checks \ - --gather-type BLOCK \ - --input inputs.list \ - --output ${output_vcf_name} - - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - IndexFeatureFile \ - --feature-file ${output_vcf_name} - >>> - runtime { - docker: docker - memory: "7 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File output_vcf = "${output_vcf_name}" - File output_vcf_index = "${output_vcf_name}.tbi" - } -} - -task CollectVariantCallingMetrics { - File input_vcf - File input_vcf_index - - String metrics_filename_prefix - File dbsnp_vcf - File dbsnp_vcf_index - File interval_list - File ref_dict - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command { - ${gatk_path} --java-options "-Xmx6g -Xms6g" \ - CollectVariantCallingMetrics \ - --INPUT ${input_vcf} \ - --DBSNP ${dbsnp_vcf} \ - --SEQUENCE_DICTIONARY ${ref_dict} \ - --OUTPUT ${metrics_filename_prefix} \ - --THREAD_COUNT 8 \ - --TARGET_INTERVALS ${interval_list} - } - output { - File detail_metrics_file = "${metrics_filename_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${metrics_filename_prefix}.variant_calling_summary_metrics" - } - runtime { - docker: docker - memory: "7 GB" - cpu: 2 - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } -} - -task GatherMetrics { - File input_details_fofn - File input_summaries_fofn - - String output_prefix - - String gatk_path - String docker - Int disk_size - Int preemptible_tries - - command <<< - set -e - set -o pipefail - - # this is here to deal with the JES bug where commands may be run twice - rm -rf metrics - - mkdir metrics - RETRY_LIMIT=5 - - count=0 - until cat ${input_details_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the metrics from the cloud' && exit 1 - fi - - count=0 - until cat ${input_summaries_fofn} | /google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do - sleep 1 - ((count++)) && ((count >= $RETRY_LIMIT)) && break - done - if [ "$count" -ge "$RETRY_LIMIT" ]; then - echo 'Could not copy all the metrics from the cloud' && exit 1 - fi - - INPUT=`cat ${input_details_fofn} | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("-I=metrics/%s ", $1)}'` - - ${gatk_path} --java-options "-Xmx2g -Xms2g" \ - AccumulateVariantCallingMetrics \ - $INPUT \ - -O ${output_prefix} - >>> - runtime { - docker: docker - memory: "3 GB" - cpu: "1" - disks: "local-disk " + disk_size + " HDD" - preemptible: preemptible_tries - } - output { - File detail_metrics_file = "${output_prefix}.variant_calling_detail_metrics" - File summary_metrics_file = "${output_prefix}.variant_calling_summary_metrics" - } -} - -task DynamicallyCombineIntervals { - File intervals - Int merge_count - Int preemptible_tries - - command { - python << CODE - def parse_interval(interval): - colon_split = interval.split(":") - chromosome = colon_split[0] - dash_split = colon_split[1].split("-") - start = int(dash_split[0]) - end = int(dash_split[1]) - return chromosome, start, end - - def add_interval(chr, start, end): - lines_to_write.append(chr + ":" + str(start) + "-" + str(end)) - return chr, start, end - - count = 0 - chain_count = ${merge_count} - l_chr, l_start, l_end = "", 0, 0 - lines_to_write = [] - with open("${intervals}") as f: - with open("out.intervals", "w") as f1: - for line in f.readlines(): - # initialization - if count == 0: - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - # reached number to combine, so spit out and start over - if count == chain_count: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - continue - - c_chr, c_start, c_end = parse_interval(line) - # if adjacent keep the chain going - if c_chr == w_chr and c_start == w_end + 1: - w_end = c_end - count += 1 - continue - # not adjacent, end here and start a new chain - else: - l_char, l_start, l_end = add_interval(w_chr, w_start, w_end) - w_chr, w_start, w_end = parse_interval(line) - count = 1 - if l_char != w_chr or l_start != w_start or l_end != w_end: - add_interval(w_chr, w_start, w_end) - f1.writelines("\n".join(lines_to_write)) - CODE - } - - runtime { - memory: "3 GB" - preemptible: preemptible_tries - docker: "python:2.7" - } - - output { - File output_intervals = "out.intervals" - } -} diff --git a/tasks/JointGenotypingTasks-terra.wdl b/tasks/JointGenotypingTasks-terra.wdl new file mode 100644 index 0000000..bc7edf1 --- /dev/null +++ b/tasks/JointGenotypingTasks-terra.wdl @@ -0,0 +1,1103 @@ +version 1.0 + + +task CheckSamplesUnique { + input { + File sample_name_map + Int sample_num_threshold = 50 + } + + command { + set -euo pipefail + if [[ $(cut -f 1 ~{sample_name_map} | wc -l) -ne $(cut -f 1 ~{sample_name_map} | sort | uniq | wc -l) ]] + then + echo "Samples in the sample_name_map are not unique" 1>&2 + exit 1 + elif [[ $(cut -f 1 ~{sample_name_map} | wc -l) -lt ~{sample_num_threshold} ]] + then + echo "There are fewer than ~{sample_num_threshold} samples in the sample_name_map" 1>&2 + echo "Having fewer than ~{sample_num_threshold} samples means there likely isn't enough data to complete joint calling" 1>&2 + exit 1 + else + echo true + fi + } + + output { + Boolean samples_unique = read_boolean(stdout()) + } + + runtime { + memory: "1 GiB" + preemptible: 1 + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} + +task SplitIntervalList { + + input { + File interval_list + Int scatter_count + File ref_fasta + File ref_fasta_index + File ref_dict + Boolean sample_names_unique_done + Int disk_size + String scatter_mode = "BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW" + String gatk_docker + String gatk_path + String preemptible_tries + } + + parameter_meta { + interval_list: { + localization_optional: true + } + } + + command <<< + ~{gatk_path} --java-options -Xms3g SplitIntervals \ + -L ~{interval_list} -O scatterDir -scatter ~{scatter_count} -R ~{ref_fasta} \ + -mode ~{scatter_mode} + >>> + + runtime { + memory: "3.75 GiB" + preemptible: preemptible_tries + disks: "local-disk " + disk_size + " HDD" + docker: gatk_docker + } + + output { + Array[File] output_intervals = glob("scatterDir/*") + } +} + +task ImportGVCFs { + + input { + File sample_name_map + File interval + File ref_fasta + File ref_fasta_index + File ref_dict + + String workspace_dir_name + + Int disk_size + Int batch_size + + # Using a nightly version of GATK containing fixes for GenomicsDB + # https://github.com/broadinstitute/gatk/pull/5899 + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk-nightly:2019-05-07-4.1.2.0-5-g53d015e4f-NIGHTLY-SNAPSHOT" + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + rm -rf ~{workspace_dir_name} + + # We've seen some GenomicsDB performance regressions related to intervals, so we're going to pretend we only have a single interval + # using the --merge-input-intervals arg + # There's no data in between since we didn't run HaplotypeCaller over those loci so we're not wasting any compute + + # The memory setting here is very important and must be several GiB lower + # than the total memory allocated to the VM because this tool uses + # a significant amount of non-heap memory for native libraries. + # Also, testing has shown that the multithreaded reader initialization + # does not scale well beyond 5 threads, so don't increase beyond that. + ~{gatk_path} --java-options -Xms8g \ + GenomicsDBImport \ + --genomicsdb-workspace-path ~{workspace_dir_name} \ + --batch-size ~{batch_size} \ + -L ~{interval} \ + --sample-name-map ~{sample_name_map} \ + --reader-threads 5 \ + --merge-input-intervals \ + --consolidate + + tar -cf ~{workspace_dir_name}.tar ~{workspace_dir_name} + >>> + + runtime { + memory: "26 GiB" + cpu: 4 + disks: "local-disk " + disk_size + " HDD" + docker: gatk_docker + preemptible: preemptible_tries + } + + output { + File output_genomicsdb = "~{workspace_dir_name}.tar" + } +} + +task GenotypeGVCFs { + + input { + File workspace_tar + File interval + + String output_vcf_filename + + File ref_fasta + File ref_fasta_index + File ref_dict + + String dbsnp_vcf + + Int disk_size + # This is needed for gVCFs generated with GATK3 HaplotypeCaller + Boolean allow_old_rms_mapping_quality_annotation_data = false + String gatk_docker + String gatk_path + String preemptible_tries + } + + parameter_meta { + interval: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + tar -xf ~{workspace_tar} + WORKSPACE=$(basename ~{workspace_tar} .tar) + + ~{gatk_path} --java-options -Xms8g \ + GenotypeGVCFs \ + -R ~{ref_fasta} \ + -O ~{output_vcf_filename} \ + -D ~{dbsnp_vcf} \ + -G StandardAnnotation -G AS_StandardAnnotation \ + --only-output-calls-starting-in-intervals \ + --use-new-qual-calculator \ + -V gendb://$WORKSPACE \ + -L ~{interval} \ + ~{true='--allow-old-rms-mapping-quality-annotation-data' false='' allow_old_rms_mapping_quality_annotation_data} \ + --merge-input-intervals + >>> + + runtime { + memory: "26 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_filename}" + File output_vcf_index = "~{output_vcf_filename}.tbi" + } +} + +task GnarlyGenotyper { + + input { + File workspace_tar + File interval + String output_vcf_filename + File ref_fasta + File ref_fasta_index + File ref_dict + String dbsnp_vcf + + String gatk_docker = "us.gcr.io/broad-gotc-prod/gnarly_genotyper:fixNegativeRefCount" + String preemptible_tries + } + + parameter_meta { + interval: { + localization_optional: true + } + } + + Int disk_size = ceil(size(workspace_tar, "GiB") + size(ref_fasta, "GiB") + size(dbsnp_vcf, "GiB") * 3) + + command <<< + set -e + + tar -xf ~{workspace_tar} + WORKSPACE=$( basename ~{workspace_tar} .tar) + + # use a query.json to set some params that aren't exposed -- ewwwww + cat < $WORKSPACE/query.json + { + "scan_full": true, + "workspace": "genomicsdb", + "array": "genomicsdb_array", + "vid_mapping_file": "genomicsdb/vidmap.json", + "callset_mapping_file": "genomicsdb/callset.json", + "reference_genome": "/cromwell_root/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "max_diploid_alt_alleles_that_can_be_genotyped": 6, + "produce_GT_field": true + } + EOF + + gatk --java-options -Xms8g \ + GnarlyGenotyper \ + -R ~{ref_fasta} \ + -O ~{output_vcf_filename} \ + --output-database-name annotationDB.vcf.gz \ + -D ~{dbsnp_vcf} \ + --only-output-calls-starting-in-intervals \ + --use-new-qual-calculator \ + -V gendb://$WORKSPACE \ + -L ~{interval} \ + -stand-call-conf 10 \ + --merge-input-intervals + >>> + + runtime { + memory: "26 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_filename}" + File output_vcf_index = "~{output_vcf_filename}.tbi" + File output_database = "annotationDB.vcf.gz" + File output_database_index = "annotationDB.vcf.gz.tbi" + } +} + +task HardFilterAndMakeSitesOnlyVcf { + + input { + File vcf + File vcf_index + Float excess_het_threshold + + String variant_filtered_vcf_filename + String sites_only_vcf_filename + + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + ~{gatk_path} --java-options -Xms3g \ + VariantFiltration \ + --filter-expression "ExcessHet > ~{excess_het_threshold}" \ + --filter-name ExcessHet \ + -O ~{variant_filtered_vcf_filename} \ + -V ~{vcf} + + ~{gatk_path} --java-options -Xms3g \ + MakeSitesOnlyVcf \ + -I ~{variant_filtered_vcf_filename} \ + -O ~{sites_only_vcf_filename} + >>> + + runtime { + memory: "3.75 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File variant_filtered_vcf = "~{variant_filtered_vcf_filename}" + File variant_filtered_vcf_index = "~{variant_filtered_vcf_filename}.tbi" + File sites_only_vcf = "~{sites_only_vcf_filename}" + File sites_only_vcf_index = "~{sites_only_vcf_filename}.tbi" + } +} + +task IndelsVariantRecalibrator { + + input { + String recalibration_filename + String tranches_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File mills_resource_vcf + File axiomPoly_resource_vcf + File dbsnp_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 4 + + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + ~{gatk_path} --java-options -Xms24g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode INDEL \ + --max-gaussians ~{max_gaussians} \ + -resource:mills,known=false,training=true,truth=true,prior=12 ~{mills_resource_vcf} \ + -resource:axiomPoly,known=false,training=true,truth=false,prior=10 ~{axiomPoly_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=2 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "26 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File recalibration = "~{recalibration_filename}" + File recalibration_index = "~{recalibration_filename}.idx" + File tranches = "~{tranches_filename}" + } +} + +task SNPsVariantRecalibratorCreateModel { + + input { + String recalibration_filename + String tranches_filename + Int downsampleFactor + String model_report_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + ~{gatk_path} --java-options -Xms100g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + --sample-every-Nth-variant ~{downsampleFactor} \ + --output-model ~{model_report_filename} \ + --max-gaussians ~{max_gaussians} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "104 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File model_report = "~{model_report_filename}" + } +} + +task SNPsVariantRecalibrator { + + input { + String recalibration_filename + String tranches_filename + File? model_report + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + Int? machine_mem_gb + + } + + Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf, + hapmap_resource_vcf, + omni_resource_vcf, + one_thousand_genomes_resource_vcf, + dbsnp_resource_vcf], + "GiB")) + Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) + Int java_mem = machine_mem - 1 + + + String model_report_arg = if defined(model_report) then "--input-model $MODEL_REPORT --output-tranches-for-scatter" else "" + + command <<< + set -euo pipefail + + MODEL_REPORT=~{model_report} + + ~{gatk_path} --java-options -Xms~{java_mem}g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + ~{model_report_arg} \ + --max-gaussians ~{max_gaussians} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "~{machine_mem} GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File recalibration = "~{recalibration_filename}" + File recalibration_index = "~{recalibration_filename}.idx" + File tranches = "~{tranches_filename}" + } +} + +task GatherTranches { + + input { + Array[File] tranches + String output_filename + Int disk_size + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152" + String gatk_path + String preemptible_tries + } + + parameter_meta { + tranches: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + tranches_fofn=~{write_lines(tranches)} + + # Jose says: + # Cromwell will fall over if we have it try to localize tens of thousands of files, + # so we manually localize files using gsutil. + # Using gsutil also lets us parallelize the localization, which (as far as we can tell) + # PAPI doesn't do. + + # This is here to deal with the JES bug where commands may be run twice + rm -rf tranches + mkdir tranches + RETRY_LIMIT=5 + + count=0 + until cat $tranches_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the tranches from the cloud' && exit 1 + fi + + cat $tranches_fofn | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list + + /usr/gitc/gatk --java-options -Xms6g \ + GatherTranches \ + --input inputs.list \ + --output ~{output_filename} + >>> + + runtime { + memory: "7.5 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File tranches = "~{output_filename}" + } +} + +task ApplyRecalibration { + + input { + String recalibrated_vcf_filename + File input_vcf + File input_vcf_index + File indels_recalibration + File indels_recalibration_index + File indels_tranches + File snps_recalibration + File snps_recalibration_index + File snps_tranches + Float indel_filter_level + Float snp_filter_level + Boolean use_allele_specific_annotations + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + ~{gatk_path} --java-options -Xms5g \ + ApplyVQSR \ + -O tmp.indel.recalibrated.vcf \ + -V ~{input_vcf} \ + --recal-file ~{indels_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{indels_tranches} \ + --truth-sensitivity-filter-level ~{indel_filter_level} \ + --create-output-variant-index true \ + -mode INDEL + + ~{gatk_path} --java-options -Xms5g \ + ApplyVQSR \ + -O ~{recalibrated_vcf_filename} \ + -V tmp.indel.recalibrated.vcf \ + --recal-file ~{snps_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{snps_tranches} \ + --truth-sensitivity-filter-level ~{snp_filter_level} \ + --create-output-variant-index true \ + -mode SNP + >>> + + runtime { + memory: "7 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File recalibrated_vcf = "~{recalibrated_vcf_filename}" + File recalibrated_vcf_index = "~{recalibrated_vcf_filename}.tbi" + } +} + +task GatherVcfs { + + input { + Array[File] input_vcfs + String output_vcf_name + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + parameter_meta { + input_vcfs: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + # --ignore-safety-checks makes a big performance difference so we include it in our invocation. + # This argument disables expensive checks that the file headers contain the same set of + # genotyped samples and that files are in order by position of first record. + ~{gatk_path} --java-options -Xms6g \ + GatherVcfsCloud \ + --ignore-safety-checks \ + --gather-type BLOCK \ + --input ~{sep=" --input " input_vcfs} \ + --output ~{output_vcf_name} + + tabix ~{output_vcf_name} + >>> + + runtime { + memory: "7 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_name}" + File output_vcf_index = "~{output_vcf_name}.tbi" + } +} + +task SelectFingerprintSiteVariants { + + input { + File input_vcf + File haplotype_database + String base_output_name + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + parameter_meta { + input_vcf: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + function hdb_to_interval_list() { + input=$1 + awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1 + } + + hdb_to_interval_list ~{haplotype_database} > hdb.interval_list + + ~{gatk_path} --java-options -Xms6g \ + SelectVariants \ + --variant ~{input_vcf} \ + --intervals hdb.interval_list \ + --output ~{base_output_name}.vcf.gz + >>> + + runtime { + memory: "7.5 GiB" + cpu: 1 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File output_vcf = "~{base_output_name}.vcf.gz" + File output_vcf_index = "~{base_output_name}.vcf.gz.tbi" + } +} + +task CollectVariantCallingMetrics { + + input { + File input_vcf + File input_vcf_index + String metrics_filename_prefix + File dbsnp_vcf + File dbsnp_vcf_index + File interval_list + File ref_dict + Int disk_size + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -euo pipefail + + ~{gatk_path} --java-options -Xms6g \ + CollectVariantCallingMetrics \ + --INPUT ~{input_vcf} \ + --DBSNP ~{dbsnp_vcf} \ + --SEQUENCE_DICTIONARY ~{ref_dict} \ + --OUTPUT ~{metrics_filename_prefix} \ + --THREAD_COUNT 8 \ + --TARGET_INTERVALS ~{interval_list} + >>> + + output { + File detail_metrics_file = "~{metrics_filename_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "~{metrics_filename_prefix}.variant_calling_summary_metrics" + } + + runtime { + memory: "7.5 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } +} + +task GatherVariantCallingMetrics { + + input { + Array[File] input_details + Array[File] input_summaries + String output_prefix + Int disk_size + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152" + String gatk_path + String preemptible_tries + } + + parameter_meta { + input_details: { + localization_optional: true + } + input_summaries: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + input_details_fofn=~{write_lines(input_details)} + input_summaries_fofn=~{write_lines(input_summaries)} + + # Jose says: + # Cromwell will fall over if we have it try to localize tens of thousands of files, + # so we manually localize files using gsutil. + # Using gsutil also lets us parallelize the localization, which (as far as we can tell) + # PAPI doesn't do. + + # This is here to deal with the JES bug where commands may be run twice + rm -rf metrics + + mkdir metrics + RETRY_LIMIT=5 + + count=0 + until cat $input_details_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the metrics from the cloud' && exit 1 + fi + + count=0 + until cat $input_summaries_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the metrics from the cloud' && exit 1 + fi + + INPUT=$(cat $input_details_fofn | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("--INPUT metrics/%s ", $1)}') + + /usr/gitc/gatk --java-options -Xms2g \ + AccumulateVariantCallingMetrics \ + $INPUT \ + --OUTPUT ~{output_prefix} + >>> + + runtime { + memory: "3 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: preemptible_tries + docker: gatk_docker + } + + output { + File detail_metrics_file = "~{output_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "~{output_prefix}.variant_calling_summary_metrics" + } +} + +task CrossCheckFingerprint { + + input { + Array[File] gvcf_paths + Array[File] vcf_paths + File sample_name_map + File haplotype_database + String output_base_name + Boolean scattered = false + Array[String] expected_inconclusive_samples = [] + String picard_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map" + String preemptible_tries + } + + parameter_meta { + gvcf_paths: { + localization_optional: true + } + vcf_paths: { + localization_optional: true + } + } + + Int num_gvcfs = length(gvcf_paths) + Int cpu = if num_gvcfs < 32 then num_gvcfs else 32 + # Compute memory to use based on the CPU count, following the pattern of + # 3.75GiB / cpu used by GCP's pricing: https://cloud.google.com/compute/pricing + Int memMb = round(cpu * 3.75 * 1024) + Int disk = 100 + + String output_name = output_base_name + ".fingerprintcheck" + + command <<< + set -eu + + gvcfInputsList=~{write_lines(gvcf_paths)} + vcfInputsList=~{write_lines(vcf_paths)} + + cp $gvcfInputsList gvcf_inputs.list + cp $vcfInputsList vcf_inputs.list + + java -Dpicard.useLegacyParser=false -Xms~{memMb - 512}m \ + -jar /usr/gitc/PicardPublicWithCrosscheckNIOandSampleMapping.jar \ + CrosscheckFingerprints \ + --INPUT gvcf_inputs.list \ + --SECOND_INPUT vcf_inputs.list \ + --HAPLOTYPE_MAP ~{haplotype_database} \ + --INPUT_SAMPLE_FILE_MAP ~{sample_name_map} \ + --CROSSCHECK_BY SAMPLE \ + --CROSSCHECK_MODE CHECK_SAME_SAMPLE \ + --NUM_THREADS ~{cpu} \ + --SKIP_INPUT_READABLITY_TEST \ + ~{true='--EXIT_CODE_WHEN_MISMATCH 0' false='' scattered} \ + --OUTPUT ~{output_name} + + if ~{scattered}; then + # UNEXPECTED_MATCH is not possible with CHECK_SAME_SAMPLE + matches=$(grep "EXPECTED_MATCH" ~{output_name} | wc -l) + + # check inconclusive samples + expectedInconclusiveSamples=("~{sep='" "' expected_inconclusive_samples}") + inconclusiveSamplesCount=0 + inconclusiveSamples=($(grep 'INCONCLUSIVE' ~{output_name} | cut -f 1)) + for sample in ${inconclusiveSamples[@]}; do + if printf '%s\n' ${expectedInconclusiveSamples[@]} | grep -P '^'${sample}'$'; then + inconclusiveSamplesCount=$((inconclusiveSamplesCount+1)) + fi + done + + total_matches=$((inconclusiveSamplesCount + matches)) + if [[ ${total_matches} -eq ~{num_gvcfs} ]]; then + >&2 echo "Found the correct number of matches (~{num_gvcfs}) for this shard" + else + >&2 echo "ERROR: Found $total_matches 'EXPECTED_MATCH' records, but expected ~{num_gvcfs}" + exit 1 + fi + fi + >>> + + runtime { + memory: memMb + " MiB" + disks: "local-disk " + disk + " HDD" + preemptible: preemptible_tries + docker: picard_docker + } + + output { + File crosscheck_metrics = output_name + } +} + +task GatherPicardMetrics { + + input { + Array[File] metrics_files + String output_file_name + Int disk_size + } + + command { + # Don't use this task to gather tens of thousands of files. + # Cromwell can't handle it. + + # This cannot gather metrics with histograms + + head -n 7 ~{metrics_files[0]} > ~{output_file_name} + + for metrics_file in ~{sep=' ' metrics_files}; do + sed -n '1,7d;p' $metrics_file | grep -v '^$' >> ~{output_file_name} + done + } + + output { + File gathered_metrics = "~{output_file_name}" + } + + runtime { + cpu: 1 + memory: "3.75 GiB" + preemptible: 1 + disks: "local-disk " + disk_size + " HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} + +task GetFingerprintingIntervalIndices { + + input { + Array[File] unpadded_intervals + File haplotype_database + String gatk_docker + String gatk_path + String preemptible_tries + } + + command <<< + set -xeo pipefail + + function rename_intervals(){ + interval_list=$1 + name=$2 + + awk 'BEGIN{FS=IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {$5="'$name'"; print}' $interval_list + } + export -f rename_intervals + + function hdb_to_interval_list(){ + input=$1 + + awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1 + } + + function rename_scatter(){ + file=$1 + number=$(echo $file | sed -E 's|([0-9]+)-scattered\.interval.*|\1|') + rename_intervals $file $number > scattered.renamed.$number.interval_list + } + export -f rename_scatter + + # rename the intervals within each interval_list according to the number in the name of the list + + cp ~{sep=' ' unpadded_intervals} ./ + + cat ~{write_lines(unpadded_intervals)} | xargs -n1 basename | xargs -I{} bash -c 'rename_scatter $@' _ {} + + #find the first header + find . -name "scattered.renamed.*.interval_list" | head -n1 | xargs cat | grep '^@' > all.interval_list + + # concatenate the resulting intervals (with no header) + find . -name "scattered.renamed.*.interval_list" | xargs cat | grep -v '^@' >> all.interval_list + + # convert the Haplotype_database to an interval_list + hdb_to_interval_list ~{haplotype_database} > hdb.interval_list + + # find the intervals that overlap the haplotype_database + ~{gatk_path} IntervalListTools \ + -ACTION OVERLAPS \ + -O all.sorted.interval_list \ + -I all.interval_list \ + -SI hdb.interval_list + + if grep -v '^@' all.sorted.interval_list; then + grep -v '^@' all.sorted.interval_list | awk '{FS="\t"; print $5}' | uniq > indices.out + else + touch indices.out + fi + >>> + + output { + Array[String] indices_to_fingerprint = read_lines("indices.out") + File all_sorted_interval_list = "all.sorted.interval_list" + File all_interval_list = "all.interval_list" + File hdb_interval_list = "hdb.interval_list" + } + + runtime { + cpu: 2 + memory: "3.75 GiB" + preemptible: preemptible_tries + disks: "local-disk 10 HDD" + docker: gatk_docker + } +} + +task PartitionSampleNameMap { + + input { + File sample_name_map + Int line_limit + } + + command { + + cut -f 2 ~{sample_name_map} > sample_paths + split -l ~{line_limit} -d sample_paths partition_ + + # Let the OS catch up with creation of files for glob command + sleep 1 + } + + output { + Array[File] partitions = glob("partition_*") + } + + runtime { + memory: "1 GiB" + preemptible: 1 + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} diff --git a/tasks/JointGenotypingTasks.wdl b/tasks/JointGenotypingTasks.wdl new file mode 100644 index 0000000..e15f19c --- /dev/null +++ b/tasks/JointGenotypingTasks.wdl @@ -0,0 +1,1073 @@ +version 1.0 + + +task CheckSamplesUnique { + input { + File sample_name_map + Int sample_num_threshold = 50 + } + + command { + set -euo pipefail + if [[ $(cut -f 1 ~{sample_name_map} | wc -l) -ne $(cut -f 1 ~{sample_name_map} | sort | uniq | wc -l) ]] + then + echo "Samples in the sample_name_map are not unique" 1>&2 + exit 1 + elif [[ $(cut -f 1 ~{sample_name_map} | wc -l) -lt ~{sample_num_threshold} ]] + then + echo "There are fewer than ~{sample_num_threshold} samples in the sample_name_map" 1>&2 + echo "Having fewer than ~{sample_num_threshold} samples means there likely isn't enough data to complete joint calling" 1>&2 + exit 1 + else + echo true + fi + } + + output { + Boolean samples_unique = read_boolean(stdout()) + } + + runtime { + memory: "1 GiB" + preemptible: 1 + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} + +task SplitIntervalList { + + input { + File interval_list + Int scatter_count + File ref_fasta + File ref_fasta_index + File ref_dict + Boolean sample_names_unique_done + Int disk_size + String scatter_mode = "BALANCING_WITHOUT_INTERVAL_SUBDIVISION_WITH_OVERFLOW" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + parameter_meta { + interval_list: { + localization_optional: true + } + } + + command <<< + gatk --java-options -Xms3g SplitIntervals \ + -L ~{interval_list} -O scatterDir -scatter ~{scatter_count} -R ~{ref_fasta} \ + -mode ~{scatter_mode} + >>> + + runtime { + memory: "3.75 GiB" + preemptible: 1 + disks: "local-disk " + disk_size + " HDD" + docker: gatk_docker + } + + output { + Array[File] output_intervals = glob("scatterDir/*") + } +} + +task ImportGVCFs { + + input { + File sample_name_map + File interval + File ref_fasta + File ref_fasta_index + File ref_dict + + String workspace_dir_name + + Int disk_size + Int batch_size + + # Using a nightly version of GATK containing fixes for GenomicsDB + # https://github.com/broadinstitute/gatk/pull/5899 + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk-nightly:2019-05-07-4.1.2.0-5-g53d015e4f-NIGHTLY-SNAPSHOT" + } + + command <<< + set -euo pipefail + + rm -rf ~{workspace_dir_name} + + # We've seen some GenomicsDB performance regressions related to intervals, so we're going to pretend we only have a single interval + # using the --merge-input-intervals arg + # There's no data in between since we didn't run HaplotypeCaller over those loci so we're not wasting any compute + + # The memory setting here is very important and must be several GiB lower + # than the total memory allocated to the VM because this tool uses + # a significant amount of non-heap memory for native libraries. + # Also, testing has shown that the multithreaded reader initialization + # does not scale well beyond 5 threads, so don't increase beyond that. + gatk --java-options -Xms8g \ + GenomicsDBImport \ + --genomicsdb-workspace-path ~{workspace_dir_name} \ + --batch-size ~{batch_size} \ + -L ~{interval} \ + --sample-name-map ~{sample_name_map} \ + --reader-threads 5 \ + --merge-input-intervals \ + --consolidate + + tar -cf ~{workspace_dir_name}.tar ~{workspace_dir_name} + >>> + + runtime { + memory: "26 GiB" + cpu: 4 + disks: "local-disk " + disk_size + " HDD" + docker: gatk_docker + preemptible: 1 + } + + output { + File output_genomicsdb = "~{workspace_dir_name}.tar" + } +} + +task GenotypeGVCFs { + + input { + File workspace_tar + File interval + + String output_vcf_filename + + File ref_fasta + File ref_fasta_index + File ref_dict + + String dbsnp_vcf + + Int disk_size + # This is needed for gVCFs generated with GATK3 HaplotypeCaller + Boolean allow_old_rms_mapping_quality_annotation_data = false + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.4.0" + } + + parameter_meta { + interval: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + tar -xf ~{workspace_tar} + WORKSPACE=$(basename ~{workspace_tar} .tar) + + gatk --java-options -Xms8g \ + GenotypeGVCFs \ + -R ~{ref_fasta} \ + -O ~{output_vcf_filename} \ + -D ~{dbsnp_vcf} \ + -G StandardAnnotation -G AS_StandardAnnotation \ + --only-output-calls-starting-in-intervals \ + --use-new-qual-calculator \ + -V gendb://$WORKSPACE \ + -L ~{interval} \ + ~{true='--allow-old-rms-mapping-quality-annotation-data' false='' allow_old_rms_mapping_quality_annotation_data} \ + --merge-input-intervals + >>> + + runtime { + memory: "26 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_filename}" + File output_vcf_index = "~{output_vcf_filename}.tbi" + } +} + +task GnarlyGenotyper { + + input { + File workspace_tar + File interval + String output_vcf_filename + File ref_fasta + File ref_fasta_index + File ref_dict + String dbsnp_vcf + + String gatk_docker = "us.gcr.io/broad-gotc-prod/gnarly_genotyper:fixNegativeRefCount" + } + + parameter_meta { + interval: { + localization_optional: true + } + } + + Int disk_size = ceil(size(workspace_tar, "GiB") + size(ref_fasta, "GiB") + size(dbsnp_vcf, "GiB") * 3) + + command <<< + set -e + + tar -xf ~{workspace_tar} + WORKSPACE=$( basename ~{workspace_tar} .tar) + + # use a query.json to set some params that aren't exposed -- ewwwww + cat < $WORKSPACE/query.json + { + "scan_full": true, + "workspace": "genomicsdb", + "array": "genomicsdb_array", + "vid_mapping_file": "genomicsdb/vidmap.json", + "callset_mapping_file": "genomicsdb/callset.json", + "reference_genome": "/cromwell_root/broad-references/hg38/v0/Homo_sapiens_assembly38.fasta", + "max_diploid_alt_alleles_that_can_be_genotyped": 6, + "produce_GT_field": true + } + EOF + + gatk --java-options -Xms8g \ + GnarlyGenotyper \ + -R ~{ref_fasta} \ + -O ~{output_vcf_filename} \ + --output-database-name annotationDB.vcf.gz \ + -D ~{dbsnp_vcf} \ + --only-output-calls-starting-in-intervals \ + --use-new-qual-calculator \ + -V gendb://$WORKSPACE \ + -L ~{interval} \ + -stand-call-conf 10 \ + --merge-input-intervals + >>> + + runtime { + memory: "26 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_filename}" + File output_vcf_index = "~{output_vcf_filename}.tbi" + File output_database = "annotationDB.vcf.gz" + File output_database_index = "annotationDB.vcf.gz.tbi" + } +} + +task HardFilterAndMakeSitesOnlyVcf { + + input { + File vcf + File vcf_index + Float excess_het_threshold + + String variant_filtered_vcf_filename + String sites_only_vcf_filename + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms3g \ + VariantFiltration \ + --filter-expression "ExcessHet > ~{excess_het_threshold}" \ + --filter-name ExcessHet \ + -O ~{variant_filtered_vcf_filename} \ + -V ~{vcf} + + gatk --java-options -Xms3g \ + MakeSitesOnlyVcf \ + -I ~{variant_filtered_vcf_filename} \ + -O ~{sites_only_vcf_filename} + >>> + + runtime { + memory: "3.75 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File variant_filtered_vcf = "~{variant_filtered_vcf_filename}" + File variant_filtered_vcf_index = "~{variant_filtered_vcf_filename}.tbi" + File sites_only_vcf = "~{sites_only_vcf_filename}" + File sites_only_vcf_index = "~{sites_only_vcf_filename}.tbi" + } +} + +task IndelsVariantRecalibrator { + + input { + String recalibration_filename + String tranches_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File mills_resource_vcf + File axiomPoly_resource_vcf + File dbsnp_resource_vcf + File mills_resource_vcf_index + File axiomPoly_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 4 + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms24g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode INDEL \ + --max-gaussians ~{max_gaussians} \ + -resource:mills,known=false,training=true,truth=true,prior=12 ~{mills_resource_vcf} \ + -resource:axiomPoly,known=false,training=true,truth=false,prior=10 ~{axiomPoly_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=2 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "26 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File recalibration = "~{recalibration_filename}" + File recalibration_index = "~{recalibration_filename}.idx" + File tranches = "~{tranches_filename}" + } +} + +task SNPsVariantRecalibratorCreateModel { + + input { + String recalibration_filename + String tranches_filename + Int downsampleFactor + String model_report_filename + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms100g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + --sample-every-Nth-variant ~{downsampleFactor} \ + --output-model ~{model_report_filename} \ + --max-gaussians ~{max_gaussians} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "104 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File model_report = "~{model_report_filename}" + } +} + +task SNPsVariantRecalibrator { + + input { + String recalibration_filename + String tranches_filename + File? model_report + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + File sites_only_variant_filtered_vcf + File sites_only_variant_filtered_vcf_index + + File hapmap_resource_vcf + File omni_resource_vcf + File one_thousand_genomes_resource_vcf + File dbsnp_resource_vcf + File hapmap_resource_vcf_index + File omni_resource_vcf_index + File one_thousand_genomes_resource_vcf_index + File dbsnp_resource_vcf_index + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + Int? machine_mem_gb + + } + + Int auto_mem = ceil(2 * size([sites_only_variant_filtered_vcf, + hapmap_resource_vcf, + omni_resource_vcf, + one_thousand_genomes_resource_vcf, + dbsnp_resource_vcf], + "GiB")) + Int machine_mem = select_first([machine_mem_gb, if auto_mem < 7 then 7 else auto_mem]) + Int java_mem = machine_mem - 1 + + + String model_report_arg = if defined(model_report) then "--input-model $MODEL_REPORT --output-tranches-for-scatter" else "" + + command <<< + set -euo pipefail + + MODEL_REPORT=~{model_report} + + gatk --java-options -Xms~{java_mem}g \ + VariantRecalibrator \ + -V ~{sites_only_variant_filtered_vcf} \ + -O ~{recalibration_filename} \ + --tranches-file ~{tranches_filename} \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + ~{model_report_arg} \ + --max-gaussians ~{max_gaussians} \ + -resource:hapmap,known=false,training=true,truth=true,prior=15 ~{hapmap_resource_vcf} \ + -resource:omni,known=false,training=true,truth=true,prior=12 ~{omni_resource_vcf} \ + -resource:1000G,known=false,training=true,truth=false,prior=10 ~{one_thousand_genomes_resource_vcf} \ + -resource:dbsnp,known=true,training=false,truth=false,prior=7 ~{dbsnp_resource_vcf} + >>> + + runtime { + memory: "~{machine_mem} GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File recalibration = "~{recalibration_filename}" + File recalibration_index = "~{recalibration_filename}.idx" + File tranches = "~{tranches_filename}" + } +} + +task GatherTranches { + + input { + Array[File] tranches + String output_filename + Int disk_size + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152" + } + + parameter_meta { + tranches: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + tranches_fofn=~{write_lines(tranches)} + + # Jose says: + # Cromwell will fall over if we have it try to localize tens of thousands of files, + # so we manually localize files using gsutil. + # Using gsutil also lets us parallelize the localization, which (as far as we can tell) + # PAPI doesn't do. + + # This is here to deal with the JES bug where commands may be run twice + rm -rf tranches + mkdir tranches + RETRY_LIMIT=5 + + count=0 + until cat $tranches_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I tranches/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the tranches from the cloud' && exit 1 + fi + + cat $tranches_fofn | rev | cut -d '/' -f 1 | rev | awk '{print "tranches/" $1}' > inputs.list + + /usr/gitc/gatk --java-options -Xms6g \ + GatherTranches \ + --input inputs.list \ + --output ~{output_filename} + >>> + + runtime { + memory: "7.5 GiB" + cpu: "2" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File tranches = "~{output_filename}" + } +} + +task ApplyRecalibration { + + input { + String recalibrated_vcf_filename + File input_vcf + File input_vcf_index + File indels_recalibration + File indels_recalibration_index + File indels_tranches + File snps_recalibration + File snps_recalibration_index + File snps_tranches + Float indel_filter_level + Float snp_filter_level + Boolean use_allele_specific_annotations + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms5g \ + ApplyVQSR \ + -O tmp.indel.recalibrated.vcf \ + -V ~{input_vcf} \ + --recal-file ~{indels_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{indels_tranches} \ + --truth-sensitivity-filter-level ~{indel_filter_level} \ + --create-output-variant-index true \ + -mode INDEL + + gatk --java-options -Xms5g \ + ApplyVQSR \ + -O ~{recalibrated_vcf_filename} \ + -V tmp.indel.recalibrated.vcf \ + --recal-file ~{snps_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{snps_tranches} \ + --truth-sensitivity-filter-level ~{snp_filter_level} \ + --create-output-variant-index true \ + -mode SNP + >>> + + runtime { + memory: "7 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File recalibrated_vcf = "~{recalibrated_vcf_filename}" + File recalibrated_vcf_index = "~{recalibrated_vcf_filename}.tbi" + } +} + +task GatherVcfs { + + input { + Array[File] input_vcfs + String output_vcf_name + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + parameter_meta { + input_vcfs: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + # --ignore-safety-checks makes a big performance difference so we include it in our invocation. + # This argument disables expensive checks that the file headers contain the same set of + # genotyped samples and that files are in order by position of first record. + gatk --java-options -Xms6g \ + GatherVcfsCloud \ + --ignore-safety-checks \ + --gather-type BLOCK \ + --input ~{sep=" --input " input_vcfs} \ + --output ~{output_vcf_name} + + tabix ~{output_vcf_name} + >>> + + runtime { + memory: "7 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File output_vcf = "~{output_vcf_name}" + File output_vcf_index = "~{output_vcf_name}.tbi" + } +} + +task SelectFingerprintSiteVariants { + + input { + File input_vcf + File haplotype_database + String base_output_name + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + parameter_meta { + input_vcf: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + function hdb_to_interval_list() { + input=$1 + awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1 + } + + hdb_to_interval_list ~{haplotype_database} > hdb.interval_list + + gatk --java-options -Xms6g \ + SelectVariants \ + --variant ~{input_vcf} \ + --intervals hdb.interval_list \ + --output ~{base_output_name}.vcf.gz + >>> + + runtime { + memory: "7.5 GiB" + cpu: 1 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File output_vcf = "~{base_output_name}.vcf.gz" + File output_vcf_index = "~{base_output_name}.vcf.gz.tbi" + } +} + +task CollectVariantCallingMetrics { + + input { + File input_vcf + File input_vcf_index + String metrics_filename_prefix + File dbsnp_vcf + File dbsnp_vcf_index + File interval_list + File ref_dict + Int disk_size + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -euo pipefail + + gatk --java-options -Xms6g \ + CollectVariantCallingMetrics \ + --INPUT ~{input_vcf} \ + --DBSNP ~{dbsnp_vcf} \ + --SEQUENCE_DICTIONARY ~{ref_dict} \ + --OUTPUT ~{metrics_filename_prefix} \ + --THREAD_COUNT 8 \ + --TARGET_INTERVALS ~{interval_list} + >>> + + output { + File detail_metrics_file = "~{metrics_filename_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "~{metrics_filename_prefix}.variant_calling_summary_metrics" + } + + runtime { + memory: "7.5 GiB" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } +} + +task GatherVariantCallingMetrics { + + input { + Array[File] input_details + Array[File] input_summaries + String output_prefix + Int disk_size + String gatk_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:1.3.0-1527875152" + } + + parameter_meta { + input_details: { + localization_optional: true + } + input_summaries: { + localization_optional: true + } + } + + command <<< + set -euo pipefail + + input_details_fofn=~{write_lines(input_details)} + input_summaries_fofn=~{write_lines(input_summaries)} + + # Jose says: + # Cromwell will fall over if we have it try to localize tens of thousands of files, + # so we manually localize files using gsutil. + # Using gsutil also lets us parallelize the localization, which (as far as we can tell) + # PAPI doesn't do. + + # This is here to deal with the JES bug where commands may be run twice + rm -rf metrics + + mkdir metrics + RETRY_LIMIT=5 + + count=0 + until cat $input_details_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the metrics from the cloud' && exit 1 + fi + + count=0 + until cat $input_summaries_fofn | /root/google-cloud-sdk/bin/gsutil -m cp -L cp.log -c -I metrics/; do + sleep 1 + ((count++)) && ((count >= $RETRY_LIMIT)) && break + done + if [ "$count" -ge "$RETRY_LIMIT" ]; then + echo 'Could not copy all the metrics from the cloud' && exit 1 + fi + + INPUT=$(cat $input_details_fofn | rev | cut -d '/' -f 1 | rev | sed s/.variant_calling_detail_metrics//g | awk '{printf("--INPUT metrics/%s ", $1)}') + + /usr/gitc/gatk --java-options -Xms2g \ + AccumulateVariantCallingMetrics \ + $INPUT \ + --OUTPUT ~{output_prefix} + >>> + + runtime { + memory: "3 GiB" + cpu: "1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + docker: gatk_docker + } + + output { + File detail_metrics_file = "~{output_prefix}.variant_calling_detail_metrics" + File summary_metrics_file = "~{output_prefix}.variant_calling_summary_metrics" + } +} + +task CrossCheckFingerprint { + + input { + Array[File] gvcf_paths + Array[File] vcf_paths + File sample_name_map + File haplotype_database + String output_base_name + Boolean scattered = false + Array[String] expected_inconclusive_samples = [] + String picard_docker = "us.gcr.io/broad-gotc-prod/gatk4-joint-genotyping:yf_fire_crosscheck_picard_with_nio_fast_fail_fast_sample_map" + } + + parameter_meta { + gvcf_paths: { + localization_optional: true + } + vcf_paths: { + localization_optional: true + } + } + + Int num_gvcfs = length(gvcf_paths) + Int cpu = if num_gvcfs < 32 then num_gvcfs else 32 + # Compute memory to use based on the CPU count, following the pattern of + # 3.75GiB / cpu used by GCP's pricing: https://cloud.google.com/compute/pricing + Int memMb = round(cpu * 3.75 * 1024) + Int disk = 100 + + String output_name = output_base_name + ".fingerprintcheck" + + command <<< + set -eu + + gvcfInputsList=~{write_lines(gvcf_paths)} + vcfInputsList=~{write_lines(vcf_paths)} + + cp $gvcfInputsList gvcf_inputs.list + cp $vcfInputsList vcf_inputs.list + + java -Dpicard.useLegacyParser=false -Xms~{memMb - 512}m \ + -jar /usr/gitc/PicardPublicWithCrosscheckNIOandSampleMapping.jar \ + CrosscheckFingerprints \ + --INPUT gvcf_inputs.list \ + --SECOND_INPUT vcf_inputs.list \ + --HAPLOTYPE_MAP ~{haplotype_database} \ + --INPUT_SAMPLE_FILE_MAP ~{sample_name_map} \ + --CROSSCHECK_BY SAMPLE \ + --CROSSCHECK_MODE CHECK_SAME_SAMPLE \ + --NUM_THREADS ~{cpu} \ + --SKIP_INPUT_READABLITY_TEST \ + ~{true='--EXIT_CODE_WHEN_MISMATCH 0' false='' scattered} \ + --OUTPUT ~{output_name} + + if ~{scattered}; then + # UNEXPECTED_MATCH is not possible with CHECK_SAME_SAMPLE + matches=$(grep "EXPECTED_MATCH" ~{output_name} | wc -l) + + # check inconclusive samples + expectedInconclusiveSamples=("~{sep='" "' expected_inconclusive_samples}") + inconclusiveSamplesCount=0 + inconclusiveSamples=($(grep 'INCONCLUSIVE' ~{output_name} | cut -f 1)) + for sample in ${inconclusiveSamples[@]}; do + if printf '%s\n' ${expectedInconclusiveSamples[@]} | grep -P '^'${sample}'$'; then + inconclusiveSamplesCount=$((inconclusiveSamplesCount+1)) + fi + done + + total_matches=$((inconclusiveSamplesCount + matches)) + if [[ ${total_matches} -eq ~{num_gvcfs} ]]; then + >&2 echo "Found the correct number of matches (~{num_gvcfs}) for this shard" + else + >&2 echo "ERROR: Found $total_matches 'EXPECTED_MATCH' records, but expected ~{num_gvcfs}" + exit 1 + fi + fi + >>> + + runtime { + memory: memMb + " MiB" + disks: "local-disk " + disk + " HDD" + preemptible: 0 + docker: picard_docker + } + + output { + File crosscheck_metrics = output_name + } +} + +task GatherPicardMetrics { + + input { + Array[File] metrics_files + String output_file_name + Int disk_size + } + + command { + # Don't use this task to gather tens of thousands of files. + # Cromwell can't handle it. + + # This cannot gather metrics with histograms + + head -n 7 ~{metrics_files[0]} > ~{output_file_name} + + for metrics_file in ~{sep=' ' metrics_files}; do + sed -n '1,7d;p' $metrics_file | grep -v '^$' >> ~{output_file_name} + done + } + + output { + File gathered_metrics = "~{output_file_name}" + } + + runtime { + cpu: 1 + memory: "3.75 GiB" + preemptible: 1 + disks: "local-disk " + disk_size + " HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +} + +task GetFingerprintingIntervalIndices { + + input { + Array[File] unpadded_intervals + File haplotype_database + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.1.1.0" + } + + command <<< + set -xeo pipefail + + function rename_intervals(){ + interval_list=$1 + name=$2 + + awk 'BEGIN{FS=IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {$5="'$name'"; print}' $interval_list + } + export -f rename_intervals + + function hdb_to_interval_list(){ + input=$1 + + awk 'BEGIN{IFS="\t";OFS="\t";} $0~"^@"{print;next;} $0~"#CHROM"{next;} {print $1,$2,$2,"+","interval-"NR}' $1 + } + + function rename_scatter(){ + file=$1 + number=$(echo $file | sed -E 's|([0-9]+)-scattered\.interval.*|\1|') + rename_intervals $file $number > scattered.renamed.$number.interval_list + } + export -f rename_scatter + + # rename the intervals within each interval_list according to the number in the name of the list + + cp ~{sep=' ' unpadded_intervals} ./ + + cat ~{write_lines(unpadded_intervals)} | xargs -n1 basename | xargs -I{} bash -c 'rename_scatter $@' _ {} + + #find the first header + find . -name "scattered.renamed.*.interval_list" | head -n1 | xargs cat | grep '^@' > all.interval_list + + # concatenate the resulting intervals (with no header) + find . -name "scattered.renamed.*.interval_list" | xargs cat | grep -v '^@' >> all.interval_list + + # convert the Haplotype_database to an interval_list + hdb_to_interval_list ~{haplotype_database} > hdb.interval_list + + # find the intervals that overlap the haplotype_database + gatk IntervalListTools \ + -ACTION OVERLAPS \ + -O all.sorted.interval_list \ + -I all.interval_list \ + -SI hdb.interval_list + + if grep -v '^@' all.sorted.interval_list; then + grep -v '^@' all.sorted.interval_list | awk '{FS="\t"; print $5}' | uniq > indices.out + else + touch indices.out + fi + >>> + + output { + Array[String] indices_to_fingerprint = read_lines("indices.out") + File all_sorted_interval_list = "all.sorted.interval_list" + File all_interval_list = "all.interval_list" + File hdb_interval_list = "hdb.interval_list" + } + + runtime { + cpu: 2 + memory: "3.75 GiB" + preemptible: 1 + disks: "local-disk 10 HDD" + docker: gatk_docker + } +} + +task PartitionSampleNameMap { + + input { + File sample_name_map + Int line_limit + } + + command { + + cut -f 2 ~{sample_name_map} > sample_paths + split -l ~{line_limit} -d sample_paths partition_ + + # Let the OS catch up with creation of files for glob command + sleep 1 + } + + output { + Array[File] partitions = glob("partition_*") + } + + runtime { + memory: "1 GiB" + preemptible: 1 + disks: "local-disk 10 HDD" + docker: "us.gcr.io/broad-gotc-prod/python:2.7" + } +}