From 517a88255e79af339fb625dd472f075ec6521d9e Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Mon, 19 Jun 2023 11:12:38 -0400 Subject: [PATCH] SVConcordance workflows update (#540) --- .../CleanVcf.SingleBatch.json.tmpl | 2 +- .../CleanVcf.json.tmpl | 2 +- ...otypeComplexVariants.SingleBatch.json.tmpl | 2 +- .../GenotypeComplexVariants.json.tmpl | 2 +- .../FormatVcfForGatk.json.tmpl | 12 + .../test/JoinRawCalls/JoinRawCalls.json.tmpl | 13 +- .../test/MakeCohortVcf/CleanVcf.json.tmpl | 2 +- .../GenotypeComplexVariants.json.tmpl | 2 +- .../SVConcordance/SVConcordance.json.tmpl | 16 +- inputs/templates/test/Vapor/Vapor.json.tmpl | 24 +- inputs/values/dockers.json | 5 +- inputs/values/dockers_azure.json | 3 +- inputs/values/hgdp.json | 147 ++++++++ inputs/values/ref_panel_1kg.json | 6 +- inputs/values/resources_hg38.json | 1 + scripts/inputs/build_default_inputs.sh | 4 + scripts/inputs/build_inputs.py | 8 +- scripts/test/validate.sh | 2 +- .../scripts/format_gatk_vcf_for_svtk.py | 43 ++- .../scripts/format_svtk_vcf_for_gatk.py | 326 +++++------------- wdl/CleanVcf.wdl | 23 +- wdl/CleanVcfChromosome.wdl | 24 +- wdl/ClusterBatch.wdl | 1 + wdl/FormatVcfForGatk.wdl | 145 ++++++++ wdl/GATKSVPipelineSingleSample.wdl | 4 +- wdl/GenotypeComplexVariants.wdl | 6 +- wdl/GenotypeCpxCnvs.wdl | 4 +- wdl/JoinRawCalls.wdl | 70 +++- wdl/MakeCohortVcf.wdl | 8 +- wdl/PESRClustering.wdl | 9 +- wdl/SVConcordance.wdl | 183 +--------- wdl/ScatterCpxGenotyping.wdl | 4 +- wdl/TasksClusterBatch.wdl | 21 +- wdl/TasksMakeCohortVcf.wdl | 83 ----- 34 files changed, 581 insertions(+), 626 deletions(-) create mode 100644 inputs/templates/test/FormatVcfForGatk/FormatVcfForGatk.json.tmpl create mode 100644 inputs/values/hgdp.json create mode 100644 wdl/FormatVcfForGatk.wdl diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.SingleBatch.json.tmpl index 4de4c554d..dc37d432f 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.SingleBatch.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.SingleBatch.json.tmpl @@ -20,7 +20,7 @@ "CleanVcf.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", "CleanVcf.cohort_name": "${this.sample_set_id}", - "CleanVcf.merged_ped_file": "${workspace.cohort_ped_file}", + "CleanVcf.ped_file": "${workspace.cohort_ped_file}", "CleanVcf.complex_genotype_vcfs": "${this.complex_genotype_vcfs}", "CleanVcf.complex_resolve_bothside_pass_lists": "${this.complex_resolve_bothside_pass_lists}", "CleanVcf.complex_resolve_background_fail_lists": "${this.complex_resolve_background_fail_lists}" diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl index a71e75b17..d2c5d265e 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/CleanVcf.json.tmpl @@ -20,7 +20,7 @@ "CleanVcf.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", "CleanVcf.cohort_name": "${this.sample_set_set_id}", - "CleanVcf.merged_ped_file": "${workspace.cohort_ped_file}", + "CleanVcf.ped_file": "${workspace.cohort_ped_file}", "CleanVcf.complex_genotype_vcfs": "${this.complex_genotype_vcfs}", "CleanVcf.complex_resolve_bothside_pass_lists": "${this.complex_resolve_bothside_pass_lists}", "CleanVcf.complex_resolve_background_fail_lists": "${this.complex_resolve_background_fail_lists}" diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.SingleBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.SingleBatch.json.tmpl index 2120cff6c..53f0c6d3d 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.SingleBatch.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.SingleBatch.json.tmpl @@ -15,7 +15,7 @@ "GenotypeComplexVariants.depth_vcfs": "${this.regenotyped_depth_vcfs}", "GenotypeComplexVariants.complex_resolve_vcfs": "${this.complex_resolve_vcfs}", "GenotypeComplexVariants.complex_resolve_vcf_indexes": "${this.complex_resolve_vcf_indexes}", - "GenotypeComplexVariants.merged_ped_file": "${workspace.cohort_ped_file}", + "GenotypeComplexVariants.ped_file": "${workspace.cohort_ped_file}", "GenotypeComplexVariants.bincov_files": "${this.merged_bincov}", "GenotypeComplexVariants.depth_gt_rd_sep_files": "${this.trained_genotype_depth_depth_sepcutoff}", "GenotypeComplexVariants.median_coverage_files": "${this.median_cov}" diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.json.tmpl index 91dec1548..0b8ca397a 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/GenotypeComplexVariants.json.tmpl @@ -15,7 +15,7 @@ "GenotypeComplexVariants.depth_vcfs": "${this.regenotyped_depth_vcfs}", "GenotypeComplexVariants.complex_resolve_vcfs": "${this.complex_resolve_vcfs}", "GenotypeComplexVariants.complex_resolve_vcf_indexes": "${this.complex_resolve_vcf_indexes}", - "GenotypeComplexVariants.merged_ped_file": "${workspace.cohort_ped_file}", + "GenotypeComplexVariants.ped_file": "${workspace.cohort_ped_file}", "GenotypeComplexVariants.bincov_files": "${this.sample_sets.merged_bincov}", "GenotypeComplexVariants.depth_gt_rd_sep_files": "${this.sample_sets.trained_genotype_depth_depth_sepcutoff}", "GenotypeComplexVariants.median_coverage_files": "${this.sample_sets.median_cov}" diff --git a/inputs/templates/test/FormatVcfForGatk/FormatVcfForGatk.json.tmpl b/inputs/templates/test/FormatVcfForGatk/FormatVcfForGatk.json.tmpl new file mode 100644 index 000000000..e81f46244 --- /dev/null +++ b/inputs/templates/test/FormatVcfForGatk/FormatVcfForGatk.json.tmpl @@ -0,0 +1,12 @@ +{ + "FormatVcfForGatk.vcf": {{ test_batch.clean_vcf | tojson }}, + "FormatVcfForGatk.prefix": {{ test_batch.name | tojson }}, + "FormatVcfForGatk.ped_file": {{ test_batch.ped_file | tojson }}, + "FormatVcfForGatk.formatter_args": {{ test_batch.clean_vcf_gatk_formatter_args | tojson }}, + "FormatVcfForGatk.contig_list": {{ reference_resources.primary_contigs_list | tojson }}, + "FormatVcfForGatk.chr_x": {{ reference_resources.chr_x | tojson }}, + "FormatVcfForGatk.chr_y": {{ reference_resources.chr_y | tojson }}, + "FormatVcfForGatk.contigs_header": {{ reference_resources.contigs_header | tojson }}, + "FormatVcfForGatk.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, + "FormatVcfForGatk.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }} +} diff --git a/inputs/templates/test/JoinRawCalls/JoinRawCalls.json.tmpl b/inputs/templates/test/JoinRawCalls/JoinRawCalls.json.tmpl index 332634643..732d238ef 100644 --- a/inputs/templates/test/JoinRawCalls/JoinRawCalls.json.tmpl +++ b/inputs/templates/test/JoinRawCalls/JoinRawCalls.json.tmpl @@ -1,19 +1,26 @@ { - "JoinRawCalls.gatk_docker":{{ dockers.gatk_docker | tojson }}, + "JoinRawCalls.gatk_docker": {{ dockers.gatk_docker | tojson }}, "JoinRawCalls.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, "JoinRawCalls.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, "JoinRawCalls.clustered_depth_vcfs" : [{{ test_batch.merged_depth_vcf | tojson }}], + "JoinRawCalls.clustered_depth_vcf_indexes" : [{{ test_batch.merged_depth_vcf_index | tojson }}], + "JoinRawCalls.clustered_manta_vcfs" : [{{ test_batch.merged_manta_vcf | tojson }}], + "JoinRawCalls.clustered_manta_vcf_indexes" : [{{ test_batch.merged_manta_vcf_index | tojson }}], + "JoinRawCalls.clustered_wham_vcfs" : [{{ test_batch.merged_wham_vcf | tojson }}], + "JoinRawCalls.clustered_wham_vcf_indexes" : [{{ test_batch.merged_wham_vcf_index | tojson }}], + "JoinRawCalls.clustered_melt_vcfs" : [{{ test_batch.merged_melt_vcf | tojson }}], + "JoinRawCalls.clustered_melt_vcf_indexes" : [{{ test_batch.merged_melt_vcf_index | tojson }}], - "JoinRawCalls.ploidy_table": {{ test_batch.ploidy_table | tojson }}, + "JoinRawCalls.ped_file": {{ test_batch.ped_file | tojson }}, "JoinRawCalls.contig_list": {{ reference_resources.primary_contigs_list | tojson }}, "JoinRawCalls.reference_fasta": {{ reference_resources.reference_fasta | tojson }}, "JoinRawCalls.reference_fasta_fai": {{ reference_resources.reference_index | tojson }}, "JoinRawCalls.reference_dict": {{ reference_resources.reference_dict | tojson }}, - "JoinRawCalls.cohort": {{ test_batch.name | tojson }} + "JoinRawCalls.prefix": {{ test_batch.name | tojson }} } diff --git a/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl b/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl index 2b2f36962..f79f52fc6 100644 --- a/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl +++ b/inputs/templates/test/MakeCohortVcf/CleanVcf.json.tmpl @@ -20,7 +20,7 @@ "CleanVcf.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }}, "CleanVcf.cohort_name": {{ test_batch.name | tojson }}, - "CleanVcf.merged_ped_file": {{ test_batch.ped_file | tojson }}, + "CleanVcf.ped_file": {{ test_batch.ped_file | tojson }}, "CleanVcf.complex_genotype_vcfs": {{ test_batch.complex_genotype_vcfs | tojson }}, "CleanVcf.complex_resolve_bothside_pass_lists": {{ test_batch.complex_resolve_bothside_pass_lists | tojson }}, "CleanVcf.complex_resolve_background_fail_lists": {{ test_batch.complex_resolve_background_fail_lists | tojson }} diff --git a/inputs/templates/test/MakeCohortVcf/GenotypeComplexVariants.json.tmpl b/inputs/templates/test/MakeCohortVcf/GenotypeComplexVariants.json.tmpl index e4df9a9df..16eaeceb3 100644 --- a/inputs/templates/test/MakeCohortVcf/GenotypeComplexVariants.json.tmpl +++ b/inputs/templates/test/MakeCohortVcf/GenotypeComplexVariants.json.tmpl @@ -19,7 +19,7 @@ ], "GenotypeComplexVariants.complex_resolve_vcfs": {{ test_batch.complex_resolve_vcfs | tojson }}, "GenotypeComplexVariants.complex_resolve_vcf_indexes": {{ test_batch.complex_resolve_vcf_indexes | tojson }}, - "GenotypeComplexVariants.merged_ped_file": {{ test_batch.ped_file | tojson }}, + "GenotypeComplexVariants.ped_file": {{ test_batch.ped_file | tojson }}, "GenotypeComplexVariants.bincov_files": [ {{ test_batch.merged_coverage_file | tojson }} ], diff --git a/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl b/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl index 07e98a4c5..0540305cd 100644 --- a/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl +++ b/inputs/templates/test/SVConcordance/SVConcordance.json.tmpl @@ -1,21 +1,11 @@ { - "SVConcordance.gatk_docker":{{ dockers.gatk_docker_concordance | tojson }}, + "SVConcordance.gatk_docker": {{ dockers.gatk_docker | tojson }}, "SVConcordance.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, - "SVConcordance.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, - "SVConcordance.sv_utils_docker": {{ dockers.sv_utils_docker | tojson }}, - "SVConcordance.eval_vcf" : {{ test_batch.clean_vcf | tojson }}, + "SVConcordance.eval_vcf" : {{ test_batch.gatk_formatted_vcf | tojson }}, "SVConcordance.truth_vcf" : {{ test_batch.joined_raw_calls_vcf | tojson }}, - "SVConcordance.ploidy_table": {{ test_batch.ploidy_table | tojson }}, - "SVConcordance.cohort": {{ test_batch.name | tojson }}, - - "SVConcordance.run_svutils_truth_vcf": "false", - "SVConcordance.run_formatter_truth_vcf": "false", - - "SVConcordance.run_svutils_eval_vcf": "true", - "SVConcordance.run_formatter_eval_vcf": "true", - "SVConcordance.formatter_eval_args": "--only-add-cn-fields --replace-ev-format --filter-unsupported-types", + "SVConcordance.output_prefix": {{ test_batch.name | tojson }}, "SVConcordance.contig_list": {{ reference_resources.primary_contigs_list | tojson }}, "SVConcordance.reference_dict": {{ reference_resources.reference_dict | tojson }} diff --git a/inputs/templates/test/Vapor/Vapor.json.tmpl b/inputs/templates/test/Vapor/Vapor.json.tmpl index 582e1e9e7..0c7adf9c7 100644 --- a/inputs/templates/test/Vapor/Vapor.json.tmpl +++ b/inputs/templates/test/Vapor/Vapor.json.tmpl @@ -1,15 +1,15 @@ { - "VaporBatch.sv_base_mini_docker" : {{ dockers.sv_base_mini_docker | tojson }}, - "VaporBatch.sv_pipeline_docker" : {{ dockers.sv_pipeline_docker | tojson }}, - "VaporBatch.vapor_docker": {{ dockers.vapor_docker | tojson }}, - "VaporBatch.contigs": {{ reference_resources.primary_contigs_list | tojson }}, - "VaporBatch.ref_fasta" : {{ reference_resources.reference_fasta | tojson }}, - "VaporBatch.ref_fai" : {{ reference_resources.reference_index | tojson }}, - "VaporBatch.ref_dict": {{ reference_resources.reference_dict | tojson }}, + "Vapor.sv_base_mini_docker" : {{ dockers.sv_base_mini_docker | tojson }}, + "Vapor.sv_pipeline_docker" : {{ dockers.sv_pipeline_docker | tojson }}, + "Vapor.vapor_docker": {{ dockers.vapor_docker | tojson }}, + "Vapor.contigs": {{ reference_resources.primary_contigs_list | tojson }}, + "Vapor.ref_fasta" : {{ reference_resources.reference_fasta | tojson }}, + "Vapor.ref_fai" : {{ reference_resources.reference_index | tojson }}, + "Vapor.ref_dict": {{ reference_resources.reference_dict | tojson }}, - "VaporBatch.prefix": {{ test_batch.example_pacbio_sample_id | tojson }}, - "VaporBatch.sample_id": {{ test_batch.example_pacbio_sample_id | tojson }}, - "VaporBatch.bam_or_cram_file": {{ test_batch.example_pacbio_cram | tojson }}, - "VaporBatch.bam_or_cram_index": {{ test_batch.example_pacbio_cram_index | tojson }}, - "VaporBatch.bed_file": {{ test_batch.clean_bed | tojson }} + "Vapor.prefix": {{ test_batch.example_pacbio_sample_id | tojson }}, + "Vapor.sample_id": {{ test_batch.example_pacbio_sample_id | tojson }}, + "Vapor.bam_or_cram_file": {{ test_batch.example_pacbio_cram | tojson }}, + "Vapor.bam_or_cram_index": {{ test_batch.example_pacbio_cram_index | tojson }}, + "Vapor.bed_file": {{ test_batch.clean_bed | tojson }} } diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 4c3c2d5bf..4cffa8b4d 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -2,9 +2,8 @@ "name": "dockers", "cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2023-02-01-v0.26.8-beta-9b25c72d", "condense_counts_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", - "gatk_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", + "gatk_docker": "us.gcr.io/broad-dsde-methods/markw/gatk:2023-05-16-4.4.0.0-17-g18edcd3e6-NIGHTLY-SNAPSHOT", "gatk_docker_pesr_override": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", - "gatk_docker_concordance": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-sv-concordance-937c81", "genomes_in_the_cloud_docker": "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135", "linux_docker": "marketplace.gcr.io/google/ubuntu1804", "manta_docker": "us.gcr.io/broad-dsde-methods/vjalili/manta:5994670", @@ -33,4 +32,4 @@ "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2023-03-16-v0.27-beta-906c6272", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/tbrookin/gatk:0a7e1d86f", "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2023-05-23-v0.27.3-beta-e537bdd6" -} \ No newline at end of file +} diff --git a/inputs/values/dockers_azure.json b/inputs/values/dockers_azure.json index c324d341d..4b7aed7a8 100644 --- a/inputs/values/dockers_azure.json +++ b/inputs/values/dockers_azure.json @@ -2,9 +2,8 @@ "name": "dockers", "cnmops_docker": "vahid.azurecr.io/gatk-sv/cnmops:2023-02-01-v0.26.8-beta-9b25c72d", "condense_counts_docker": "vahid.azurecr.io/tsharpe/gatk:4.2.6.1-57-g9e03432", - "gatk_docker": "vahid.azurecr.io/tsharpe/gatk:4.2.6.1-57-g9e03432", + "gatk_docker": "vahid.azurecr.io/markw/gatk:2023-05-16-4.4.0.0-17-g18edcd3e6-NIGHTLY-SNAPSHOT", "gatk_docker_pesr_override": "vahid.azurecr.io/tsharpe/gatk:4.2.6.1-57-g9e03432", - "gatk_docker_concordance": "vahid.azurecr.io/markw/gatk:mw-sv-concordance-937c81", "genomes_in_the_cloud_docker": "vahid.azurecr.io/genomes-in-the-cloud:2.3.2-1510681135", "linux_docker": "vahid.azurecr.io/google/ubuntu1804", "manta_docker": "vahid.azurecr.io/vjalili/manta:5994670", diff --git a/inputs/values/hgdp.json b/inputs/values/hgdp.json new file mode 100644 index 000000000..5eae7726e --- /dev/null +++ b/inputs/values/hgdp.json @@ -0,0 +1,147 @@ +{ + "name": "hgdp", + "ped_file": "gs://gatk-sv-hgdp/mw-sv-concordance-update/HGDP_1KGP.ped", + "ploidy_table": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.ploidy_table.tsv", + + "del_bed": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.DEL.bed.gz", + "dup_bed": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.DUP.bed.gz", + "std_wham_vcf_tar": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.wham.std_vcfs.37_missing.tar.gz", + "std_manta_vcf_tar": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.manta.std_vcfs.37_missing.tar.gz", + "std_melt_vcf_tar": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.melt.std_vcfs.37_missing.tar.gz", + + "merged_depth_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.cluster_batch.depth.vcf.gz", + "merged_depth_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.cluster_batch.depth.vcf.gz.tbi", + "merged_manta_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.cluster_batch.manta.vcf.gz", + "merged_manta_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.cluster_batch.manta.vcf.gz.tbi", + "merged_melt_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.cluster_batch.melt.vcf.gz", + "merged_melt_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.cluster_batch.melt.vcf.gz.tbi", + "merged_wham_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.cluster_batch.wham.vcf.gz", + "merged_wham_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.cluster_batch.wham.vcf.gz.tbi", + + "clean_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp_and_hgsv.cleaned.vcf.gz", + "clean_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp_and_hgsv.cleaned.vcf.gz.tbi", + "clean_vcf_qc": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp_SV_VCF_QC_output.tar.gz", + "clean_bed": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.vcf2bed.bed.gz", + + "clean_vcf_gatk_formatter_args": "--scale-down-gq", + "gatk_formatted_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.gatk_formatted.vcf.gz", + "gatk_formatted_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.gatk_formatted.vcf.gz.tbi", + + "joined_raw_calls_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.join_raw_calls.vcf.gz", + "joined_raw_calls_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.join_raw_calls.vcf.gz.tbi", + "concordance_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.concordance.vcf.gz", + "concordance_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.concordance.vcf.gz.tbi", + + "pacbio_sample_concordance_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/training/hgdp.concordance.subset.vcf.gz", + "pacbio_sample_concordance_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/training/hgdp.concordance.subset.vcf.gz.tbi", + "recalibrate_gq_truth_json": "gs://gatk-sv-hgdp/mw-sv-concordance-update/training/hgdp.gq_training_labels.json", + + "hgdp_recalibrate_gq_model_file": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.gq_recalibrator.model", + "hgdp_recalibrated_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.concordance.hgdp_gq_recalibrated.vcf.gz", + "hgdp_recalibrated_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.concordance.hgdp_gq_recalibrated.vcf.gz.tbi", + "hgdp_sl_filtered_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.hgdp_gq_recalibrated.filtered.vcf.gz", + "hgdp_sl_filtered_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.hgdp_gq_recalibrated.filtered.vcf.gz.tbi", + + "aou_recalibrate_gq_model_file": "gs://broad-dsde-methods-markw/gq-filter/aou.gq_recalibrator.v2.model", + "aou_recalibrated_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.concordance.aou_gq_recalibrated.vcf.gz", + "aou_recalibrated_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.concordance.aou_gq_recalibrated.vcf.gz.tbi", + "aou_sl_filtered_vcf": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.aou_gq_recalibrated.filtered.vcf.gz", + "aou_sl_filtered_vcf_index": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.aou_gq_recalibrated.filtered.vcf.gz.tbi", + + "example_pacbio_sample_id": "HG00512", + "example_pacbio_cram": "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00512/HG00512.cram", + "example_pacbio_cram_index": "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00512/HG00512.cram.crai", + + "pacbio_samples_list": "gs://gatk-sv-hgdp/mw-sv-concordance-update/hgdp.pacbio_samples.list", + "pacbio_samples": [ + "HG00512", + "HG00513", + "HG00514", + "HG00731", + "HG00732", + "HG00733", + "NA12878", + "NA19238", + "NA19239", + "NA19240", + "NA24385" + ], + "vapor_files": [ + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/HG00512.bed.gz", + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/HG00513.bed.gz", + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/HG00514.bed.gz", + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/HG00731.bed.gz", + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/HG00732.bed.gz", + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/HG00733.bed.gz", + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/NA12878.bed.gz", + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/NA19238.bed.gz", + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/NA19239.bed.gz", + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/NA19240.bed.gz", + "gs://gatk-sv-hgdp/mw-sv-concordance-update/vapor/NA24385.bed.gz" + ], + "pacbio_crams": [ + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00512/HG00512.cram", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00513/HG00513.cram", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00514/HG00514.cram", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00731/HG00731.cram", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00732/HG00732.cram", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00733/HG00733.cram", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/NA12878/NA12878.cram", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/NA19238/NA19238.cram", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/NA19239/NA19239.cram", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/NA19240/NA19240.cram", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/NA24385/NA24385.cram" + ], + "pacbio_cram_indexes": [ + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00512/HG00512.cram.crai", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00513/HG00513.cram.crai", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00514/HG00514.cram.crai", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00731/HG00731.cram.crai", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00732/HG00732.cram.crai", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/HG00733/HG00733.cram.crai", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/NA12878/NA12878.cram.crai", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/NA19238/NA19238.cram.crai", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/NA19239/NA19239.cram.crai", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/NA19240/NA19240.cram.crai", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/CompressLongreadsBam/NA24385/NA24385.cram.crai" + ], + "pacbio_pav_vcfs": [ + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/HG00512/pav_HG00512.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/HG00513/pav_HG00513.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/HG00514/pav_HG00514.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/HG00731/pav_HG00731.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/HG00732/pav_HG00732.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/HG00733/pav_HG00733.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/NA12878/pav_NA12878.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/NA19238/pav_NA19238.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/NA19239/pav_NA19239.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/NA19240/pav_NA19240.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PAV/NA24385/pav_NA24385.vcf.gz" + ], + "pacbio_pbsv_vcfs": [ + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00512/variants/sv/HG00512.pbsv.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00513/variants/sv/HG00513.pbsv.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00514/variants/sv/HG00514.pbsv.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00731/variants/sv/HG00731.pbsv.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00732/variants/sv/HG00732.pbsv.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00733/variants/sv/HG00733.pbsv.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/NA12878/variants/sv/NA12878.pbsv.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/NA19238/variants/sv/NA19238.pbsv.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/NA19239/variants/sv/NA19239.pbsv.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/NA19240/variants/sv/NA19240.pbsv.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/NA24385/variants/sv/NA24385.pbsv.vcf.gz" + ], + "pacbio_sniffles_vcfs": [ + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00512/variants/sv/HG00512.sniffles.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00513/variants/sv/HG00513.sniffles.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00514/variants/sv/HG00514.sniffles.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00731/variants/sv/HG00731.sniffles.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00732/variants/sv/HG00732.sniffles.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/HG00733/variants/sv/HG00733.sniffles.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/NA12878/variants/sv/NA12878.sniffles.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/NA19238/variants/sv/NA19238.sniffles.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/NA19239/variants/sv/NA19239.sniffles.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/NA19240/variants/sv/NA19240.sniffles.vcf.gz", + "gs://fc-7891e5cf-0a7a-4c2f-8a18-0d05b27c53ab/GRCh38/PBCCSWholeGenome/NA24385/variants/sv/NA24385.sniffles.vcf.gz" + ] +} diff --git a/inputs/values/ref_panel_1kg.json b/inputs/values/ref_panel_1kg.json index 66bf811c2..6c319480a 100644 --- a/inputs/values/ref_panel_1kg.json +++ b/inputs/values/ref_panel_1kg.json @@ -1168,6 +1168,7 @@ ], "clean_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/MakeCohortVcf/8a209488-c928-449d-92cd-0a5131e92b7c/call-CleanVcf/CleanVcf/277f3f25-bb99-4fe4-a48b-567fd3f344f9/call-ConcatCleanedVcfs/ref_panel_1kg.cleaned.vcf.gz", "clean_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/MakeCohortVcf/8a209488-c928-449d-92cd-0a5131e92b7c/call-CleanVcf/CleanVcf/277f3f25-bb99-4fe4-a48b-567fd3f344f9/call-ConcatCleanedVcfs/ref_panel_1kg.cleaned.vcf.gz.tbi", + "clean_vcf_gatk_formatter_args": "", "cluster_background_fail_lists": [ "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/CombineBatches/cluster-background-fail-lists/ref_panel_1kg.chr1.sr_background_fail.updated2.txt", "gs://gatk-sv-ref-panel-1kg/outputs/mw-make-cohort-vcf-templates/CombineBatches/cluster-background-fail-lists/ref_panel_1kg.chr2.sr_background_fail.updated2.txt", @@ -1605,6 +1606,8 @@ "filtered_pesr_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GATKSVPipelinePhase1/GATKSVPipelinePhase1/acce2c71-7458-4205-ae13-624f6efc9956/call-FilterBatch/FilterBatch/184defa3-e61c-4757-9962-f685f6d0d204/call-FilterBatchSamples/FilterBatchSamples/b308c32e-d171-4d8d-aeaf-b561c55b06b4/call-MergePesrVcfs/cacheCopy/ref_panel_1kg.filtered_pesr_merged.vcf.gz", "final_sample_list": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GATKSVPipelinePhase1/GATKSVPipelinePhase1/acce2c71-7458-4205-ae13-624f6efc9956/call-FilterBatch/FilterBatch/184defa3-e61c-4757-9962-f685f6d0d204/call-FilterBatchSamples/FilterBatchSamples/b308c32e-d171-4d8d-aeaf-b561c55b06b4/call-FilterSampleList/cacheCopy/ref_panel_1kg.outliers_excluded.samples.list", "final_sample_outlier_list": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GATKSVPipelinePhase1/GATKSVPipelinePhase1/acce2c71-7458-4205-ae13-624f6efc9956/call-FilterBatch/FilterBatch/184defa3-e61c-4757-9962-f685f6d0d204/call-FilterBatchSamples/FilterBatchSamples/b308c32e-d171-4d8d-aeaf-b561c55b06b4/call-CatOutliers/cacheCopy/ref_panel_1kg.outliers.samples.list", + "gatk_formatted_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.gatk_formatted.vcf.gz", + "gatk_formatted_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.gatk_formatted.vcf.gz.tbi", "gcnv_model_tars": [ "gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/gcnv/model_files/ref_panel_1kg_v2-gcnv-model-shard-0.tar.gz", "gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/gcnv/model_files/ref_panel_1kg_v2-gcnv-model-shard-1.tar.gz", @@ -1899,7 +1902,8 @@ "genotype_pesr_pesr_sepcutoff": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypePESRPart1/GenotypePESRPart1/40ec6d76-dd1c-432d-bfab-bc4426d0b1ec/call-TrainRDGenotyping/TrainRDGenotyping/e5540a96-9072-4719-bcfb-afccdfec15c6/call-UpdateCutoff/cacheCopy/ref_panel_1kg.pesr.pesr_sepcutoff.txt", "genotyped_depth_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypeDepthPart2/GenotypeDepthPart2/0aafd752-e606-4196-86ac-41c1c3ce1eb2/call-ConcatGenotypedVcfs/cacheCopy/ref_panel_1kg.depth.vcf.gz", "genotyped_pesr_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/GATKSVPipelineBatch/38c65ca4-2a07-4805-86b6-214696075fef/call-GenotypeBatch/GenotypeBatch/ad17f522-0950-4f0a-9148-a13f689082ed/call-GenotypePESRPart2/GenotypePESRPart2/ce1f4075-1a3e-44b5-9cfe-bfb701327616/call-ConcatGenotypedVcfs/cacheCopy/ref_panel_1kg.pesr.vcf.gz", - "joined_raw_calls_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/JoinRawCalls/a613865b-f7ec-4edb-8a2e-21508335249e/ref_panel_1kg.join_raw_calls.vcf.gz", + "joined_raw_calls_vcf": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz", + "joined_raw_calls_vcf_index": "gs://gatk-sv-ref-panel-1kg/outputs/mw_sv_concordance_update/ref_panel_1kg.join_raw_calls.vcf.gz.tbi", "manta_vcfs": [ "gs://broad-dsde-methods-markw/tws-no-cram-conversion/GatherSampleEvidenceBatch/HG00096.manta.vcf.gz", "gs://broad-dsde-methods-markw/tws-no-cram-conversion/GatherSampleEvidenceBatch/HG00129.manta.vcf.gz", diff --git a/inputs/values/resources_hg38.json b/inputs/values/resources_hg38.json index 3ee4cf632..8713900c5 100644 --- a/inputs/values/resources_hg38.json +++ b/inputs/values/resources_hg38.json @@ -26,6 +26,7 @@ "preprocessed_intervals" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/preprocessed_intervals.interval_list", "primary_contigs_fai" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/contig.fai", "primary_contigs_list" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/primary_contigs.list", + "contigs_header": "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/hg38_contigs_header.vcf", "protein_coding_gtf" : "gs://gatk-sv-resources-public/hg38/v0/sv-resources/resources/v1/MANE.GRCh38.v0.95.select_ensembl_genomic.gtf", "reference_build" : "hg38", "reference_dict" : "gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.dict", diff --git a/scripts/inputs/build_default_inputs.sh b/scripts/inputs/build_default_inputs.sh index b8ca3e7d5..6397de18a 100755 --- a/scripts/inputs/build_default_inputs.sh +++ b/scripts/inputs/build_default_inputs.sh @@ -51,6 +51,10 @@ echo "########## Building ref_panel_1kg cohort Terra workspace ##########" scripts/inputs/build_inputs.py ${BASE_DIR}/inputs/values ${BASE_DIR}/inputs/templates/terra_workspaces/cohort_mode ${BASE_DIR}/inputs/build/ref_panel_1kg/terra \ -a '{ "test_batch" : "ref_panel_1kg", "cloud_env" : "'$CLOUD_ENV'" }' +echo "########## Building hgdp test ##########" +scripts/inputs/build_inputs.py ${BASE_DIR}/inputs/values ${BASE_DIR}/inputs/templates/test ${BASE_DIR}/inputs/build/hgdp/test \ + -a '{ "test_batch" : "hgdp", "cloud_env" : "'$CLOUD_ENV'" }' + # Note CLOUD_ENV is not currently required for the single-sample workflow echo "########## Building NA19240 single-sample test ##########" scripts/inputs/build_inputs.py ${BASE_DIR}/inputs/values ${BASE_DIR}/inputs/templates/test/GATKSVPipelineSingleSample ${BASE_DIR}/inputs/build/NA19240/test \ diff --git a/scripts/inputs/build_inputs.py b/scripts/inputs/build_inputs.py index 0dc9e1f06..e097f4d46 100755 --- a/scripts/inputs/build_inputs.py +++ b/scripts/inputs/build_inputs.py @@ -42,7 +42,7 @@ # # Will cause the "test_batch_small" input value set to be aliased to the "test_batch" resource bundle. # -# If a template refers to missing property from a resource bundle, it will be skipped, with a warning message listing which +# If a template refers to missing property from a resource bundle, it will be skipped, with an info message listing which # properties are missing. This feature can be used purposefully to generate different sets of input files from the same sets # of templates depending on which properties are present in the input value files. For example, the build_default_inputs.sh # script generates inputs three times from the test_input_templates directory, with the test_batch bundle aliased to the @@ -96,7 +96,7 @@ def main(): parser.add_argument('-a', '--aliases', type=json.loads, default={}, help="Aliases for input value bundles") parser.add_argument('--log-info', action='store_true', - help="Show INFO-level logging messages") + help="Show INFO-level logging messages. Use for troubleshooting.") args = parser.parse_args() # Set logger @@ -212,8 +212,8 @@ def process_file(input_dict, template_subdir, template_file, target_subdir): # Transpose the TSV data in processed_content processed_content = transpose_tsv(processed_content) if len(undefined_names) > 0: - logging.warning("skipping file " + template_file_path + - " due to missing values " + str(undefined_names)) + logging.info("skipping file " + template_file_path + + " due to missing values " + str(undefined_names)) else: os.makedirs(target_subdir, exist_ok=True) target_file = open(target_file_path, "w") diff --git a/scripts/test/validate.sh b/scripts/test/validate.sh index 89c8888ae..b40467c7f 100755 --- a/scripts/test/validate.sh +++ b/scripts/test/validate.sh @@ -63,7 +63,7 @@ COUNTER=0 for wdl in "${WDLS[@]}" do name=$(basename $wdl .wdl) - JSONS=(`find inputs/build/ref_panel_1kg/test -name "${name}.*json"` `find inputs/build/NA12878/test -name "${name}.*json"`) + JSONS=(`find inputs/build/ref_panel_1kg/test -name "${name}.*json"` `find inputs/build/hgdp/test -name "${name}.*json"` `find inputs/build/NA12878/test -name "${name}.*json"`) for json in "${JSONS[@]}" do cmd="java -jar ${WOMTOOL_JAR} validate ${wdl} -i ${json}" diff --git a/src/sv-pipeline/scripts/format_gatk_vcf_for_svtk.py b/src/sv-pipeline/scripts/format_gatk_vcf_for_svtk.py index ddae71da0..64e42b18e 100644 --- a/src/sv-pipeline/scripts/format_gatk_vcf_for_svtk.py +++ b/src/sv-pipeline/scripts/format_gatk_vcf_for_svtk.py @@ -5,6 +5,18 @@ import sys from typing import Optional, List, Text, Set +_gt_sum_map = dict() + + +def _cache_gt_sum(gt): + if gt is None: + return 0 + s = _gt_sum_map.get(gt, None) + if s is None: + s = sum([1 for a in gt if a is not None and a > 0]) + _gt_sum_map[gt] = s + return s + def create_header(header_in: pysam.VariantHeader, source: Text, @@ -91,25 +103,19 @@ def convert(record: pysam.VariantRecord, new_record.info[key] = record.info[key] # fix END, CHR2, SVLEN, STRANDS if svtype == 'INS': - new_record.info['CHR2'] = contig - if 'SVLEN' not in record.info: - new_record.info['SVLEN'] = -1 + new_record.info['SVLEN'] = record.info.get('SVLEN', -1) new_record.info['STRANDS'] = '+-' - elif svtype == 'BND': + elif svtype == 'BND' or svtype == 'CTX': new_record.stop = record.info['END2'] new_record.info['SVLEN'] = -1 + elif svtype == 'CPX': + new_record.info['SVLEN'] = record.info.get('SVLEN', -1) elif svtype == 'DEL': - new_record.info['CHR2'] = contig - new_record.info['SVLEN'] = record.stop - record.start new_record.info['STRANDS'] = '+-' elif svtype == 'DUP': - new_record.info['CHR2'] = contig - new_record.info['SVLEN'] = record.stop - record.start new_record.info['STRANDS'] = '-+' elif svtype == 'INV': - new_record.info['CHR2'] = contig - new_record.info['SVLEN'] = record.stop - record.start - new_record.info['STRANDS'] = record.info['STRANDS'] + new_record.info['STRANDS'] = record.info.get('STRANDS', None) for sample in record.samples: new_genotype = new_record.samples[sample] @@ -119,17 +125,10 @@ def convert(record: pysam.VariantRecord, if key not in remove_formats: new_genotype[key] = genotype[key] # fix GT, always assuming diploid - if svtype == 'DUP': - if genotype['ECN'] < genotype['CN']: - new_genotype['GT'] = (0, 1) - else: - new_genotype['GT'] = (0, 0) + if _cache_gt_sum(genotype.get('GT', None)) > 0: + new_genotype['GT'] = (0, 1) else: - called_gt = [g for g in genotype['GT'] if g is not None] if 'GT' in genotype else [] - if sum(called_gt) > 0: - new_genotype['GT'] = (0, 1) - else: - new_genotype['GT'] = (0, 0) + new_genotype['GT'] = (0, 0) return new_record @@ -148,7 +147,7 @@ def __parse_arg_list(arg: Text) -> List[Text]: def __parse_arguments(argv: List[Text]) -> argparse.Namespace: # noinspection PyTypeChecker parser = argparse.ArgumentParser( - description="Convert a GATK-style SV VCF to SVTK-style", + description="Convert a GATK-style SV VCF from ClusterBatch for consumption by GenerateBatchMetrics.", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument("--vcf", type=str, required=True, diff --git a/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py b/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py index e2d32fbca..6bc550b37 100644 --- a/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py +++ b/src/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py @@ -4,9 +4,10 @@ import pysam import sys import gzip -from typing import Any, List, Text, Set, Dict, Optional +from math import floor +from typing import Any, List, Text, Dict, Optional -_gt_sum_map = dict() +GQ_FIELDS = ["GQ", "PE_GQ", "SR_GQ", "RD_GQ"] def _parse_bnd_ends(vcf_path: Text) -> Dict[Text, int]: @@ -32,7 +33,7 @@ def _parse_bnd_ends(vcf_path: Text) -> Dict[Text, int]: columns = line.split('\t', 8) vid = columns[2] info = columns[7] - if 'SVTYPE=BND' not in info and 'SVTYPE=CTX' not in info and 'SVTYPE=CPX' not in info: + if 'SVTYPE=BND' not in info: continue info_tokens = info.split(';') end_field_list = [x for x in info_tokens if x.startswith("END=")] @@ -69,234 +70,103 @@ def _parse_ploidy_table(path: Text) -> Dict[Text, Dict[Text, int]]: return ploidy_dict -def create_header(header_in: pysam.VariantHeader, - replace_ev_format: bool, - remove_infos: Set[Text], - remove_formats: Set[Text]) -> pysam.VariantHeader: +def update_header(header: pysam.VariantHeader) -> None: """ Ingests the given header, removes specified fields, and adds necessary fields. Parameters ---------- - header_in: pysam.VariantHeader - input header - remove_infos: Set[Text] - set of info fields to remove - remove_formats: Set[Text] - set of format fields to remove - - Returns - ------- header: pysam.VariantHeader - gatk-style header + input header """ - header = pysam.VariantHeader() - for sample in header_in.samples: - header.add_sample(sample) - for line in header_in.records: - # remove fields - if len(line.attrs) > 0 and 'ID' in line.keys() and (line['ID'] in remove_infos or line['ID'] in remove_formats): - continue - line_str = str(line) - # remove source line - if line_str.startswith('##source='): - continue - header.add_line(line_str) - # new fields - header.add_line('##INFO=') - header.add_line('##INFO=') - header.add_line('##FORMAT=') header.add_line('##FORMAT=') - if replace_ev_format: - header.add_line('##FORMAT=') - return header + # Add these just in case (no effect if they exist) + header.add_line('##INFO=') + header.add_line('##INFO=') + + +def rescale_gq(record): + for sample in record.samples: + for gq_field in GQ_FIELDS: + if gq_field in record.samples[sample] and record.samples[sample][gq_field] is not None: + record.samples[sample][gq_field] = floor(record.samples[sample][gq_field] / 10) def convert(record: pysam.VariantRecord, - vcf_out: pysam.VariantFile, - remove_infos: Set[Text], - remove_formats: Set[Text], bnd_end_dict: Optional[Dict[Text, int]], - ploidy_dict: Dict[Text, Dict[Text, int]]) -> pysam.VariantRecord: + ploidy_dict: Dict[Text, Dict[Text, int]], + scale_down_gq: bool) -> pysam.VariantRecord: """ - Converts a record from svtk to gatk style. This includes updating all GT fields with proper ploidy, and adding - necessary fields such as ECN and CN. + Converts a record from svtk to gatk style. This includes updating END/END2 and adding + necessary fields such as ECN. Parameters ---------- record: pysam.VariantRecord svtk-style record - vcf_out: pysam.VariantFile - new vcf, to which the converted record will be written - remove_infos: Set[Text] - info fields to remove - remove_formats: Set[Text] - format fields to remove bnd_end_dict: Optional[Dict[Text, int]] map from BND variant ID to END coordinate ploidy_dict: Dict[Text, Dict[Text, int]] map from sample to contig to ploidy + scale_down_gq: bool + scale GQs to 0-99 range Returns ------- header: pysam.VariantRecord gatk-style record """ + + def is_null(val): + return val is None or val == "." + svtype = record.info['SVTYPE'] - # Force symbolic BND alleles - if svtype == 'BND': - alleles = (record.alleles[0], '') - else: - alleles = record.alleles contig = record.contig - new_record = vcf_out.new_record(id=record.id, contig=contig, start=record.start, stop=record.stop, alleles=alleles, - info={key: value for key, value in record.info.items() if key not in remove_infos}) # Version of htsjdk currently in gatk only supports these base alleles - if new_record.ref in ['A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N', 'n']: - new_record.ref = new_record.ref + if record.ref in ['A', 'C', 'G', 'T', 'a', 'c', 'g', 't', 'N', 'n']: + record.ref = record.ref else: - new_record.ref = 'N' - # fix SVLEN, STRANDS, CHR2, and END2 where needed - if svtype == 'INS': - new_record.info['SVLEN'] = record.info['SVLEN'] - elif svtype == 'BND' or svtype == 'CTX': - if svtype == 'CTX': - svtype = 'BND' - new_record.info['OSVTYPE'] = record.info['SVTYPE'] - new_record.info['SVTYPE'] = svtype - new_record.info['STRANDS'] = record.info.get('STRANDS', '++') - else: - new_record.info['STRANDS'] = record.info['STRANDS'] - new_record.info['CHR2'] = record.info['CHR2'] - new_record.info['END2'] = bnd_end_dict[record.id] if bnd_end_dict is not None else record.info.get('END2', record.pos + record.info['SVLEN']) - new_record.stop = record.start + 1 - elif svtype == 'INV': - new_record.info['STRANDS'] = record.info.get('STRANDS', '++') - elif svtype == 'CTX': - new_record.info['STRANDS'] = record.info.get('STRANDS', '++') - new_record.info['OSVTYPE'] = record.info['SVTYPE'] - new_record.info['SVTYPE'] = 'BND' - elif svtype == 'CPX': - svtype = 'INV' - new_record.info['STRANDS'] = record.info.get('STRANDS', '++') - new_record.info['OSVTYPE'] = record.info['SVTYPE'] - new_record.info['SVTYPE'] = svtype - # copy FORMAT fields - for sample in record.samples: - genotype = record.samples[sample] - new_genotype = new_record.samples[sample] - for key in genotype.keys(): - if key not in remove_formats: - new_genotype[key] = genotype[key] - new_genotype['ECN'] = ploidy_dict[sample][contig] - if new_genotype['ECN'] == 0: - new_genotype['GT'] = () - new_genotype['CN'] = 0 - elif new_genotype['ECN'] == 1: - if svtype == 'DUP': - new_genotype['CN'] = 1 + _cache_gt_sum(genotype['GT']) - new_genotype['GT'] = (None,) - elif _cache_gt_sum(genotype['GT']) == 0: - new_genotype['GT'] = (0,) - if svtype == 'DEL': - new_genotype['CN'] = 1 + record.ref = 'N' + if svtype == 'BND' or svtype == 'CTX': + record.info['END2'] = bnd_end_dict[record.id] if bnd_end_dict is not None \ + else record.info.get('END2', record.stop) + # Fix this weird edge case (may be from CPX review workflow) + if svtype == 'INV' and '' in record.alleles[1]: + svtype = 'CPX' + record.info['SVTYPE'] = svtype + is_ddup = svtype == 'CPX' and 'dDUP' in record.info.get('CPX_TYPE', '') + if svtype == 'BND' or svtype == 'INS' or svtype == 'CTX' or is_ddup: + record.stop = record.start + 1 + if is_ddup: + # e.g. SOURCE=DUP_chrX:49151588-49151850 + source = record.info.get('SOURCE', None) + if source is not None: + tokens = source.split(':') + chr2 = tokens[0].split('_')[-1] + end2 = int(tokens[-1].split('-')[0]) + record.info['CHR2'] = chr2 + record.info['END2'] = end2 else: - new_genotype['GT'] = (1,) - if svtype == 'DEL': - new_genotype['CN'] = 0 - else: - gt_sum = _cache_gt_sum(genotype['GT']) - if svtype == 'DUP': - new_genotype['CN'] = 2 + gt_sum - new_genotype['GT'] = (None, None) - elif gt_sum == 0: - new_genotype['GT'] = (0, 0) - if svtype == 'DEL': - new_genotype['CN'] = 2 - elif gt_sum == 1: - new_genotype['GT'] = (0, 1) - if svtype == 'DEL': - new_genotype['CN'] = 1 - else: - new_genotype['GT'] = (1, 1) - if svtype == 'DEL': - new_genotype['CN'] = 0 - if svtype == 'CNV': - new_genotype['GT'] = (None,) * new_genotype['ECN'] - return new_record - - -def _cache_gt_sum(gt): - s = _gt_sum_map.get(gt, None) - if s is None: - s = sum([1 for a in gt if a is not None and a > 0]) - _gt_sum_map[gt] = s - return s - - -def add_cn_ecn(record: pysam.VariantRecord, - vcf_out: pysam.VariantFile, - ploidy_dict: Dict[Text, Dict[Text, int]]) -> pysam.VariantRecord: - """" - Only modifies records by adding CN and ECN INFO fields, e.g. for 'fixed' VCFs that just need - this metadata for certain GATK tools such as SVCluster and SVConcordance - - Parameters - ---------- - record: pysam.VariantRecord - input record - vcf_out: pysam.VariantFile - new vcf, to which the converted record will be written - ploidy_dict: Dict[Text, Dict[Text, int]] - map from sample to contig to ploidy - - Returns - ------- - header: pysam.VariantRecord - record with CN and ECN fields added""" - svtype = record.info['SVTYPE'] - contig = record.contig - new_record = vcf_out.new_record(id=record.id, contig=contig, start=record.start, stop=record.stop, - alleles=record.alleles, info=record.info) - + # Sometimes SOURCE is not set (may be from CPX review workflow) + record.info['CHR2'] = record.chrom + record.info['END2'] = record.stop + # Delete empty INFO fields (GATK does not like "." for non-String types) + keys = record.info.keys() + for k in keys: + val = record.info[k] + if is_null(val) or (isinstance(val, tuple) and len(val) == 1 and is_null(val[0])): + del record.info[k] # copy FORMAT fields - for sample in record.samples: - genotype = record.samples[sample] - new_genotype = new_record.samples[sample] - for key in genotype.keys(): - new_genotype[key] = genotype[key] - ecn = ploidy_dict[sample][contig] - new_genotype['ECN'] = ecn - if svtype == 'DEL': - new_genotype['CN'] = max(0, ecn - _cache_gt_sum(genotype['GT'])) - elif svtype == 'DUP': - new_genotype['CN'] = ecn + _cache_gt_sum(genotype['GT']) - elif svtype == 'CNV': - # Disambiguates non-existent and empty (i.e. ".") CN - cn = genotype.get('CN', None) - if cn is None: - cn = ecn - new_genotype['CN'] = cn - return new_record - - -def filter_unsupported_type(record: pysam.VariantRecord) -> bool: - svtype = record.info['SVTYPE'] - return svtype == 'CPX' or svtype == 'CTX' - - -def _parse_arg_list(arg: Text) -> List[Text]: - if arg is None: - return set() - else: - return arg.split(',') + for sample, genotype in record.samples.items(): + genotype['ECN'] = ploidy_dict[sample][contig] + if scale_down_gq: + rescale_gq(record) + return record def _process(vcf_in: pysam.VariantFile, vcf_out: pysam.VariantFile, - arguments: Dict[Text, Any], - vcf_filter: Optional[pysam.VariantFile] = None) -> None: + arguments: Dict[Text, Any]) -> None: """" Master function for processing the given input vcf and writing output @@ -308,50 +178,27 @@ def _process(vcf_in: pysam.VariantFile, output vcf arguments: Dict[Text, Any] commandline arguments - vcf_filter: Optional[pysam.VariantFile] - if provided, write filtered records to this vcf Returns ------- header: pysam.VariantRecord - record with CN and ECN fields added""" - remove_formats = set(_parse_arg_list(arguments.remove_formats)) - remove_infos = set(_parse_arg_list(arguments.remove_infos)) - if not arguments.only_add_cn_fields and not arguments.use_end2: + record with ECN fields added""" + if arguments.fix_end: bnd_end_dict = _parse_bnd_ends(arguments.vcf) else: bnd_end_dict = None ploidy_dict = _parse_ploidy_table(arguments.ploidy_table) - # info fields we drop by default (unless needed for certain SV types) - default_remove_infos = set(["SVLEN", "STRANDS", "CHR2"]) - if bnd_end_dict is not None: - default_remove_infos.add("END2") - remove_infos = remove_infos.union(default_remove_infos) - for record in vcf_in: - if arguments.filter_unsupported_types and filter_unsupported_type(record): - if vcf_filter is not None: - vcf_filter.write(record) - else: - if arguments.only_add_cn_fields: - out = add_cn_ecn(record=record, vcf_out=vcf_out, ploidy_dict=ploidy_dict) - else: - out = convert( - record=record, - vcf_out=vcf_out, - remove_infos=remove_infos, - remove_formats=remove_formats, - bnd_end_dict=bnd_end_dict, - ploidy_dict=ploidy_dict - ) - vcf_out.write(out) + out = convert(record=record, bnd_end_dict=bnd_end_dict, + ploidy_dict=ploidy_dict, scale_down_gq=arguments.scale_down_gq) + vcf_out.write(out) def _parse_arguments(argv: List[Text]) -> argparse.Namespace: # noinspection PyTypeChecker parser = argparse.ArgumentParser( - description="Convert a GATK-style SV VCF to SVTK-style", + description="Convert a SVTK-style SV VCF to GATK-style", formatter_class=argparse.ArgumentDefaultsHelpFormatter ) parser.add_argument("--vcf", type=str, required=True, @@ -363,20 +210,10 @@ def _parse_arguments(argv: List[Text]) -> argparse.Namespace: "first column is SAMPLE, and the remaining columns are contig names. For each row " "thereafter, the first column is the sample name, and remaining columns are the contig " "ploidy values for that sample.") - parser.add_argument("--only-add-cn-fields", action='store_true', - help="Only add CN and ECN info fields. All other corrections are skipped.") - parser.add_argument("--use-end2", action='store_true', - help="Use existing END2 fields rather than getting them from END") - parser.add_argument("--filter-unsupported-types", action='store_true', - help="Filter CPX and CTX types, which are not currently supported by GATK") - parser.add_argument("--filter-out", type=str, - help="Write any filtered variants to the specified VCF") - parser.add_argument("--replace-ev-format", action='store_true', - help="Adds EV FORMAT field with unbounded Number to header") - parser.add_argument("--remove-formats", type=str, - help="Comma-delimited list of FORMAT fields to remove") - parser.add_argument("--remove-infos", type=str, - help="Comma-delimited list of INFO fields to remove") + parser.add_argument("--fix-end", action='store_true', + help="Fix END tags and assign END2 to END") + parser.add_argument("--scale-down-gq", action='store_true', + help="Scales all GQs down from [0-999] to [0-99]") if len(argv) <= 1: parser.parse_args(["--help"]) sys.exit(0) @@ -388,23 +225,14 @@ def main(argv: Optional[List[Text]] = None): if argv is None: argv = sys.argv arguments = _parse_arguments(argv) - remove_formats = set(_parse_arg_list(arguments.remove_formats)) - remove_infos = set(_parse_arg_list(arguments.remove_infos)) # convert vcf header and records with pysam.VariantFile(arguments.vcf) as vcf_in: - header = create_header( - header_in=vcf_in.header, - replace_ev_format=arguments.replace_ev_format, - remove_infos=remove_infos, - remove_formats=remove_formats + update_header( + header=vcf_in.header ) - with pysam.VariantFile(arguments.out, mode='w', header=header) as vcf_out: - vcf_filter = pysam.VariantFile(arguments.filter_out, mode='w', header=vcf_in.header) if \ - arguments.filter_out is not None else None - _process(vcf_in, vcf_out, arguments, vcf_filter=vcf_filter) - if vcf_filter is not None: - vcf_filter.close() + with pysam.VariantFile(arguments.out, mode='w', header=vcf_in.header) as vcf_out: + _process(vcf_in, vcf_out, arguments) if __name__ == "__main__": diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index 257580fef..0694bc9a6 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -1,6 +1,7 @@ version 1.0 import "CleanVcfChromosome.wdl" as CleanVcfChromosome +import "TasksClusterBatch.wdl" as TasksCluster import "TasksMakeCohortVcf.wdl" as MiniTasks import "HailMerge.wdl" as HailMerge import "MakeCohortVcfMetrics.wdl" as metrics @@ -12,7 +13,7 @@ workflow CleanVcf { Array[File] complex_genotype_vcfs Array[File] complex_resolve_bothside_pass_lists Array[File] complex_resolve_background_fail_lists - File merged_ped_file + File ped_file File contig_list File allosome_fai @@ -55,7 +56,7 @@ workflow CleanVcf { RuntimeAttr? runtime_override_hail_merge_clean_final RuntimeAttr? runtime_override_fix_header_clean_final RuntimeAttr? runtime_override_concat_cleaned_vcfs - RuntimeAttr? runtime_override_fix_bad_ends + RuntimeAttr? runtime_attr_create_ploidy # overrides for CleanVcfContig RuntimeAttr? runtime_override_clean_vcf_1a @@ -68,6 +69,7 @@ workflow CleanVcf { RuntimeAttr? runtime_override_clean_vcf_5_polish RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup + RuntimeAttr? runtime_attr_format # Clean vcf 1b RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b @@ -98,6 +100,18 @@ workflow CleanVcf { RuntimeAttr? runtime_override_sort_drop_redundant_cnvs } + call TasksCluster.CreatePloidyTableFromPed { + input: + ped_file=ped_file, + contig_list=contig_list, + retain_female_chr_y=false, + chr_x=chr_x, + chr_y=chr_y, + output_prefix="~{cohort_name}.ploidy", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_create_ploidy + } + #Scatter per chromosome Array[String] contigs = transpose(read_tsv(contig_list))[0] scatter ( i in range(length(contigs)) ) { @@ -108,7 +122,7 @@ workflow CleanVcf { vcf=complex_genotype_vcfs[i], contig=contig, background_list=complex_resolve_background_fail_lists[i], - ped_file=merged_ped_file, + ped_file=ped_file, bothsides_pass_list=complex_resolve_bothside_pass_lists[i], allosome_fai=allosome_fai, prefix="~{cohort_name}.~{contig}", @@ -121,6 +135,7 @@ workflow CleanVcf { gcs_project=gcs_project, clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard, clean_vcf5_records_per_shard=clean_vcf5_records_per_shard, + ploidy_table=CreatePloidyTableFromPed.out, chr_x=chr_x, chr_y=chr_y, linux_docker=linux_docker, @@ -159,7 +174,7 @@ workflow CleanVcf { runtime_attr_override_filter_vcf_1b=runtime_attr_override_filter_vcf_1b, runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b, runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b, - runtime_override_fix_bad_ends=runtime_override_fix_bad_ends + runtime_attr_format=runtime_attr_format, } } diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 298ca5fd0..a3bbfcd37 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -2,6 +2,7 @@ version 1.0 import "Structs.wdl" import "TasksMakeCohortVcf.wdl" as MiniTasks +import "FormatVcfForGatk.wdl" as fvcf import "CleanVcf1b.wdl" as c1b import "CleanVcf5.wdl" as c5 import "HailMerge.wdl" as HailMerge @@ -24,9 +25,12 @@ workflow CleanVcfChromosome { File? outlier_samples_list Int? max_samples_per_shard_step3 + File ploidy_table String chr_x String chr_y + File? svtk_to_gatk_script # For debugging + Boolean use_hail String? gcs_project @@ -76,7 +80,7 @@ workflow CleanVcfChromosome { RuntimeAttr? runtime_override_drop_redundant_cnvs RuntimeAttr? runtime_override_combine_step_1_vcfs RuntimeAttr? runtime_override_sort_drop_redundant_cnvs - RuntimeAttr? runtime_override_fix_bad_ends + RuntimeAttr? runtime_attr_format } @@ -293,20 +297,22 @@ workflow CleanVcfChromosome { prefix="~{prefix}.final_cleanup", sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_final_cleanup - } - call MiniTasks.FixEndsRescaleGQ { + call fvcf.FormatVcf { input: - vcf = FinalCleanup.final_cleaned_shard, - prefix = prefix + ".cleaned", - sv_pipeline_docker = sv_pipeline_docker, - runtime_attr_override = runtime_override_fix_bad_ends + vcf=FinalCleanup.final_cleaned_shard, + ploidy_table=ploidy_table, + args="--fix-end --scale-down-gq", + output_prefix="~{prefix}.final_format", + script=svtk_to_gatk_script, + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_format } output { - File out=FixEndsRescaleGQ.out - File out_idx=FixEndsRescaleGQ.out_idx + File out = FormatVcf.out + File out_idx = FormatVcf.out_index } } diff --git a/wdl/ClusterBatch.wdl b/wdl/ClusterBatch.wdl index f340c4880..f1ce387c9 100644 --- a/wdl/ClusterBatch.wdl +++ b/wdl/ClusterBatch.wdl @@ -97,6 +97,7 @@ workflow ClusterBatch { ped_file=ped_file, script=ploidy_table_script, contig_list=contig_list, + retain_female_chr_y=true, chr_x=chr_x, chr_y=chr_y, output_prefix="~{batch}.ploidy", diff --git a/wdl/FormatVcfForGatk.wdl b/wdl/FormatVcfForGatk.wdl new file mode 100644 index 000000000..e6461b841 --- /dev/null +++ b/wdl/FormatVcfForGatk.wdl @@ -0,0 +1,145 @@ +version 1.0 + +import "Structs.wdl" +import "TasksClusterBatch.wdl" as tasks_cluster +import "TasksMakeCohortVcf.wdl" as tasks + +workflow FormatVcfForGatk { + input { + File vcf + String prefix + File ped_file + Int records_per_shard = 40000 + + File contig_list + File? contigs_header # Replaces vcf contig dictionary if provided + String? formatter_args + + String? chr_x + String? chr_y + + File? svtk_to_gatk_script # For debugging + + String sv_base_mini_docker + String sv_pipeline_docker + + RuntimeAttr? runtime_attr_create_ploidy + RuntimeAttr? runtime_attr_scatter + RuntimeAttr? runtime_attr_format + RuntimeAttr? runtime_override_concat + RuntimeAttr? runtime_override_preconcat_step1 + RuntimeAttr? runtime_override_hail_merge_step1 + RuntimeAttr? runtime_override_fix_header_step1 + } + + call tasks_cluster.CreatePloidyTableFromPed { + input: + ped_file=ped_file, + contig_list=contig_list, + retain_female_chr_y=false, + chr_x=chr_x, + chr_y=chr_y, + output_prefix="~{prefix}.ploidy", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_create_ploidy + } + + call tasks.ScatterVcf { + input: + vcf=vcf, + records_per_shard = records_per_shard, + prefix = "~{prefix}.scatter_vcf", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_scatter + } + + scatter ( i in range(length(ScatterVcf.shards)) ) { + call FormatVcf { + input: + vcf=ScatterVcf.shards[i], + ploidy_table=CreatePloidyTableFromPed.out, + args=formatter_args, + output_prefix="~{prefix}.format.shard_~{i}", + contigs_header=contigs_header, + script=svtk_to_gatk_script, + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_format + } + } + + Boolean shards_unsorted = defined(contigs_header) + call tasks.ConcatVcfs { + input: + vcfs=FormatVcf.out, + vcfs_idx=FormatVcf.out_index, + naive=!shards_unsorted, + allow_overlaps=shards_unsorted, + outfile_prefix="~{prefix}.gatk_formatted", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_concat + } + + output { + File gatk_formatted_vcf = ConcatVcfs.concat_vcf + File gatk_formatted_vcf_index = ConcatVcfs.concat_vcf_idx + } +} + +task FormatVcf { + input { + File vcf + File ploidy_table + File? script + String? args + File? contigs_header # Overwrites contig dictionary, in case they are out of order + String output_prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3.75, + disk_gb: ceil(50 + size(vcf, "GB") * 3), + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + output { + File out = "~{output_prefix}.vcf.gz" + File out_index = "~{output_prefix}.vcf.gz.tbi" + } + command <<< + set -euo pipefail + + # Convert format + python ~{default="/opt/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py" script} \ + --vcf ~{vcf} \ + --out tmp.vcf.gz \ + --ploidy-table ~{ploidy_table} \ + ~{args} + + if ~{defined(contigs_header)}; then + bcftools view --no-version -h tmp.vcf.gz > original_header.vcf + grep -v "^##contig=" original_header.vcf | grep -v "^#CHROM" > header.vcf + cat ~{contigs_header} >> header.vcf + grep "^#CHROM" original_header.vcf >> header.vcf + bcftools reheader -h header.vcf tmp.vcf.gz | bcftools sort -Oz -o ~{output_prefix}.vcf.gz + else + mv tmp.vcf.gz ~{output_prefix}.vcf.gz + fi + + tabix ~{output_prefix}.vcf.gz + >>> + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } +} \ No newline at end of file diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl index bf107c48c..464638389 100644 --- a/wdl/GATKSVPipelineSingleSample.wdl +++ b/wdl/GATKSVPipelineSingleSample.wdl @@ -487,7 +487,7 @@ workflow GATKSVPipelineSingleSample { RuntimeAttr? runtime_override_preconcat_clean_final RuntimeAttr? runtime_override_hail_merge_clean_final RuntimeAttr? runtime_override_fix_header_clean_final - RuntimeAttr? runtime_override_fix_bad_ends + RuntimeAttr? runtime_attr_format_clean RuntimeAttr? runtime_override_clean_vcf_1a RuntimeAttr? runtime_override_clean_vcf_2 @@ -1296,7 +1296,7 @@ workflow GATKSVPipelineSingleSample { runtime_override_benchmark_samples=runtime_override_benchmark_samples, runtime_override_split_shuffled_list=runtime_override_split_shuffled_list, runtime_override_merge_and_tar_shard_benchmarks=runtime_override_merge_and_tar_shard_benchmarks, - runtime_override_fix_bad_ends=runtime_override_fix_bad_ends + runtime_attr_format_clean=runtime_attr_format_clean } diff --git a/wdl/GenotypeComplexVariants.wdl b/wdl/GenotypeComplexVariants.wdl index 22e89c68a..73e28415d 100644 --- a/wdl/GenotypeComplexVariants.wdl +++ b/wdl/GenotypeComplexVariants.wdl @@ -8,7 +8,7 @@ workflow GenotypeComplexVariants { input { String cohort_name Array[String] batches - File merged_ped_file + File ped_file Array[File] depth_vcfs Boolean merge_vcfs = false @@ -65,7 +65,7 @@ workflow GenotypeComplexVariants { } call util.SubsetPedFile { input: - ped_file = merged_ped_file, + ped_file = ped_file, sample_list = GetSampleIdsFromVcf.out_file, subset_name = batches[i], sv_base_mini_docker = sv_base_mini_docker, @@ -87,7 +87,7 @@ workflow GenotypeComplexVariants { batches=batches, coverage_files=bincov_files, rd_depth_sep_cutoff_files=depth_gt_rd_sep_files, - merged_ped_file=merged_ped_file, + ped_file=ped_file, median_coverage_files=median_coverage_files, n_per_split_small=2500, n_per_split_large=250, diff --git a/wdl/GenotypeCpxCnvs.wdl b/wdl/GenotypeCpxCnvs.wdl index ab9f69e9a..7a4c2331d 100644 --- a/wdl/GenotypeCpxCnvs.wdl +++ b/wdl/GenotypeCpxCnvs.wdl @@ -20,7 +20,7 @@ workflow GenotypeCpxCnvs { Int n_per_split_large Int n_rd_test_bins String prefix - File merged_ped_file + File ped_file String contig File ref_dict @@ -96,7 +96,7 @@ workflow GenotypeCpxCnvs { intervals=GetCpxCnvIntervals.cpx_cnv_bed, genotypes=MergeMeltedGts.outfile, prefix=contig_prefix, - ped_file=merged_ped_file, + ped_file=ped_file, contig=contig, sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_parse_genotypes diff --git a/wdl/JoinRawCalls.wdl b/wdl/JoinRawCalls.wdl index aae4f50e3..3bc70a766 100644 --- a/wdl/JoinRawCalls.wdl +++ b/wdl/JoinRawCalls.wdl @@ -1,25 +1,30 @@ version 1.0 import "Structs.wdl" +import "FormatVcfForGatk.wdl" as format import "TasksClusterBatch.wdl" as tasks_cluster import "TasksMakeCohortVcf.wdl" as tasks_cohort -import "SVConcordance.wdl" as svc # Clusters raw call VCFs across batches - to be used for preparing raw calls for SV concordance analysis workflow JoinRawCalls { input { - String cohort + String prefix # ClusterBatch outputs Array[File]? clustered_manta_vcfs + Array[File]? clustered_manta_vcf_indexes Array[File]? clustered_melt_vcfs + Array[File]? clustered_melt_vcf_indexes Array[File]? clustered_scramble_vcfs + Array[File]? clustered_scramble_vcf_indexes Array[File]? clustered_wham_vcfs + Array[File]? clustered_wham_vcf_indexes Array[File]? clustered_depth_vcfs + Array[File]? clustered_depth_vcf_indexes - File ploidy_table + File ped_file String? preprocess_args @@ -28,37 +33,66 @@ workflow JoinRawCalls { File reference_fasta_fai File reference_dict + String? chr_x + String? chr_y + String gatk_docker String sv_base_mini_docker String sv_pipeline_docker Float? java_mem_fraction + RuntimeAttr? runtime_attr_create_ploidy + RuntimeAttr? runtime_override_concat_input_vcfs RuntimeAttr? runtime_attr_prepare_truth RuntimeAttr? runtime_attr_svcluster RuntimeAttr? runtime_override_concat_vcfs_pesr } - Array[File] vcfs_ = flatten(select_all([clustered_manta_vcfs, clustered_melt_vcfs, clustered_scramble_vcfs, clustered_wham_vcfs, clustered_depth_vcfs])) - scatter (i in range(length(vcfs_))) { - call svc.PreprocessVcf { + call tasks_cluster.CreatePloidyTableFromPed { + input: + ped_file=ped_file, + contig_list=contig_list, + retain_female_chr_y=false, + chr_x=chr_x, + chr_y=chr_y, + output_prefix="~{prefix}.ploidy", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_create_ploidy + } + + Array[Array[File]] vcf_matrix = transpose(select_all([clustered_manta_vcfs, clustered_melt_vcfs, clustered_scramble_vcfs, clustered_wham_vcfs, clustered_depth_vcfs])) + Array[Array[File]] vcf_index_matrix = transpose(select_all([clustered_manta_vcf_indexes, clustered_melt_vcf_indexes, clustered_scramble_vcf_indexes, clustered_wham_vcf_indexes, clustered_depth_vcf_indexes])) + scatter (i in range(length(vcf_matrix))) { + call tasks_cohort.ConcatVcfs as ConcatInputVcfs { + input: + vcfs=vcf_matrix[i], + vcfs_idx=vcf_index_matrix[i], + allow_overlaps=true, + outfile_prefix="~{prefix}.join_raw_calls.concat_batch_~{i}", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_concat_input_vcfs + } + } + + scatter (i in range(length(ConcatInputVcfs.concat_vcf))) { + call format.FormatVcfForGatk { input: - vcf=vcfs_[i], - ploidy_table=ploidy_table, - args=preprocess_args, - output_prefix="~{cohort}.join_raw_calls.preprocess_~{i}", + vcf=ConcatInputVcfs.concat_vcf[i], + ped_file=ped_file, + contig_list=contig_list, + prefix="~{prefix}.join_raw_calls.format_~{i}", sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_prepare_truth + sv_base_mini_docker=sv_base_mini_docker } } - Array[String] contigs = transpose(read_tsv(contig_list))[0] - scatter (contig in contigs) { + scatter (contig in read_lines(contig_list)) { call tasks_cluster.SVCluster { input: - vcfs=PreprocessVcf.out, - ploidy_table=ploidy_table, - output_prefix="~{cohort}.join_raw_calls.~{contig}", + vcfs=FormatVcfForGatk.gatk_formatted_vcf, + ploidy_table=CreatePloidyTableFromPed.out, + output_prefix="~{prefix}.join_raw_calls.~{contig}", contig=contig, fast_mode=true, algorithm="SINGLE_LINKAGE", @@ -69,7 +103,7 @@ workflow JoinRawCalls { reference_fasta_fai=reference_fasta_fai, reference_dict=reference_dict, java_mem_fraction=java_mem_fraction, - variant_prefix="~{cohort}_~{contig}_", + variant_prefix="~{prefix}_~{contig}_", gatk_docker=gatk_docker, runtime_attr_override=runtime_attr_svcluster } @@ -80,7 +114,7 @@ workflow JoinRawCalls { vcfs=SVCluster.out, vcfs_idx=SVCluster.out_index, naive=true, - outfile_prefix="~{cohort}.join_raw_calls", + outfile_prefix="~{prefix}.join_raw_calls", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_concat_vcfs_pesr } diff --git a/wdl/MakeCohortVcf.wdl b/wdl/MakeCohortVcf.wdl index 7a8fd10b2..a3b40f8e2 100644 --- a/wdl/MakeCohortVcf.wdl +++ b/wdl/MakeCohortVcf.wdl @@ -170,7 +170,6 @@ workflow MakeCohortVcf { RuntimeAttr? runtime_override_preconcat_clean_final RuntimeAttr? runtime_override_hail_merge_clean_final RuntimeAttr? runtime_override_fix_header_clean_final - RuntimeAttr? runtime_override_fix_bad_ends RuntimeAttr? runtime_override_clean_vcf_1a RuntimeAttr? runtime_override_clean_vcf_2 @@ -182,6 +181,7 @@ workflow MakeCohortVcf { RuntimeAttr? runtime_override_clean_vcf_5_polish RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup + RuntimeAttr? runtime_attr_format_clean RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b RuntimeAttr? runtime_attr_override_sort_bed_1b @@ -340,7 +340,7 @@ workflow MakeCohortVcf { complex_resolve_vcfs=ResolveComplexVariants.complex_resolve_vcfs, complex_resolve_vcf_indexes=ResolveComplexVariants.complex_resolve_vcf_indexes, depth_vcfs=depth_vcfs, - merged_ped_file=ped_file, + ped_file=ped_file, bincov_files=bincov_files, depth_gt_rd_sep_files=depth_gt_rd_sep_files, median_coverage_files=median_coverage_files, @@ -375,7 +375,7 @@ workflow MakeCohortVcf { complex_genotype_vcfs=GenotypeComplexVariants.complex_genotype_vcfs, complex_resolve_bothside_pass_lists=ResolveComplexVariants.complex_resolve_bothside_pass_lists, complex_resolve_background_fail_lists=ResolveComplexVariants.complex_resolve_background_fail_lists, - merged_ped_file=ped_file, + ped_file=ped_file, contig_list=contig_list, allosome_fai=allosome_fai, chr_x=chr_x, @@ -441,7 +441,7 @@ workflow MakeCohortVcf { runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs, runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs, runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs, - runtime_override_fix_bad_ends=runtime_override_fix_bad_ends + runtime_attr_format=runtime_attr_format_clean } call VcfQc.MainVcfQc { diff --git a/wdl/PESRClustering.wdl b/wdl/PESRClustering.wdl index 1405e4106..617ac0b00 100644 --- a/wdl/PESRClustering.wdl +++ b/wdl/PESRClustering.wdl @@ -128,8 +128,6 @@ task PreparePESRVcfs { File ploidy_table Int min_size File? script - String? remove_infos - String? remove_formats String output_prefix String sv_pipeline_docker RuntimeAttr? runtime_attr_override @@ -162,17 +160,16 @@ task PreparePESRVcfs { --vcf $VCF \ --out tmp.vcf.gz \ --ploidy-table ~{ploidy_table} \ - ~{"--remove-infos " + remove_infos} \ - ~{"--remove-formats " + remove_formats} + --fix-end # Interval, contig, and size filtering bcftools query -f '%CHROM\t%POS\t%POS\t%ID\t%SVTYPE\n%CHROM\t%END\t%END\t%ID\t%SVTYPE\n%CHR2\t%END2\t%END2\t%ID\t%SVTYPE\n' tmp.vcf.gz \ - | awk '$1!="."' \ + | awk '$1!="." && $2!="."' \ | sort -k1,1V -k2,2n -k3,3n \ > ends.bed bedtools intersect -sorted -u -wa -g genome.file -wa -a ends.bed -b ~{exclude_intervals} | cut -f4 | sort | uniq \ > excluded_vids.list - bcftools view -i 'ID!=@excluded_vids.list && (INFO/SVLEN="." || INFO/SVLEN>=~{min_size})' tmp.vcf.gz \ + bcftools view -i 'ID!=@excluded_vids.list && (INFO/SVLEN="." || INFO/SVLEN=-1 || INFO/SVLEN>=~{min_size})' tmp.vcf.gz \ -Oz -o out/$SAMPLE_NUM.$NAME.vcf.gz tabix out/$SAMPLE_NUM.$NAME.vcf.gz i=$((i+1)) diff --git a/wdl/SVConcordance.wdl b/wdl/SVConcordance.wdl index 3e144cc19..cd50b40f0 100644 --- a/wdl/SVConcordance.wdl +++ b/wdl/SVConcordance.wdl @@ -5,99 +5,32 @@ import "TasksMakeCohortVcf.wdl" as tasks_cohort workflow SVConcordance { input { + # Vcfs must be formatted using FormatVcfForGatk (if unsure, check for ECN FORMAT field) File eval_vcf File truth_vcf - - File ploidy_table - String cohort - - Boolean? run_svutils_truth_vcf - Boolean? run_formatter_truth_vcf - String? formatter_truth_args - - Boolean? run_svutils_eval_vcf - Boolean? run_formatter_eval_vcf - String? formatter_eval_args - - # For testing - File? svtk_to_gatk_script + String output_prefix File contig_list File reference_dict String gatk_docker String sv_base_mini_docker - String sv_pipeline_docker - String sv_utils_docker Float? java_mem_fraction - RuntimeAttr? runtime_attr_svutils_truth - RuntimeAttr? runtime_attr_format_truth - RuntimeAttr? runtime_attr_svutils_eval RuntimeAttr? runtime_attr_format_eval + RuntimeAttr? runtime_attr_format_truth RuntimeAttr? runtime_attr_sv_concordance RuntimeAttr? runtime_attr_postprocess RuntimeAttr? runtime_override_concat_shards } - Boolean run_svutils_truth_vcf_ = select_first([run_svutils_truth_vcf, true]) - Boolean run_formatter_truth_vcf_ = select_first([run_formatter_truth_vcf, true]) - - Boolean run_svutils_eval_vcf_ = select_first([run_svutils_eval_vcf, true]) - Boolean run_formatter_eval_vcf_ = select_first([run_formatter_eval_vcf, true]) - - if (run_svutils_truth_vcf_) { - call SvutilsFixVcf as SvutilsTruth { - input: - vcf=truth_vcf, - output_prefix="~{cohort}.svutils_truth", - sv_utils_docker=sv_utils_docker, - runtime_attr_override=runtime_attr_svutils_truth - } - } - if (run_formatter_truth_vcf_) { - call PreprocessVcf as FormatTruth { - input: - vcf=select_first([SvutilsTruth.out, truth_vcf]), - ploidy_table=ploidy_table, - args=formatter_truth_args, - output_prefix="~{cohort}.format_truth", - script=svtk_to_gatk_script, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_format_truth - } - } - - if (run_svutils_eval_vcf_) { - call SvutilsFixVcf as SvutilsEval { - input: - vcf=eval_vcf, - output_prefix="~{cohort}.svutils_eval", - sv_utils_docker=sv_utils_docker, - runtime_attr_override=runtime_attr_svutils_eval - } - } - if (run_formatter_eval_vcf_) { - call PreprocessVcf as FormatEval { - input: - vcf=select_first([SvutilsEval.out, eval_vcf]), - ploidy_table=ploidy_table, - args=formatter_eval_args, - output_prefix="~{cohort}.format_eval", - script=svtk_to_gatk_script, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_attr_format_eval - } - } - - Array[String] contigs = transpose(read_tsv(contig_list))[0] - scatter (contig in contigs) { + scatter (contig in read_lines(contig_list)) { call SVConcordanceTask { input: - eval_vcf=select_first([FormatEval.out, SvutilsEval.out, eval_vcf]), - truth_vcf=select_first([FormatTruth.out, SvutilsTruth.out, truth_vcf]), - output_prefix="~{cohort}.concordance.~{contig}", + eval_vcf=eval_vcf, + truth_vcf=truth_vcf, + output_prefix="~{output_prefix}.concordance.~{contig}", contig=contig, reference_dict=reference_dict, java_mem_fraction=java_mem_fraction, @@ -111,7 +44,7 @@ workflow SVConcordance { vcfs=SVConcordanceTask.out, vcfs_idx=SVConcordanceTask.out_index, naive=true, - outfile_prefix="~{cohort}.concordance", + outfile_prefix="~{output_prefix}.concordance", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_concat_shards } @@ -119,102 +52,6 @@ workflow SVConcordance { output { File concordance_vcf = ConcatVcfs.concat_vcf File concordance_vcf_index = ConcatVcfs.concat_vcf_idx - File? filtered_eval_records_vcf = FormatEval.filtered - File? filtered_eval_records_index =FormatEval.filtered_index - File? filtered_truth_records_vcf = FormatTruth.filtered - File? filtered_truth_records_index = FormatTruth.filtered_index - } -} - -task SvutilsFixVcf { - input { - File vcf - String output_prefix - String sv_utils_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 3.75, - disk_gb: ceil(10 + size(vcf, "GB") * 2), - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - output { - File out = "~{output_prefix}.vcf.gz" - File out_index = "~{output_prefix}.vcf.gz.tbi" - } - command <<< - set -euo pipefail - sv-utils fix-vcf ~{vcf} ~{output_prefix}.vcf.gz - >>> - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_utils_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - -task PreprocessVcf { - input { - File vcf - File ploidy_table - File? script - String? args - String output_prefix - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 3.75, - disk_gb: ceil(10 + size(vcf, "GB") * 2), - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - output { - File out = "~{output_prefix}.vcf.gz" - File out_index = "~{output_prefix}.vcf.gz.tbi" - File filtered = "~{output_prefix}.filtered_records.vcf.gz" - File filtered_index = "~{output_prefix}.filtered_records.vcf.gz.tbi" - } - command <<< - set -euo pipefail - - # Convert format - python ~{default="/opt/sv-pipeline/scripts/format_svtk_vcf_for_gatk.py" script} \ - --vcf ~{vcf} \ - --out tmp.vcf.gz \ - --filter-out ~{output_prefix}.filtered_records.vcf.gz \ - --ploidy-table ~{ploidy_table} \ - ~{args} - - # TODO Filter invalid records with SVLEN=0, only needed for legacy runs that used svtk cluster in ClusterBatch - bcftools view --no-version -i 'INFO/SVLEN="." || INFO/SVLEN>0' tmp.vcf.gz -Oz -o ~{output_prefix}.vcf.gz - - tabix ~{output_prefix}.vcf.gz - tabix ~{output_prefix}.filtered_records.vcf.gz - >>> - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } } @@ -226,6 +63,7 @@ task SVConcordanceTask { File reference_dict String? contig String? additional_args + Float? java_mem_fraction String gatk_docker RuntimeAttr? runtime_attr_override @@ -242,7 +80,7 @@ task SVConcordanceTask { RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 3.75, + mem_gb: 16, disk_gb: ceil(10 + size(eval_vcf, "GB") * 2 + size(truth_vcf, "GB")), boot_disk_gb: 10, preemptible_tries: 3, @@ -276,7 +114,6 @@ task SVConcordanceTask { --eval ~{eval_vcf} \ --truth ~{truth_vcf} \ -O ~{output_prefix}.vcf.gz \ - --force-biallelic-dups \ ~{additional_args} >>> runtime { diff --git a/wdl/ScatterCpxGenotyping.wdl b/wdl/ScatterCpxGenotyping.wdl index 694059784..2315f79ec 100644 --- a/wdl/ScatterCpxGenotyping.wdl +++ b/wdl/ScatterCpxGenotyping.wdl @@ -22,7 +22,7 @@ workflow ScatterCpxGenotyping { Int n_per_split_large Int n_rd_test_bins String prefix - File merged_ped_file + File ped_file String contig File ref_dict @@ -82,7 +82,7 @@ workflow ScatterCpxGenotyping { n_per_split_small=n_per_split_small, n_rd_test_bins=n_rd_test_bins, prefix=prefix, - merged_ped_file=merged_ped_file, + ped_file=ped_file, contig=contig, ref_dict=ref_dict, linux_docker=linux_docker, diff --git a/wdl/TasksClusterBatch.wdl b/wdl/TasksClusterBatch.wdl index 3bf35e9de..cab695ca9 100644 --- a/wdl/TasksClusterBatch.wdl +++ b/wdl/TasksClusterBatch.wdl @@ -28,12 +28,15 @@ task SVCluster { Float? depth_sample_overlap Float? depth_interval_overlap + Float? depth_size_similarity Int? depth_breakend_window Float? mixed_sample_overlap Float? mixed_interval_overlap + Float? mixed_size_similarity Int? mixed_breakend_window Float? pesr_sample_overlap Float? pesr_interval_overlap + Float? pesr_size_similarity Int? pesr_breakend_window File reference_fasta @@ -98,24 +101,27 @@ task SVCluster { --arguments_file arguments.txt \ --output ~{output_prefix}.vcf.gz \ --ploidy-table ~{ploidy_table} \ - --variant-prefix ~{variant_prefix} \ --reference ~{reference_fasta} \ ~{"-L " + contig} \ ~{true="--fast-mode" false="" fast_mode} \ ~{true="--enable-cnv" false="" enable_cnv} \ ~{true="--omit-members" false="" omit_members} \ ~{true="--default-no-call" false="" default_no_call} \ + ~{"--variant-prefix " + variant_prefix} \ ~{"--algorithm " + algorithm} \ ~{"--defrag-padding-fraction " + defrag_padding_fraction} \ ~{"--defrag-sample-overlap " + defrag_sample_overlap} \ ~{"--depth-sample-overlap " + depth_sample_overlap} \ ~{"--depth-interval-overlap " + depth_interval_overlap} \ + ~{"--depth-size-similarity " + depth_size_similarity} \ ~{"--depth-breakend-window " + depth_breakend_window} \ ~{"--mixed-sample-overlap " + mixed_sample_overlap} \ ~{"--mixed-interval-overlap " + mixed_interval_overlap} \ + ~{"--mixed-size-similarity " + mixed_size_similarity} \ ~{"--mixed-breakend-window " + mixed_breakend_window} \ ~{"--pesr-sample-overlap " + pesr_sample_overlap} \ ~{"--pesr-interval-overlap " + pesr_interval_overlap} \ + ~{"--pesr-size-similarity " + pesr_size_similarity} \ ~{"--pesr-breakend-window " + pesr_breakend_window} \ ~{"--insertion-length-summary-strategy " + insertion_length_summary_strategy} \ ~{"--breakpoint-summary-strategy " + breakpoint_summary_strategy} \ @@ -333,6 +339,7 @@ task CreatePloidyTableFromPed { File ped_file File? script File contig_list + Boolean retain_female_chr_y = false String? chr_x String? chr_y String output_prefix @@ -350,20 +357,26 @@ task CreatePloidyTableFromPed { } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + String output_file = if retain_female_chr_y then "~{output_prefix}.FEMALE_chrY_1.tsv" else "~{output_prefix}.tsv" + output { - File out = "~{output_prefix}.tsv" + File out = "~{output_file}" } command <<< set -euo pipefail python ~{default="/opt/sv-pipeline/scripts/ploidy_table_from_ped.py" script} \ --ped ~{ped_file} \ - --out ~{output_prefix}.tsv.tmp \ + --out tmp.tsv \ --contigs ~{contig_list} \ ~{"--chr-x " + chr_x} \ ~{"--chr-y " + chr_y} # TODO : For now we retain female Y genotypes for metric generation - sed -e 's/\t0/\t1/g' ~{output_prefix}.tsv.tmp > ~{output_prefix}.tsv + if ~{retain_female_chr_y}; then + sed -e 's/\t0/\t1/g' tmp.tsv > ~{output_file} + else + mv tmp.tsv ~{output_file} + fi >>> runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl index dba6b41e1..1cf9237d7 100644 --- a/wdl/TasksMakeCohortVcf.wdl +++ b/wdl/TasksMakeCohortVcf.wdl @@ -1006,86 +1006,3 @@ task ScatterVcf { Array[File] shards = glob("~{prefix}.shard_*.vcf.gz") } } - -task FixEndsRescaleGQ { - input { - File vcf - String prefix - - Boolean? fix_ends - Boolean? rescale_gq - - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 3.75, - disk_gb: ceil(10 + size(vcf, "GB") * 2), - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - String outfile = "~{prefix}.vcf.gz" - Boolean fix_ends_ = select_first([fix_ends, true]) - Boolean rescale_gq_ = select_first([rescale_gq, true]) - - output { - File out = "~{outfile}" - File out_idx = "~{outfile}.tbi" - } - command <<< - - set -euo pipefail - - python <>> - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - -