From e2e76a8018c61e1733c993680ea21a105273a1d9 Mon Sep 17 00:00:00 2001 From: kirtanav98 <123595850+kirtanav98@users.noreply.github.com> Date: Wed, 16 Aug 2023 14:59:12 -0400 Subject: [PATCH] Add PlotSVCountsPerSample subworkflow to the end of ClusterBatch and FilterBatchSites (#567) * Update README to link to SV callers used. * Imported PlotSVCountsPerSample.wdl into ClusterBatch.wdl and FilterBatchSites.wdl. Added the N_IQR_cutoff input to the workflow with a default value of 6 to both wdls. Call PlotSVCountsPerSample as a subworkflow at the end of each workflow, passing the final VCF's as the input and the batch as the prefix. Added the outputs of PlotSVCountsPerSample to the workflows' outputs with unique names. Updated the JSON templates for ClusterBatch and FilterBatchSites in test and terra to include the N_IQR_cuffott input with a value of 6. Then validated ClusterBatch and FilterBatchSites wofkflows with womtool and the Terra validation script, and ran the updated workflows on the ref_panel_1kg test data. There was successful completion and decent outputs. * integrate PlotSVCountsPerSample into ClusterBatch and FilterBatchSites directly * integrate PlotSVCountsPerSample into ClusterBatch and FilterBatchSites directly * fixed issues from miniwdl * fixed issues from miniwdl * fixed issues from miniwdl * fixed issues from miniwdl * fixed issues from miniwdl * fixed issues from miniwdl * fixed issues from miniwdl * fixed issues from miniwdl * fixed issues from miniwdl * fixed issues from miniwdl * make edits to location of parameters called * make edits to location of parameters called --------- Co-authored-by: Kirtana Veeraraghavan --- .../ClusterBatch.json.tmpl | 3 +- .../FilterBatchSites.json.tmpl | 3 +- .../test/ClusterBatch/ClusterBatch.json.tmpl | 3 +- .../FilterBatch/FilterBatchSites.json.tmpl | 3 +- .../GATKSVPipelineBatch.json.tmpl | 3 +- .../GATKSVPipelinePhase1.json.tmpl | 1 + wdl/ClusterBatch.wdl | 30 ++++++++++++- wdl/FilterBatchSites.wdl | 43 ++++++++++++++----- wdl/GATKSVPipelineBatch.wdl | 11 ++++- wdl/GATKSVPipelinePhase1.wdl | 16 ++++++- 10 files changed, 96 insertions(+), 20 deletions(-) diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/ClusterBatch.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/ClusterBatch.json.tmpl index fe71d0bea..714e889cd 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/ClusterBatch.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/ClusterBatch.json.tmpl @@ -28,5 +28,6 @@ "ClusterBatch.manta_vcf_tar": "${this.std_manta_vcf_tar}", "ClusterBatch.melt_vcf_tar": "${this.std_melt_vcf_tar}", "ClusterBatch.scramble_vcf_tar": "${this.std_scramble_vcf_tar}", - "ClusterBatch.ped_file": "${workspace.cohort_ped_file}" + "ClusterBatch.ped_file": "${workspace.cohort_ped_file}", + "ClusterBatch.N_IQR_cutoff_plotting": "6" } diff --git a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/FilterBatchSites.json.tmpl b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/FilterBatchSites.json.tmpl index 5761fabb3..77990c7bb 100644 --- a/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/FilterBatchSites.json.tmpl +++ b/inputs/templates/terra_workspaces/cohort_mode/workflow_configurations/FilterBatchSites.json.tmpl @@ -8,5 +8,6 @@ "FilterBatchSites.melt_vcf" : "${this.clustered_melt_vcf}", "FilterBatchSites.scramble_vcf" : "${this.clustered_scramble_vcf}", "FilterBatchSites.evidence_metrics": "${this.metrics}", - "FilterBatchSites.evidence_metrics_common": "${this.metrics_common}" + "FilterBatchSites.evidence_metrics_common": "${this.metrics_common}", + "FilterBatchSites.N_IQR_cutoff_plotting": "6" } diff --git a/inputs/templates/test/ClusterBatch/ClusterBatch.json.tmpl b/inputs/templates/test/ClusterBatch/ClusterBatch.json.tmpl index 8c9acb25b..00225f1f7 100644 --- a/inputs/templates/test/ClusterBatch/ClusterBatch.json.tmpl +++ b/inputs/templates/test/ClusterBatch/ClusterBatch.json.tmpl @@ -29,5 +29,6 @@ "ClusterBatch.wham_vcf_tar": {{ test_batch.std_wham_vcf_tar | tojson }}, "ClusterBatch.manta_vcf_tar": {{ test_batch.std_manta_vcf_tar | tojson }}, "ClusterBatch.melt_vcf_tar": {{ test_batch.std_melt_vcf_tar | tojson }}, - "ClusterBatch.ped_file": {{ test_batch.ped_file | tojson }} + "ClusterBatch.ped_file": {{ test_batch.ped_file | tojson }}, + "ClusterBatch.N_IQR_cutoff_plotting": "6" } diff --git a/inputs/templates/test/FilterBatch/FilterBatchSites.json.tmpl b/inputs/templates/test/FilterBatch/FilterBatchSites.json.tmpl index cb936e511..a5b3a423b 100644 --- a/inputs/templates/test/FilterBatch/FilterBatchSites.json.tmpl +++ b/inputs/templates/test/FilterBatch/FilterBatchSites.json.tmpl @@ -7,5 +7,6 @@ "FilterBatchSites.wham_vcf" : {{ test_batch.merged_wham_vcf | tojson }}, "FilterBatchSites.melt_vcf" : {{ test_batch.merged_melt_vcf | tojson }}, "FilterBatchSites.evidence_metrics": {{ test_batch.evidence_metrics | tojson }}, - "FilterBatchSites.evidence_metrics_common": {{ test_batch.evidence_metrics_common | tojson }} + "FilterBatchSites.evidence_metrics_common": {{ test_batch.evidence_metrics_common | tojson }}, + "FilterBatchSites.N_IQR_cutoff_plotting": "6" } diff --git a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl index 6b61270be..4e925a3de 100644 --- a/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl +++ b/inputs/templates/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json.tmpl @@ -94,7 +94,7 @@ "GATKSVPipelineBatch.GATKSVPipelinePhase1.depth_exclude_overlap_fraction": "0.5", "GATKSVPipelineBatch.GATKSVPipelinePhase1.depth_interval_overlap": "0.8", "GATKSVPipelineBatch.GATKSVPipelinePhase1.depth_clustering_algorithm": "SINGLE_LINKAGE", - + "GATKSVPipelineBatch.N_IQR_cutoff_plotting": "6", "GATKSVPipelineBatch.GATKSVPipelinePhase1.BAF_split_size": "10000", "GATKSVPipelineBatch.GATKSVPipelinePhase1.RD_split_size": "10000", "GATKSVPipelineBatch.GATKSVPipelinePhase1.PE_split_size": "10000", @@ -105,7 +105,6 @@ "GATKSVPipelineBatch.outlier_cutoff_table" : {{ test_batch.outlier_cutoff_table | tojson }}, "GATKSVPipelineBatch.GATKSVPipelinePhase1.outlier_cutoff_nIQR": "999999", - "GATKSVPipelineBatch.GenotypeBatch.n_RD_genotype_bins": "100000", "GATKSVPipelineBatch.GenotypeBatch.n_per_split": "5000", "GATKSVPipelineBatch.GenotypeBatch.pesr_exclude_list": {{ reference_resources.pesr_exclude_list | tojson }}, diff --git a/inputs/templates/test/GATKSVPipelinePhase1/GATKSVPipelinePhase1.json.tmpl b/inputs/templates/test/GATKSVPipelinePhase1/GATKSVPipelinePhase1.json.tmpl index c5ec7ce56..471fe461a 100644 --- a/inputs/templates/test/GATKSVPipelinePhase1/GATKSVPipelinePhase1.json.tmpl +++ b/inputs/templates/test/GATKSVPipelinePhase1/GATKSVPipelinePhase1.json.tmpl @@ -45,6 +45,7 @@ "GATKSVPipelinePhase1.outlier_cutoff_table" : {{ test_batch.outlier_cutoff_table | tojson }}, "GATKSVPipelinePhase1.outlier_cutoff_nIQR": "6", + "GATKSVPipelinePhase1.N_IQR_cutoff_plotting": "6", "GATKSVPipelinePhase1.ploidy_sample_psi_scale": "0.001", "GATKSVPipelinePhase1.contig_ploidy_model_tar" : {{ test_batch.contig_ploidy_model_tar | tojson }}, diff --git a/wdl/ClusterBatch.wdl b/wdl/ClusterBatch.wdl index f1ce387c9..e6c55d5ae 100644 --- a/wdl/ClusterBatch.wdl +++ b/wdl/ClusterBatch.wdl @@ -5,6 +5,7 @@ import "DepthClustering.wdl" as depth import "ClusterBatchMetrics.wdl" as metrics import "TasksClusterBatch.wdl" as tasks import "Utils.wdl" as util +import "PlotSVCountsPerSample.wdl" as sv_counts workflow ClusterBatch { input { @@ -48,6 +49,9 @@ workflow ClusterBatch { Int pesr_breakend_window String? pesr_clustering_algorithm + # PlotSVCountsPerSample + Int? N_IQR_cutoff_plotting + # Module metrics parameters # Run module metrics workflow at the end - on by default Boolean? run_module_metrics @@ -81,6 +85,9 @@ workflow ClusterBatch { RuntimeAttr? runtime_attr_gatk_to_svtk_vcf_depth RuntimeAttr? runtime_override_concat_vcfs_depth RuntimeAttr? runtime_attr_exclude_intervals_pesr + RuntimeAttr? runtime_attr_count_svs + RuntimeAttr? runtime_attr_plot_svcounts + RuntimeAttr? runtime_attr_cat_outliers_preview } call util.GetSampleIdsFromVcfTar { @@ -282,6 +289,19 @@ workflow ClusterBatch { } } + if (defined(N_IQR_cutoff_plotting)){ + call sv_counts.PlotSVCountsPerSample { + input: + prefix = batch, + vcfs = [ClusterDepth.clustered_vcf, ClusterPESR_manta.clustered_vcf, ClusterPESR_wham.clustered_vcf, ClusterPESR_melt.clustered_vcf, ClusterPESR_scramble.clustered_vcf], + N_IQR_cutoff = select_first([N_IQR_cutoff_plotting]), + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_count_svs = runtime_attr_count_svs, + runtime_attr_plot_svcounts = runtime_attr_plot_svcounts, + runtime_attr_cat_outliers_preview = runtime_attr_cat_outliers_preview + } + } + output { File clustered_depth_vcf = ClusterDepth.clustered_vcf File clustered_depth_vcf_index = ClusterDepth.clustered_vcf_index @@ -293,7 +313,13 @@ workflow ClusterBatch { File? clustered_melt_vcf_index = ClusterPESR_melt.clustered_vcf_index File? clustered_scramble_vcf = ClusterPESR_scramble.clustered_vcf File? clustered_scramble_vcf_index = ClusterPESR_scramble.clustered_vcf_index - + Array[File]? clustered_sv_counts = PlotSVCountsPerSample.sv_counts + Array[File]? clustered_sv_count_plots = PlotSVCountsPerSample.sv_count_plots + File? clustered_outlier_samples_preview = PlotSVCountsPerSample.outlier_samples_preview + File? clustered_outlier_samples_with_reason = PlotSVCountsPerSample.outlier_samples_with_reason + Int? clustered_num_outlier_samples = PlotSVCountsPerSample.num_outlier_samples File? metrics_file_clusterbatch = ClusterBatchMetrics.metrics_file } -} + + +} \ No newline at end of file diff --git a/wdl/FilterBatchSites.wdl b/wdl/FilterBatchSites.wdl index 633486bcb..031204e6c 100644 --- a/wdl/FilterBatchSites.wdl +++ b/wdl/FilterBatchSites.wdl @@ -1,6 +1,7 @@ version 1.0 import "Structs.wdl" +import "PlotSVCountsPerSample.wdl" as sv_counts workflow FilterBatchSites { input { @@ -12,13 +13,19 @@ workflow FilterBatchSites { File? depth_vcf File evidence_metrics File evidence_metrics_common - String sv_pipeline_docker + + # PlotSVCountsPerSample metrics + Int N_IQR_cutoff_plotting = 6 + RuntimeAttr? runtime_attr_adjudicate RuntimeAttr? runtime_attr_rewrite_scores RuntimeAttr? runtime_attr_filter_annotate_vcf RuntimeAttr? runtime_attr_merge_pesr_vcfs - + RuntimeAttr? runtime_attr_count_svs + RuntimeAttr? runtime_attr_plot_svcounts + RuntimeAttr? runtime_attr_cat_outliers_preview + } Array[String] algorithms = ["manta", "wham", "melt", "scramble", "depth"] @@ -58,6 +65,17 @@ workflow FilterBatchSites { } } + call sv_counts.PlotSVCountsPerSample { + input: + prefix = batch, + vcfs=[FilterAnnotateVcf.annotated_vcf[0], FilterAnnotateVcf.annotated_vcf[1], FilterAnnotateVcf.annotated_vcf[2], FilterAnnotateVcf.annotated_vcf[3], FilterAnnotateVcf.annotated_vcf[4]], + N_IQR_cutoff = N_IQR_cutoff_plotting, + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_count_svs = runtime_attr_count_svs, + runtime_attr_plot_svcounts = runtime_attr_plot_svcounts, + runtime_attr_cat_outliers_preview = runtime_attr_cat_outliers_preview + } + output { File? sites_filtered_manta_vcf = FilterAnnotateVcf.annotated_vcf[0] File? sites_filtered_wham_vcf = FilterAnnotateVcf.annotated_vcf[1] @@ -67,7 +85,13 @@ workflow FilterBatchSites { File cutoffs = AdjudicateSV.cutoffs File scores = RewriteScores.updated_scores File RF_intermediate_files = AdjudicateSV.RF_intermediate_files + Array[File] sites_filtered_sv_counts = PlotSVCountsPerSample.sv_counts + Array[File] sites_filtered_sv_count_plots = PlotSVCountsPerSample.sv_count_plots + File sites_filtered_outlier_samples_preview = PlotSVCountsPerSample.outlier_samples_preview + File sites_filtered_outlier_samples_with_reason = PlotSVCountsPerSample.outlier_samples_with_reason + Int sites_filtered_num_outlier_samples = PlotSVCountsPerSample.num_outlier_samples } + } task AdjudicateSV { @@ -79,7 +103,7 @@ task AdjudicateSV { } RuntimeAttr default_attr = object { - cpu_cores: 1, + cpu_cores: 1, mem_gb: 3.75, disk_gb: 10, boot_disk_gb: 10, @@ -101,7 +125,7 @@ task AdjudicateSV { mv *_trainable.txt ~{batch}.RF_intermediate_files/ mv *_testable.txt ~{batch}.RF_intermediate_files/ tar -czvf ~{batch}.RF_intermediate_files.tar.gz ~{batch}.RF_intermediate_files - + >>> runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) @@ -125,7 +149,7 @@ task RewriteScores { } RuntimeAttr default_attr = object { - cpu_cores: 1, + cpu_cores: 1, mem_gb: 3.75, disk_gb: 10, boot_disk_gb: 10, @@ -145,7 +169,7 @@ task RewriteScores { -m ~{metrics} \ -s ~{scores} \ -o ~{batch}.updated_scores - + >>> runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) @@ -170,7 +194,7 @@ task FilterAnnotateVcf { } RuntimeAttr default_attr = object { - cpu_cores: 1, + cpu_cores: 1, mem_gb: 3.75, disk_gb: 10, boot_disk_gb: 10, @@ -200,7 +224,7 @@ task FilterAnnotateVcf { /opt/sv-pipeline/03_variant_filtering/scripts/annotate_RF_evidence.py filtered.corrected_coords.vcf.gz ~{scores} ~{prefix}.with_evidence.vcf bgzip ~{prefix}.with_evidence.vcf - + >>> runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) @@ -212,5 +236,4 @@ task FilterAnnotateVcf { maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) } -} - +} \ No newline at end of file diff --git a/wdl/GATKSVPipelineBatch.wdl b/wdl/GATKSVPipelineBatch.wdl index 9b4d20678..077b13100 100644 --- a/wdl/GATKSVPipelineBatch.wdl +++ b/wdl/GATKSVPipelineBatch.wdl @@ -63,6 +63,9 @@ workflow GATKSVPipelineBatch { File contig_ploidy_model_tar Array[File] gcnv_model_tars + # PlotSVCountsPerSample metrics from ClusterBatch in GATKSVPipelinePhase1 + Int? N_IQR_cutoff_plotting + File? outlier_cutoff_table File qc_definitions @@ -204,6 +207,7 @@ workflow GATKSVPipelineBatch { counts=counts_files_, bincov_matrix=EvidenceQC.bincov_matrix, bincov_matrix_index=EvidenceQC.bincov_matrix_index, + N_IQR_cutoff_plotting = N_IQR_cutoff_plotting, PE_files=pe_files_, SR_files=sr_files_, SD_files=sd_files_, @@ -211,6 +215,7 @@ workflow GATKSVPipelineBatch { melt_vcfs=melt_vcfs_, scramble_vcfs=scramble_vcfs_, wham_vcfs=wham_vcfs_, + cnmops_chrom_file=autosome_file, cnmops_allo_file=allosome_file, allosome_contigs=allosome_file, @@ -420,6 +425,11 @@ workflow GATKSVPipelineBatch { File? merged_melt_vcf_index = GATKSVPipelinePhase1.melt_vcf_index File? merged_wham_vcf = GATKSVPipelinePhase1.wham_vcf File? merged_wham_vcf_index = GATKSVPipelinePhase1.wham_vcf_index + Array[File] ?clustered_sv_counts = GATKSVPipelinePhase1.clustered_sv_counts + Array[File]? clustered_sv_count_plots = GATKSVPipelinePhase1.clustered_sv_count_plots + File? clustered_outlier_samples_preview = GATKSVPipelinePhase1.clustered_outlier_samples_preview + File? clustered_outlier_samples_with_reason = GATKSVPipelinePhase1.clustered_outlier_samples_with_reason + Int? clustered_num_outlier_samples = GATKSVPipelinePhase1.clustered_num_outlier_samples File evidence_metrics = GATKSVPipelinePhase1.evidence_metrics File evidence_metrics_common = GATKSVPipelinePhase1.evidence_metrics_common @@ -432,7 +442,6 @@ workflow GATKSVPipelineBatch { File? sites_filtered_wham_vcf = GATKSVPipelinePhase1.sites_filtered_wham_vcf File? sites_filtered_melt_vcf = GATKSVPipelinePhase1.sites_filtered_melt_vcf File? sites_filtered_depth_vcf = GATKSVPipelinePhase1.sites_filtered_depth_vcf - File cutoffs = GATKSVPipelinePhase1.cutoffs File genotyped_pesr_vcf = GenotypeBatch.genotyped_pesr_vcf File genotyped_depth_vcf = GenotypeBatch.genotyped_depth_vcf diff --git a/wdl/GATKSVPipelinePhase1.wdl b/wdl/GATKSVPipelinePhase1.wdl index ab20065b1..3aa03f739 100644 --- a/wdl/GATKSVPipelinePhase1.wdl +++ b/wdl/GATKSVPipelinePhase1.wdl @@ -160,6 +160,8 @@ workflow GATKSVPipelinePhase1 { Int pesr_breakend_window String? pesr_clustering_algorithm + Int? N_IQR_cutoff_plotting + File? baseline_depth_vcf_cluster_batch File? baseline_manta_vcf_cluster_batch File? baseline_wham_vcf_cluster_batch @@ -183,6 +185,9 @@ workflow GATKSVPipelinePhase1 { RuntimeAttr? runtime_attr_gatk_to_svtk_vcf_depth_cluster_batch RuntimeAttr? runtime_override_concat_vcfs_depth_cluster_batch RuntimeAttr? runtime_attr_exclude_intervals_pesr_cluster_batch + RuntimeAttr? runtime_attr_count_svs + RuntimeAttr? runtime_attr_plot_svcounts + RuntimeAttr? runtime_attr_cat_outliers_preview ############################################################ ## GenerateBatchMetrics @@ -358,6 +363,7 @@ workflow GATKSVPipelinePhase1 { pesr_interval_overlap=pesr_interval_overlap, pesr_breakend_window=pesr_breakend_window, pesr_clustering_algorithm=pesr_clustering_algorithm, + N_IQR_cutoff_plotting = N_IQR_cutoff_plotting, run_module_metrics=run_clusterbatch_metrics, linux_docker=linux_docker, sv_pipeline_base_docker=sv_pipeline_base_docker, @@ -384,7 +390,10 @@ workflow GATKSVPipelinePhase1 { runtime_attr_svcluster_depth=runtime_attr_svcluster_depth_cluster_batch, runtime_attr_gatk_to_svtk_vcf_depth=runtime_attr_gatk_to_svtk_vcf_depth_cluster_batch, runtime_override_concat_vcfs_depth=runtime_override_concat_vcfs_depth_cluster_batch, - runtime_attr_exclude_intervals_pesr=runtime_attr_exclude_intervals_pesr_cluster_batch + runtime_attr_exclude_intervals_pesr=runtime_attr_exclude_intervals_pesr_cluster_batch, + runtime_attr_count_svs = runtime_attr_count_svs, + runtime_attr_plot_svcounts = runtime_attr_plot_svcounts, + runtime_attr_cat_outliers_preview = runtime_attr_cat_outliers_preview } call batchmetrics.GenerateBatchMetrics as GenerateBatchMetrics { @@ -500,6 +509,11 @@ workflow GATKSVPipelinePhase1 { File? melt_vcf_index = ClusterBatch.clustered_melt_vcf_index File? scramble_vcf = ClusterBatch.clustered_scramble_vcf File? scramble_vcf_index = ClusterBatch.clustered_scramble_vcf_index + Array[File]? clustered_sv_counts = ClusterBatch.clustered_sv_counts + Array[File]? clustered_sv_count_plots = ClusterBatch.clustered_sv_count_plots + File? clustered_outlier_samples_preview = ClusterBatch.clustered_outlier_samples_preview + File? clustered_outlier_samples_with_reason = ClusterBatch.clustered_outlier_samples_with_reason + Int? clustered_num_outlier_samples = ClusterBatch.clustered_num_outlier_samples File? metrics_file_clusterbatch = ClusterBatch.metrics_file_clusterbatch