From c3e9818453375a583a2bec5a808caa3744528bde Mon Sep 17 00:00:00 2001 From: samuelklee Date: Thu, 10 Jan 2019 10:03:55 -0500 Subject: [PATCH] Added separate allele-count thresholds for the normal and tumor in ModelSegments. (#5556) --- .../somatic/cnv_somatic_pair_workflow.wdl | 25 +++++++---- .../tools/copynumber/ModelSegments.java | 42 ++++++++++++------- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl b/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl index ba5d1eb1af2..8b3754cd3b3 100644 --- a/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl +++ b/scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl @@ -90,6 +90,7 @@ workflow CNVSomaticPairWorkflow { ############################################## Int? max_num_segments_per_chromosome Int? min_total_allele_count + Int? min_total_allele_count_normal Float? genotyping_homozygous_log_ratio_threshold Float? genotyping_base_error_rate Float? kernel_variance_copy_ratio @@ -218,6 +219,7 @@ workflow CNVSomaticPairWorkflow { normal_allelic_counts = CollectAllelicCountsNormal.allelic_counts, max_num_segments_per_chromosome = max_num_segments_per_chromosome, min_total_allele_count = min_total_allele_count, + min_total_allele_count_normal = min_total_allele_count_normal, genotyping_homozygous_log_ratio_threshold = genotyping_homozygous_log_ratio_threshold, genotyping_base_error_rate = genotyping_base_error_rate, kernel_variance_copy_ratio = kernel_variance_copy_ratio, @@ -345,7 +347,7 @@ workflow CNVSomaticPairWorkflow { denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios, allelic_counts = CollectAllelicCountsNormal.allelic_counts, max_num_segments_per_chromosome = max_num_segments_per_chromosome, - min_total_allele_count = min_total_allele_count, + min_total_allele_count = min_total_allele_count_normal, genotyping_homozygous_log_ratio_threshold = genotyping_homozygous_log_ratio_threshold, genotyping_base_error_rate = genotyping_base_error_rate, kernel_variance_copy_ratio = kernel_variance_copy_ratio, @@ -542,6 +544,7 @@ task ModelSegments { File? normal_allelic_counts Int? max_num_segments_per_chromosome Int? min_total_allele_count + Int? min_total_allele_count_normal Float? genotyping_homozygous_log_ratio_threshold Float? genotyping_base_error_rate Float? kernel_variance_copy_ratio @@ -577,6 +580,11 @@ task ModelSegments { # If optional output_dir not specified, use "out" String output_dir_ = select_first([output_dir, "out"]) + # default values are min_total_allele_count_ = 0 in matched-normal mode + # = 30 in case-only mode + Int default_min_total_allele_count = if defined(normal_allelic_counts) then 0 else 30 + Int min_total_allele_count_ = select_first([min_total_allele_count, default_min_total_allele_count]) + command <<< set -e mkdir ${output_dir_} @@ -586,7 +594,8 @@ task ModelSegments { --denoised-copy-ratios ${denoised_copy_ratios} \ --allelic-counts ${allelic_counts} \ ${"--normal-allelic-counts " + normal_allelic_counts} \ - --minimum-total-allele-count ${default="30" min_total_allele_count} \ + --minimum-total-allele-count-case ${min_total_allele_count_} \ + --minimum-total-allele-count-normal ${default="30" min_total_allele_count_normal} \ --genotyping-homozygous-log-ratio-threshold ${default="-10.0" genotyping_homozygous_log_ratio_threshold} \ --genotyping-base-error-rate ${default="0.05" genotyping_base_error_rate} \ --maximum-number-of-segments-per-chromosome ${default="1000" max_num_segments_per_chromosome} \ @@ -597,14 +606,14 @@ task ModelSegments { --window-size ${sep=" --window-size " window_sizes} \ --number-of-changepoints-penalty-factor ${default="1.0" num_changepoints_penalty_factor} \ --minor-allele-fraction-prior-alpha ${default="25.0" minor_allele_fraction_prior_alpha} \ - --number-of-samples-copy-ratio ${default=100 num_samples_copy_ratio} \ - --number-of-burn-in-samples-copy-ratio ${default=50 num_burn_in_copy_ratio} \ - --number-of-samples-allele-fraction ${default=100 num_samples_allele_fraction} \ - --number-of-burn-in-samples-allele-fraction ${default=50 num_burn_in_allele_fraction} \ + --number-of-samples-copy-ratio ${default="100" num_samples_copy_ratio} \ + --number-of-burn-in-samples-copy-ratio ${default="50" num_burn_in_copy_ratio} \ + --number-of-samples-allele-fraction ${default="100" num_samples_allele_fraction} \ + --number-of-burn-in-samples-allele-fraction ${default="50" num_burn_in_allele_fraction} \ --smoothing-credible-interval-threshold-copy-ratio ${default="2.0" smoothing_threshold_copy_ratio} \ --smoothing-credible-interval-threshold-allele-fraction ${default="2.0" smoothing_threshold_allele_fraction} \ - --maximum-number-of-smoothing-iterations ${default=10 max_num_smoothing_iterations} \ - --number-of-smoothing-iterations-per-fit ${default=0 num_smoothing_iterations_per_fit} \ + --maximum-number-of-smoothing-iterations ${default="10" max_num_smoothing_iterations} \ + --number-of-smoothing-iterations-per-fit ${default="0" num_smoothing_iterations_per_fit} \ --output ${output_dir_} \ --output-prefix ${entity_id} diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/ModelSegments.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/ModelSegments.java index b61a2f132cf..c4f9b50f726 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/ModelSegments.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/ModelSegments.java @@ -231,7 +231,8 @@ public final class ModelSegments extends CommandLineProgram { public static final String ALLELE_FRACTION_LEGACY_SEGMENTS_FILE_SUFFIX = ".af.igv" + SEGMENTS_FILE_SUFFIX; //het genotyping argument names - public static final String MINIMUM_TOTAL_ALLELE_COUNT_LONG_NAME = "minimum-total-allele-count"; + public static final String MINIMUM_TOTAL_ALLELE_COUNT_CASE_LONG_NAME = "minimum-total-allele-count-case"; + public static final String MINIMUM_TOTAL_ALLELE_COUNT_NORMAL_LONG_NAME = "minimum-total-allele-count-normal"; public static final String GENOTYPING_HOMOZYGOUS_LOG_RATIO_THRESHOLD_LONG_NAME = "genotyping-homozygous-log-ratio-threshold"; public static final String GENOTYPING_BASE_ERROR_RATE_LONG_NAME = "genotyping-base-error-rate"; @@ -248,8 +249,8 @@ public final class ModelSegments extends CommandLineProgram { public static final String MINOR_ALLELE_FRACTION_PRIOR_ALPHA_LONG_NAME = "minor-allele-fraction-prior-alpha"; public static final String NUMBER_OF_SAMPLES_COPY_RATIO_LONG_NAME = "number-of-samples-copy-ratio"; public static final String NUMBER_OF_BURN_IN_SAMPLES_COPY_RATIO_LONG_NAME = "number-of-burn-in-samples-copy-ratio"; - public static final String NUM_SAMPLES_ALLELE_FRACTION_LONG_NAME = "number-of-samples-allele-fraction"; - public static final String NUM_BURN_IN_ALLELE_FRACTION_LONG_NAME = "number-of-burn-in-samples-allele-fraction"; + public static final String NUMBER_OF_SAMPLES_ALLELE_FRACTION_LONG_NAME = "number-of-samples-allele-fraction"; + public static final String NUMBER_OF_BURN_IN_SAMPLES_ALLELE_FRACTION_LONG_NAME = "number-of-burn-in-samples-allele-fraction"; //smoothing argument names public static final String SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_COPY_RATIO_LONG_NAME = "smoothing-credible-interval-threshold-copy-ratio"; @@ -292,12 +293,22 @@ public final class ModelSegments extends CommandLineProgram { private String outputDir; @Argument( - doc = "Minimum total count for filtering allelic counts, if available.", - fullName = MINIMUM_TOTAL_ALLELE_COUNT_LONG_NAME, + doc = "Minimum total count for filtering allelic counts in the case sample, if available. " + + "The default value of zero is appropriate for matched-normal mode; " + + "increase to an appropriate value for case-only mode.", + fullName = MINIMUM_TOTAL_ALLELE_COUNT_CASE_LONG_NAME, minValue = 0, optional = true ) - private int minTotalAlleleCount = 30; + private int minTotalAlleleCountCase = 0; + + @Argument( + doc = "Minimum total count for filtering allelic counts in the matched-normal sample, if available.", + fullName = MINIMUM_TOTAL_ALLELE_COUNT_NORMAL_LONG_NAME, + minValue = 0, + optional = true + ) + private int minTotalAlleleCountNormal = 30; @Argument( doc = "Log-ratio threshold for genotyping and filtering homozygous allelic counts, if available. " + @@ -414,7 +425,7 @@ public final class ModelSegments extends CommandLineProgram { @Argument( doc = "Total number of MCMC samples for allele-fraction model.", - fullName = NUM_SAMPLES_ALLELE_FRACTION_LONG_NAME, + fullName = NUMBER_OF_SAMPLES_ALLELE_FRACTION_LONG_NAME, optional = true, minValue = 1 ) @@ -422,7 +433,7 @@ public final class ModelSegments extends CommandLineProgram { @Argument( doc = "Number of burn-in samples to discard for allele-fraction model.", - fullName = NUM_BURN_IN_ALLELE_FRACTION_LONG_NAME, + fullName = NUMBER_OF_BURN_IN_SAMPLES_ALLELE_FRACTION_LONG_NAME, optional = true, minValue = 0 ) @@ -619,12 +630,14 @@ private AllelicCountCollection genotypeHets(final SampleLocatableMetadata metada logger.info("Genotyping heterozygous sites from available allelic counts..."); + AllelicCountCollection filteredAllelicCounts = allelicCounts; + //filter on total count in case sample - logger.info(String.format("Filtering allelic counts with total count less than %d...", minTotalAlleleCount)); - AllelicCountCollection filteredAllelicCounts = new AllelicCountCollection( + logger.info(String.format("Filtering allelic counts with total count less than %d...", minTotalAlleleCountCase)); + filteredAllelicCounts = new AllelicCountCollection( metadata, - allelicCounts.getRecords().stream() - .filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCount) + filteredAllelicCounts.getRecords().stream() + .filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCountCase) .collect(Collectors.toList())); logger.info(String.format("Retained %d / %d sites after filtering on total count...", filteredAllelicCounts.size(), allelicCounts.size())); @@ -645,6 +658,7 @@ private AllelicCountCollection genotypeHets(final SampleLocatableMetadata metada if (normalAllelicCounts == null) { //filter on homozygosity in case sample logger.info("No matched normal was provided, not running in matched-normal mode..."); + logger.info("Performing binomial testing and filtering homozygous allelic counts..."); hetAllelicCounts = new AllelicCountCollection( metadata, @@ -672,11 +686,11 @@ private AllelicCountCollection genotypeHets(final SampleLocatableMetadata metada } //filter on total count in matched normal - logger.info(String.format("Filtering allelic counts in matched normal with total count less than %d...", minTotalAlleleCount)); + logger.info(String.format("Filtering allelic counts in matched normal with total count less than %d...", minTotalAlleleCountNormal)); AllelicCountCollection filteredNormalAllelicCounts = new AllelicCountCollection( normalMetadata, normalAllelicCounts.getRecords().stream() - .filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCount) + .filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCountNormal) .collect(Collectors.toList())); logger.info(String.format("Retained %d / %d sites in matched normal after filtering on total count...", filteredNormalAllelicCounts.size(), normalAllelicCounts.size()));