Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added separate allele-count thresholds for the normal and tumor in ModelSegments. #5556

Merged
merged 1 commit into from
Jan 10, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 17 additions & 8 deletions scripts/cnv_wdl/somatic/cnv_somatic_pair_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ workflow CNVSomaticPairWorkflow {
##############################################
Int? max_num_segments_per_chromosome
Int? min_total_allele_count
Int? min_total_allele_count_normal
Float? genotyping_homozygous_log_ratio_threshold
Float? genotyping_base_error_rate
Float? kernel_variance_copy_ratio
Expand Down Expand Up @@ -218,6 +219,7 @@ workflow CNVSomaticPairWorkflow {
normal_allelic_counts = CollectAllelicCountsNormal.allelic_counts,
max_num_segments_per_chromosome = max_num_segments_per_chromosome,
min_total_allele_count = min_total_allele_count,
min_total_allele_count_normal = min_total_allele_count_normal,
genotyping_homozygous_log_ratio_threshold = genotyping_homozygous_log_ratio_threshold,
genotyping_base_error_rate = genotyping_base_error_rate,
kernel_variance_copy_ratio = kernel_variance_copy_ratio,
Expand Down Expand Up @@ -345,7 +347,7 @@ workflow CNVSomaticPairWorkflow {
denoised_copy_ratios = DenoiseReadCountsNormal.denoised_copy_ratios,
allelic_counts = CollectAllelicCountsNormal.allelic_counts,
max_num_segments_per_chromosome = max_num_segments_per_chromosome,
min_total_allele_count = min_total_allele_count,
min_total_allele_count = min_total_allele_count_normal,
genotyping_homozygous_log_ratio_threshold = genotyping_homozygous_log_ratio_threshold,
genotyping_base_error_rate = genotyping_base_error_rate,
kernel_variance_copy_ratio = kernel_variance_copy_ratio,
Expand Down Expand Up @@ -542,6 +544,7 @@ task ModelSegments {
File? normal_allelic_counts
Int? max_num_segments_per_chromosome
Int? min_total_allele_count
Int? min_total_allele_count_normal
Float? genotyping_homozygous_log_ratio_threshold
Float? genotyping_base_error_rate
Float? kernel_variance_copy_ratio
Expand Down Expand Up @@ -577,6 +580,11 @@ task ModelSegments {
# If optional output_dir not specified, use "out"
String output_dir_ = select_first([output_dir, "out"])

# default values are min_total_allele_count_ = 0 in matched-normal mode
# = 30 in case-only mode
Int default_min_total_allele_count = if defined(normal_allelic_counts) then 0 else 30
Int min_total_allele_count_ = select_first([min_total_allele_count, default_min_total_allele_count])

command <<<
set -e
mkdir ${output_dir_}
Expand All @@ -586,7 +594,8 @@ task ModelSegments {
--denoised-copy-ratios ${denoised_copy_ratios} \
--allelic-counts ${allelic_counts} \
${"--normal-allelic-counts " + normal_allelic_counts} \
--minimum-total-allele-count ${default="30" min_total_allele_count} \
--minimum-total-allele-count-case ${min_total_allele_count_} \
--minimum-total-allele-count-normal ${default="30" min_total_allele_count_normal} \
--genotyping-homozygous-log-ratio-threshold ${default="-10.0" genotyping_homozygous_log_ratio_threshold} \
--genotyping-base-error-rate ${default="0.05" genotyping_base_error_rate} \
--maximum-number-of-segments-per-chromosome ${default="1000" max_num_segments_per_chromosome} \
Expand All @@ -597,14 +606,14 @@ task ModelSegments {
--window-size ${sep=" --window-size " window_sizes} \
--number-of-changepoints-penalty-factor ${default="1.0" num_changepoints_penalty_factor} \
--minor-allele-fraction-prior-alpha ${default="25.0" minor_allele_fraction_prior_alpha} \
--number-of-samples-copy-ratio ${default=100 num_samples_copy_ratio} \
--number-of-burn-in-samples-copy-ratio ${default=50 num_burn_in_copy_ratio} \
--number-of-samples-allele-fraction ${default=100 num_samples_allele_fraction} \
--number-of-burn-in-samples-allele-fraction ${default=50 num_burn_in_allele_fraction} \
--number-of-samples-copy-ratio ${default="100" num_samples_copy_ratio} \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not that I mind, but curiosity: Were the quotes necessary?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just another matter of style, I think these were the only numbers missing quotes in the CNV WDLs.

--number-of-burn-in-samples-copy-ratio ${default="50" num_burn_in_copy_ratio} \
--number-of-samples-allele-fraction ${default="100" num_samples_allele_fraction} \
--number-of-burn-in-samples-allele-fraction ${default="50" num_burn_in_allele_fraction} \
--smoothing-credible-interval-threshold-copy-ratio ${default="2.0" smoothing_threshold_copy_ratio} \
--smoothing-credible-interval-threshold-allele-fraction ${default="2.0" smoothing_threshold_allele_fraction} \
--maximum-number-of-smoothing-iterations ${default=10 max_num_smoothing_iterations} \
--number-of-smoothing-iterations-per-fit ${default=0 num_smoothing_iterations_per_fit} \
--maximum-number-of-smoothing-iterations ${default="10" max_num_smoothing_iterations} \
--number-of-smoothing-iterations-per-fit ${default="0" num_smoothing_iterations_per_fit} \
--output ${output_dir_} \
--output-prefix ${entity_id}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,8 @@ public final class ModelSegments extends CommandLineProgram {
public static final String ALLELE_FRACTION_LEGACY_SEGMENTS_FILE_SUFFIX = ".af.igv" + SEGMENTS_FILE_SUFFIX;

//het genotyping argument names
public static final String MINIMUM_TOTAL_ALLELE_COUNT_LONG_NAME = "minimum-total-allele-count";
public static final String MINIMUM_TOTAL_ALLELE_COUNT_CASE_LONG_NAME = "minimum-total-allele-count-case";
public static final String MINIMUM_TOTAL_ALLELE_COUNT_NORMAL_LONG_NAME = "minimum-total-allele-count-normal";
public static final String GENOTYPING_HOMOZYGOUS_LOG_RATIO_THRESHOLD_LONG_NAME = "genotyping-homozygous-log-ratio-threshold";
public static final String GENOTYPING_BASE_ERROR_RATE_LONG_NAME = "genotyping-base-error-rate";

Expand All @@ -248,8 +249,8 @@ public final class ModelSegments extends CommandLineProgram {
public static final String MINOR_ALLELE_FRACTION_PRIOR_ALPHA_LONG_NAME = "minor-allele-fraction-prior-alpha";
public static final String NUMBER_OF_SAMPLES_COPY_RATIO_LONG_NAME = "number-of-samples-copy-ratio";
public static final String NUMBER_OF_BURN_IN_SAMPLES_COPY_RATIO_LONG_NAME = "number-of-burn-in-samples-copy-ratio";
public static final String NUM_SAMPLES_ALLELE_FRACTION_LONG_NAME = "number-of-samples-allele-fraction";
public static final String NUM_BURN_IN_ALLELE_FRACTION_LONG_NAME = "number-of-burn-in-samples-allele-fraction";
public static final String NUMBER_OF_SAMPLES_ALLELE_FRACTION_LONG_NAME = "number-of-samples-allele-fraction";
public static final String NUMBER_OF_BURN_IN_SAMPLES_ALLELE_FRACTION_LONG_NAME = "number-of-burn-in-samples-allele-fraction";

//smoothing argument names
public static final String SMOOTHING_CREDIBLE_INTERVAL_THRESHOLD_COPY_RATIO_LONG_NAME = "smoothing-credible-interval-threshold-copy-ratio";
Expand Down Expand Up @@ -292,12 +293,22 @@ public final class ModelSegments extends CommandLineProgram {
private String outputDir;

@Argument(
doc = "Minimum total count for filtering allelic counts, if available.",
fullName = MINIMUM_TOTAL_ALLELE_COUNT_LONG_NAME,
doc = "Minimum total count for filtering allelic counts in the case sample, if available. " +
"The default value of zero is appropriate for matched-normal mode; " +
"increase to an appropriate value for case-only mode.",
fullName = MINIMUM_TOTAL_ALLELE_COUNT_CASE_LONG_NAME,
minValue = 0,
optional = true
)
private int minTotalAlleleCount = 30;
private int minTotalAlleleCountCase = 0;

@Argument(
doc = "Minimum total count for filtering allelic counts in the matched-normal sample, if available.",
fullName = MINIMUM_TOTAL_ALLELE_COUNT_NORMAL_LONG_NAME,
minValue = 0,
optional = true
)
private int minTotalAlleleCountNormal = 30;

@Argument(
doc = "Log-ratio threshold for genotyping and filtering homozygous allelic counts, if available. " +
Expand Down Expand Up @@ -414,15 +425,15 @@ public final class ModelSegments extends CommandLineProgram {

@Argument(
doc = "Total number of MCMC samples for allele-fraction model.",
fullName = NUM_SAMPLES_ALLELE_FRACTION_LONG_NAME,
fullName = NUMBER_OF_SAMPLES_ALLELE_FRACTION_LONG_NAME,
optional = true,
minValue = 1
)
private int numSamplesAlleleFraction = 100;

@Argument(
doc = "Number of burn-in samples to discard for allele-fraction model.",
fullName = NUM_BURN_IN_ALLELE_FRACTION_LONG_NAME,
fullName = NUMBER_OF_BURN_IN_SAMPLES_ALLELE_FRACTION_LONG_NAME,
optional = true,
minValue = 0
)
Expand Down Expand Up @@ -619,12 +630,14 @@ private AllelicCountCollection genotypeHets(final SampleLocatableMetadata metada

logger.info("Genotyping heterozygous sites from available allelic counts...");

AllelicCountCollection filteredAllelicCounts = allelicCounts;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any reason you moved the declaration/definition up here? You end up setting it a couple lines later, so it doesn't seem to make a difference.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a very minor matter of style. Now all subsequent transformations after the initial declaration operate on the new variable, so any transformations added later will have an identical format. (Really what happened is that I experimented with adding filtering steps and changing their order, but got bit by not noticing I had inadvertently reverted to the original counts in a later step due to a careless copy and paste...)


//filter on total count in case sample
logger.info(String.format("Filtering allelic counts with total count less than %d...", minTotalAlleleCount));
AllelicCountCollection filteredAllelicCounts = new AllelicCountCollection(
logger.info(String.format("Filtering allelic counts with total count less than %d...", minTotalAlleleCountCase));
filteredAllelicCounts = new AllelicCountCollection(
metadata,
allelicCounts.getRecords().stream()
.filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCount)
filteredAllelicCounts.getRecords().stream()
.filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCountCase)
.collect(Collectors.toList()));
logger.info(String.format("Retained %d / %d sites after filtering on total count...",
filteredAllelicCounts.size(), allelicCounts.size()));
Expand All @@ -645,6 +658,7 @@ private AllelicCountCollection genotypeHets(final SampleLocatableMetadata metada
if (normalAllelicCounts == null) {
//filter on homozygosity in case sample
logger.info("No matched normal was provided, not running in matched-normal mode...");

logger.info("Performing binomial testing and filtering homozygous allelic counts...");
hetAllelicCounts = new AllelicCountCollection(
metadata,
Expand Down Expand Up @@ -672,11 +686,11 @@ private AllelicCountCollection genotypeHets(final SampleLocatableMetadata metada
}

//filter on total count in matched normal
logger.info(String.format("Filtering allelic counts in matched normal with total count less than %d...", minTotalAlleleCount));
logger.info(String.format("Filtering allelic counts in matched normal with total count less than %d...", minTotalAlleleCountNormal));
AllelicCountCollection filteredNormalAllelicCounts = new AllelicCountCollection(
normalMetadata,
normalAllelicCounts.getRecords().stream()
.filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCount)
.filter(ac -> ac.getTotalReadCount() >= minTotalAlleleCountNormal)
.collect(Collectors.toList()));
logger.info(String.format("Retained %d / %d sites in matched normal after filtering on total count...",
filteredNormalAllelicCounts.size(), normalAllelicCounts.size()));
Expand Down