diff --git a/scripts/mutect2_wdl/mutect2.wdl b/scripts/mutect2_wdl/mutect2.wdl index a5b674cd5f3..032863a08d5 100755 --- a/scripts/mutect2_wdl/mutect2.wdl +++ b/scripts/mutect2_wdl/mutect2.wdl @@ -194,6 +194,38 @@ workflow Mutect2 { } Int m2_output_size = tumor_bam_size / scatter_count + + if (run_ob_mm_filter) { + scatter (subintervals in SplitIntervals.interval_files ) { + call CollectF1R2Counts { + input: + gatk_docker = gatk_docker, + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + preemptible_attempts = preemptible_attempts, + tumor_bam = tumor_bam, + tumor_bai = tumor_bai, + gatk_override = gatk_override, + disk_space = tumor_bam_size + ref_size + disk_pad, + intervals = subintervals, + extra_intervals = ob_mm_filter_training_intervals, + max_retries = max_retries + } + } + + call LearnReadOrientationModel { + input: + alt_tables = CollectF1R2Counts.alt_table, + ref_histograms = CollectF1R2Counts.ref_histogram, + alt_histograms = CollectF1R2Counts.alt_histograms, + gatk_override = gatk_override, + gatk_docker = gatk_docker, + preemptible_attempts = preemptible_attempts, + max_retries = max_retries + } + } + scatter (subintervals in SplitIntervals.interval_files ) { call M2 { input: @@ -214,6 +246,8 @@ workflow Mutect2 { m2_extra_args = m2_extra_args, make_bamout = make_bamout_or_default, artifact_prior_table = LearnReadOrientationModel.artifact_prior_table, + variants_for_contamination = variants_for_contamination, + variants_for_contamination_index = variants_for_contamination_index, compress = compress, gga_vcf = gga_vcf, gga_vcf_idx = gga_vcf_idx, @@ -222,45 +256,6 @@ workflow Mutect2 { disk_space = tumor_bam_size + normal_bam_size + ref_size + gnomad_vcf_size + m2_output_size + disk_pad } - if (defined(variants_for_contamination)) { - call GetPileupSummaries as TumorPileups { - input: - gatk_override = gatk_override, - intervals = subintervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - gatk_docker = gatk_docker, - bam = tumor_bam, - bai = tumor_bai, - variants_for_contamination = variants_for_contamination, - variants_for_contamination_index = variants_for_contamination_index, - disk_space = tumor_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad - } - - if (defined(normal_bam)){ - call GetPileupSummaries as NormalPileups { - input: - gatk_override = gatk_override, - intervals = subintervals, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries, - gatk_docker = gatk_docker, - bam = normal_bam, - bai = normal_bai, - variants_for_contamination = variants_for_contamination, - variants_for_contamination_index = variants_for_contamination_index, - disk_space = normal_bam_size + ceil(size(variants_for_contamination, "GB") * small_input_to_output_multiplier) + disk_pad - - } - } - } - Float sub_vcf_size = size(M2.unfiltered_vcf, "GB") Float sub_bamout_size = size(M2.output_bamOut, "GB") } @@ -322,39 +317,10 @@ workflow Mutect2 { } } - if (run_ob_mm_filter) { - call CollectF1R2Counts { - input: - gatk_docker = gatk_docker, - ref_fasta = ref_fasta, - ref_fai = ref_fai, - ref_dict = ref_dict, - preemptible_attempts = preemptible_attempts, - tumor_bam = tumor_bam, - tumor_bai = tumor_bai, - gatk_override = gatk_override, - disk_space = tumor_bam_size + ref_size + disk_pad, - intervals = if defined(ob_mm_filter_training_intervals) then ob_mm_filter_training_intervals else intervals, - max_retries = max_retries - } - - call LearnReadOrientationModel { - input: - alt_table = CollectF1R2Counts.alt_table, - ref_histogram = CollectF1R2Counts.ref_histogram, - alt_histograms = CollectF1R2Counts.alt_histograms, - tumor_sample = CollectF1R2Counts.tumor_sample, - gatk_override = gatk_override, - gatk_docker = gatk_docker, - preemptible_attempts = preemptible_attempts, - max_retries = max_retries - } - } - if (defined(variants_for_contamination)) { call MergePileupSummaries as MergeTumorPileups { input: - input_tables = TumorPileups.pileups, + input_tables = M2.tumor_pileups, output_name = output_basename, ref_dict = ref_dict, gatk_override = gatk_override, @@ -367,7 +333,7 @@ workflow Mutect2 { if (defined(normal_bam)){ call MergePileupSummaries as MergeNormalPileups { input: - input_tables = NormalPileups.pileups, + input_tables = M2.normal_pileups, output_name = output_basename, ref_dict = ref_dict, gatk_override = gatk_override, @@ -381,7 +347,6 @@ workflow Mutect2 { call CalculateContamination { input: gatk_override = gatk_override, - intervals = intervals, ref_fasta = ref_fasta, ref_fai = ref_fai, ref_dict = ref_dict, @@ -589,7 +554,9 @@ task M2 { Boolean compress File? gga_vcf File? gga_vcf_idx - File? artifact_prior_table + File? artifact_prior_table + File? variants_for_contamination + File? variants_for_contamination_index String output_vcf = "output" + if compress then ".vcf.gz" else ".vcf" String output_vcf_index = output_vcf + if compress then ".tbi" else ".idx" @@ -639,6 +606,21 @@ task M2 { ${true='--bam-output bamout.bam' false='' make_bamout} \ ${"--orientation-bias-artifact-priors " + artifact_prior_table} \ ${m2_extra_args} + + ### GetPileupSummaries + # These must be created, even if they remain empty, as cromwell doesn't support optional output + touch tumor-pileups.table + touch normal-pileups.table + + if [[ -f "${variants_for_contamination}" ]]; then + gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${tumor_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ${variants_for_contamination} -L ${variants_for_contamination} -O tumor-pileups.table + + if [[ -f ${normal_bam} ]]; then + gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${normal_bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ${variants_for_contamination} -L ${variants_for_contamination} -O normal-pileups.table + fi + fi >>> runtime { @@ -657,6 +639,8 @@ task M2 { File output_bamOut = "bamout.bam" String tumor_sample = read_string("tumor_name.txt") String normal_sample = read_string("normal_name.txt") + File tumor_pileups = "tumor-pileups.table" + File normal_pileups = "normal-pileups.table" } } @@ -764,54 +748,6 @@ task MergeBamOuts { } } -task GetPileupSummaries { - # inputs - File? intervals - File ref_fasta - File ref_fai - File ref_dict - File bam - File bai - File? variants_for_contamination - File? variants_for_contamination_index - - File? gatk_override - - # runtime - Int? preemptible_attempts - Int? max_retries - String gatk_docker - Int? disk_space - Int? mem - - # Mem is in units of GB but our command and memory runtime values are in MB - Int machine_mem = if defined(mem) then mem * 1000 else 3000 - Int command_mem = machine_mem - 500 - - command { - set -e - - export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} - - gatk --java-options "-Xmx${command_mem}m" GetPileupSummaries -R ${ref_fasta} -I ${bam} ${"--interval-set-rule INTERSECTION -L " + intervals} \ - -V ${variants_for_contamination} -L ${variants_for_contamination} -O pileups.table - } - - runtime { - docker: gatk_docker - bootDiskSizeGb: 12 - memory: command_mem + " MB" - disks: "local-disk " + select_first([disk_space, 100]) + " HDD" - preemptible: select_first([preemptible_attempts, 10]) - maxRetries: select_first([max_retries, 3]) - } - - output { - File pileups = "pileups.table" - } - -} - task MergePileupSummaries { # input_tables needs to be optional because GetPileupSummaries is in an if-block Array[File?] input_tables @@ -912,6 +848,7 @@ task CollectF1R2Counts { File? gatk_override File? intervals + File? extra_intervals # runtime Int? max_retries @@ -938,9 +875,10 @@ task CollectF1R2Counts { gatk --java-options "-Xmx${command_mem}m" CollectF1R2Counts \ -I ${tumor_bam} -R ${ref_fasta} \ ${"-L " + intervals} \ - -alt-table "$tumor_name-alt.tsv" \ - -ref-hist "$tumor_name-ref.metrics" \ - -alt-hist "$tumor_name-alt-depth1.metrics" + ${"-isr INTERSECTION -L " + extra_intervals} \ + --alt-table "$tumor_name-alt.tsv" \ + --ref-hist "$tumor_name-ref.metrics" \ + --alt-hist "$tumor_name-alt-depth1.metrics" } runtime { @@ -963,13 +901,12 @@ task CollectF1R2Counts { # Learning step of the orientation bias mixture model, which is the recommended orientation bias filter as of September 2018 task LearnReadOrientationModel { - File alt_table - File ref_histogram - File? alt_histograms + Array[File] alt_tables + Array[File] ref_histograms + Array[File?] alt_histograms + Int alt_hist_length = length(alt_histograms) File? gatk_override - File? intervals - String tumor_sample # runtime Int? max_retries @@ -988,11 +925,15 @@ task LearnReadOrientationModel { set -e export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} + if [[ "${alt_hist_length}" > 0 ]]; then + ah_flag="-ah" + fi + gatk --java-options "-Xmx${command_mem}m" LearnReadOrientationModel \ - -alt-table ${alt_table} \ - -ref-hist ${ref_histogram} \ - -alt-hist ${alt_histograms} \ - -O "${tumor_sample}-artifact-prior-table.tsv" + -at ${sep=" -at " alt_tables} \ + -rh ${sep=" -rh " ref_histograms} \ + $ah_flag ${sep=" -ah " alt_histograms} \ + -O "artifact-prior-table.tsv" } runtime { @@ -1006,14 +947,13 @@ task LearnReadOrientationModel { } output { - File artifact_prior_table = "${tumor_sample}-artifact-prior-table.tsv" + File artifact_prior_table = "artifact-prior-table.tsv" } } task CalculateContamination { # inputs - File? intervals File ref_fasta File ref_fai File ref_dict @@ -1093,10 +1033,11 @@ task Filter { export GATK_LOCAL_JAR=${default="/root/gatk.jar" gatk_override} gatk --java-options "-Xmx${command_mem}m" FilterMutectCalls -V ${unfiltered_vcf} \ - -O ${output_vcf} \ - ${"--contamination-table " + contamination_table} \ - ${"--tumor-segmentation " + maf_segments} \ - ${m2_extra_filtering_args} + -O ${output_vcf} \ + ${"--contamination-table " + contamination_table} \ + ${"--tumor-segmentation " + maf_segments} \ + ${"-L" + intervals} \ + ${m2_extra_filtering_args} } runtime { diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/readorientation/CollectF1R2Counts.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/readorientation/CollectF1R2Counts.java index 276927047b1..c4270f7ca34 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/readorientation/CollectF1R2Counts.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/readorientation/CollectF1R2Counts.java @@ -5,7 +5,7 @@ import htsjdk.samtools.util.Histogram; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; -import org.broadinstitute.hellbender.cmdline.programgroups.CoverageAnalysisProgramGroup; +import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.exceptions.UserException; @@ -44,13 +44,16 @@ @CommandLineProgramProperties( summary = "Collect F1R2 read counts for the Mutect2 orientation bias mixture model filter", oneLineSummary = "Collect F1R2 read counts for the Mutect2 orientation bias mixture model filter", - programGroup = CoverageAnalysisProgramGroup.class + programGroup = ShortVariantDiscoveryProgramGroup.class ) public class CollectF1R2Counts extends LocusWalker { public static final String ALT_DATA_TABLE_LONG_NAME = "alt-table"; + public static final String ALT_DATA_TABLE_SHORT_NAME = "at"; public static final String ALT_DEPTH1_HISTOGRAM_LONG_NAME = "alt-hist"; + public static final String ALT_DEPTH1_HISTOGRAM_SHORT_NAME = "ah"; public static final String REF_SITE_METRICS_LONG_NAME = "ref-hist"; + public static final String REF_SITE_METRICS_SHORT_NAME = "rh"; public static final String MIN_MEDIAN_MQ_LONG_NAME = "median-mq"; public static final String MIN_BASE_QUALITY_LONG_NAME = "min-bq"; public static final String MAX_DEPTH_LONG_NAME = "max-depth"; @@ -61,13 +64,13 @@ public class CollectF1R2Counts extends LocusWalker { @Argument(fullName = MIN_BASE_QUALITY_LONG_NAME, doc = "exclude bases below this quality from pileup", optional = true) private int MINIMUM_BASE_QUALITY = 20; - @Argument(fullName = ALT_DATA_TABLE_LONG_NAME, doc = "a tab-separated output table of pileup data over alt sites") + @Argument(shortName = ALT_DATA_TABLE_SHORT_NAME, fullName = ALT_DATA_TABLE_LONG_NAME, doc = "a tab-separated output table of pileup data over alt sites") private File altDataTable = null; - @Argument(fullName = REF_SITE_METRICS_LONG_NAME, doc = "a metrics file with overall summary metrics and reference context-specific depth histograms") + @Argument(shortName = REF_SITE_METRICS_SHORT_NAME, fullName = REF_SITE_METRICS_LONG_NAME, doc = "a metrics file with overall summary metrics and reference context-specific depth histograms") private File refMetricsOutput = null; - @Argument(fullName = ALT_DEPTH1_HISTOGRAM_LONG_NAME, doc = "a histogram of alt sites with alt depth = 1") + @Argument(shortName = ALT_DEPTH1_HISTOGRAM_SHORT_NAME, fullName = ALT_DEPTH1_HISTOGRAM_LONG_NAME, doc = "a histogram of alt sites with alt depth = 1") private File altMetricsOutput = null; @Argument(fullName = MAX_DEPTH_LONG_NAME, doc = "sites with depth higher than this value will be grouped", optional = true) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/readorientation/LearnReadOrientationModel.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/readorientation/LearnReadOrientationModel.java index 9617b2ac409..6c65866900e 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/readorientation/LearnReadOrientationModel.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/readorientation/LearnReadOrientationModel.java @@ -6,12 +6,14 @@ import htsjdk.samtools.util.Histogram; import htsjdk.samtools.util.IOUtil; import htsjdk.samtools.util.SequenceUtil; +import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; import org.broadinstitute.barclay.argparser.Argument; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; import org.broadinstitute.hellbender.cmdline.CommandLineProgram; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup; +import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.utils.Nucleotide; import org.broadinstitute.hellbender.utils.Utils; @@ -45,18 +47,24 @@ public class LearnReadOrientationModel extends CommandLineProgram { public static final String MAX_EM_ITERATIONS_LONG_NAME = "num-em-iterations"; public static final String MAX_DEPTH_LONG_NAME = "max-depth"; - @Argument(fullName = CollectF1R2Counts.REF_SITE_METRICS_LONG_NAME, doc = "histograms of depths over ref sites for each reference context") - private File refHistogramTable; + @Argument(fullName = CollectF1R2Counts.REF_SITE_METRICS_LONG_NAME, + shortName = CollectF1R2Counts.REF_SITE_METRICS_SHORT_NAME, + doc = "histograms of depths over ref sites for each reference context") + private List refHistogramFiles; - @Argument(fullName = CollectF1R2Counts.ALT_DATA_TABLE_LONG_NAME, doc = "a table of F1R2 and depth counts") - private File altDataTable; + @Argument(fullName = CollectF1R2Counts.ALT_DATA_TABLE_LONG_NAME, + shortName = CollectF1R2Counts.ALT_DATA_TABLE_SHORT_NAME, + doc = "a table of F1R2 and depth counts") + private List altDataTables; + + @Argument(fullName = CollectF1R2Counts.ALT_DEPTH1_HISTOGRAM_LONG_NAME, + shortName = CollectF1R2Counts.ALT_DEPTH1_HISTOGRAM_SHORT_NAME, + doc = "histograms of depth 1 alt sites", optional = true) + private List altHistogramFiles = null; @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "table of artifact priors") private File output; - @Argument(fullName = CollectF1R2Counts.ALT_DEPTH1_HISTOGRAM_LONG_NAME, doc = "histograms of depth 1 alt sites", optional = true) - private File altHistogramTable = null; - @Argument(fullName = EM_CONVERGENCE_THRESHOLD_LONG_NAME, doc = "Stop the EM when the distance between parameters between iterations falls below this value", optional = true) private double converagenceThreshold = DEFAULT_CONVERGENCE_THRESHOLD; @@ -72,12 +80,10 @@ public class LearnReadOrientationModel extends CommandLineProgram { @Override protected void onStartup(){ - final MetricsFile referenceSiteMetrics = readMetricsFile(refHistogramTable); - refHistograms = referenceSiteMetrics.getAllHistograms(); + refHistograms = sumHistogramsFromFiles(refHistogramFiles); - if (altHistogramTable != null) { - final MetricsFile altSiteMetrics = readMetricsFile(altHistogramTable); - altHistograms = altSiteMetrics.getAllHistograms(); + if (altHistogramFiles != null) { + altHistograms = sumHistogramsFromFiles(altHistogramFiles); } else { altHistograms = Collections.emptyList(); } @@ -85,12 +91,11 @@ protected void onStartup(){ @Override public Object doWork(){ - final int defaultInitialListSize = 1_000_000; + final Pair> recordsSamplePair = gatherAltSiteRecords(altDataTables); + final String sample = recordsSamplePair.getLeft(); + final List records = recordsSamplePair.getRight(); - final Pair> sampleAndRecords = AltSiteRecord.readAltSiteRecords(altDataTable, defaultInitialListSize); - - final String sample = sampleAndRecords.getLeft(); - final Map> altDesignMatrixByContext = sampleAndRecords.getRight().stream() + final Map> altDesignMatrixByContext = records.stream() .collect(Collectors.groupingBy(AltSiteRecord::getReferenceContext)); final ArtifactPriorCollection artifactPriorCollection = new ArtifactPriorCollection(sample); @@ -220,7 +225,7 @@ public static List> combineAltDepthOneHistogramWithRC(final L /** * Contract: this method must be called after grouping the design matrices by context. * That is, {@param altDesignMatrix} must be a list of {@link AltSiteRecord} of a single reference context - * (which is in {@link F1R2FilterConstants.CANONICAL_KMERS}) and {@param altDesignRevComp} contains only + * (which is in F1R2FilterConstants.CANONICAL_KMERS) and {@param altDesignRevComp} contains only * {@link AltSiteRecord} of its reverse complement. */ @VisibleForTesting @@ -247,7 +252,7 @@ public static void mergeDesignMatrices(final List altDesignMatrix altDesignMatrix.addAll(altDesignMatrixRevComp.stream().map(AltSiteRecord::getReverseComplementOfRecord).collect(Collectors.toList())); } - private MetricsFile readMetricsFile(File file){ + public static MetricsFile readMetricsFile(File file){ final MetricsFile metricsFile = new MetricsFile<>(); final Reader reader = IOUtil.openFileForBufferedReading(file); metricsFile.read(reader); @@ -255,4 +260,43 @@ private MetricsFile readMetricsFile(File file){ return metricsFile; } + public static List> sumHistogramsFromFiles(final List files){ + Utils.nonNull(files, "files may not be null"); + + final List> histogramList = readMetricsFile(files.get(0)).getAllHistograms(); + for (int i = 1; i < files.size(); i++){ + final List> ithHistograms = readMetricsFile(files.get(i)).getAllHistograms(); + for (final Histogram jthHistogram : ithHistograms){ + final String refContext = jthHistogram.getValueLabel(); + final Optional> hist = histogramList.stream().filter(h -> h.getValueLabel().equals(refContext)).findAny(); + if (! hist.isPresent()){ + throw new IllegalStateException("Reference histogram is empty, which violates the invariant enforced by CollectF1R2Counts"); + } + + hist.get().addHistogram(jthHistogram); + } + } + return histogramList; + } + + public static Pair> gatherAltSiteRecords(final List tables){ + final int defaultInitialListSize = 1_000_000; + + final Pair> sampleAndRecords = AltSiteRecord.readAltSiteRecords(tables.get(0), defaultInitialListSize); + final String sample = sampleAndRecords.getLeft(); + + final List records = sampleAndRecords.getRight(); + for (int i = 1; i < tables.size(); i++){ + final Pair> ithSampleAndRecords = AltSiteRecord.readAltSiteRecords(tables.get(i), defaultInitialListSize); + if (! sample.equals(ithSampleAndRecords.getLeft())){ + throw new UserException("altDataTables contains records for two samples: " + sample + " and " + ithSampleAndRecords.getLeft()); + } + + records.addAll(ithSampleAndRecords.getRight()); + } + + return new ImmutablePair<>(sample, records); + + } + } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/LearnReadOrientationModelUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/LearnReadOrientationModelUnitTest.java new file mode 100644 index 00000000000..ac5f158390d --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/LearnReadOrientationModelUnitTest.java @@ -0,0 +1,81 @@ +package org.broadinstitute.hellbender.tools.walkers.readorientation; + +import htsjdk.samtools.util.Histogram; +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.tools.walkers.SplitIntervals; +import org.testng.Assert; +import org.testng.annotations.Test; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +public class LearnReadOrientationModelUnitTest extends CommandLineProgramTest { + @Test + public void testCombineHistograms(){ + final File refHistTruthFile = createTempFile("ref", "metrics"); + final File altHistTruthFile = createTempFile("alt", "metrics"); + final File altTableTruthFile = createTempFile("alt","tsv"); + + final File refMetricsDir = createTempDir("rh"); + final File altMetricsDir = createTempDir("ah"); + final File altTableDir = createTempDir("at"); + + // Step 0: Run CollectF1R2Counts without scatter-gather + runCommandLine(Arrays.asList( + "-R", b37_reference_20_21, + "-I", ReadOrientationModelIntegrationTest.hapmapBamSnippet, + "-L", ReadOrientationModelIntegrationTest.intervalList, + "--" + CollectF1R2Counts.ALT_DATA_TABLE_LONG_NAME, altTableTruthFile.getAbsolutePath(), + "--" + CollectF1R2Counts.REF_SITE_METRICS_LONG_NAME, refHistTruthFile.getAbsolutePath(), + "--" + CollectF1R2Counts.ALT_DEPTH1_HISTOGRAM_LONG_NAME, altHistTruthFile.getAbsolutePath()), + CollectF1R2Counts.class.getSimpleName()); + + + // Step 1: SplitIntervals + final File intervalDir = createTempDir("intervals"); + final int scatterCount = 5; + runCommandLine(Arrays.asList( + "-R", b37_reference_20_21, + "-L", ReadOrientationModelIntegrationTest.intervalList, + "-O", intervalDir.getAbsolutePath(), + "-" + SplitIntervals.SCATTER_COUNT_SHORT_NAME, Integer.toString(scatterCount)), + SplitIntervals.class.getSimpleName()); + + // Step 2: CollectF1R2Counts + final File[] intervals = intervalDir.listFiles(); + for (int i = 0; i < intervals.length; i++){ + runCommandLine(Arrays.asList( + "-R", b37_reference_20_21, + "-I", ReadOrientationModelIntegrationTest.hapmapBamSnippet, + "-L", intervals[i].getAbsolutePath(), + "--" + CollectF1R2Counts.ALT_DATA_TABLE_LONG_NAME, altTableDir.getAbsolutePath() + "/" + i + ".tsv", + "--" + CollectF1R2Counts.REF_SITE_METRICS_LONG_NAME, refMetricsDir.getAbsolutePath() + "/" + i + ".metrics", + "--" + CollectF1R2Counts.ALT_DEPTH1_HISTOGRAM_LONG_NAME, altMetricsDir.getAbsolutePath() + "/" + i + ".metrics"), + CollectF1R2Counts.class.getSimpleName()); + } + + final List> ref = LearnReadOrientationModel.sumHistogramsFromFiles(Arrays.asList(refMetricsDir.listFiles())); + final List> alt = LearnReadOrientationModel.sumHistogramsFromFiles(Arrays.asList(altMetricsDir.listFiles())); + final List altSites = LearnReadOrientationModel.gatherAltSiteRecords(Arrays.asList(altTableDir.listFiles())).getRight(); + + final List> refTruth = LearnReadOrientationModel.readMetricsFile(refHistTruthFile).getAllHistograms(); + final List> altTruth = LearnReadOrientationModel.readMetricsFile(altHistTruthFile).getAllHistograms(); + final List altSitesTruth = AltSiteRecord.readAltSiteRecords(altTableTruthFile).getRight(); + + + for (Histogram truth : refTruth){ + final Histogram eval = ref.stream().filter(h -> h.getValueLabel().equals(truth.getValueLabel())).findAny().get(); + Assert.assertEquals(eval.getSumOfValues(), truth.getSumOfValues()); + } + + for (Histogram truth : altTruth){ + final Histogram eval = alt.stream().filter(h -> h.getValueLabel().equals(truth.getValueLabel())).findAny().get(); + Assert.assertEquals(eval.getSum(), truth.getSum()); + Assert.assertEquals(eval.getSumOfValues(), truth.getSumOfValues()); + } + + Assert.assertEquals(altSites.size(), altSitesTruth.size()); + + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/ReadOrientationModelIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/ReadOrientationModelIntegrationTest.java index 31cdbbc8b52..356b29e4040 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/ReadOrientationModelIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/readorientation/ReadOrientationModelIntegrationTest.java @@ -1,5 +1,6 @@ package org.broadinstitute.hellbender.tools.walkers.readorientation; +import htsjdk.samtools.util.Histogram; import htsjdk.variant.variantcontext.VariantContext; import org.apache.commons.lang3.tuple.ImmutableTriple; import org.apache.commons.lang3.tuple.Triple; @@ -8,7 +9,11 @@ import org.broadinstitute.hellbender.Main; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.FeatureDataSource; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.tools.walkers.SplitIntervals; import org.broadinstitute.hellbender.tools.walkers.annotator.ReferenceBases; +import org.broadinstitute.hellbender.tools.walkers.contamination.GatherPileupSummaries; +import org.broadinstitute.hellbender.tools.walkers.contamination.GetPileupSummaries; import org.broadinstitute.hellbender.tools.walkers.mutect.FilterMutectCalls; import org.broadinstitute.hellbender.tools.walkers.mutect.M2ArgumentCollection; import org.broadinstitute.hellbender.tools.walkers.mutect.M2FiltersArgumentCollection; @@ -16,6 +21,8 @@ import org.broadinstitute.hellbender.utils.GATKProtectedVariantContextUtils; import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants; import org.testng.Assert; +import org.testng.annotations.BeforeClass; +import org.testng.annotations.DataProvider; import org.testng.annotations.Test; import java.io.File; @@ -23,49 +30,73 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.util.*; +import java.util.stream.IntStream; import java.util.stream.StreamSupport; public class ReadOrientationModelIntegrationTest extends CommandLineProgramTest { + public static final String testDir = toolsTestDir + "read_orientation_filter/"; + public static final String hapmapBamSnippet = testDir + "hapmap-20-plex-chr-20-21-read-orientation.bam"; + public static final String intervalList = testDir + "hapmap-20-plex-chr-20-21-read-orientation.interval_list"; + + @DataProvider(name = "scatterCounts") + public Object[][] getScatterCounts(){ + return new Object[][]{{1}, {5}}; + } /** - * Test the tool on a real bam to make sure that it does not crash + * Test that the tool sites of orientation bias that are manually picked out. + * Also tests scattering CollectF1R2Counts */ - @Test - public void testOnRealBam() throws IOException { - final File refMetrics = createTempFile("ref", ".table"); - final File altMetrics = createTempFile("alt", ".table"); - final File altTable = createTempFile("alt", ".table"); - - final String testDir = toolsTestDir + "read_orientation_filter/"; - // final String hapmapBamSnippet = testDir + "hapmap-20-plex-chr20-ROF.bam"; - final String hapmapBamSnippet = testDir + "hapmap-20-plex-chr-20-21-read-orientation.bam"; - - new Main().instanceMain(makeCommandLineArgs( - Arrays.asList( - "-R", b37_reference_20_21, - "-I", hapmapBamSnippet, - "--" + CollectF1R2Counts.ALT_DATA_TABLE_LONG_NAME, altTable.getAbsolutePath(), - "--" + CollectF1R2Counts.REF_SITE_METRICS_LONG_NAME, refMetrics.getAbsolutePath(), - "--" + CollectF1R2Counts.ALT_DEPTH1_HISTOGRAM_LONG_NAME, altMetrics.getAbsolutePath()), - CollectF1R2Counts.class.getSimpleName())); - - int lineCount = (int) Files.lines(Paths.get(refMetrics.getAbsolutePath())).filter(l -> l.matches("^[0-9].+")).count(); - - // Ensure that we print every bin, even when the count is 0 - Assert.assertEquals(lineCount, F1R2FilterConstants.DEFAULT_MAX_DEPTH); + @Test(dataProvider = "scatterCounts") + public void testOnRealBam(final int scatterCount) throws IOException { + final File refMetricsDir = createTempDir("rh"); + final File altMetricsDir = createTempDir("ah"); + final File altTableDir = createTempDir("at"); + + // Step 1: SplitIntervals + final File intervalDir = createTempDir("intervals"); + runCommandLine( + Arrays.asList( + "-R", b37_reference_20_21, + "-L", intervalList, + "-O", intervalDir.getAbsolutePath(), + "-" + SplitIntervals.SCATTER_COUNT_SHORT_NAME, Integer.toString(scatterCount)), + SplitIntervals.class.getSimpleName()); + + // Step 2: CollectF1R2Counts + final File[] intervals = intervalDir.listFiles(); + for (int i = 0; i < intervals.length; i++){ + runCommandLine(Arrays.asList( + "-R", b37_reference_20_21, + "-I", hapmapBamSnippet, + "-L", intervals[i].getAbsolutePath(), + "--" + CollectF1R2Counts.ALT_DATA_TABLE_LONG_NAME, altTableDir.getAbsolutePath() + "/" + i + ".tsv", + "--" + CollectF1R2Counts.REF_SITE_METRICS_LONG_NAME, refMetricsDir.getAbsolutePath() + "/" + i + ".metrics", + "--" + CollectF1R2Counts.ALT_DEPTH1_HISTOGRAM_LONG_NAME, altMetricsDir.getAbsolutePath() + "/" + i + ".metrics"), + CollectF1R2Counts.class.getSimpleName()); + + // Ensure that we print every bin, even when the count is 0 + final int lineCount = (int) Files.lines(Paths.get(refMetricsDir.getAbsolutePath() + "/" + i + ".metrics")).filter(l -> l.matches("^[0-9].+")).count(); + Assert.assertEquals(lineCount, F1R2FilterConstants.DEFAULT_MAX_DEPTH); + } - // Run the prior probabilities of artifact + // Step 3: LearnReadOrientationModel final File priorTable = createTempFile("prior", ".tsv"); - new Main().instanceMain(makeCommandLineArgs( - Arrays.asList( - "--" + CollectF1R2Counts.ALT_DATA_TABLE_LONG_NAME, altTable.getAbsolutePath(), - "--" + CollectF1R2Counts.ALT_DEPTH1_HISTOGRAM_LONG_NAME, altMetrics.getAbsolutePath(), - "--" + CollectF1R2Counts.REF_SITE_METRICS_LONG_NAME, refMetrics.getAbsolutePath(), - "--" + StandardArgumentDefinitions.OUTPUT_LONG_NAME, priorTable.getAbsolutePath()), - LearnReadOrientationModel.class.getSimpleName())); + final ArgumentsBuilder args = new ArgumentsBuilder(); + args.addArgument(StandardArgumentDefinitions.OUTPUT_LONG_NAME, priorTable.getAbsolutePath()); + final File[] refMetricsFiles = refMetricsDir.listFiles(); + Arrays.stream(refMetricsFiles).forEach(f -> + args.addArgument(CollectF1R2Counts.REF_SITE_METRICS_SHORT_NAME, f.getAbsolutePath())); + final File[] altMetricsFiles = altMetricsDir.listFiles(); + Arrays.stream(altMetricsFiles).forEach(f -> + args.addArgument(CollectF1R2Counts.ALT_DEPTH1_HISTOGRAM_SHORT_NAME, f.getAbsolutePath())); + final File[] altTableFiles = altTableDir.listFiles(); + Arrays.stream(altTableFiles).forEach(f -> + args.addArgument(CollectF1R2Counts.ALT_DATA_TABLE_SHORT_NAME, f.getAbsolutePath())); + runCommandLine(args.getArgsList(), LearnReadOrientationModel.class.getSimpleName()); final ArtifactPriorCollection artifactPriorCollection = ArtifactPriorCollection.readArtifactPriors(priorTable); - // Run Mutect 2 + // Step 4: Mutect 2 final File unfilteredVcf = GATKBaseTest.createTempFile("unfiltered", ".vcf"); final File filteredVcf = GATKBaseTest.createTempFile("filtered", ".vcf"); final File bamout = GATKBaseTest.createTempFile("SM-CEMAH", ".bam"); diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/read_orientation_filter/hapmap-20-plex-chr-20-21-read-orientation.interval_list b/src/test/resources/org/broadinstitute/hellbender/tools/read_orientation_filter/hapmap-20-plex-chr-20-21-read-orientation.interval_list new file mode 100644 index 00000000000..355b793eae3 --- /dev/null +++ b/src/test/resources/org/broadinstitute/hellbender/tools/read_orientation_filter/hapmap-20-plex-chr-20-21-read-orientation.interval_list @@ -0,0 +1,8 @@ +@HD VN:1.6 +@SQ SN:20 LN:63025520 M5:0dec9660ec1efaaf33281c0d5ea2560f UR:file:/Users/droazen/src/hellbender/src/test/resources/large/human_g1k_v37.20.21.fasta +@SQ SN:21 LN:48129895 M5:2979a6085bfe28e3ad6f552f361ed74d UR:file:/Users/droazen/src/hellbender/src/test/resources/large/human_g1k_v37.20.21.fasta +20 5296760 5297096 + . +20 23420905 23421244 + . +20 25755641 25755987 + . +20 34144590 34144918 + . +20 62165386 62165688 + .