diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervals.java index 7a232e1cb01..4bd2971d872 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervals.java @@ -2,6 +2,7 @@ import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalList; +import org.apache.commons.collections4.ListUtils; import org.apache.commons.math3.linear.Array2DRowRealMatrix; import org.apache.commons.math3.linear.RealMatrix; import org.apache.commons.math3.stat.descriptive.rank.Percentile; @@ -14,7 +15,7 @@ import org.broadinstitute.hellbender.cmdline.CommandLineProgram; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.IntervalArgumentCollection; -import org.broadinstitute.hellbender.cmdline.argumentcollections.OptionalIntervalArgumentCollection; +import org.broadinstitute.hellbender.cmdline.argumentcollections.RequiredIntervalArgumentCollection; import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils; @@ -22,6 +23,7 @@ import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleIntervalCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata; import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationKey; import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations; @@ -36,31 +38,33 @@ import java.util.stream.IntStream; /** - * Given annotated intervals output by {@link AnnotateIntervals} and/or counts collected on those intervals output - * by {@link CollectReadCounts}, outputs a filtered Picard interval list. Parameters for filtering based on the - * annotations and counts can be adjusted. Annotation-based filters will be applied first, followed by count-based - * filters. The result may be passed via -L to other tools (e.g., {@link DetermineGermlineContigPloidy} and - * {@link GermlineCNVCaller}) to mask intervals from analysis. + * Given specified intervals, annotated intervals output by {@link AnnotateIntervals}, and/or counts output by + * {@link CollectReadCounts}, outputs a filtered Picard interval list. The set intersection of intervals from the + * specified intervals, the annotated intervals, and the first count file will be taken as the initial set of intervals + * on which to perform filtering. Parameters for filtering based on the annotations and counts can be adjusted. + * Annotation-based filters will be applied first, followed by count-based filters. The result may be passed via -L to + * other tools (e.g., {@link DetermineGermlineContigPloidy} and {@link GermlineCNVCaller}) to mask intervals from + * analysis. * *
* gatk FilterIntervals \ - * -L intervals.interval_list \ + * -L preprocessed_intervals.interval_list \ * -XL blacklist_intervals.interval_list \ * -I sample_1.counts.hdf5 \ * -I sample_2.counts.hdf5 \ @@ -87,14 +91,14 @@ * ** gatk FilterIntervals \ - * -L intervals.interval_list \ + * -L preprocessed_intervals.interval_list \ * --annotated-intervals annotated_intervals.tsv \ * -O filtered_intervals.interval_list ** ** gatk FilterIntervals \ - * -L intervals.interval_list \ + * -L preprocessed_intervals.interval_list \ * -I sample_1.counts.hdf5 \ * -I sample_2.counts.hdf5 \ * ... \ @@ -125,7 +129,6 @@ public final class FilterIntervals extends CommandLineProgram { @Argument( doc = "Input file containing annotations for genomic intervals (output of AnnotateIntervals). " + - "All intervals specified via -L must be contained. " + "Must be provided if no counts files are provided.", fullName = CopyNumberStandardArgument.ANNOTATED_INTERVALS_FILE_LONG_NAME, optional = true @@ -134,7 +137,6 @@ public final class FilterIntervals extends CommandLineProgram { @Argument( doc = "Input TSV or HDF5 files containing integer read counts in genomic intervals (output of CollectReadCounts). " + - "All intervals specified via -L must be contained. " + "Must be provided if no annotated-intervals file is provided.", fullName = StandardArgumentDefinitions.INPUT_LONG_NAME, shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME, @@ -151,7 +153,7 @@ public final class FilterIntervals extends CommandLineProgram { @ArgumentCollection protected IntervalArgumentCollection intervalArgumentCollection - = new OptionalIntervalArgumentCollection(); + = new RequiredIntervalArgumentCollection(); @Argument( doc = "Minimum allowed value for GC-content annotation (inclusive).", @@ -254,7 +256,7 @@ public final class FilterIntervals extends CommandLineProgram { ) private double extremeCountFilterPercentageOfSamples = 90.; - private SimpleIntervalCollection specifiedIntervals; + private SimpleIntervalCollection intersectedIntervals; private AnnotatedIntervalCollection annotatedIntervals; @Override @@ -281,33 +283,53 @@ private void validateFilesAndResolveIntervals() { inputReadCountFiles.forEach(IOUtils::canReadFile); Utils.validateArg(inputReadCountFiles.size() == new HashSet<>(inputReadCountFiles).size(), "List of input read-count files cannot contain duplicates."); - - //parse inputs to resolve intervals and validate annotated intervals, if provided - if (inputReadCountFiles.isEmpty()) { - //only annotated intervals provided (no counts) + + //parse inputs to resolve and intersect intervals + final LocatableMetadata metadata; + final Listresolved; + final List intersected; + if (inputAnnotatedIntervalsFile != null && inputReadCountFiles.isEmpty()) { + //only annotated intervals provided annotatedIntervals = new AnnotatedIntervalCollection(inputAnnotatedIntervalsFile); - specifiedIntervals = new SimpleIntervalCollection( - annotatedIntervals.getMetadata(), - intervalArgumentCollection.getIntervals(annotatedIntervals.getMetadata().getSequenceDictionary())); - Utils.validateArg(specifiedIntervals.size() != 0, "At least one interval must be specified."); - Utils.validateArg(new HashSet<>(annotatedIntervals.getIntervals()).containsAll(specifiedIntervals.getIntervals()), - "Annotated intervals do not contain all specified intervals."); + metadata = annotatedIntervals.getMetadata(); + resolved = intervalArgumentCollection.getIntervals(metadata.getSequenceDictionary()); + intersected = ListUtils.intersection( + resolved, + annotatedIntervals.getIntervals()); + } else if (inputAnnotatedIntervalsFile == null && !inputReadCountFiles.isEmpty()) { + //only counts provided + final File firstReadCountFile = inputReadCountFiles.get(0); + final SimpleCountCollection firstReadCounts = SimpleCountCollection.read(firstReadCountFile); + metadata = firstReadCounts.getMetadata(); + resolved = intervalArgumentCollection.getIntervals(metadata.getSequenceDictionary()); + intersected = ListUtils.intersection( + resolved, + firstReadCounts.getIntervals()); } else { - //counts provided + //both annotated intervals and counts provided + annotatedIntervals = new AnnotatedIntervalCollection(inputAnnotatedIntervalsFile); final File firstReadCountFile = inputReadCountFiles.get(0); - specifiedIntervals = CopyNumberArgumentValidationUtils.resolveIntervals( - firstReadCountFile, intervalArgumentCollection, logger); - if (inputAnnotatedIntervalsFile != null) { - //both annotated intervals and counts provided - annotatedIntervals = CopyNumberArgumentValidationUtils.validateAnnotatedIntervalsSubset( - inputAnnotatedIntervalsFile, specifiedIntervals, logger); - } + final SimpleCountCollection firstReadCounts = SimpleCountCollection.read(firstReadCountFile); + CopyNumberArgumentValidationUtils.isSameDictionary( + annotatedIntervals.getMetadata().getSequenceDictionary(), + firstReadCounts.getMetadata().getSequenceDictionary()); + metadata = annotatedIntervals.getMetadata(); + resolved = intervalArgumentCollection.getIntervals(metadata.getSequenceDictionary()); + intersected = ListUtils.intersection( + ListUtils.intersection( + resolved, + annotatedIntervals.getIntervals()), + firstReadCounts.getIntervals()); } + Utils.validateArg(!intersected.isEmpty(), "At least one interval must remain after intersection."); + logger.info(String.format("After interval resolution, %d intervals remain...", resolved.size())); + logger.info(String.format("After interval intersection, %d intervals remain...", intersected.size())); + intersectedIntervals = new SimpleIntervalCollection(metadata, intersected); } private SimpleIntervalCollection filterIntervals() { - final int numOriginalIntervals = specifiedIntervals.size(); - final boolean[] mask = new boolean[numOriginalIntervals]; //if true, filter out; each filter modifies this mask + final int numIntersectedIntervals = intersectedIntervals.size(); + final boolean[] mask = new boolean[numIntersectedIntervals]; //if true, filter out; each filter modifies this mask //apply annotation-based filters if (annotatedIntervals != null) { @@ -315,17 +337,17 @@ private SimpleIntervalCollection filterIntervals() { //for present annotations, apply corresponding filters final List > annotationKeys = annotatedIntervals.getRecords().get(0).getAnnotationMap().getKeys(); if (annotationKeys.contains(CopyNumberAnnotations.GC_CONTENT)) { //this should always be true, but we check it anyway - updateMaskByAnnotationFilter(logger, annotatedIntervals, mask, + updateMaskByAnnotationFilter(logger, intersectedIntervals, annotatedIntervals, mask, CopyNumberAnnotations.GC_CONTENT, "GC-content", minimumGCContent, maximumGCContent); } if (annotationKeys.contains(CopyNumberAnnotations.MAPPABILITY)) { - updateMaskByAnnotationFilter(logger, annotatedIntervals, mask, + updateMaskByAnnotationFilter(logger, intersectedIntervals, annotatedIntervals, mask, CopyNumberAnnotations.MAPPABILITY, "mappability", minimumMappability, maximumMappability); } if (annotationKeys.contains(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT)) { - updateMaskByAnnotationFilter(logger, annotatedIntervals, mask, + updateMaskByAnnotationFilter(logger, intersectedIntervals, annotatedIntervals, mask, CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, "segmental-duplication-content", minimumSegmentalDuplicationContent, maximumSegmentalDuplicationContent); } @@ -334,13 +356,13 @@ private SimpleIntervalCollection filterIntervals() { //apply count-based filters if (!inputReadCountFiles.isEmpty()) { //get the read-count matrix (samples x specified intervals) and validate intervals and sequence dictionaries - final RealMatrix readCountMatrix = constructReadCountMatrix(logger, inputReadCountFiles, specifiedIntervals); + final RealMatrix readCountMatrix = constructReadCountMatrix(logger, inputReadCountFiles, intersectedIntervals); final int numSamples = readCountMatrix.getRowDimension(); logger.info("Applying count-based filters..."); //low-count filter: filter out intervals with a count strictly less than lowCountFilterCountThreshold //for strictly greater than lowCountFilterPercentageOfSamples - IntStream.range(0, numOriginalIntervals) + IntStream.range(0, numIntersectedIntervals) .filter(i -> !mask[i]) .forEach(i -> { if (Arrays.stream(readCountMatrix.getColumn(i)) @@ -353,14 +375,14 @@ private SimpleIntervalCollection filterIntervals() { "(intervals with a count < %d in > %s%% of samples fail), " + "%d / %d intervals remain...", lowCountFilterCountThreshold, lowCountFilterPercentageOfSamples, - countNumberPassing(mask), numOriginalIntervals)); + countNumberPassing(mask), numIntersectedIntervals)); //extreme-count filter: filter out remaining intervals with counts that fall outside of the per-sample percentiles //[extremeCountMinimumPercentile, extremeCountMaximumPercentile] for strictly greater than extremeCountFilterPercentageOfSamples - final boolean[][] percentileMask = new boolean[numSamples][numOriginalIntervals]; + final boolean[][] percentileMask = new boolean[numSamples][numIntersectedIntervals]; for (int sampleIndex = 0; sampleIndex < numSamples; sampleIndex++) { final double[] counts = readCountMatrix.getRow(sampleIndex); - final double[] filteredCounts = IntStream.range(0, numOriginalIntervals) + final double[] filteredCounts = IntStream.range(0, numIntersectedIntervals) .filter(i -> !mask[i]) .mapToDouble(i -> counts[i]) .toArray(); @@ -370,14 +392,14 @@ private SimpleIntervalCollection filterIntervals() { final double extremeCountMaximumPercentileThreshold = extremeCountFilterMaximumPercentile == 0. ? 0. : new Percentile(extremeCountFilterMaximumPercentile).evaluate(filteredCounts); - for (int intervalIndex = 0; intervalIndex < numOriginalIntervals; intervalIndex++) { + for (int intervalIndex = 0; intervalIndex < numIntersectedIntervals; intervalIndex++) { final double count = readCountMatrix.getEntry(sampleIndex, intervalIndex); if (!(extremeCountMinimumPercentileThreshold <= count && count <= extremeCountMaximumPercentileThreshold)) { percentileMask[sampleIndex][intervalIndex] = true; } } } - IntStream.range(0, numOriginalIntervals) + IntStream.range(0, numIntersectedIntervals) .filter(i -> !mask[i]) .forEach(i -> { if (IntStream.range(0, numSamples) @@ -390,28 +412,29 @@ private SimpleIntervalCollection filterIntervals() { "(intervals with a count percentile outside of [%s, %s] in > %s%% of samples fail), " + "%d / %d intervals remain...", extremeCountFilterMinimumPercentile, extremeCountFilterMaximumPercentile, extremeCountFilterPercentageOfSamples, - countNumberPassing(mask), numOriginalIntervals)); + countNumberPassing(mask), numIntersectedIntervals)); } - logger.info(String.format("%d / %d intervals passed all filters...", countNumberPassing(mask), numOriginalIntervals)); + logger.info(String.format("%d / %d intervals passed all filters...", countNumberPassing(mask), numIntersectedIntervals)); //return the filtered intervals as a SimpleIntervalCollection return new SimpleIntervalCollection( - specifiedIntervals.getMetadata(), - IntStream.range(0, numOriginalIntervals) + intersectedIntervals.getMetadata(), + IntStream.range(0, numIntersectedIntervals) .filter(i -> !mask[i]) - .mapToObj(i -> specifiedIntervals.getRecords().get(i)) + .mapToObj(i -> intersectedIntervals.getRecords().get(i)) .collect(Collectors.toList())); } private static void updateMaskByAnnotationFilter(final Logger logger, + final SimpleIntervalCollection intersectedIntervals, final AnnotatedIntervalCollection annotatedIntervals, final boolean[] mask, final AnnotationKey annotationKey, final String filterName, final double minValue, final double maxValue) { - IntStream.range(0, annotatedIntervals.size()) + IntStream.range(0, intersectedIntervals.size()) .filter(i -> !mask[i]) .forEach(i -> { final double value = annotatedIntervals.getRecords().get(i).getAnnotationMap().getValue(annotationKey); @@ -419,17 +442,17 @@ private static void updateMaskByAnnotationFilter(final Logger logger, mask[i] = true; }}); logger.info(String.format("After applying %s filter (intervals with values outside of [%s, %s] fail), %d / %d intervals remain...", - filterName, minValue, maxValue, countNumberPassing(mask), annotatedIntervals.size())); + filterName, minValue, maxValue, countNumberPassing(mask), intersectedIntervals.size())); } private static RealMatrix constructReadCountMatrix(final Logger logger, final List inputReadCountFiles, - final SimpleIntervalCollection specifiedIntervals) { + final SimpleIntervalCollection intersectedIntervals) { logger.info("Validating and aggregating input read-counts files..."); final int numSamples = inputReadCountFiles.size(); - final int numIntervals = specifiedIntervals.size(); + final int numIntervals = intersectedIntervals.size(); //construct the interval subset to pull out from the read-count files - final Set intervalSubset = new HashSet<>(specifiedIntervals.getRecords()); + final Set intervalSubset = new HashSet<>(intersectedIntervals.getRecords()); final RealMatrix readCountMatrix = new Array2DRowRealMatrix(numSamples, numIntervals); final ListIterator inputReadCountFilesIterator = inputReadCountFiles.listIterator(); while (inputReadCountFilesIterator.hasNext()) { @@ -439,7 +462,7 @@ private static RealMatrix constructReadCountMatrix(final Logger logger, final SimpleCountCollection readCounts = SimpleCountCollection.read(inputReadCountFile); if (!CopyNumberArgumentValidationUtils.isSameDictionary( readCounts.getMetadata().getSequenceDictionary(), - specifiedIntervals.getMetadata().getSequenceDictionary())) { + intersectedIntervals.getMetadata().getSequenceDictionary())) { logger.warn(String.format("Sequence dictionary for read-counts file %s is inconsistent with those for other inputs.", inputReadCountFile)); } final double[] subsetReadCounts = readCounts.getRecords().stream() diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervalsIntegrationTest.java index 43b07733925..e34e52ae85d 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervalsIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervalsIntegrationTest.java @@ -47,27 +47,27 @@ public final class FilterIntervalsIntegrationTest extends CommandLineProgramTest private static final AnnotatedIntervalCollection ANNOTATED_INTERVALS = new AnnotatedIntervalCollection( LOCATABLE_METADATA, Arrays.asList( - new AnnotatedInterval(new SimpleInterval("20", 1, 1), + new AnnotatedInterval(new SimpleInterval("20", 1, 10), new AnnotationMap(Arrays.asList( Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5), Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.05), Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.05)))), - new AnnotatedInterval(new SimpleInterval("20", 2, 2), + new AnnotatedInterval(new SimpleInterval("20", 11, 20), new AnnotationMap(Arrays.asList( Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5), Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5), Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.95)))), - new AnnotatedInterval(new SimpleInterval("20", 3, 3), + new AnnotatedInterval(new SimpleInterval("20", 21, 30), new AnnotationMap(Arrays.asList( Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5), Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5), Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))), - new AnnotatedInterval(new SimpleInterval("20", 4, 4), + new AnnotatedInterval(new SimpleInterval("20", 31, 40), new AnnotationMap(Arrays.asList( Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.05), Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5), Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))), - new AnnotatedInterval(new SimpleInterval("20", 5, 5), + new AnnotatedInterval(new SimpleInterval("20", 41, 50), new AnnotationMap(Arrays.asList( Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.95), Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.95), @@ -82,21 +82,43 @@ public Object[][] dataAnnotationBasedFilters() { ANNOTATED_INTERVALS.getIntervals().forEach(i -> intervals.add(new Interval(i))); intervals.write(intervalsFile); + //test that intersection gets rid of extra intervals in the interval list + final File intervalsWithExtraIntervalFile = createTempFile("intervals-with-extra-interval", ".interval_list"); + final IntervalList intervalsWithExtraInterval = new IntervalList(ANNOTATED_INTERVALS.getMetadata().getSequenceDictionary()); + ANNOTATED_INTERVALS.getIntervals().forEach(i -> intervalsWithExtraInterval.add(new Interval(i))); + intervalsWithExtraInterval.add(new Interval("20", 100000, 200000)); + intervalsWithExtraInterval.write(intervalsWithExtraIntervalFile); + return new Object[][]{ - //intervals file, annotated-intervals file, + //intervals file, array of strings for excluded intervals, annotated-intervals file, //min/max GC content, mix/max mappability, min/max seg-dupe content, expected array of indices of retained intervals - {intervalsFile, annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4)}, - {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2)}, - {intervalsFile, annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3)}, - {intervalsFile, annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4)}, - {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2)}, - {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Collections.singletonList(2)}, - {intervalsFile, annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3)}, - {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Collections.singletonList(2)}}; + {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4)}, + {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2)}, + {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3)}, + {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4)}, + {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2)}, + {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Collections.singletonList(2)}, + {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3)}, + {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Collections.singletonList(2)}, + {intervalsFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4)}, + {intervalsFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Collections.singletonList(2)}, + {intervalsFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1)}, + {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4)}, + {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2)}, + {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3)}, + {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4)}, + {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2)}, + {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Collections.singletonList(2)}, + {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3)}, + {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Collections.singletonList(2)}, + {intervalsWithExtraIntervalFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4)}, + {intervalsWithExtraIntervalFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Collections.singletonList(2)}, + {intervalsWithExtraIntervalFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1)}}; } @Test(dataProvider = "dataAnnotationBasedFilters") public void testAnnotationBasedFilters(final File intervalsFile, + final List excludedIntervals, final File annotatedIntervalsFile, final double minimumGCContent, final double maximumGCContent, @@ -117,6 +139,7 @@ public void testAnnotationBasedFilters(final File intervalsFile, .addArgument(FilterIntervals.MAXIMUM_SEGMENTAL_DUPLICATION_CONTENT_LONG_NAME, Double.toString(maximumSegmentalDuplicationContent)) .addArgument(IntervalArgumentCollection.INTERVAL_MERGING_RULE_LONG_NAME, IntervalMergingRule.OVERLAPPING_ONLY.toString()) .addOutput(outputFile); + excludedIntervals.forEach(i -> argsBuilder.addArgument(IntervalArgumentCollection.EXCLUDE_INTERVALS_LONG_NAME, i)); runCommandLine(argsBuilder); final IntervalList result = IntervalList.fromFile(outputFile); final IntervalList all = IntervalList.fromFile(intervalsFile); @@ -138,13 +161,14 @@ public Object[][] dataCountBasedFilters() { final double percentageOfSamples = 100. * (numSamples - numUncorruptedSamples) / numSamples - epsilon; final File intervalsFile = createTempFile("intervals", ".interval_list"); + final File intervalsWithExtraIntervalFile = createTempFile("intervals-with-extra-interval", ".interval_list"); final List countFiles = new ArrayList<>(numSamples); for (int sampleIndex = 0; sampleIndex < numSamples; sampleIndex++) { final String sampleName = String.format("sample-%d", sampleIndex); final SampleLocatableMetadata sampleMetadata = new SimpleSampleLocatableMetadata(sampleName, SEQUENCE_DICTIONARY); final List counts = new ArrayList<>(numIntervals); for (int intervalIndex = 0; intervalIndex < numIntervals; intervalIndex++) { - final SimpleInterval interval = new SimpleInterval("20", intervalIndex + 1, intervalIndex + 1); + final SimpleInterval interval = new SimpleInterval("20", 10 * intervalIndex + 1, 10 * (intervalIndex + 1)); if (intervalIndex < numIntervalsBelowCountThreshold && sampleIndex >= numUncorruptedSamples) { //corrupt first numIntervalsBelowCountThreshold intervals in last (numSamples - numUncorruptedSamples) samples with low counts counts.add(new SimpleCount(interval, lowCountFilterCountThreshold - 1)); @@ -161,6 +185,12 @@ public Object[][] dataCountBasedFilters() { final IntervalList intervals = new IntervalList(sampleCounts.getMetadata().getSequenceDictionary()); sampleCounts.getIntervals().forEach(i -> intervals.add(new Interval(i))); intervals.write(intervalsFile); + + //test that intersection gets rid of extra intervals in the interval list + final IntervalList intervalsWithExtraInterval = new IntervalList(sampleCounts.getMetadata().getSequenceDictionary()); + sampleCounts.getIntervals().forEach(i -> intervalsWithExtraInterval.add(new Interval(i))); + intervalsWithExtraInterval.add(new Interval("20", 100000, 200000)); + intervalsWithExtraInterval.write(intervalsWithExtraIntervalFile); } } @@ -171,6 +201,10 @@ public Object[][] dataCountBasedFilters() { {intervalsFile, countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 99., percentageOfSamples, IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 99).boxed().collect(Collectors.toList())}, {intervalsFile, countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 90., percentageOfSamples, + IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 90).boxed().collect(Collectors.toList())}, + {intervalsWithExtraIntervalFile, countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 99., percentageOfSamples, + IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 99).boxed().collect(Collectors.toList())}, + {intervalsWithExtraIntervalFile, countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 90., percentageOfSamples, IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 90).boxed().collect(Collectors.toList())}}; } @@ -223,7 +257,7 @@ public Object[][] dataAllFilters() { final SampleLocatableMetadata sampleMetadata = new SimpleSampleLocatableMetadata(sampleName, SEQUENCE_DICTIONARY); final List counts = new ArrayList<>(numIntervals); for (int intervalIndex = 0; intervalIndex < numIntervals; intervalIndex++) { - final SimpleInterval interval = new SimpleInterval("20", intervalIndex + 1, intervalIndex + 1); + final SimpleInterval interval = new SimpleInterval("20", 10 * intervalIndex + 1, 10 * (intervalIndex + 1)); if (intervalIndex < numIntervalsBelowCountThreshold && sampleIndex >= numUncorruptedSamples) { //corrupt first numIntervalsBelowCountThreshold intervals in last (numSamples - numUncorruptedSamples) samples with low counts counts.add(new SimpleCount(interval, lowCountFilterCountThreshold - 1)); @@ -262,20 +296,25 @@ public Object[][] dataAllFilters() { } return new Object[][]{ - //intervals file, annotated-intervals file, min/max GC content, mix/max mappability, min/max seg-dupe content, + //intervals file, array of strings for excluded intervals, annotated-intervals file, min/max GC content, mix/max mappability, min/max seg-dupe content, //count files, lowCountFilterCountThreshold, lowCountFilterPercentageOfSamples, //extremeCountFilterMinimumPercentile, extremeCountFilterMaximumPercentile, extremeCountFilterPercentageOfSamples, //expected array of indices of retained intervals - {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, + {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 99., percentageOfSamples, IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 99).boxed().collect(Collectors.toList())}, - {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, + {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, + countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 90., percentageOfSamples, + IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 90).boxed().collect(Collectors.toList())}, + //allow last numIntervalsFailingAnnotationFilters to pass annotation-based filters but exclude using -XL + {intervalsFile, Collections.singletonList(String.format("20:%d-%d", (numIntervals - numIntervalsFailingAnnotationFilters) * 10 + 5, numIntervals * 10)), annotatedIntervalsFile, 0, 1, 0, 1, 0, 1, countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 90., percentageOfSamples, IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 90).boxed().collect(Collectors.toList())}}; } @Test(dataProvider = "dataAllFilters") public void testAllFilters(final File intervalsFile, + final List excludedIntervals, final File annotatedIntervalsFile, final double minimumGCContent, final double maximumGCContent, @@ -307,6 +346,7 @@ public void testAllFilters(final File intervalsFile, .addArgument(FilterIntervals.EXTREME_COUNT_FILTER_PERCENTAGE_OF_SAMPLES_LONG_NAME, Double.toString(extremeCountFilterPercentageOfSamples)) .addArgument(IntervalArgumentCollection.INTERVAL_MERGING_RULE_LONG_NAME, IntervalMergingRule.OVERLAPPING_ONLY.toString()) .addOutput(outputFile); + excludedIntervals.forEach(i -> argsBuilder.addArgument(IntervalArgumentCollection.EXCLUDE_INTERVALS_LONG_NAME, i)); countFiles.forEach(argsBuilder::addInput); runCommandLine(argsBuilder); final IntervalList result = IntervalList.fromFile(outputFile);