diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervals.java index 7a232e1cb01..4bd2971d872 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervals.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervals.java @@ -2,6 +2,7 @@ import htsjdk.samtools.util.Interval; import htsjdk.samtools.util.IntervalList; +import org.apache.commons.collections4.ListUtils; import org.apache.commons.math3.linear.Array2DRowRealMatrix; import org.apache.commons.math3.linear.RealMatrix; import org.apache.commons.math3.stat.descriptive.rank.Percentile; @@ -14,7 +15,7 @@ import org.broadinstitute.hellbender.cmdline.CommandLineProgram; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.cmdline.argumentcollections.IntervalArgumentCollection; -import org.broadinstitute.hellbender.cmdline.argumentcollections.OptionalIntervalArgumentCollection; +import org.broadinstitute.hellbender.cmdline.argumentcollections.RequiredIntervalArgumentCollection; import org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup; import org.broadinstitute.hellbender.exceptions.UserException; import org.broadinstitute.hellbender.tools.copynumber.arguments.CopyNumberArgumentValidationUtils; @@ -22,6 +23,7 @@ import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection; import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleIntervalCollection; +import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata; import org.broadinstitute.hellbender.tools.copynumber.formats.records.SimpleCount; import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationKey; import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations; @@ -36,31 +38,33 @@ import java.util.stream.IntStream; /** - * Given annotated intervals output by {@link AnnotateIntervals} and/or counts collected on those intervals output - * by {@link CollectReadCounts}, outputs a filtered Picard interval list. Parameters for filtering based on the - * annotations and counts can be adjusted. Annotation-based filters will be applied first, followed by count-based - * filters. The result may be passed via -L to other tools (e.g., {@link DetermineGermlineContigPloidy} and - * {@link GermlineCNVCaller}) to mask intervals from analysis. + * Given specified intervals, annotated intervals output by {@link AnnotateIntervals}, and/or counts output by + * {@link CollectReadCounts}, outputs a filtered Picard interval list. The set intersection of intervals from the + * specified intervals, the annotated intervals, and the first count file will be taken as the initial set of intervals + * on which to perform filtering. Parameters for filtering based on the annotations and counts can be adjusted. + * Annotation-based filters will be applied first, followed by count-based filters. The result may be passed via -L to + * other tools (e.g., {@link DetermineGermlineContigPloidy} and {@link GermlineCNVCaller}) to mask intervals from + * analysis. * *

Inputs

* * * @@ -76,7 +80,7 @@ * *
  *     gatk FilterIntervals \
- *          -L intervals.interval_list \
+ *          -L preprocessed_intervals.interval_list \
  *          -XL blacklist_intervals.interval_list \
  *          -I sample_1.counts.hdf5 \
  *          -I sample_2.counts.hdf5 \
@@ -87,14 +91,14 @@
  *
  * 
  *     gatk FilterIntervals \
- *          -L intervals.interval_list \
+ *          -L preprocessed_intervals.interval_list \
  *          --annotated-intervals annotated_intervals.tsv \
  *          -O filtered_intervals.interval_list
  * 
* *
  *     gatk FilterIntervals \
- *          -L intervals.interval_list \
+ *          -L preprocessed_intervals.interval_list \
  *          -I sample_1.counts.hdf5 \
  *          -I sample_2.counts.hdf5 \
  *          ... \
@@ -125,7 +129,6 @@ public final class FilterIntervals extends CommandLineProgram {
 
     @Argument(
             doc = "Input file containing annotations for genomic intervals (output of AnnotateIntervals).  " +
-                    "All intervals specified via -L must be contained.  " +
                     "Must be provided if no counts files are provided.",
             fullName = CopyNumberStandardArgument.ANNOTATED_INTERVALS_FILE_LONG_NAME,
             optional = true
@@ -134,7 +137,6 @@ public final class FilterIntervals extends CommandLineProgram {
 
     @Argument(
             doc = "Input TSV or HDF5 files containing integer read counts in genomic intervals (output of CollectReadCounts).  " +
-                    "All intervals specified via -L must be contained.  " +
                     "Must be provided if no annotated-intervals file is provided.",
             fullName = StandardArgumentDefinitions.INPUT_LONG_NAME,
             shortName = StandardArgumentDefinitions.INPUT_SHORT_NAME,
@@ -151,7 +153,7 @@ public final class FilterIntervals extends CommandLineProgram {
 
     @ArgumentCollection
     protected IntervalArgumentCollection intervalArgumentCollection
-            = new OptionalIntervalArgumentCollection();
+            = new RequiredIntervalArgumentCollection();
 
     @Argument(
             doc = "Minimum allowed value for GC-content annotation (inclusive).",
@@ -254,7 +256,7 @@ public final class FilterIntervals extends CommandLineProgram {
     )
     private double extremeCountFilterPercentageOfSamples = 90.;
 
-    private SimpleIntervalCollection specifiedIntervals;
+    private SimpleIntervalCollection intersectedIntervals;
     private AnnotatedIntervalCollection annotatedIntervals;
 
     @Override
@@ -281,33 +283,53 @@ private void validateFilesAndResolveIntervals() {
         inputReadCountFiles.forEach(IOUtils::canReadFile);
         Utils.validateArg(inputReadCountFiles.size() == new HashSet<>(inputReadCountFiles).size(),
                 "List of input read-count files cannot contain duplicates.");
-
-        //parse inputs to resolve intervals and validate annotated intervals, if provided
-        if (inputReadCountFiles.isEmpty()) {
-            //only annotated intervals provided (no counts)
+        
+        //parse inputs to resolve and intersect intervals
+        final LocatableMetadata metadata;
+        final List resolved;
+        final List intersected;
+        if (inputAnnotatedIntervalsFile != null && inputReadCountFiles.isEmpty()) {
+            //only annotated intervals provided
             annotatedIntervals = new AnnotatedIntervalCollection(inputAnnotatedIntervalsFile);
-            specifiedIntervals = new SimpleIntervalCollection(
-                    annotatedIntervals.getMetadata(),
-                    intervalArgumentCollection.getIntervals(annotatedIntervals.getMetadata().getSequenceDictionary()));
-            Utils.validateArg(specifiedIntervals.size() != 0, "At least one interval must be specified.");
-            Utils.validateArg(new HashSet<>(annotatedIntervals.getIntervals()).containsAll(specifiedIntervals.getIntervals()),
-                    "Annotated intervals do not contain all specified intervals.");
+            metadata = annotatedIntervals.getMetadata();
+            resolved = intervalArgumentCollection.getIntervals(metadata.getSequenceDictionary());
+            intersected = ListUtils.intersection(
+                    resolved,
+                    annotatedIntervals.getIntervals());
+        } else if (inputAnnotatedIntervalsFile == null && !inputReadCountFiles.isEmpty()) {
+            //only counts provided
+            final File firstReadCountFile = inputReadCountFiles.get(0);
+            final SimpleCountCollection firstReadCounts = SimpleCountCollection.read(firstReadCountFile);
+            metadata = firstReadCounts.getMetadata();
+            resolved = intervalArgumentCollection.getIntervals(metadata.getSequenceDictionary());
+            intersected = ListUtils.intersection(
+                    resolved,
+                    firstReadCounts.getIntervals());
         } else {
-            //counts provided
+            //both annotated intervals and counts provided
+            annotatedIntervals = new AnnotatedIntervalCollection(inputAnnotatedIntervalsFile);
             final File firstReadCountFile = inputReadCountFiles.get(0);
-            specifiedIntervals = CopyNumberArgumentValidationUtils.resolveIntervals(
-                    firstReadCountFile, intervalArgumentCollection, logger);
-            if (inputAnnotatedIntervalsFile != null) {
-                //both annotated intervals and counts provided
-                annotatedIntervals = CopyNumberArgumentValidationUtils.validateAnnotatedIntervalsSubset(
-                        inputAnnotatedIntervalsFile, specifiedIntervals, logger);
-            }
+            final SimpleCountCollection firstReadCounts = SimpleCountCollection.read(firstReadCountFile);
+            CopyNumberArgumentValidationUtils.isSameDictionary(
+                    annotatedIntervals.getMetadata().getSequenceDictionary(),
+                    firstReadCounts.getMetadata().getSequenceDictionary());
+            metadata = annotatedIntervals.getMetadata();
+            resolved = intervalArgumentCollection.getIntervals(metadata.getSequenceDictionary());
+            intersected = ListUtils.intersection(
+                    ListUtils.intersection(
+                            resolved,
+                            annotatedIntervals.getIntervals()),
+                    firstReadCounts.getIntervals());
         }
+        Utils.validateArg(!intersected.isEmpty(), "At least one interval must remain after intersection.");
+        logger.info(String.format("After interval resolution, %d intervals remain...", resolved.size()));
+        logger.info(String.format("After interval intersection, %d intervals remain...", intersected.size()));
+        intersectedIntervals = new SimpleIntervalCollection(metadata, intersected);
     }
 
     private SimpleIntervalCollection filterIntervals() {
-        final int numOriginalIntervals = specifiedIntervals.size();
-        final boolean[] mask = new boolean[numOriginalIntervals];     //if true, filter out; each filter modifies this mask
+        final int numIntersectedIntervals = intersectedIntervals.size();
+        final boolean[] mask = new boolean[numIntersectedIntervals];     //if true, filter out; each filter modifies this mask
 
         //apply annotation-based filters
         if (annotatedIntervals != null) {
@@ -315,17 +337,17 @@ private SimpleIntervalCollection filterIntervals() {
             //for present annotations, apply corresponding filters
             final List> annotationKeys = annotatedIntervals.getRecords().get(0).getAnnotationMap().getKeys();
             if (annotationKeys.contains(CopyNumberAnnotations.GC_CONTENT)) {    //this should always be true, but we check it anyway
-                updateMaskByAnnotationFilter(logger, annotatedIntervals, mask,
+                updateMaskByAnnotationFilter(logger, intersectedIntervals, annotatedIntervals, mask,
                         CopyNumberAnnotations.GC_CONTENT, "GC-content",
                         minimumGCContent, maximumGCContent);
             }
             if (annotationKeys.contains(CopyNumberAnnotations.MAPPABILITY)) {
-                updateMaskByAnnotationFilter(logger, annotatedIntervals, mask,
+                updateMaskByAnnotationFilter(logger, intersectedIntervals, annotatedIntervals, mask,
                         CopyNumberAnnotations.MAPPABILITY, "mappability",
                         minimumMappability, maximumMappability);
             }
             if (annotationKeys.contains(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT)) {
-                updateMaskByAnnotationFilter(logger, annotatedIntervals, mask,
+                updateMaskByAnnotationFilter(logger, intersectedIntervals, annotatedIntervals, mask,
                         CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, "segmental-duplication-content",
                         minimumSegmentalDuplicationContent, maximumSegmentalDuplicationContent);
             }
@@ -334,13 +356,13 @@ private SimpleIntervalCollection filterIntervals() {
         //apply count-based filters
         if (!inputReadCountFiles.isEmpty()) {
             //get the read-count matrix (samples x specified intervals) and validate intervals and sequence dictionaries
-            final RealMatrix readCountMatrix = constructReadCountMatrix(logger, inputReadCountFiles, specifiedIntervals);
+            final RealMatrix readCountMatrix = constructReadCountMatrix(logger, inputReadCountFiles, intersectedIntervals);
             final int numSamples = readCountMatrix.getRowDimension();
             logger.info("Applying count-based filters...");
 
             //low-count filter: filter out intervals with a count strictly less than lowCountFilterCountThreshold
             //for strictly greater than lowCountFilterPercentageOfSamples
-            IntStream.range(0, numOriginalIntervals)
+            IntStream.range(0, numIntersectedIntervals)
                     .filter(i -> !mask[i])
                     .forEach(i -> {
                         if (Arrays.stream(readCountMatrix.getColumn(i))
@@ -353,14 +375,14 @@ private SimpleIntervalCollection filterIntervals() {
                             "(intervals with a count < %d in > %s%% of samples fail), " +
                             "%d / %d intervals remain...",
                     lowCountFilterCountThreshold, lowCountFilterPercentageOfSamples,
-                    countNumberPassing(mask), numOriginalIntervals));
+                    countNumberPassing(mask), numIntersectedIntervals));
 
             //extreme-count filter: filter out remaining intervals with counts that fall outside of the per-sample percentiles
             //[extremeCountMinimumPercentile, extremeCountMaximumPercentile] for strictly greater than extremeCountFilterPercentageOfSamples
-            final boolean[][] percentileMask = new boolean[numSamples][numOriginalIntervals];
+            final boolean[][] percentileMask = new boolean[numSamples][numIntersectedIntervals];
             for (int sampleIndex = 0; sampleIndex < numSamples; sampleIndex++) {
                 final double[] counts = readCountMatrix.getRow(sampleIndex);
-                final double[] filteredCounts = IntStream.range(0, numOriginalIntervals)
+                final double[] filteredCounts = IntStream.range(0, numIntersectedIntervals)
                         .filter(i -> !mask[i])
                         .mapToDouble(i -> counts[i])
                         .toArray();
@@ -370,14 +392,14 @@ private SimpleIntervalCollection filterIntervals() {
                 final double extremeCountMaximumPercentileThreshold = extremeCountFilterMaximumPercentile == 0.
                         ? 0.
                         : new Percentile(extremeCountFilterMaximumPercentile).evaluate(filteredCounts);
-                for (int intervalIndex = 0; intervalIndex < numOriginalIntervals; intervalIndex++) {
+                for (int intervalIndex = 0; intervalIndex < numIntersectedIntervals; intervalIndex++) {
                     final double count = readCountMatrix.getEntry(sampleIndex, intervalIndex);
                     if (!(extremeCountMinimumPercentileThreshold <= count && count <= extremeCountMaximumPercentileThreshold)) {
                         percentileMask[sampleIndex][intervalIndex] = true;
                     }
                 }
             }
-            IntStream.range(0, numOriginalIntervals)
+            IntStream.range(0, numIntersectedIntervals)
                     .filter(i -> !mask[i])
                     .forEach(i -> {
                         if (IntStream.range(0, numSamples)
@@ -390,28 +412,29 @@ private SimpleIntervalCollection filterIntervals() {
                             "(intervals with a count percentile outside of [%s, %s] in > %s%% of samples fail), " +
                             "%d / %d intervals remain...",
                     extremeCountFilterMinimumPercentile, extremeCountFilterMaximumPercentile, extremeCountFilterPercentageOfSamples,
-                    countNumberPassing(mask), numOriginalIntervals));
+                    countNumberPassing(mask), numIntersectedIntervals));
         }
 
-        logger.info(String.format("%d / %d intervals passed all filters...", countNumberPassing(mask), numOriginalIntervals));
+        logger.info(String.format("%d / %d intervals passed all filters...", countNumberPassing(mask), numIntersectedIntervals));
 
         //return the filtered intervals as a SimpleIntervalCollection
         return new SimpleIntervalCollection(
-                specifiedIntervals.getMetadata(),
-                IntStream.range(0, numOriginalIntervals)
+                intersectedIntervals.getMetadata(),
+                IntStream.range(0, numIntersectedIntervals)
                         .filter(i -> !mask[i])
-                        .mapToObj(i -> specifiedIntervals.getRecords().get(i))
+                        .mapToObj(i -> intersectedIntervals.getRecords().get(i))
                         .collect(Collectors.toList()));
     }
 
     private static void updateMaskByAnnotationFilter(final Logger logger,
+                                                     final SimpleIntervalCollection intersectedIntervals,
                                                      final AnnotatedIntervalCollection annotatedIntervals,
                                                      final boolean[] mask,
                                                      final AnnotationKey annotationKey,
                                                      final String filterName,
                                                      final double minValue,
                                                      final double maxValue) {
-        IntStream.range(0, annotatedIntervals.size())
+        IntStream.range(0, intersectedIntervals.size())
                 .filter(i -> !mask[i])
                 .forEach(i -> {
                     final double value = annotatedIntervals.getRecords().get(i).getAnnotationMap().getValue(annotationKey);
@@ -419,17 +442,17 @@ private static void updateMaskByAnnotationFilter(final Logger logger,
                         mask[i] = true;
                     }});
         logger.info(String.format("After applying %s filter (intervals with values outside of [%s, %s] fail), %d / %d intervals remain...",
-                filterName, minValue, maxValue, countNumberPassing(mask), annotatedIntervals.size()));
+                filterName, minValue, maxValue, countNumberPassing(mask), intersectedIntervals.size()));
     }
 
     private static RealMatrix constructReadCountMatrix(final Logger logger,
                                                        final List inputReadCountFiles,
-                                                       final SimpleIntervalCollection specifiedIntervals) {
+                                                       final SimpleIntervalCollection intersectedIntervals) {
         logger.info("Validating and aggregating input read-counts files...");
         final int numSamples = inputReadCountFiles.size();
-        final int numIntervals = specifiedIntervals.size();
+        final int numIntervals = intersectedIntervals.size();
         //construct the interval subset to pull out from the read-count files
-        final Set intervalSubset = new HashSet<>(specifiedIntervals.getRecords());
+        final Set intervalSubset = new HashSet<>(intersectedIntervals.getRecords());
         final RealMatrix readCountMatrix = new Array2DRowRealMatrix(numSamples, numIntervals);
         final ListIterator inputReadCountFilesIterator = inputReadCountFiles.listIterator();
         while (inputReadCountFilesIterator.hasNext()) {
@@ -439,7 +462,7 @@ private static RealMatrix constructReadCountMatrix(final Logger logger,
             final SimpleCountCollection readCounts = SimpleCountCollection.read(inputReadCountFile);
             if (!CopyNumberArgumentValidationUtils.isSameDictionary(
                     readCounts.getMetadata().getSequenceDictionary(),
-                    specifiedIntervals.getMetadata().getSequenceDictionary())) {
+                    intersectedIntervals.getMetadata().getSequenceDictionary())) {
                 logger.warn(String.format("Sequence dictionary for read-counts file %s is inconsistent with those for other inputs.", inputReadCountFile));
             }
             final double[] subsetReadCounts = readCounts.getRecords().stream()
diff --git a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervalsIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervalsIntegrationTest.java
index 43b07733925..e34e52ae85d 100644
--- a/src/test/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervalsIntegrationTest.java
+++ b/src/test/java/org/broadinstitute/hellbender/tools/copynumber/FilterIntervalsIntegrationTest.java
@@ -47,27 +47,27 @@ public final class FilterIntervalsIntegrationTest extends CommandLineProgramTest
     private static final AnnotatedIntervalCollection ANNOTATED_INTERVALS = new AnnotatedIntervalCollection(
             LOCATABLE_METADATA,
             Arrays.asList(
-                    new AnnotatedInterval(new SimpleInterval("20", 1,	1),
+                    new AnnotatedInterval(new SimpleInterval("20", 1,	10),
                             new AnnotationMap(Arrays.asList(
                                     Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
                                     Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.05),
                                     Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.05)))),
-                    new AnnotatedInterval(new SimpleInterval("20", 2,	2),
+                    new AnnotatedInterval(new SimpleInterval("20", 11,	20),
                             new AnnotationMap(Arrays.asList(
                                     Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
                                     Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
                                     Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.95)))),
-                    new AnnotatedInterval(new SimpleInterval("20", 3,	3),
+                    new AnnotatedInterval(new SimpleInterval("20", 21,	30),
                             new AnnotationMap(Arrays.asList(
                                     Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.5),
                                     Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
                                     Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))),
-                    new AnnotatedInterval(new SimpleInterval("20", 4,	4),
+                    new AnnotatedInterval(new SimpleInterval("20", 31,	40),
                             new AnnotationMap(Arrays.asList(
                                     Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.05),
                                     Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.5),
                                     Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.5)))),
-                    new AnnotatedInterval(new SimpleInterval("20", 5,	5),
+                    new AnnotatedInterval(new SimpleInterval("20", 41,	50),
                             new AnnotationMap(Arrays.asList(
                                     Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.95),
                                     Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.95),
@@ -82,21 +82,43 @@ public Object[][] dataAnnotationBasedFilters() {
         ANNOTATED_INTERVALS.getIntervals().forEach(i -> intervals.add(new Interval(i)));
         intervals.write(intervalsFile);
 
+        //test that intersection gets rid of extra intervals in the interval list
+        final File intervalsWithExtraIntervalFile = createTempFile("intervals-with-extra-interval", ".interval_list");
+        final IntervalList intervalsWithExtraInterval = new IntervalList(ANNOTATED_INTERVALS.getMetadata().getSequenceDictionary());
+        ANNOTATED_INTERVALS.getIntervals().forEach(i -> intervalsWithExtraInterval.add(new Interval(i)));
+        intervalsWithExtraInterval.add(new Interval("20", 100000, 200000));
+        intervalsWithExtraInterval.write(intervalsWithExtraIntervalFile);
+
         return new Object[][]{
-                //intervals file, annotated-intervals file,
+                //intervals file, array of strings for excluded intervals, annotated-intervals file,
                 //min/max GC content, mix/max mappability, min/max seg-dupe content, expected array of indices of retained intervals
-                {intervalsFile, annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4)},
-                {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2)},
-                {intervalsFile, annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3)},
-                {intervalsFile, annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4)},
-                {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2)},
-                {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Collections.singletonList(2)},
-                {intervalsFile, annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3)},
-                {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Collections.singletonList(2)}};
+                {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4)},
+                {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2)},
+                {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3)},
+                {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4)},
+                {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2)},
+                {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Collections.singletonList(2)},
+                {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3)},
+                {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Collections.singletonList(2)},
+                {intervalsFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4)},
+                {intervalsFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Collections.singletonList(2)},
+                {intervalsFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1)},
+                {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(0, 1, 2, 3, 4)},
+                {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1, 2)},
+                {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0., 1., Arrays.asList(1, 2, 3)},
+                {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0., 1., 0.1, 0.9, Arrays.asList(2, 3, 4)},
+                {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0., 1., Arrays.asList(1, 2)},
+                {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0.1, 0.9, Collections.singletonList(2)},
+                {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0., 1., 0.1, 0.9, 0.1, 0.9, Arrays.asList(2, 3)},
+                {intervalsWithExtraIntervalFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9, Collections.singletonList(2)},
+                {intervalsWithExtraIntervalFile, Collections.singletonList("20:1-10"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Arrays.asList(1, 2, 3, 4)},
+                {intervalsWithExtraIntervalFile, Arrays.asList("20:1-15", "20:35-45"), annotatedIntervalsFile, 0., 1., 0., 1., 0., 1., Collections.singletonList(2)},
+                {intervalsWithExtraIntervalFile, Collections.singletonList("20:25-50"), annotatedIntervalsFile, 0.1, 0.9, 0., 1., 0., 1., Arrays.asList(0, 1)}};
     }
 
     @Test(dataProvider = "dataAnnotationBasedFilters")
     public void testAnnotationBasedFilters(final File intervalsFile,
+                                           final List excludedIntervals,
                                            final File annotatedIntervalsFile,
                                            final double minimumGCContent,
                                            final double maximumGCContent,
@@ -117,6 +139,7 @@ public void testAnnotationBasedFilters(final File intervalsFile,
                 .addArgument(FilterIntervals.MAXIMUM_SEGMENTAL_DUPLICATION_CONTENT_LONG_NAME, Double.toString(maximumSegmentalDuplicationContent))
                 .addArgument(IntervalArgumentCollection.INTERVAL_MERGING_RULE_LONG_NAME, IntervalMergingRule.OVERLAPPING_ONLY.toString())
                 .addOutput(outputFile);
+        excludedIntervals.forEach(i -> argsBuilder.addArgument(IntervalArgumentCollection.EXCLUDE_INTERVALS_LONG_NAME, i));
         runCommandLine(argsBuilder);
         final IntervalList result = IntervalList.fromFile(outputFile);
         final IntervalList all = IntervalList.fromFile(intervalsFile);
@@ -138,13 +161,14 @@ public Object[][] dataCountBasedFilters() {
         final double percentageOfSamples = 100. * (numSamples - numUncorruptedSamples) / numSamples - epsilon;
 
         final File intervalsFile = createTempFile("intervals", ".interval_list");
+        final File intervalsWithExtraIntervalFile = createTempFile("intervals-with-extra-interval", ".interval_list");
         final List countFiles = new ArrayList<>(numSamples);
         for (int sampleIndex = 0; sampleIndex < numSamples; sampleIndex++) {
             final String sampleName = String.format("sample-%d", sampleIndex);
             final SampleLocatableMetadata sampleMetadata = new SimpleSampleLocatableMetadata(sampleName, SEQUENCE_DICTIONARY);
             final List counts = new ArrayList<>(numIntervals);
             for (int intervalIndex = 0; intervalIndex < numIntervals; intervalIndex++) {
-                final SimpleInterval interval = new SimpleInterval("20", intervalIndex + 1, intervalIndex + 1);
+                final SimpleInterval interval = new SimpleInterval("20", 10 * intervalIndex + 1, 10 * (intervalIndex + 1));
                 if (intervalIndex < numIntervalsBelowCountThreshold && sampleIndex >= numUncorruptedSamples) {
                     //corrupt first numIntervalsBelowCountThreshold intervals in last (numSamples - numUncorruptedSamples) samples with low counts
                     counts.add(new SimpleCount(interval, lowCountFilterCountThreshold - 1));
@@ -161,6 +185,12 @@ public Object[][] dataCountBasedFilters() {
                 final IntervalList intervals = new IntervalList(sampleCounts.getMetadata().getSequenceDictionary());
                 sampleCounts.getIntervals().forEach(i -> intervals.add(new Interval(i)));
                 intervals.write(intervalsFile);
+
+                //test that intersection gets rid of extra intervals in the interval list
+                final IntervalList intervalsWithExtraInterval = new IntervalList(sampleCounts.getMetadata().getSequenceDictionary());
+                sampleCounts.getIntervals().forEach(i -> intervalsWithExtraInterval.add(new Interval(i)));
+                intervalsWithExtraInterval.add(new Interval("20", 100000, 200000));
+                intervalsWithExtraInterval.write(intervalsWithExtraIntervalFile);
             }
         }
 
@@ -171,6 +201,10 @@ public Object[][] dataCountBasedFilters() {
                 {intervalsFile, countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 99., percentageOfSamples,
                         IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 99).boxed().collect(Collectors.toList())},
                 {intervalsFile, countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 90., percentageOfSamples,
+                        IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 90).boxed().collect(Collectors.toList())},
+                {intervalsWithExtraIntervalFile, countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 99., percentageOfSamples,
+                        IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 99).boxed().collect(Collectors.toList())},
+                {intervalsWithExtraIntervalFile, countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 90., percentageOfSamples,
                         IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 90).boxed().collect(Collectors.toList())}};
     }
 
@@ -223,7 +257,7 @@ public Object[][] dataAllFilters() {
             final SampleLocatableMetadata sampleMetadata = new SimpleSampleLocatableMetadata(sampleName, SEQUENCE_DICTIONARY);
             final List counts = new ArrayList<>(numIntervals);
             for (int intervalIndex = 0; intervalIndex < numIntervals; intervalIndex++) {
-                final SimpleInterval interval = new SimpleInterval("20", intervalIndex + 1, intervalIndex + 1);
+                final SimpleInterval interval = new SimpleInterval("20", 10 * intervalIndex + 1, 10 * (intervalIndex + 1));
                 if (intervalIndex < numIntervalsBelowCountThreshold && sampleIndex >= numUncorruptedSamples) {
                     //corrupt first numIntervalsBelowCountThreshold intervals in last (numSamples - numUncorruptedSamples) samples with low counts
                     counts.add(new SimpleCount(interval, lowCountFilterCountThreshold - 1));
@@ -262,20 +296,25 @@ public Object[][] dataAllFilters() {
         }
 
         return new Object[][]{
-                //intervals file, annotated-intervals file, min/max GC content, mix/max mappability, min/max seg-dupe content,
+                //intervals file, array of strings for excluded intervals, annotated-intervals file, min/max GC content, mix/max mappability, min/max seg-dupe content,
                 //count files, lowCountFilterCountThreshold, lowCountFilterPercentageOfSamples,
                 //extremeCountFilterMinimumPercentile, extremeCountFilterMaximumPercentile, extremeCountFilterPercentageOfSamples,
                 //expected array of indices of retained intervals
-                {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9,
+                {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9,
                         countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 99., percentageOfSamples,
                         IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 99).boxed().collect(Collectors.toList())},
-                {intervalsFile, annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9,
+                {intervalsFile, Collections.emptyList(), annotatedIntervalsFile, 0.1, 0.9, 0.1, 0.9, 0.1, 0.9,
+                        countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 90., percentageOfSamples,
+                        IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 90).boxed().collect(Collectors.toList())},
+                //allow last numIntervalsFailingAnnotationFilters to pass annotation-based filters but exclude using -XL
+                {intervalsFile, Collections.singletonList(String.format("20:%d-%d", (numIntervals - numIntervalsFailingAnnotationFilters) * 10 + 5, numIntervals * 10)), annotatedIntervalsFile, 0, 1, 0, 1, 0, 1,
                         countFiles, lowCountFilterCountThreshold, percentageOfSamples, 1., 90., percentageOfSamples,
                         IntStream.range(numIntervalsBelowCountThreshold + 1, numIntervalsBelowCountThreshold + 90).boxed().collect(Collectors.toList())}};
     }
 
     @Test(dataProvider = "dataAllFilters")
     public void testAllFilters(final File intervalsFile,
+                               final List excludedIntervals,
                                final File annotatedIntervalsFile,
                                final double minimumGCContent,
                                final double maximumGCContent,
@@ -307,6 +346,7 @@ public void testAllFilters(final File intervalsFile,
                 .addArgument(FilterIntervals.EXTREME_COUNT_FILTER_PERCENTAGE_OF_SAMPLES_LONG_NAME, Double.toString(extremeCountFilterPercentageOfSamples))
                 .addArgument(IntervalArgumentCollection.INTERVAL_MERGING_RULE_LONG_NAME, IntervalMergingRule.OVERLAPPING_ONLY.toString())
                 .addOutput(outputFile);
+        excludedIntervals.forEach(i -> argsBuilder.addArgument(IntervalArgumentCollection.EXCLUDE_INTERVALS_LONG_NAME, i));
         countFiles.forEach(argsBuilder::addInput);
         runCommandLine(argsBuilder);
         final IntervalList result = IntervalList.fromFile(outputFile);