Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added optional mappability and segmental-duplication annotation to AnnotateIntervals. #5162

Merged
merged 4 commits into from
Sep 13, 2018
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import htsjdk.samtools.util.CoordMath;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.bed.BEDFeature;
import org.apache.commons.collections4.IteratorUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.BetaFeature;
Expand Down Expand Up @@ -49,15 +50,15 @@
* This is a BED file in .bed or .bed.gz format that identifies uniquely mappable regions of the genome.
* The track should correspond to the appropriate read length and overlapping intervals must be merged.
* See <a href ="https://bismap.hoffmanlab.org/">https://bismap.hoffmanlab.org/</a>. If scores are provided,
* intervals will be annotated with the length-weighted average; scores may not be NaN. Otherwise, scores
* for covered and uncovered intervals will be taken as unity and zero, respectively.
* intervals will be annotated with the length-weighted average; note that NaN scores will be taken as unity.
* Otherwise, scores for covered and uncovered intervals will be taken as unity and zero, respectively.
* </li>
* <li>
* (Optional) Segmental-duplication track.
* This is a BED file in .bed or .bed.gz format that identifies segmental-duplication regions of the genome.
* Overlapping intervals must be merged. If scores are provided, intervals will be annotated with the
* length-weighted average; scores may not be NaN. Otherwise, scores for covered and uncovered intervals
* will be taken as unity and zero, respectively.
* length-weighted average; note that NaN scores will be taken as unity. Otherwise, scores for covered and
* uncovered intervals will be taken as unity and zero, respectively.
* </li>
* </ul>
*
Expand Down Expand Up @@ -124,7 +125,7 @@ public final class AnnotateIntervals extends GATKTool {
private FeatureInput<BEDFeature> mappabilityTrackPath;

@Argument(
doc = "Path to segmental-duplication track in .bed or .bed.gz format (see https://bismap.hoffmanlab.org/). " +
doc = "Path to segmental-duplication track in .bed or .bed.gz format. " +
"Overlapping intervals must be merged.",
fullName = SEGMENTAL_DUPLICATION_TRACK_PATH_LONG_NAME,
optional = true
Expand Down Expand Up @@ -178,21 +179,39 @@ public void onTraversalStart() {

// add optional annotators
if (mappabilityTrackPath != null) {
checkForOverlaps(features, mappabilityTrackPath, sequenceDictionary);
logger.info("Adding mappability annotator...");
annotators.add(new MappabilityAnnotator(mappabilityTrackPath));
}
if (segmentalDuplicationTrackPath != null) {
checkForOverlaps(features, segmentalDuplicationTrackPath, sequenceDictionary);
logger.info("Adding segmental-duplication-content annotator...");
annotators.add(new SegmentalDuplicationContentAnnotator(segmentalDuplicationTrackPath));
}

logger.info("Annotating intervals...");
}

private static void checkForOverlaps(final FeatureManager featureManager,
final FeatureInput<BEDFeature> featureTrackPath,
final SAMSequenceDictionary sequenceDictionary) {
final List<BEDFeature> features = IteratorUtils.toList(featureManager.getFeatureIterator(featureTrackPath));
final GenomeLocParser parser = new GenomeLocParser(sequenceDictionary);
final List<GenomeLoc> genomeLocs = IntervalUtils.genomeLocsFromLocatables(parser, features);
final List<GenomeLoc> mergedGenomeLocs = IntervalUtils.mergeIntervalLocations(
genomeLocs, IntervalMergingRule.OVERLAPPING_ONLY);
if (genomeLocs.size() != mergedGenomeLocs.size()) {
throw new UserException.BadInput(String.format("Feature track %s contains overlapping intervals; " +
"these should be merged.", featureTrackPath));
}
}
@Override
public void traverse() {
final List<AnnotatedInterval> annotatedIntervalList = new ArrayList<>(intervals.size());
for (final SimpleInterval interval : intervals) {
if (interval.getLengthOnReference() == 0) {
throw new UserException.BadInput(String.format("Interval cannot have zero length: %s", interval));
}
final ReferenceContext referenceContext = new ReferenceContext(reference, interval);
final FeatureContext featureContext = new FeatureContext(features, interval);
final AnnotationMap annotations = new AnnotationMap(annotators.stream()
Expand Down Expand Up @@ -224,28 +243,32 @@ public Object onTraversalSuccess() {
abstract static class IntervalAnnotator<T> {
public abstract AnnotationKey<T> getAnnotationKey();

/**
* @param interval assumed to have non-zero length
*/
abstract T apply(final Locatable interval,
final ReferenceContext referenceContext,
final FeatureContext featureContext);

/**
* @param interval assumed to have non-zero length
*/
T applyAndValidate(final Locatable interval,
final ReferenceContext referenceContext,
final FeatureContext featureContext) {
try {
return getAnnotationKey().validate(apply(interval, referenceContext, featureContext));
} catch (final IllegalArgumentException e) {
throw new UserException.BadInput(String.format("%s " +
"Feature track may contain overlapping intervals; these should be merged.", e.getMessage()));
}
return getAnnotationKey().validate(apply(interval, referenceContext, featureContext));
}
}

public static class GCContentAnnotator extends IntervalAnnotator<Double> {
private static class GCContentAnnotator extends IntervalAnnotator<Double> {
@Override
public AnnotationKey<Double> getAnnotationKey() {
return CopyNumberAnnotations.GC_CONTENT;
}

/**
* @param interval assumed to have non-zero length
*/
@Override
Double apply(final Locatable interval,
final ReferenceContext referenceContext,
Expand All @@ -260,8 +283,9 @@ Double apply(final Locatable interval,
}

/**
* If scores are provided, intervals will be annotated with the length-weighted average; scores may not be NaN.
* Otherwise, scores for covered and uncovered intervals will be taken as unity and zero, respectively.
* If scores are provided, intervals will be annotated with the length-weighted average; note that NaN scores will
* be taken as unity. Otherwise, scores for covered and uncovered intervals will be taken as unity and zero,
* respectively.
*/
abstract static class BEDLengthWeightedAnnotator extends IntervalAnnotator<Double> {
private final FeatureInput<BEDFeature> trackPath;
Expand All @@ -270,19 +294,18 @@ abstract static class BEDLengthWeightedAnnotator extends IntervalAnnotator<Doubl
this.trackPath = trackPath;
}

/**
* @param interval assumed to have non-zero length
*/
@Override
Double apply(final Locatable interval,
final ReferenceContext referenceContext,
final FeatureContext featureContext) {
final int intervalLength = interval.getLengthOnReference();
if (intervalLength == 0) {
return Double.NaN;
}
double lengthWeightedSum = 0.;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want to assign a score of 0 if a region is not in the mappability track?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is what we indicate in the documentation.

final List<BEDFeature> features = featureContext.getValues(trackPath);
for (final BEDFeature feature : features) {
final double scoreOrNaN = (double) feature.getScore();
final double score = Double.isNaN(scoreOrNaN) ? 1. : scoreOrNaN; // missing score -> score = 1
final double score = Double.isNaN(scoreOrNaN) ? 1. : scoreOrNaN; // missing or NaN score -> score = 1
lengthWeightedSum += score *
CoordMath.getOverlap(
feature.getStart(), feature.getEnd() - 1, // zero-based
Expand All @@ -292,7 +315,7 @@ Double apply(final Locatable interval,
}
}

public static class MappabilityAnnotator extends BEDLengthWeightedAnnotator {
private static class MappabilityAnnotator extends BEDLengthWeightedAnnotator {
MappabilityAnnotator(final FeatureInput<BEDFeature> mappabilityTrackPath) {
super(mappabilityTrackPath);
}
Expand All @@ -303,7 +326,7 @@ public AnnotationKey<Double> getAnnotationKey() {
}
}

public static class SegmentalDuplicationContentAnnotator extends BEDLengthWeightedAnnotator {
private static class SegmentalDuplicationContentAnnotator extends BEDLengthWeightedAnnotator {
SegmentalDuplicationContentAnnotator(final FeatureInput<BEDFeature> segmentalDuplicationTrackPath) {
super(segmentalDuplicationTrackPath);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.argumentcollections.IntervalArgumentCollection;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollectionUnitTest;
Expand Down Expand Up @@ -37,6 +38,8 @@ public final class AnnotateIntervalsIntegrationTest extends CommandLineProgramTe
"annotate-intervals-hg19-umap-k100-single-read-mappability-merged-20-21.bed.gz");
private static final File SEGMENTAL_DUPLICATION_TRACK_FILE = new File(TEST_SUB_DIR,
"annotate-intervals-hg19-segmental-duplication-20-21.bed.gz");
private static final File SEGMENTAL_DUPLICATION_WITH_OVERLAPS_TRACK_FILE = new File(TEST_SUB_DIR,
"annotate-intervals-hg19-segmental-duplication-20-21-with-overlaps.bed.gz");

private static final SAMSequenceDictionary SEQUENCE_DICTIONARY = ReferenceDataSource.of(REFERENCE_FILE.toPath()).getSequenceDictionary();
private static final LocatableMetadata LOCATABLE_METADATA = new SimpleLocatableMetadata(SEQUENCE_DICTIONARY);
Expand All @@ -45,7 +48,7 @@ public final class AnnotateIntervalsIntegrationTest extends CommandLineProgramTe
* Test case checks that intervals are sorted according to {@link #SEQUENCE_DICTIONARY} and
* adjacent intervals are not merged. This test case is also used in {@link AnnotatedIntervalCollectionUnitTest}.
*/
private static final AnnotatedIntervalCollection EXPECTED_ALL_ANNOTATIONS = new AnnotatedIntervalCollection(
public static final AnnotatedIntervalCollection EXPECTED_ALL_ANNOTATIONS = new AnnotatedIntervalCollection(
LOCATABLE_METADATA,
Arrays.asList(
new AnnotatedInterval(new SimpleInterval("20", 1000001, 1001000),
Expand All @@ -68,6 +71,11 @@ public final class AnnotateIntervalsIntegrationTest extends CommandLineProgramTe
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.448),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add another interval which partially overlaps with intervals in mappability track?

Pair.of(CopyNumberAnnotations.MAPPABILITY, 1.),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.)))),
new AnnotatedInterval(new SimpleInterval("20", 1238001, 1239000),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.3),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 1.),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.218)))),
new AnnotatedInterval(new SimpleInterval("21", 1, 100),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, Double.NaN),
Expand Down Expand Up @@ -153,6 +161,18 @@ public void testAllAnnotations() {
Assert.assertNotSame(result, expected);
}

@Test(expectedExceptions = UserException.BadInput.class)
public void testSegmentalDuplicationContentWithOverlaps() {
final File outputFile = createTempFile("annotate-intervals-test", ".tsv");
final ArgumentsBuilder argsBuilder = new ArgumentsBuilder()
.addReference(REFERENCE_FILE)
.addFileArgument(AnnotateIntervals.SEGMENTAL_DUPLICATION_TRACK_PATH_LONG_NAME, SEGMENTAL_DUPLICATION_WITH_OVERLAPS_TRACK_FILE)
.addArgument(StandardArgumentDefinitions.INTERVALS_LONG_NAME, INTERVALS_FILE.getAbsolutePath())
.addArgument(IntervalArgumentCollection.INTERVAL_MERGING_RULE_LONG_NAME, IntervalMergingRule.OVERLAPPING_ONLY.toString())
.addOutput(outputFile);
runCommandLine(argsBuilder);
}

@Test(expectedExceptions = IllegalArgumentException.class)
public void testIntervalSetRule() {
final File resultOutputFile = createTempFile("annotate-intervals-test", ".tsv");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,25 +1,20 @@
package org.broadinstitute.hellbender.tools.copynumber.formats.collections;

import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.engine.ReferenceDataSource;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.SimpleLocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.AnnotateIntervalsIntegrationTest;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationKey;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationMap;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
Expand All @@ -34,44 +29,8 @@ public final class AnnotatedIntervalCollectionUnitTest extends GATKBaseTest {
"annotated-intervals-repeated-annotation.tsv");
private static final File ANNOTATED_INTERVALS_GC_CONTENT_ONLY_FILE = new File(TEST_SUB_DIR,
"annotated-intervals-gc-content-only.tsv");
private static final File REFERENCE_FILE = new File(b37_reference_20_21);

private static final SAMSequenceDictionary SEQUENCE_DICTIONARY = ReferenceDataSource.of(REFERENCE_FILE.toPath()).getSequenceDictionary();
private static final LocatableMetadata LOCATABLE_METADATA = new SimpleLocatableMetadata(SEQUENCE_DICTIONARY);

private static final AnnotatedIntervalCollection EXPECTED_ALL_ANNOTATIONS = new AnnotatedIntervalCollection(
LOCATABLE_METADATA,
Arrays.asList(
new AnnotatedInterval(new SimpleInterval("20", 1000001, 1001000),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.49),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 1.),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.)))),
new AnnotatedInterval(new SimpleInterval("20", 1001001, 1002000),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.483),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 1.),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.)))),
new AnnotatedInterval(new SimpleInterval("20", 1002001, 1003000),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.401),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 1.),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.)))),
new AnnotatedInterval(new SimpleInterval("20", 1003001, 1004000),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, 0.448),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 1.),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.)))),
new AnnotatedInterval(new SimpleInterval("21", 1, 100),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, Double.NaN),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.0),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.)))),
new AnnotatedInterval(new SimpleInterval("21", 101, 200),
new AnnotationMap(Arrays.asList(
Pair.of(CopyNumberAnnotations.GC_CONTENT, Double.NaN),
Pair.of(CopyNumberAnnotations.MAPPABILITY, 0.0),
Pair.of(CopyNumberAnnotations.SEGMENTAL_DUPLICATION_CONTENT, 0.))))));
private static final AnnotatedIntervalCollection EXPECTED_ALL_ANNOTATIONS = AnnotateIntervalsIntegrationTest.EXPECTED_ALL_ANNOTATIONS;

@Test
public void testRead() {
Expand All @@ -92,9 +51,7 @@ public void testReadExtraAnnotation() {

@Test(expectedExceptions = UserException.BadInput.class)
public void testReadRepeatedAnnotation() {
final AnnotatedIntervalCollection result = new AnnotatedIntervalCollection(ANNOTATED_INTERVALS_REPEATED_ANNOTATION_FILE);
Assert.assertEquals(result, EXPECTED_ALL_ANNOTATIONS);
Assert.assertNotSame(result, EXPECTED_ALL_ANNOTATIONS);
new AnnotatedIntervalCollection(ANNOTATED_INTERVALS_REPEATED_ANNOTATION_FILE);
}

@Test
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@
20 1001001 1002000 + .
20 1002001 1003000 + .
20 1003001 1004000 + .
20 1238001 1239000 + .
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ CONTIG START END GC_CONTENT MAPPABILITY SEGMENTAL_DUPLICATION_CONTENT
20 1001001 1002000 0.483000 1.000000 0.000000
20 1002001 1003000 0.401000 1.000000 0.000000
20 1003001 1004000 0.448000 1.000000 0.000000
20 1238001 1239000 0.300000 1.000000 0.218000
21 1 100 NaN 0.000000 0.000000
21 101 200 NaN 0.000000 0.000000
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ CONTIG START END GC_CONTENT MAPPABILITY SEGMENTAL_DUPLICATION_CONTENT EXTRA_ANNO
20 1001001 1002000 0.483000 1.000000 0.000000 0.000000
20 1002001 1003000 0.401000 1.000000 0.000000 0.000000
20 1003001 1004000 0.448000 1.000000 0.000000 0.000000
20 1238001 1239000 0.300000 1.000000 0.218000 0.218000
21 1 100 NaN 0.000000 0.000000 0.000000
21 101 200 NaN 0.000000 0.000000 0.000000
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@ CONTIG START END GC_CONTENT
20 1001001 1002000 0.483000
20 1002001 1003000 0.401000
20 1003001 1004000 0.448000
20 1238001 1239000 0.300000
21 1 100 NaN
21 101 200 NaN
Loading