Added optional mappability and segmental-duplication annotation to An…

…notateIntervals.
broadinstitute · Sep 6, 2018 · ae30544 · ae30544
1 parent 2bfe742
commit ae30544
Show file tree

Hide file tree

Showing 25 changed files with 874 additions and 168 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java
diff --git a/...in/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java b/...in/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java
@@ -21,6 +21,7 @@
 import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
 import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
 import org.broadinstitute.hellbender.utils.Utils;
@@ -280,7 +281,9 @@ protected void runPipeline(final JavaSparkContext ctx) {
                 inputAnnotatedIntervalsFile, firstReadCounts, logger);
         final double[] intervalGCContent = annotatedIntervals == null
                 ? null
-                : annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationSet().getGCContent()).toArray();
+                : annotatedIntervals.getRecords().stream()
+                    .mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
+                    .toArray();
 
         //validate input read-counts files (i.e., check intervals and that only integer counts are contained)
         //and aggregate as a RealMatrix with dimensions numIntervals x numSamples

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java
@@ -17,6 +17,7 @@
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 
 import java.io.File;
@@ -210,7 +211,9 @@ protected Object doWork() {
                     inputAnnotatedIntervalsFile, readCounts, logger);
             final double[] intervalGCContent = annotatedIntervals == null
                     ? null
-                    : annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationSet().getGCContent()).toArray();
+                    : annotatedIntervals.getRecords().stream()
+                    .mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
+                    .toArray();
 
             if (intervalGCContent == null) {
                 logger.warn("Neither a panel of normals nor GC-content annotations were provided, so only standardization will be performed...");

diff --git a/...n/java/org/broadinstitute/hellbender/tools/copynumber/formats/CopyNumberFormatsUtils.java b/...n/java/org/broadinstitute/hellbender/tools/copynumber/formats/CopyNumberFormatsUtils.java
@@ -1,11 +1,51 @@
 package org.broadinstitute.hellbender.tools.copynumber.formats;
 
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+import org.broadinstitute.hellbender.utils.text.XReadLines;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+import org.broadinstitute.hellbender.utils.tsv.TableUtils;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
 public final class CopyNumberFormatsUtils {
+    public static final String COMMENT_PREFIX = "@";    //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
     public static final String DOUBLE_FORMAT = "%.6f";
 
     private CopyNumberFormatsUtils() {}
 
     public static String formatDouble(final double value) {
         return String.format(DOUBLE_FORMAT, value);
     }
+
+    /**
+     * Extracts column names from a TSV file
+     */
+    public static TableColumnCollection readColumnsFromHeader(final File inputFile) {
+        IOUtils.canReadFile(inputFile);
+        List<String> columns = null;
+        try (final XReadLines reader = new XReadLines(inputFile)) {
+            while (reader.hasNext()) {
+                String nextLine = reader.next();
+                if (!nextLine.startsWith(COMMENT_PREFIX)) {
+                    columns = Arrays.asList(nextLine.split(TableUtils.COLUMN_SEPARATOR_STRING));
+                    break;
+                }
+            }
+        } catch (final IOException e) {
+            throw new UserException.CouldNotReadInputFile(inputFile);
+        }
+        if (columns == null) {
+            throw new UserException.BadInput(String.format(
+                    "The input file %s does not have a header (starting with comment character %s).",
+                    inputFile.getAbsolutePath(), COMMENT_PREFIX));
+        }
+        if (columns.stream().distinct().count() != columns.size()) {
+            throw new UserException.BadInput("Column headers must all be unique.");
+        }
+        return new TableColumnCollection(columns);
+    }
 }
diff --git a/...adinstitute/hellbender/tools/copynumber/formats/collections/AbstractRecordCollection.java b/...adinstitute/hellbender/tools/copynumber/formats/collections/AbstractRecordCollection.java
@@ -163,7 +163,7 @@ static String formatDouble(final double value) {
     }
 
     final class RecordCollectionReader extends TableReader<RECORD> {
-        private static final String COMMENT_PREFIX = "@";   //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
+        private static final String COMMENT_PREFIX = CopyNumberFormatsUtils.COMMENT_PREFIX;   //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
         private final File file;
 
         RecordCollectionReader(final File file) throws IOException {

diff --git a/...nstitute/hellbender/tools/copynumber/formats/collections/AnnotatedIntervalCollection.java b/...nstitute/hellbender/tools/copynumber/formats/collections/AnnotatedIntervalCollection.java
@@ -1,56 +1,181 @@
 package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
 
+import org.apache.commons.collections4.ListUtils;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberFormatsUtils;
 import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata;
 import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
-import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationKey;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationMap;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.tsv.DataLine;
 import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
 
 import java.io.File;
-import java.util.List;
+import java.util.*;
 import java.util.function.BiConsumer;
 import java.util.function.Function;
+import java.util.stream.Collectors;
 
 /**
+ * Represents a collection of intervals annotated with {@link CopyNumberAnnotations}.
+ * Supports {@link AnnotationKey}s of integer, long, double, and string type.
+ * Can be constructed from a TSV file that contains the standard interval column headers,
+ * any subset of the {@link CopyNumberAnnotations}, and additional columns (which are ignored).
+ *
  * @author Samuel Lee &lt;[email protected]&gt;
  */
 public final class AnnotatedIntervalCollection extends AbstractLocatableCollection<LocatableMetadata, AnnotatedInterval> {
     //note to developers: repeat the column headers in Javadoc so that they are viewable when linked
     /**
-     * CONTIG, START, END, GC_CONTENT
+     * CONTIG, START, END; columns headers for additional annotations can be specified
      */
     enum AnnotatedIntervalTableColumn {
         CONTIG,
         START,
-        END,
-        GC_CONTENT;
+        END;
 
-        static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values());
+        static final TableColumnCollection STANDARD_COLUMNS = new TableColumnCollection((Object[]) values());
+    }
+
+    enum AnnotationValueType {
+        Integer,
+        Long,
+        Double,
+        String
     }
 
-    private static final Function<DataLine, AnnotatedInterval> ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER = dataLine -> {
-        final String contig = dataLine.get(AnnotatedIntervalTableColumn.CONTIG);
-        final int start = dataLine.getInt(AnnotatedIntervalTableColumn.START);
-        final int end = dataLine.getInt(AnnotatedIntervalTableColumn.END);
-        final double gcContent = dataLine.getDouble(AnnotatedIntervalTableColumn.GC_CONTENT);
-        final SimpleInterval interval = new SimpleInterval(contig, start, end);
-        final AnnotationSet annotationSet = new AnnotationSet(gcContent);
-        return new AnnotatedInterval(interval, annotationSet);
+    private static final BiConsumer<AnnotatedInterval, DataLine> ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER = (annotatedInterval, dataLine) -> {
+        dataLine.append(annotatedInterval.getInterval().getContig())
+                .append(annotatedInterval.getInterval().getStart())
+                .append(annotatedInterval.getInterval().getEnd());
+        final AnnotationMap annotations = annotatedInterval.getAnnotationMap();
+        for (final AnnotationKey<?> key : annotations.getKeys()) {
+            final AnnotationValueType type = AnnotationValueType.valueOf(key.getType().getSimpleName());
+            switch (type) {
+                case Integer:
+                    dataLine.append((Integer) annotations.getValue(key));
+                    break;
+                case Long:
+                    dataLine.append((Long) annotations.getValue(key));
+                    break;
+                case Double:
+                    dataLine.append(formatDouble((Double) annotations.getValue(key)));
+                    break;
+                case String:
+                    dataLine.append((String) annotations.getValue(key));
+                    break;
+                default:
+                    throw new UserException.BadInput(String.format("Unsupported annotation type: %s", type));
+            }
+        }
     };
 
-    private static final BiConsumer<AnnotatedInterval, DataLine> ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER = (annotatedInterval, dataLine) ->
-            dataLine.append(annotatedInterval.getInterval().getContig())
-                    .append(annotatedInterval.getInterval().getStart())
-                    .append(annotatedInterval.getInterval().getEnd())
-                    .append(formatDouble(annotatedInterval.getAnnotationSet().getGCContent()));
-
     public AnnotatedIntervalCollection(final File inputFile) {
-        super(inputFile, AnnotatedIntervalCollection.AnnotatedIntervalTableColumn.COLUMNS, ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER, ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
+        this(inputFile, getAnnotationKeys(CopyNumberFormatsUtils.readColumnsFromHeader(inputFile)));
+    }
+
+    private AnnotatedIntervalCollection(final File inputFile,
+                                        final List<AnnotationKey<?>> annotationKeys) {
+        super(
+                inputFile,
+                getColumns(annotationKeys),
+                getAnnotatedIntervalRecordFromDataLineDecoder(annotationKeys),
+                ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
     }
 
     public AnnotatedIntervalCollection(final LocatableMetadata metadata,
                                        final List<AnnotatedInterval> annotatedIntervals) {
-        super(metadata, annotatedIntervals, AnnotatedIntervalCollection.AnnotatedIntervalTableColumn.COLUMNS, ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER, ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
+        super(
+                metadata,
+                annotatedIntervals,
+                getColumns(getAnnotationKeys(annotatedIntervals)),
+                getAnnotatedIntervalRecordFromDataLineDecoder(getAnnotationKeys(annotatedIntervals)),
+                ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
+    }
+
+    private static TableColumnCollection getColumns(final List<AnnotationKey<?>> annotationKeys) {
+        return new TableColumnCollection(
+                ListUtils.union(
+                        AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names(),
+                        annotationKeys.stream().map(AnnotationKey::getName).collect(Collectors.toList())));
+    }
+
+    private static List<AnnotationKey<?>> getAnnotationKeys(final TableColumnCollection columns) {
+        Utils.nonNull(columns);
+        Utils.validateArg(columns.columnCount() != 0, "TableColumnCollection cannot be empty.");
+        Utils.validateArg(columns.containsAll(AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names()),
+                String.format("TableColumnCollection must contain standard columns: %s.",
+                        AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names()));
+        return CopyNumberAnnotations.ANNOTATIONS.stream()
+                .filter(a -> columns.contains(a.getName()))
+                .collect(Collectors.toList());
+    }
+
+    private static List<AnnotationKey<?>> getAnnotationKeys(final List<AnnotatedInterval> annotatedIntervals) {
+        return annotatedIntervals.isEmpty() ? new ArrayList<>() : annotatedIntervals.get(0).getAnnotationMap().getKeys();
+    }
+
+    private static Function<DataLine, AnnotatedInterval> getAnnotatedIntervalRecordFromDataLineDecoder(
+            final List<AnnotationKey<?>> annotationKeys) {
+        return dataLine -> {
+            final String contig = dataLine.get(AnnotatedIntervalTableColumn.CONTIG);
+            final int start = dataLine.getInt(AnnotatedIntervalTableColumn.START);
+            final int end = dataLine.getInt(AnnotatedIntervalTableColumn.END);
+            final SimpleInterval interval = new SimpleInterval(contig, start, end);
+            final List<Pair<AnnotationKey<?>, Object>> annotations = new ArrayList<>(annotationKeys.size());
+            for (final AnnotationKey<?> key : annotationKeys) {
+                final AnnotationValueType type = AnnotationValueType.valueOf(key.getType().getSimpleName());
+                switch (type) {
+                    case Integer:
+                        annotations.add(Pair.of(key, dataLine.getInt(key.getName())));
+                        break;
+                    case Long:
+                        annotations.add(Pair.of(key, dataLine.getLong(key.getName())));
+                        break;
+                    case Double:
+                        annotations.add(Pair.of(key, dataLine.getDouble(key.getName())));
+                        break;
+                    case String:
+                        annotations.add(Pair.of(key, dataLine.get(key.getName())));
+                        break;
+                    default:
+                        throw new UserException.BadInput(String.format("Unsupported annotation type: %s", type));
+                }
+            }
+            final AnnotationMap annotationMap = new AnnotationMap(annotations);
+            return new AnnotatedInterval(interval, annotationMap);
+        };
+    }
+
+    /**
+     * Columns, encoder, and decoder are not used.
+     */
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+
+        final AbstractRecordCollection<?, ?> that = (AbstractRecordCollection<?, ?>) o;
+        return getMetadata().equals(that.getMetadata()) &&
+                getRecords().equals(that.getRecords());
+    }
+
+    /**
+     * Columns, encoder, and decoder are not used.
+     */
+    @Override
+    public int hashCode() {
+        int result = getMetadata().hashCode();
+        result = 31 * result + getRecords().hashCode();
+        return result;
     }
-}
+}
diff --git a/...ender/tools/copynumber/formats/collections/CopyNumberPosteriorDistributionCollection.java b/...ender/tools/copynumber/formats/collections/CopyNumberPosteriorDistributionCollection.java
@@ -1,6 +1,7 @@
 package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
 
 import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberFormatsUtils;
 import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyNumberPosteriorDistribution;
 import org.broadinstitute.hellbender.tools.copynumber.gcnv.GermlineCNVNamingConstants;
 import org.broadinstitute.hellbender.tools.copynumber.gcnv.IntegerCopyNumberState;
@@ -84,13 +85,10 @@ private static class IntegerCopyNumberStateCollection {
         private final List<IntegerCopyNumberState> copyNumberStates;
         private final TableColumnCollection columnCollection;
 
-        private static final String COMMENT_PREFIX = "@";
-
         IntegerCopyNumberStateCollection(final File inputFile) {
-            final List<String> copyNumberStatesColumns = extractCopyNumberColumnsFromHeader(inputFile);
-            this.columnCollection = new TableColumnCollection(copyNumberStatesColumns);
+            this.columnCollection = CopyNumberFormatsUtils.readColumnsFromHeader(inputFile);
             this.copyNumberStates = new ArrayList<>();
-            copyNumberStatesColumns
+            columnCollection.names()
                     .forEach(copyNumberString -> copyNumberStates.add(parseIntegerCopyNumber(copyNumberString)));
         }
 
@@ -137,29 +135,5 @@ private IntegerCopyNumberState parseIntegerCopyNumber(final String copyNumberSta
                         "Could not parse copy-number column string (%s) to an integer copy-number.", copyNumberStateString));
             }
         }
-
-        /**
-         * Extracts column names from a TSV file
-         */
-        private List<String> extractCopyNumberColumnsFromHeader(final File inputFile) {
-            List<String> columns = null;
-            try (final XReadLines reader = new XReadLines(inputFile)) {
-                while (reader.hasNext()) {
-                    String nextLine = reader.next();
-                    if (!nextLine.startsWith(COMMENT_PREFIX)) {
-                        columns = Arrays.asList(nextLine.split(TableUtils.COLUMN_SEPARATOR_STRING));
-                        break;
-                    }
-                }
-            } catch (final IOException e) {
-                throw new UserException.CouldNotReadInputFile(inputFile);
-            }
-            if (columns == null) {
-                throw new UserException.BadInput(String.format(
-                        "The input file %s does not have a header (starting with comment character %s).",
-                        inputFile.getAbsolutePath(), COMMENT_PREFIX));
-            }
-            return columns;
-        }
     }
 }