Added optional mappability and segmental-duplication annotation to An…

…notateIntervals. (#5162) * Added optional mappability and segmental-duplication annotation to AnnotateIntervals. * Updated AnnotateIntervals task and calls in WDL (untested).
broadinstitute · Sep 13, 2018 · 63f7b63 · 63f7b63
1 parent 5a09b88
commit 63f7b63
Show file tree

Hide file tree

Showing 32 changed files with 910 additions and 169 deletions.
diff --git a/scripts/cnv_wdl/cnv_common_tasks.wdl b/scripts/cnv_wdl/cnv_common_tasks.wdl
@@ -56,6 +56,9 @@ task AnnotateIntervals {
     File ref_fasta
     File ref_fasta_fai
     File ref_fasta_dict
+    File? mappability_track
+    File? segmental_duplication_track
+    Int? feature_query_lookahead
     File? gatk4_jar_override
 
     # Runtime parameters
@@ -76,6 +79,9 @@ task AnnotateIntervals {
         gatk --java-options "-Xmx${command_mem_mb}m" AnnotateIntervals \
             -L ${intervals} \
             --reference ${ref_fasta} \
+            ${"--mappability-track " + mappability_track} \
+            ${"--segmental-duplication-track " + segmental_duplication_track} \
+            --feature-query-lookahead ${default=1000000 feature_query_lookahead} \
             --interval-merging-rule OVERLAPPING_ONLY \
             --output annotated_intervals.tsv
     >>>

diff --git a/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl b/scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
@@ -54,6 +54,14 @@ workflow CNVGermlineCohortWorkflow {
     Int? padding
     Int? bin_length
 
+    ##################################################
+    #### optional arguments for AnnotateIntervals ####
+    ##################################################
+    File? mappability_track
+    File? segmental_duplication_track
+    Int? feature_query_lookahead
+    Int? mem_gb_for_annotate_intervals
+
     ##############################################
     #### optional arguments for CollectCounts ####
     ##############################################
@@ -146,8 +154,12 @@ workflow CNVGermlineCohortWorkflow {
                 ref_fasta = ref_fasta,
                 ref_fasta_fai = ref_fasta_fai,
                 ref_fasta_dict = ref_fasta_dict,
+                mappability_track = mappability_track,
+                segmental_duplication_track = segmental_duplication_track,
+                feature_query_lookahead = feature_query_lookahead,
                 gatk4_jar_override = gatk4_jar_override,
                 gatk_docker = gatk_docker,
+                mem_gb = mem_gb_for_annotate_intervals,
                 preemptible_attempts = preemptible_attempts
         }
     }

diff --git a/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl b/scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl
@@ -58,6 +58,9 @@ workflow CNVSomaticPanelWorkflow {
     ##################################################
     #### optional arguments for AnnotateIntervals ####
     ##################################################
+    File? mappability_track
+    File? segmental_duplication_track
+    Int? feature_query_lookahead
     Int? mem_gb_for_annotate_intervals
 
     ##############################################
@@ -103,6 +106,9 @@ workflow CNVSomaticPanelWorkflow {
                 ref_fasta = ref_fasta,
                 ref_fasta_fai = ref_fasta_fai,
                 ref_fasta_dict = ref_fasta_dict,
+                mappability_track = mappability_track,
+                segmental_duplication_track = segmental_duplication_track,
+                feature_query_lookahead = feature_query_lookahead,
                 gatk4_jar_override = gatk4_jar_override,
                 gatk_docker = gatk_docker,
                 mem_gb = mem_gb_for_annotate_intervals,

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/AnnotateIntervals.java
diff --git a/...in/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java b/...in/java/org/broadinstitute/hellbender/tools/copynumber/CreateReadCountPanelOfNormals.java
@@ -21,6 +21,7 @@
 import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
 import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
 import org.broadinstitute.hellbender.utils.Utils;
@@ -280,7 +281,9 @@ protected void runPipeline(final JavaSparkContext ctx) {
                 inputAnnotatedIntervalsFile, firstReadCounts, logger);
         final double[] intervalGCContent = annotatedIntervals == null
                 ? null
-                : annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationSet().getGCContent()).toArray();
+                : annotatedIntervals.getRecords().stream()
+                    .mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
+                    .toArray();
 
         //validate input read-counts files (i.e., check intervals and that only integer counts are contained)
         //and aggregate as a RealMatrix with dimensions numIntervals x numSamples

diff --git a/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java b/src/main/java/org/broadinstitute/hellbender/tools/copynumber/DenoiseReadCounts.java
@@ -17,6 +17,7 @@
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
 import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
 import org.broadinstitute.hellbender.utils.io.IOUtils;
 
 import java.io.File;
@@ -210,7 +211,9 @@ protected Object doWork() {
                     inputAnnotatedIntervalsFile, readCounts, logger);
             final double[] intervalGCContent = annotatedIntervals == null
                     ? null
-                    : annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationSet().getGCContent()).toArray();
+                    : annotatedIntervals.getRecords().stream()
+                    .mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
+                    .toArray();
 
             if (intervalGCContent == null) {
                 logger.warn("Neither a panel of normals nor GC-content annotations were provided, so only standardization will be performed...");

diff --git a/...n/java/org/broadinstitute/hellbender/tools/copynumber/formats/CopyNumberFormatsUtils.java b/...n/java/org/broadinstitute/hellbender/tools/copynumber/formats/CopyNumberFormatsUtils.java
@@ -1,11 +1,51 @@
 package org.broadinstitute.hellbender.tools.copynumber.formats;
 
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
+import org.broadinstitute.hellbender.utils.text.XReadLines;
+import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
+import org.broadinstitute.hellbender.utils.tsv.TableUtils;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
 public final class CopyNumberFormatsUtils {
+    public static final String COMMENT_PREFIX = "@";    //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
     public static final String DOUBLE_FORMAT = "%.6f";
 
     private CopyNumberFormatsUtils() {}
 
     public static String formatDouble(final double value) {
         return String.format(DOUBLE_FORMAT, value);
     }
+
+    /**
+     * Extracts column names from a TSV file
+     */
+    public static TableColumnCollection readColumnsFromHeader(final File inputFile) {
+        IOUtils.canReadFile(inputFile);
+        List<String> columns = null;
+        try (final XReadLines reader = new XReadLines(inputFile)) {
+            while (reader.hasNext()) {
+                String nextLine = reader.next();
+                if (!nextLine.startsWith(COMMENT_PREFIX)) {
+                    columns = Arrays.asList(nextLine.split(TableUtils.COLUMN_SEPARATOR_STRING));
+                    break;
+                }
+            }
+        } catch (final IOException e) {
+            throw new UserException.CouldNotReadInputFile(inputFile);
+        }
+        if (columns == null) {
+            throw new UserException.BadInput(String.format(
+                    "The input file %s does not have a header (starting with comment character %s).",
+                    inputFile.getAbsolutePath(), COMMENT_PREFIX));
+        }
+        if (columns.stream().distinct().count() != columns.size()) {
+            throw new UserException.BadInput("Column headers must all be unique.");
+        }
+        return new TableColumnCollection(columns);
+    }
 }
diff --git a/...adinstitute/hellbender/tools/copynumber/formats/collections/AbstractRecordCollection.java b/...adinstitute/hellbender/tools/copynumber/formats/collections/AbstractRecordCollection.java
@@ -163,7 +163,7 @@ static String formatDouble(final double value) {
     }
 
     final class RecordCollectionReader extends TableReader<RECORD> {
-        private static final String COMMENT_PREFIX = "@";   //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
+        private static final String COMMENT_PREFIX = CopyNumberFormatsUtils.COMMENT_PREFIX;   //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
         private final File file;
 
         RecordCollectionReader(final File file) throws IOException {

diff --git a/...nstitute/hellbender/tools/copynumber/formats/collections/AnnotatedIntervalCollection.java b/...nstitute/hellbender/tools/copynumber/formats/collections/AnnotatedIntervalCollection.java
@@ -1,56 +1,181 @@
 package org.broadinstitute.hellbender.tools.copynumber.formats.collections;
 
+import org.apache.commons.collections4.ListUtils;
+import org.apache.commons.lang3.tuple.Pair;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberFormatsUtils;
 import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata;
 import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
-import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationKey;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationMap;
+import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
 import org.broadinstitute.hellbender.utils.SimpleInterval;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.io.IOUtils;
 import org.broadinstitute.hellbender.utils.tsv.DataLine;
 import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
 
 import java.io.File;
-import java.util.List;
+import java.util.*;
 import java.util.function.BiConsumer;
 import java.util.function.Function;
+import java.util.stream.Collectors;
 
 /**
+ * Represents a collection of intervals annotated with {@link CopyNumberAnnotations}.
+ * Supports {@link AnnotationKey}s of integer, long, double, and string type.
+ * Can be constructed from a TSV file that contains the standard interval column headers,
+ * any subset of the {@link CopyNumberAnnotations}, and additional columns (which are ignored).
+ *
  * @author Samuel Lee &lt;[email protected]&gt;
  */
 public final class AnnotatedIntervalCollection extends AbstractLocatableCollection<LocatableMetadata, AnnotatedInterval> {
     //note to developers: repeat the column headers in Javadoc so that they are viewable when linked
     /**
-     * CONTIG, START, END, GC_CONTENT
+     * CONTIG, START, END; columns headers for additional annotations can be specified
      */
     enum AnnotatedIntervalTableColumn {
         CONTIG,
         START,
-        END,
-        GC_CONTENT;
+        END;
 
-        static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values());
+        static final TableColumnCollection STANDARD_COLUMNS = new TableColumnCollection((Object[]) values());
+    }
+
+    enum AnnotationValueType {
+        Integer,
+        Long,
+        Double,
+        String
     }
 
-    private static final Function<DataLine, AnnotatedInterval> ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER = dataLine -> {
-        final String contig = dataLine.get(AnnotatedIntervalTableColumn.CONTIG);
-        final int start = dataLine.getInt(AnnotatedIntervalTableColumn.START);
-        final int end = dataLine.getInt(AnnotatedIntervalTableColumn.END);
-        final double gcContent = dataLine.getDouble(AnnotatedIntervalTableColumn.GC_CONTENT);
-        final SimpleInterval interval = new SimpleInterval(contig, start, end);
-        final AnnotationSet annotationSet = new AnnotationSet(gcContent);
-        return new AnnotatedInterval(interval, annotationSet);
+    private static final BiConsumer<AnnotatedInterval, DataLine> ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER = (annotatedInterval, dataLine) -> {
+        dataLine.append(annotatedInterval.getInterval().getContig())
+                .append(annotatedInterval.getInterval().getStart())
+                .append(annotatedInterval.getInterval().getEnd());
+        final AnnotationMap annotations = annotatedInterval.getAnnotationMap();
+        for (final AnnotationKey<?> key : annotations.getKeys()) {
+            final AnnotationValueType type = AnnotationValueType.valueOf(key.getType().getSimpleName());
+            switch (type) {
+                case Integer:
+                    dataLine.append((Integer) annotations.getValue(key));
+                    break;
+                case Long:
+                    dataLine.append((Long) annotations.getValue(key));
+                    break;
+                case Double:
+                    dataLine.append(formatDouble((Double) annotations.getValue(key)));
+                    break;
+                case String:
+                    dataLine.append((String) annotations.getValue(key));
+                    break;
+                default:
+                    throw new UserException.BadInput(String.format("Unsupported annotation type: %s", type));
+            }
+        }
     };
 
-    private static final BiConsumer<AnnotatedInterval, DataLine> ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER = (annotatedInterval, dataLine) ->
-            dataLine.append(annotatedInterval.getInterval().getContig())
-                    .append(annotatedInterval.getInterval().getStart())
-                    .append(annotatedInterval.getInterval().getEnd())
-                    .append(formatDouble(annotatedInterval.getAnnotationSet().getGCContent()));
-
     public AnnotatedIntervalCollection(final File inputFile) {
-        super(inputFile, AnnotatedIntervalCollection.AnnotatedIntervalTableColumn.COLUMNS, ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER, ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
+        this(inputFile, getAnnotationKeys(CopyNumberFormatsUtils.readColumnsFromHeader(inputFile)));
+    }
+
+    private AnnotatedIntervalCollection(final File inputFile,
+                                        final List<AnnotationKey<?>> annotationKeys) {
+        super(
+                inputFile,
+                getColumns(annotationKeys),
+                getAnnotatedIntervalRecordFromDataLineDecoder(annotationKeys),
+                ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
     }
 
     public AnnotatedIntervalCollection(final LocatableMetadata metadata,
                                        final List<AnnotatedInterval> annotatedIntervals) {
-        super(metadata, annotatedIntervals, AnnotatedIntervalCollection.AnnotatedIntervalTableColumn.COLUMNS, ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER, ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
+        super(
+                metadata,
+                annotatedIntervals,
+                getColumns(getAnnotationKeys(annotatedIntervals)),
+                getAnnotatedIntervalRecordFromDataLineDecoder(getAnnotationKeys(annotatedIntervals)),
+                ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
+    }
+
+    private static TableColumnCollection getColumns(final List<AnnotationKey<?>> annotationKeys) {
+        return new TableColumnCollection(
+                ListUtils.union(
+                        AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names(),
+                        annotationKeys.stream().map(AnnotationKey::getName).collect(Collectors.toList())));
+    }
+
+    private static List<AnnotationKey<?>> getAnnotationKeys(final TableColumnCollection columns) {
+        Utils.nonNull(columns);
+        Utils.validateArg(columns.columnCount() != 0, "TableColumnCollection cannot be empty.");
+        Utils.validateArg(columns.containsAll(AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names()),
+                String.format("TableColumnCollection must contain standard columns: %s.",
+                        AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names()));
+        return CopyNumberAnnotations.ANNOTATIONS.stream()
+                .filter(a -> columns.contains(a.getName()))
+                .collect(Collectors.toList());
+    }
+
+    private static List<AnnotationKey<?>> getAnnotationKeys(final List<AnnotatedInterval> annotatedIntervals) {
+        return annotatedIntervals.isEmpty() ? new ArrayList<>() : annotatedIntervals.get(0).getAnnotationMap().getKeys();
+    }
+
+    private static Function<DataLine, AnnotatedInterval> getAnnotatedIntervalRecordFromDataLineDecoder(
+            final List<AnnotationKey<?>> annotationKeys) {
+        return dataLine -> {
+            final String contig = dataLine.get(AnnotatedIntervalTableColumn.CONTIG);
+            final int start = dataLine.getInt(AnnotatedIntervalTableColumn.START);
+            final int end = dataLine.getInt(AnnotatedIntervalTableColumn.END);
+            final SimpleInterval interval = new SimpleInterval(contig, start, end);
+            final List<Pair<AnnotationKey<?>, Object>> annotations = new ArrayList<>(annotationKeys.size());
+            for (final AnnotationKey<?> key : annotationKeys) {
+                final AnnotationValueType type = AnnotationValueType.valueOf(key.getType().getSimpleName());
+                switch (type) {
+                    case Integer:
+                        annotations.add(Pair.of(key, dataLine.getInt(key.getName())));
+                        break;
+                    case Long:
+                        annotations.add(Pair.of(key, dataLine.getLong(key.getName())));
+                        break;
+                    case Double:
+                        annotations.add(Pair.of(key, dataLine.getDouble(key.getName())));
+                        break;
+                    case String:
+                        annotations.add(Pair.of(key, dataLine.get(key.getName())));
+                        break;
+                    default:
+                        throw new UserException.BadInput(String.format("Unsupported annotation type: %s", type));
+                }
+            }
+            final AnnotationMap annotationMap = new AnnotationMap(annotations);
+            return new AnnotatedInterval(interval, annotationMap);
+        };
+    }
+
+    /**
+     * Columns, encoder, and decoder are not used.
+     */
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) {
+            return true;
+        }
+        if (o == null || getClass() != o.getClass()) {
+            return false;
+        }
+
+        final AbstractRecordCollection<?, ?> that = (AbstractRecordCollection<?, ?>) o;
+        return getMetadata().equals(that.getMetadata()) &&
+                getRecords().equals(that.getRecords());
+    }
+
+    /**
+     * Columns, encoder, and decoder are not used.
+     */
+    @Override
+    public int hashCode() {
+        int result = getMetadata().hashCode();
+        result = 31 * result + getRecords().hashCode();
+        return result;
     }
-}
+}