Skip to content

Commit

Permalink
Added optional mappability and segmental-duplication annotation to An…
Browse files Browse the repository at this point in the history
…notateIntervals. (#5162)

* Added optional mappability and segmental-duplication annotation to AnnotateIntervals.

* Updated AnnotateIntervals task and calls in WDL (untested).
  • Loading branch information
samuelklee authored Sep 13, 2018
1 parent 5a09b88 commit 63f7b63
Show file tree
Hide file tree
Showing 32 changed files with 910 additions and 169 deletions.
6 changes: 6 additions & 0 deletions scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ task AnnotateIntervals {
File ref_fasta
File ref_fasta_fai
File ref_fasta_dict
File? mappability_track
File? segmental_duplication_track
Int? feature_query_lookahead
File? gatk4_jar_override

# Runtime parameters
Expand All @@ -76,6 +79,9 @@ task AnnotateIntervals {
gatk --java-options "-Xmx${command_mem_mb}m" AnnotateIntervals \
-L ${intervals} \
--reference ${ref_fasta} \
${"--mappability-track " + mappability_track} \
${"--segmental-duplication-track " + segmental_duplication_track} \
--feature-query-lookahead ${default=1000000 feature_query_lookahead} \
--interval-merging-rule OVERLAPPING_ONLY \
--output annotated_intervals.tsv
>>>
Expand Down
12 changes: 12 additions & 0 deletions scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ workflow CNVGermlineCohortWorkflow {
Int? padding
Int? bin_length

##################################################
#### optional arguments for AnnotateIntervals ####
##################################################
File? mappability_track
File? segmental_duplication_track
Int? feature_query_lookahead
Int? mem_gb_for_annotate_intervals

##############################################
#### optional arguments for CollectCounts ####
##############################################
Expand Down Expand Up @@ -146,8 +154,12 @@ workflow CNVGermlineCohortWorkflow {
ref_fasta = ref_fasta,
ref_fasta_fai = ref_fasta_fai,
ref_fasta_dict = ref_fasta_dict,
mappability_track = mappability_track,
segmental_duplication_track = segmental_duplication_track,
feature_query_lookahead = feature_query_lookahead,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
mem_gb = mem_gb_for_annotate_intervals,
preemptible_attempts = preemptible_attempts
}
}
Expand Down
6 changes: 6 additions & 0 deletions scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ workflow CNVSomaticPanelWorkflow {
##################################################
#### optional arguments for AnnotateIntervals ####
##################################################
File? mappability_track
File? segmental_duplication_track
Int? feature_query_lookahead
Int? mem_gb_for_annotate_intervals

##############################################
Expand Down Expand Up @@ -103,6 +106,9 @@ workflow CNVSomaticPanelWorkflow {
ref_fasta = ref_fasta,
ref_fasta_fai = ref_fasta_fai,
ref_fasta_dict = ref_fasta_dict,
mappability_track = mappability_track,
segmental_duplication_track = segmental_duplication_track,
feature_query_lookahead = feature_query_lookahead,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
mem_gb = mem_gb_for_annotate_intervals,
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down Expand Up @@ -280,7 +281,9 @@ protected void runPipeline(final JavaSparkContext ctx) {
inputAnnotatedIntervalsFile, firstReadCounts, logger);
final double[] intervalGCContent = annotatedIntervals == null
? null
: annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationSet().getGCContent()).toArray();
: annotatedIntervals.getRecords().stream()
.mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
.toArray();

//validate input read-counts files (i.e., check intervals and that only integer counts are contained)
//and aggregate as a RealMatrix with dimensions numIntervals x numSamples
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.File;
Expand Down Expand Up @@ -210,7 +211,9 @@ protected Object doWork() {
inputAnnotatedIntervalsFile, readCounts, logger);
final double[] intervalGCContent = annotatedIntervals == null
? null
: annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationSet().getGCContent()).toArray();
: annotatedIntervals.getRecords().stream()
.mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
.toArray();

if (intervalGCContent == null) {
logger.warn("Neither a panel of normals nor GC-content annotations were provided, so only standardization will be performed...");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,51 @@
package org.broadinstitute.hellbender.tools.copynumber.formats;

import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.text.XReadLines;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
import org.broadinstitute.hellbender.utils.tsv.TableUtils;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

public final class CopyNumberFormatsUtils {
public static final String COMMENT_PREFIX = "@"; //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
public static final String DOUBLE_FORMAT = "%.6f";

private CopyNumberFormatsUtils() {}

public static String formatDouble(final double value) {
return String.format(DOUBLE_FORMAT, value);
}

/**
* Extracts column names from a TSV file
*/
public static TableColumnCollection readColumnsFromHeader(final File inputFile) {
IOUtils.canReadFile(inputFile);
List<String> columns = null;
try (final XReadLines reader = new XReadLines(inputFile)) {
while (reader.hasNext()) {
String nextLine = reader.next();
if (!nextLine.startsWith(COMMENT_PREFIX)) {
columns = Arrays.asList(nextLine.split(TableUtils.COLUMN_SEPARATOR_STRING));
break;
}
}
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile(inputFile);
}
if (columns == null) {
throw new UserException.BadInput(String.format(
"The input file %s does not have a header (starting with comment character %s).",
inputFile.getAbsolutePath(), COMMENT_PREFIX));
}
if (columns.stream().distinct().count() != columns.size()) {
throw new UserException.BadInput("Column headers must all be unique.");
}
return new TableColumnCollection(columns);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ static String formatDouble(final double value) {
}

final class RecordCollectionReader extends TableReader<RECORD> {
private static final String COMMENT_PREFIX = "@"; //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
private static final String COMMENT_PREFIX = CopyNumberFormatsUtils.COMMENT_PREFIX; //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
private final File file;

RecordCollectionReader(final File file) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,56 +1,181 @@
package org.broadinstitute.hellbender.tools.copynumber.formats.collections;

import org.apache.commons.collections4.ListUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberFormatsUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationKey;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationMap;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;

import java.io.File;
import java.util.List;
import java.util.*;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
* Represents a collection of intervals annotated with {@link CopyNumberAnnotations}.
* Supports {@link AnnotationKey}s of integer, long, double, and string type.
* Can be constructed from a TSV file that contains the standard interval column headers,
* any subset of the {@link CopyNumberAnnotations}, and additional columns (which are ignored).
*
* @author Samuel Lee &lt;[email protected]&gt;
*/
public final class AnnotatedIntervalCollection extends AbstractLocatableCollection<LocatableMetadata, AnnotatedInterval> {
//note to developers: repeat the column headers in Javadoc so that they are viewable when linked
/**
* CONTIG, START, END, GC_CONTENT
* CONTIG, START, END; columns headers for additional annotations can be specified
*/
enum AnnotatedIntervalTableColumn {
CONTIG,
START,
END,
GC_CONTENT;
END;

static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values());
static final TableColumnCollection STANDARD_COLUMNS = new TableColumnCollection((Object[]) values());
}

enum AnnotationValueType {
Integer,
Long,
Double,
String
}

private static final Function<DataLine, AnnotatedInterval> ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER = dataLine -> {
final String contig = dataLine.get(AnnotatedIntervalTableColumn.CONTIG);
final int start = dataLine.getInt(AnnotatedIntervalTableColumn.START);
final int end = dataLine.getInt(AnnotatedIntervalTableColumn.END);
final double gcContent = dataLine.getDouble(AnnotatedIntervalTableColumn.GC_CONTENT);
final SimpleInterval interval = new SimpleInterval(contig, start, end);
final AnnotationSet annotationSet = new AnnotationSet(gcContent);
return new AnnotatedInterval(interval, annotationSet);
private static final BiConsumer<AnnotatedInterval, DataLine> ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER = (annotatedInterval, dataLine) -> {
dataLine.append(annotatedInterval.getInterval().getContig())
.append(annotatedInterval.getInterval().getStart())
.append(annotatedInterval.getInterval().getEnd());
final AnnotationMap annotations = annotatedInterval.getAnnotationMap();
for (final AnnotationKey<?> key : annotations.getKeys()) {
final AnnotationValueType type = AnnotationValueType.valueOf(key.getType().getSimpleName());
switch (type) {
case Integer:
dataLine.append((Integer) annotations.getValue(key));
break;
case Long:
dataLine.append((Long) annotations.getValue(key));
break;
case Double:
dataLine.append(formatDouble((Double) annotations.getValue(key)));
break;
case String:
dataLine.append((String) annotations.getValue(key));
break;
default:
throw new UserException.BadInput(String.format("Unsupported annotation type: %s", type));
}
}
};

private static final BiConsumer<AnnotatedInterval, DataLine> ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER = (annotatedInterval, dataLine) ->
dataLine.append(annotatedInterval.getInterval().getContig())
.append(annotatedInterval.getInterval().getStart())
.append(annotatedInterval.getInterval().getEnd())
.append(formatDouble(annotatedInterval.getAnnotationSet().getGCContent()));

public AnnotatedIntervalCollection(final File inputFile) {
super(inputFile, AnnotatedIntervalCollection.AnnotatedIntervalTableColumn.COLUMNS, ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER, ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
this(inputFile, getAnnotationKeys(CopyNumberFormatsUtils.readColumnsFromHeader(inputFile)));
}

private AnnotatedIntervalCollection(final File inputFile,
final List<AnnotationKey<?>> annotationKeys) {
super(
inputFile,
getColumns(annotationKeys),
getAnnotatedIntervalRecordFromDataLineDecoder(annotationKeys),
ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
}

public AnnotatedIntervalCollection(final LocatableMetadata metadata,
final List<AnnotatedInterval> annotatedIntervals) {
super(metadata, annotatedIntervals, AnnotatedIntervalCollection.AnnotatedIntervalTableColumn.COLUMNS, ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER, ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
super(
metadata,
annotatedIntervals,
getColumns(getAnnotationKeys(annotatedIntervals)),
getAnnotatedIntervalRecordFromDataLineDecoder(getAnnotationKeys(annotatedIntervals)),
ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
}

private static TableColumnCollection getColumns(final List<AnnotationKey<?>> annotationKeys) {
return new TableColumnCollection(
ListUtils.union(
AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names(),
annotationKeys.stream().map(AnnotationKey::getName).collect(Collectors.toList())));
}

private static List<AnnotationKey<?>> getAnnotationKeys(final TableColumnCollection columns) {
Utils.nonNull(columns);
Utils.validateArg(columns.columnCount() != 0, "TableColumnCollection cannot be empty.");
Utils.validateArg(columns.containsAll(AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names()),
String.format("TableColumnCollection must contain standard columns: %s.",
AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names()));
return CopyNumberAnnotations.ANNOTATIONS.stream()
.filter(a -> columns.contains(a.getName()))
.collect(Collectors.toList());
}

private static List<AnnotationKey<?>> getAnnotationKeys(final List<AnnotatedInterval> annotatedIntervals) {
return annotatedIntervals.isEmpty() ? new ArrayList<>() : annotatedIntervals.get(0).getAnnotationMap().getKeys();
}

private static Function<DataLine, AnnotatedInterval> getAnnotatedIntervalRecordFromDataLineDecoder(
final List<AnnotationKey<?>> annotationKeys) {
return dataLine -> {
final String contig = dataLine.get(AnnotatedIntervalTableColumn.CONTIG);
final int start = dataLine.getInt(AnnotatedIntervalTableColumn.START);
final int end = dataLine.getInt(AnnotatedIntervalTableColumn.END);
final SimpleInterval interval = new SimpleInterval(contig, start, end);
final List<Pair<AnnotationKey<?>, Object>> annotations = new ArrayList<>(annotationKeys.size());
for (final AnnotationKey<?> key : annotationKeys) {
final AnnotationValueType type = AnnotationValueType.valueOf(key.getType().getSimpleName());
switch (type) {
case Integer:
annotations.add(Pair.of(key, dataLine.getInt(key.getName())));
break;
case Long:
annotations.add(Pair.of(key, dataLine.getLong(key.getName())));
break;
case Double:
annotations.add(Pair.of(key, dataLine.getDouble(key.getName())));
break;
case String:
annotations.add(Pair.of(key, dataLine.get(key.getName())));
break;
default:
throw new UserException.BadInput(String.format("Unsupported annotation type: %s", type));
}
}
final AnnotationMap annotationMap = new AnnotationMap(annotations);
return new AnnotatedInterval(interval, annotationMap);
};
}

/**
* Columns, encoder, and decoder are not used.
*/
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}

final AbstractRecordCollection<?, ?> that = (AbstractRecordCollection<?, ?>) o;
return getMetadata().equals(that.getMetadata()) &&
getRecords().equals(that.getRecords());
}

/**
* Columns, encoder, and decoder are not used.
*/
@Override
public int hashCode() {
int result = getMetadata().hashCode();
result = 31 * result + getRecords().hashCode();
return result;
}
}
}
Loading

0 comments on commit 63f7b63

Please sign in to comment.