Skip to content

Commit

Permalink
Added optional mappability and segmental-duplication annotation to An…
Browse files Browse the repository at this point in the history
…notateIntervals.
  • Loading branch information
samuelklee committed Sep 6, 2018
1 parent 2bfe742 commit ae30544
Show file tree
Hide file tree
Showing 25 changed files with 874 additions and 168 deletions.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down Expand Up @@ -280,7 +281,9 @@ protected void runPipeline(final JavaSparkContext ctx) {
inputAnnotatedIntervalsFile, firstReadCounts, logger);
final double[] intervalGCContent = annotatedIntervals == null
? null
: annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationSet().getGCContent()).toArray();
: annotatedIntervals.getRecords().stream()
.mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
.toArray();

//validate input read-counts files (i.e., check intervals and that only integer counts are contained)
//and aggregate as a RealMatrix with dimensions numIntervals x numSamples
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.File;
Expand Down Expand Up @@ -210,7 +211,9 @@ protected Object doWork() {
inputAnnotatedIntervalsFile, readCounts, logger);
final double[] intervalGCContent = annotatedIntervals == null
? null
: annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationSet().getGCContent()).toArray();
: annotatedIntervals.getRecords().stream()
.mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
.toArray();

if (intervalGCContent == null) {
logger.warn("Neither a panel of normals nor GC-content annotations were provided, so only standardization will be performed...");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,51 @@
package org.broadinstitute.hellbender.tools.copynumber.formats;

import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.text.XReadLines;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
import org.broadinstitute.hellbender.utils.tsv.TableUtils;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

public final class CopyNumberFormatsUtils {
public static final String COMMENT_PREFIX = "@"; //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
public static final String DOUBLE_FORMAT = "%.6f";

private CopyNumberFormatsUtils() {}

public static String formatDouble(final double value) {
return String.format(DOUBLE_FORMAT, value);
}

/**
* Extracts column names from a TSV file
*/
public static TableColumnCollection readColumnsFromHeader(final File inputFile) {
IOUtils.canReadFile(inputFile);
List<String> columns = null;
try (final XReadLines reader = new XReadLines(inputFile)) {
while (reader.hasNext()) {
String nextLine = reader.next();
if (!nextLine.startsWith(COMMENT_PREFIX)) {
columns = Arrays.asList(nextLine.split(TableUtils.COLUMN_SEPARATOR_STRING));
break;
}
}
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile(inputFile);
}
if (columns == null) {
throw new UserException.BadInput(String.format(
"The input file %s does not have a header (starting with comment character %s).",
inputFile.getAbsolutePath(), COMMENT_PREFIX));
}
if (columns.stream().distinct().count() != columns.size()) {
throw new UserException.BadInput("Column headers must all be unique.");
}
return new TableColumnCollection(columns);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ static String formatDouble(final double value) {
}

final class RecordCollectionReader extends TableReader<RECORD> {
private static final String COMMENT_PREFIX = "@"; //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
private static final String COMMENT_PREFIX = CopyNumberFormatsUtils.COMMENT_PREFIX; //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
private final File file;

RecordCollectionReader(final File file) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,56 +1,181 @@
package org.broadinstitute.hellbender.tools.copynumber.formats.collections;

import org.apache.commons.collections4.ListUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberFormatsUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationKey;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationMap;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;

import java.io.File;
import java.util.List;
import java.util.*;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
* Represents a collection of intervals annotated with {@link CopyNumberAnnotations}.
* Supports {@link AnnotationKey}s of integer, long, double, and string type.
* Can be constructed from a TSV file that contains the standard interval column headers,
* any subset of the {@link CopyNumberAnnotations}, and additional columns (which are ignored).
*
* @author Samuel Lee &lt;[email protected]&gt;
*/
public final class AnnotatedIntervalCollection extends AbstractLocatableCollection<LocatableMetadata, AnnotatedInterval> {
//note to developers: repeat the column headers in Javadoc so that they are viewable when linked
/**
* CONTIG, START, END, GC_CONTENT
* CONTIG, START, END; columns headers for additional annotations can be specified
*/
enum AnnotatedIntervalTableColumn {
CONTIG,
START,
END,
GC_CONTENT;
END;

static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values());
static final TableColumnCollection STANDARD_COLUMNS = new TableColumnCollection((Object[]) values());
}

enum AnnotationValueType {
Integer,
Long,
Double,
String
}

private static final Function<DataLine, AnnotatedInterval> ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER = dataLine -> {
final String contig = dataLine.get(AnnotatedIntervalTableColumn.CONTIG);
final int start = dataLine.getInt(AnnotatedIntervalTableColumn.START);
final int end = dataLine.getInt(AnnotatedIntervalTableColumn.END);
final double gcContent = dataLine.getDouble(AnnotatedIntervalTableColumn.GC_CONTENT);
final SimpleInterval interval = new SimpleInterval(contig, start, end);
final AnnotationSet annotationSet = new AnnotationSet(gcContent);
return new AnnotatedInterval(interval, annotationSet);
private static final BiConsumer<AnnotatedInterval, DataLine> ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER = (annotatedInterval, dataLine) -> {
dataLine.append(annotatedInterval.getInterval().getContig())
.append(annotatedInterval.getInterval().getStart())
.append(annotatedInterval.getInterval().getEnd());
final AnnotationMap annotations = annotatedInterval.getAnnotationMap();
for (final AnnotationKey<?> key : annotations.getKeys()) {
final AnnotationValueType type = AnnotationValueType.valueOf(key.getType().getSimpleName());
switch (type) {
case Integer:
dataLine.append((Integer) annotations.getValue(key));
break;
case Long:
dataLine.append((Long) annotations.getValue(key));
break;
case Double:
dataLine.append(formatDouble((Double) annotations.getValue(key)));
break;
case String:
dataLine.append((String) annotations.getValue(key));
break;
default:
throw new UserException.BadInput(String.format("Unsupported annotation type: %s", type));
}
}
};

private static final BiConsumer<AnnotatedInterval, DataLine> ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER = (annotatedInterval, dataLine) ->
dataLine.append(annotatedInterval.getInterval().getContig())
.append(annotatedInterval.getInterval().getStart())
.append(annotatedInterval.getInterval().getEnd())
.append(formatDouble(annotatedInterval.getAnnotationSet().getGCContent()));

public AnnotatedIntervalCollection(final File inputFile) {
super(inputFile, AnnotatedIntervalCollection.AnnotatedIntervalTableColumn.COLUMNS, ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER, ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
this(inputFile, getAnnotationKeys(CopyNumberFormatsUtils.readColumnsFromHeader(inputFile)));
}

private AnnotatedIntervalCollection(final File inputFile,
final List<AnnotationKey<?>> annotationKeys) {
super(
inputFile,
getColumns(annotationKeys),
getAnnotatedIntervalRecordFromDataLineDecoder(annotationKeys),
ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
}

public AnnotatedIntervalCollection(final LocatableMetadata metadata,
final List<AnnotatedInterval> annotatedIntervals) {
super(metadata, annotatedIntervals, AnnotatedIntervalCollection.AnnotatedIntervalTableColumn.COLUMNS, ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER, ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
super(
metadata,
annotatedIntervals,
getColumns(getAnnotationKeys(annotatedIntervals)),
getAnnotatedIntervalRecordFromDataLineDecoder(getAnnotationKeys(annotatedIntervals)),
ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
}

private static TableColumnCollection getColumns(final List<AnnotationKey<?>> annotationKeys) {
return new TableColumnCollection(
ListUtils.union(
AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names(),
annotationKeys.stream().map(AnnotationKey::getName).collect(Collectors.toList())));
}

private static List<AnnotationKey<?>> getAnnotationKeys(final TableColumnCollection columns) {
Utils.nonNull(columns);
Utils.validateArg(columns.columnCount() != 0, "TableColumnCollection cannot be empty.");
Utils.validateArg(columns.containsAll(AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names()),
String.format("TableColumnCollection must contain standard columns: %s.",
AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names()));
return CopyNumberAnnotations.ANNOTATIONS.stream()
.filter(a -> columns.contains(a.getName()))
.collect(Collectors.toList());
}

private static List<AnnotationKey<?>> getAnnotationKeys(final List<AnnotatedInterval> annotatedIntervals) {
return annotatedIntervals.isEmpty() ? new ArrayList<>() : annotatedIntervals.get(0).getAnnotationMap().getKeys();
}

private static Function<DataLine, AnnotatedInterval> getAnnotatedIntervalRecordFromDataLineDecoder(
final List<AnnotationKey<?>> annotationKeys) {
return dataLine -> {
final String contig = dataLine.get(AnnotatedIntervalTableColumn.CONTIG);
final int start = dataLine.getInt(AnnotatedIntervalTableColumn.START);
final int end = dataLine.getInt(AnnotatedIntervalTableColumn.END);
final SimpleInterval interval = new SimpleInterval(contig, start, end);
final List<Pair<AnnotationKey<?>, Object>> annotations = new ArrayList<>(annotationKeys.size());
for (final AnnotationKey<?> key : annotationKeys) {
final AnnotationValueType type = AnnotationValueType.valueOf(key.getType().getSimpleName());
switch (type) {
case Integer:
annotations.add(Pair.of(key, dataLine.getInt(key.getName())));
break;
case Long:
annotations.add(Pair.of(key, dataLine.getLong(key.getName())));
break;
case Double:
annotations.add(Pair.of(key, dataLine.getDouble(key.getName())));
break;
case String:
annotations.add(Pair.of(key, dataLine.get(key.getName())));
break;
default:
throw new UserException.BadInput(String.format("Unsupported annotation type: %s", type));
}
}
final AnnotationMap annotationMap = new AnnotationMap(annotations);
return new AnnotatedInterval(interval, annotationMap);
};
}

/**
* Columns, encoder, and decoder are not used.
*/
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}

final AbstractRecordCollection<?, ?> that = (AbstractRecordCollection<?, ?>) o;
return getMetadata().equals(that.getMetadata()) &&
getRecords().equals(that.getRecords());
}

/**
* Columns, encoder, and decoder are not used.
*/
@Override
public int hashCode() {
int result = getMetadata().hashCode();
result = 31 * result + getRecords().hashCode();
return result;
}
}
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.broadinstitute.hellbender.tools.copynumber.formats.collections;

import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberFormatsUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.CopyNumberPosteriorDistribution;
import org.broadinstitute.hellbender.tools.copynumber.gcnv.GermlineCNVNamingConstants;
import org.broadinstitute.hellbender.tools.copynumber.gcnv.IntegerCopyNumberState;
Expand Down Expand Up @@ -84,13 +85,10 @@ private static class IntegerCopyNumberStateCollection {
private final List<IntegerCopyNumberState> copyNumberStates;
private final TableColumnCollection columnCollection;

private static final String COMMENT_PREFIX = "@";

IntegerCopyNumberStateCollection(final File inputFile) {
final List<String> copyNumberStatesColumns = extractCopyNumberColumnsFromHeader(inputFile);
this.columnCollection = new TableColumnCollection(copyNumberStatesColumns);
this.columnCollection = CopyNumberFormatsUtils.readColumnsFromHeader(inputFile);
this.copyNumberStates = new ArrayList<>();
copyNumberStatesColumns
columnCollection.names()
.forEach(copyNumberString -> copyNumberStates.add(parseIntegerCopyNumber(copyNumberString)));
}

Expand Down Expand Up @@ -137,29 +135,5 @@ private IntegerCopyNumberState parseIntegerCopyNumber(final String copyNumberSta
"Could not parse copy-number column string (%s) to an integer copy-number.", copyNumberStateString));
}
}

/**
* Extracts column names from a TSV file
*/
private List<String> extractCopyNumberColumnsFromHeader(final File inputFile) {
List<String> columns = null;
try (final XReadLines reader = new XReadLines(inputFile)) {
while (reader.hasNext()) {
String nextLine = reader.next();
if (!nextLine.startsWith(COMMENT_PREFIX)) {
columns = Arrays.asList(nextLine.split(TableUtils.COLUMN_SEPARATOR_STRING));
break;
}
}
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile(inputFile);
}
if (columns == null) {
throw new UserException.BadInput(String.format(
"The input file %s does not have a header (starting with comment character %s).",
inputFile.getAbsolutePath(), COMMENT_PREFIX));
}
return columns;
}
}
}
Loading

0 comments on commit ae30544

Please sign in to comment.