Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added optional mappability and segmental-duplication annotation to AnnotateIntervals. #5162

Merged
merged 4 commits into from
Sep 13, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions scripts/cnv_wdl/cnv_common_tasks.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ task AnnotateIntervals {
File ref_fasta
File ref_fasta_fai
File ref_fasta_dict
File? mappability_track
File? segmental_duplication_track
Int? feature_query_lookahead
File? gatk4_jar_override

# Runtime parameters
Expand All @@ -76,6 +79,9 @@ task AnnotateIntervals {
gatk --java-options "-Xmx${command_mem_mb}m" AnnotateIntervals \
-L ${intervals} \
--reference ${ref_fasta} \
${"--mappability-track " + mappability_track} \
${"--segmental-duplication-track " + segmental_duplication_track} \
--feature-query-lookahead ${default=1000000 feature_query_lookahead} \
--interval-merging-rule OVERLAPPING_ONLY \
--output annotated_intervals.tsv
>>>
Expand Down
12 changes: 12 additions & 0 deletions scripts/cnv_wdl/germline/cnv_germline_cohort_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,14 @@ workflow CNVGermlineCohortWorkflow {
Int? padding
Int? bin_length

##################################################
#### optional arguments for AnnotateIntervals ####
##################################################
File? mappability_track
File? segmental_duplication_track
Int? feature_query_lookahead
Int? mem_gb_for_annotate_intervals

##############################################
#### optional arguments for CollectCounts ####
##############################################
Expand Down Expand Up @@ -146,8 +154,12 @@ workflow CNVGermlineCohortWorkflow {
ref_fasta = ref_fasta,
ref_fasta_fai = ref_fasta_fai,
ref_fasta_dict = ref_fasta_dict,
mappability_track = mappability_track,
segmental_duplication_track = segmental_duplication_track,
feature_query_lookahead = feature_query_lookahead,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
mem_gb = mem_gb_for_annotate_intervals,
preemptible_attempts = preemptible_attempts
}
}
Expand Down
6 changes: 6 additions & 0 deletions scripts/cnv_wdl/somatic/cnv_somatic_panel_workflow.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,9 @@ workflow CNVSomaticPanelWorkflow {
##################################################
#### optional arguments for AnnotateIntervals ####
##################################################
File? mappability_track
File? segmental_duplication_track
Int? feature_query_lookahead
Int? mem_gb_for_annotate_intervals

##############################################
Expand Down Expand Up @@ -103,6 +106,9 @@ workflow CNVSomaticPanelWorkflow {
ref_fasta = ref_fasta,
ref_fasta_fai = ref_fasta_fai,
ref_fasta_dict = ref_fasta_dict,
mappability_track = mappability_track,
segmental_duplication_track = segmental_duplication_track,
feature_query_lookahead = feature_query_lookahead,
gatk4_jar_override = gatk4_jar_override,
gatk_docker = gatk_docker,
mem_gb = mem_gb_for_annotate_intervals,
Expand Down

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.broadinstitute.hellbender.tools.copynumber.denoising.HDF5SVDReadCountPanelOfNormals;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.tools.copynumber.utils.HDF5Utils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down Expand Up @@ -280,7 +281,9 @@ protected void runPipeline(final JavaSparkContext ctx) {
inputAnnotatedIntervalsFile, firstReadCounts, logger);
final double[] intervalGCContent = annotatedIntervals == null
? null
: annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationSet().getGCContent()).toArray();
: annotatedIntervals.getRecords().stream()
.mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
.toArray();

//validate input read-counts files (i.e., check intervals and that only integer counts are contained)
//and aggregate as a RealMatrix with dimensions numIntervals x numSamples
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.AnnotatedIntervalCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.CopyRatioCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.collections.SimpleCountCollection;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.utils.io.IOUtils;

import java.io.File;
Expand Down Expand Up @@ -210,7 +211,9 @@ protected Object doWork() {
inputAnnotatedIntervalsFile, readCounts, logger);
final double[] intervalGCContent = annotatedIntervals == null
? null
: annotatedIntervals.getRecords().stream().mapToDouble(i -> i.getAnnotationSet().getGCContent()).toArray();
: annotatedIntervals.getRecords().stream()
.mapToDouble(i -> i.getAnnotationMap().getValue(CopyNumberAnnotations.GC_CONTENT))
.toArray();

if (intervalGCContent == null) {
logger.warn("Neither a panel of normals nor GC-content annotations were provided, so only standardization will be performed...");
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,51 @@
package org.broadinstitute.hellbender.tools.copynumber.formats;

import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.text.XReadLines;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;
import org.broadinstitute.hellbender.utils.tsv.TableUtils;

import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;

public final class CopyNumberFormatsUtils {
public static final String COMMENT_PREFIX = "@"; //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
public static final String DOUBLE_FORMAT = "%.6f";

private CopyNumberFormatsUtils() {}

public static String formatDouble(final double value) {
return String.format(DOUBLE_FORMAT, value);
}

/**
* Extracts column names from a TSV file
*/
public static TableColumnCollection readColumnsFromHeader(final File inputFile) {
IOUtils.canReadFile(inputFile);
List<String> columns = null;
try (final XReadLines reader = new XReadLines(inputFile)) {
while (reader.hasNext()) {
String nextLine = reader.next();
if (!nextLine.startsWith(COMMENT_PREFIX)) {
columns = Arrays.asList(nextLine.split(TableUtils.COLUMN_SEPARATOR_STRING));
break;
}
}
} catch (final IOException e) {
throw new UserException.CouldNotReadInputFile(inputFile);
}
if (columns == null) {
throw new UserException.BadInput(String.format(
"The input file %s does not have a header (starting with comment character %s).",
inputFile.getAbsolutePath(), COMMENT_PREFIX));
}
if (columns.stream().distinct().count() != columns.size()) {
throw new UserException.BadInput("Column headers must all be unique.");
}
return new TableColumnCollection(columns);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ static String formatDouble(final double value) {
}

final class RecordCollectionReader extends TableReader<RECORD> {
private static final String COMMENT_PREFIX = "@"; //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
private static final String COMMENT_PREFIX = CopyNumberFormatsUtils.COMMENT_PREFIX; //SAMTextHeaderCodec.HEADER_LINE_START; we need TableReader to treat SAM header as comment lines
private final File file;

RecordCollectionReader(final File file) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,56 +1,181 @@
package org.broadinstitute.hellbender.tools.copynumber.formats.collections;

import org.apache.commons.collections4.ListUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.copynumber.formats.CopyNumberFormatsUtils;
import org.broadinstitute.hellbender.tools.copynumber.formats.metadata.LocatableMetadata;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotatedInterval;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.AnnotationSet;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationKey;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.AnnotationMap;
import org.broadinstitute.hellbender.tools.copynumber.formats.records.annotation.CopyNumberAnnotations;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.tsv.DataLine;
import org.broadinstitute.hellbender.utils.tsv.TableColumnCollection;

import java.io.File;
import java.util.List;
import java.util.*;
import java.util.function.BiConsumer;
import java.util.function.Function;
import java.util.stream.Collectors;

/**
* Represents a collection of intervals annotated with {@link CopyNumberAnnotations}.
* Supports {@link AnnotationKey}s of integer, long, double, and string type.
* Can be constructed from a TSV file that contains the standard interval column headers,
* any subset of the {@link CopyNumberAnnotations}, and additional columns (which are ignored).
*
* @author Samuel Lee &lt;[email protected]&gt;
*/
public final class AnnotatedIntervalCollection extends AbstractLocatableCollection<LocatableMetadata, AnnotatedInterval> {
//note to developers: repeat the column headers in Javadoc so that they are viewable when linked
/**
* CONTIG, START, END, GC_CONTENT
* CONTIG, START, END; columns headers for additional annotations can be specified
*/
enum AnnotatedIntervalTableColumn {
CONTIG,
START,
END,
GC_CONTENT;
END;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this enum be now moved up to AbstractLocatableCollection and shared with SimpleIntervalCollection since it's the same minimal header?

Copy link
Contributor Author

@samuelklee samuelklee Sep 12, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AllelicCountCollection doesn't have the same minimal header (it has a single POSITION = START = END header instead).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I didn't see that AllelicCountCollection also extends it


static final TableColumnCollection COLUMNS = new TableColumnCollection((Object[]) values());
static final TableColumnCollection STANDARD_COLUMNS = new TableColumnCollection((Object[]) values());
}

enum AnnotationValueType {
Integer,
Long,
Double,
String
}

private static final Function<DataLine, AnnotatedInterval> ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER = dataLine -> {
final String contig = dataLine.get(AnnotatedIntervalTableColumn.CONTIG);
final int start = dataLine.getInt(AnnotatedIntervalTableColumn.START);
final int end = dataLine.getInt(AnnotatedIntervalTableColumn.END);
final double gcContent = dataLine.getDouble(AnnotatedIntervalTableColumn.GC_CONTENT);
final SimpleInterval interval = new SimpleInterval(contig, start, end);
final AnnotationSet annotationSet = new AnnotationSet(gcContent);
return new AnnotatedInterval(interval, annotationSet);
private static final BiConsumer<AnnotatedInterval, DataLine> ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER = (annotatedInterval, dataLine) -> {
dataLine.append(annotatedInterval.getInterval().getContig())
.append(annotatedInterval.getInterval().getStart())
.append(annotatedInterval.getInterval().getEnd());
final AnnotationMap annotations = annotatedInterval.getAnnotationMap();
for (final AnnotationKey<?> key : annotations.getKeys()) {
final AnnotationValueType type = AnnotationValueType.valueOf(key.getType().getSimpleName());
switch (type) {
case Integer:
dataLine.append((Integer) annotations.getValue(key));
break;
case Long:
dataLine.append((Long) annotations.getValue(key));
break;
case Double:
dataLine.append(formatDouble((Double) annotations.getValue(key)));
break;
case String:
dataLine.append((String) annotations.getValue(key));
break;
default:
throw new UserException.BadInput(String.format("Unsupported annotation type: %s", type));
}
}
};

private static final BiConsumer<AnnotatedInterval, DataLine> ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER = (annotatedInterval, dataLine) ->
dataLine.append(annotatedInterval.getInterval().getContig())
.append(annotatedInterval.getInterval().getStart())
.append(annotatedInterval.getInterval().getEnd())
.append(formatDouble(annotatedInterval.getAnnotationSet().getGCContent()));

public AnnotatedIntervalCollection(final File inputFile) {
super(inputFile, AnnotatedIntervalCollection.AnnotatedIntervalTableColumn.COLUMNS, ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER, ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
this(inputFile, getAnnotationKeys(CopyNumberFormatsUtils.readColumnsFromHeader(inputFile)));
}

private AnnotatedIntervalCollection(final File inputFile,
final List<AnnotationKey<?>> annotationKeys) {
super(
inputFile,
getColumns(annotationKeys),
getAnnotatedIntervalRecordFromDataLineDecoder(annotationKeys),
ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
}

public AnnotatedIntervalCollection(final LocatableMetadata metadata,
final List<AnnotatedInterval> annotatedIntervals) {
super(metadata, annotatedIntervals, AnnotatedIntervalCollection.AnnotatedIntervalTableColumn.COLUMNS, ANNOTATED_INTERVAL_RECORD_FROM_DATA_LINE_DECODER, ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
super(
metadata,
annotatedIntervals,
getColumns(getAnnotationKeys(annotatedIntervals)),
getAnnotatedIntervalRecordFromDataLineDecoder(getAnnotationKeys(annotatedIntervals)),
ANNOTATED_INTERVAL_RECORD_TO_DATA_LINE_ENCODER);
}

private static TableColumnCollection getColumns(final List<AnnotationKey<?>> annotationKeys) {
return new TableColumnCollection(
ListUtils.union(
AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names(),
annotationKeys.stream().map(AnnotationKey::getName).collect(Collectors.toList())));
}

private static List<AnnotationKey<?>> getAnnotationKeys(final TableColumnCollection columns) {
Utils.nonNull(columns);
Utils.validateArg(columns.columnCount() != 0, "TableColumnCollection cannot be empty.");
Utils.validateArg(columns.containsAll(AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names()),
String.format("TableColumnCollection must contain standard columns: %s.",
AnnotatedIntervalTableColumn.STANDARD_COLUMNS.names()));
return CopyNumberAnnotations.ANNOTATIONS.stream()
.filter(a -> columns.contains(a.getName()))
.collect(Collectors.toList());
}

private static List<AnnotationKey<?>> getAnnotationKeys(final List<AnnotatedInterval> annotatedIntervals) {
return annotatedIntervals.isEmpty() ? new ArrayList<>() : annotatedIntervals.get(0).getAnnotationMap().getKeys();
}

private static Function<DataLine, AnnotatedInterval> getAnnotatedIntervalRecordFromDataLineDecoder(
final List<AnnotationKey<?>> annotationKeys) {
return dataLine -> {
final String contig = dataLine.get(AnnotatedIntervalTableColumn.CONTIG);
final int start = dataLine.getInt(AnnotatedIntervalTableColumn.START);
final int end = dataLine.getInt(AnnotatedIntervalTableColumn.END);
final SimpleInterval interval = new SimpleInterval(contig, start, end);
final List<Pair<AnnotationKey<?>, Object>> annotations = new ArrayList<>(annotationKeys.size());
for (final AnnotationKey<?> key : annotationKeys) {
final AnnotationValueType type = AnnotationValueType.valueOf(key.getType().getSimpleName());
switch (type) {
case Integer:
annotations.add(Pair.of(key, dataLine.getInt(key.getName())));
break;
case Long:
annotations.add(Pair.of(key, dataLine.getLong(key.getName())));
break;
case Double:
annotations.add(Pair.of(key, dataLine.getDouble(key.getName())));
break;
case String:
annotations.add(Pair.of(key, dataLine.get(key.getName())));
break;
default:
throw new UserException.BadInput(String.format("Unsupported annotation type: %s", type));
}
}
final AnnotationMap annotationMap = new AnnotationMap(annotations);
return new AnnotatedInterval(interval, annotationMap);
};
}

/**
* Columns, encoder, and decoder are not used.
*/
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}

final AbstractRecordCollection<?, ?> that = (AbstractRecordCollection<?, ?>) o;
return getMetadata().equals(that.getMetadata()) &&
getRecords().equals(that.getRecords());
}

/**
* Columns, encoder, and decoder are not used.
*/
@Override
public int hashCode() {
int result = getMetadata().hashCode();
result = 31 * result + getRecords().hashCode();
return result;
}
}
}
Loading