Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Complex SV intervals support #8521

Merged
merged 3 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import com.google.common.collect.HashBiMap;
import htsjdk.variant.variantcontext.Allele;
import org.aeonbits.owner.util.Collections;

import java.util.Map;
import java.util.Set;

import static java.util.Map.entry;

Expand Down Expand Up @@ -121,6 +123,13 @@ public enum ComplexVariantSubtype {
entry("CTX_INV", ComplexVariantSubtype.CTX_INV)
));

// complex subtypes expected to have POS=END
public static final Set<ComplexVariantSubtype> COMPLEX_POINT_SUBTYPES =
Collections.set(
GATKSVVCFConstants.ComplexVariantSubtype.dDUP,
GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dDUP_iDEL events don't have POS = END because there is a deletion at the sink

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I couldn't actually find where this constant gets used though?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoops. I was considering this but didn't end up using it

);

// not defined in output vcf header but used in internal id that is currently output in the ID column
public static final String INTERVAL_VARIANT_ID_FIELD_SEPARATOR = "_";
public static final String DUP_TAN_CONTRACTION_INTERNAL_ID_START_STRING = "DEL-DUPLICATION-TANDEM-CONTRACTION";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.walkers.sv.SVSegment;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
Expand All @@ -34,7 +35,8 @@ public class SVCallRecord implements SVLocatable {
GATKSVVCFConstants.END2_ATTRIBUTE,
GATKSVVCFConstants.STRANDS_ATTRIBUTE,
GATKSVVCFConstants.SVTYPE,
GATKSVVCFConstants.CPX_TYPE
GATKSVVCFConstants.CPX_TYPE,
GATKSVVCFConstants.CPX_INTERVALS
);

private final String id;
Expand All @@ -57,6 +59,7 @@ public class SVCallRecord implements SVLocatable {

// CPX related fields
private final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype;
private final List<ComplexEventInterval> cpxIntervals;

public SVCallRecord(final String id,
final String contigA,
Expand All @@ -67,6 +70,7 @@ public SVCallRecord(final String id,
final Boolean strandB,
final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final List<ComplexEventInterval> cpxIntervals,
final Integer length,
final List<String> algorithms,
final List<Allele> alleles,
Expand All @@ -75,7 +79,7 @@ public SVCallRecord(final String id,
final Set<String> filters,
final Double log10PError,
final SAMSequenceDictionary dictionary) {
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
validateCoordinates(dictionary);
}

Expand All @@ -88,6 +92,7 @@ protected SVCallRecord(final String id,
final Boolean strandB,
final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final List<ComplexEventInterval> cpxIntervals,
final Integer length,
final List<String> algorithms,
final List<Allele> alleles,
Expand All @@ -100,13 +105,15 @@ protected SVCallRecord(final String id,
Utils.nonNull(genotypes);
Utils.nonNull(attributes);
Utils.nonNull(filters);
Utils.nonNull(cpxIntervals);
this.id = Utils.nonNull(id);
this.contigA = contigA;
this.positionA = positionA;
this.contigB = contigB;
this.positionB = positionB;
this.type = Utils.nonNull(type);
this.cpxSubtype = cpxSubtype;
this.cpxIntervals = canonicalizeComplexEventList(cpxIntervals);
this.algorithms = Collections.unmodifiableList(algorithms);
this.alleles = Collections.unmodifiableList(alleles);
this.altAlleles = alleles.stream().filter(allele -> !allele.isNoCall() && !allele.isReference()).collect(Collectors.toList());
Expand Down Expand Up @@ -135,11 +142,26 @@ private void validateCoordinates(final SAMSequenceDictionary dictionary) {
// CPX types may have position B precede A, such as dispersed duplications where A is the insertion point and
// B references the source sequence.
if (type != GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
"End coordinate cannot precede start");
if (IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) > 0) {
throw new IllegalArgumentException("End precedes start in variant " + id);
}
}
for (final ComplexEventInterval interval : cpxIntervals) {
Utils.nonNull(interval);
validatePosition(interval.getContig(), interval.getStart(), dictionary);
validatePosition(interval.getContig(), interval.getEnd(), dictionary);
}
}

/**
* Sorts complex intervals list so that they can be efficiently compared across records.
* @param intervals complex intervals
* @return canonicalized list
*/
private static List<ComplexEventInterval> canonicalizeComplexEventList(final List<ComplexEventInterval> intervals) {
return intervals.stream().sorted(Comparator.comparing(ComplexEventInterval::encode)).collect(Collectors.toList());
}

private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) {
final SAMSequenceRecord seq = dictionary.getSequence(contig);
Utils.validateArg(seq != null, "Contig " + contig + " not found in dictionary");
Expand All @@ -148,7 +170,7 @@ private static void validatePosition(final String contig, final int position, fi

private static Map<String, Object> validateAttributes(final Map<String, Object> attributes) {
for (final String key : INVALID_ATTRIBUTES) {
Utils.validateArg(!attributes.containsKey(key), "Attempted to create record with invalid key: " + key);
Utils.validateArg(!attributes.containsKey(key), "Attempted to create record with reserved key: " + key);
}
return attributes;
}
Expand Down Expand Up @@ -180,6 +202,7 @@ private static Integer inferLength(final GATKSVVCFConstants.StructuralVariantAnn
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) && inputLength != null) {
throw new IllegalArgumentException("Input length should be null for type " + type.name() + " but found " + inputLength);
}
// TODO complex subtypes should be checked and handled properly, but for now we just pass through SVLEN
return inputLength;
}
}
Expand Down Expand Up @@ -384,4 +407,45 @@ public Double getLog10PError() {
return log10PError;
}

public List<ComplexEventInterval> getComplexEventIntervals() {
return cpxIntervals;
}

public static final class ComplexEventInterval extends SVSegment {

public ComplexEventInterval(final GATKSVVCFConstants.StructuralVariantAnnotationType intervalType,
final SimpleInterval interval) {
super(intervalType, interval);
}

public static ComplexEventInterval decode(final String str, final SAMSequenceDictionary dictionary) {
Utils.nonNull(str);
final String[] tokens = str.split("_", 2);
if (tokens.length < 2) {
throw new IllegalArgumentException("Expected complex interval with format \"SVTYPE_chr:pos-end\" but found \"" + str + "\"");
}
final SimpleInterval interval = new SimpleInterval(tokens[1]);
if (!IntervalUtils.intervalIsOnDictionaryContig(interval, dictionary)) {
throw new IllegalArgumentException("Invalid CPX interval: " + interval);
}
return new ComplexEventInterval(GATKSVVCFConstants.StructuralVariantAnnotationType.valueOf(tokens[0]), interval);
}

public String encode() {
return getIntervalSVType().name() + "_" + getInterval().toString();
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ComplexEventInterval that = (ComplexEventInterval) o;
return getIntervalSVType() == that.getIntervalSVType() && Objects.equals(getInterval(), that.getInterval());
}

@Override
public int hashCode() {
return Objects.hash(getIntervalSVType(), getInterval());
}
}
}
Loading
Loading