Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Complex SV intervals support #8521

Merged
merged 3 commits into from
Jul 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.tools.walkers.sv.SVSegment;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
Expand All @@ -34,7 +35,8 @@ public class SVCallRecord implements SVLocatable {
GATKSVVCFConstants.END2_ATTRIBUTE,
GATKSVVCFConstants.STRANDS_ATTRIBUTE,
GATKSVVCFConstants.SVTYPE,
GATKSVVCFConstants.CPX_TYPE
GATKSVVCFConstants.CPX_TYPE,
GATKSVVCFConstants.CPX_INTERVALS
);

private final String id;
Expand All @@ -57,6 +59,7 @@ public class SVCallRecord implements SVLocatable {

// CPX related fields
private final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype;
private final List<ComplexEventInterval> cpxIntervals;

public SVCallRecord(final String id,
final String contigA,
Expand All @@ -67,6 +70,7 @@ public SVCallRecord(final String id,
final Boolean strandB,
final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final List<ComplexEventInterval> cpxIntervals,
final Integer length,
final List<String> algorithms,
final List<Allele> alleles,
Expand All @@ -75,7 +79,7 @@ public SVCallRecord(final String id,
final Set<String> filters,
final Double log10PError,
final SAMSequenceDictionary dictionary) {
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, cpxSubtype, cpxIntervals, length, algorithms, alleles, genotypes, attributes, filters, log10PError);
validateCoordinates(dictionary);
}

Expand All @@ -88,6 +92,7 @@ protected SVCallRecord(final String id,
final Boolean strandB,
final GATKSVVCFConstants.StructuralVariantAnnotationType type,
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype,
final List<ComplexEventInterval> cpxIntervals,
final Integer length,
final List<String> algorithms,
final List<Allele> alleles,
Expand All @@ -100,13 +105,15 @@ protected SVCallRecord(final String id,
Utils.nonNull(genotypes);
Utils.nonNull(attributes);
Utils.nonNull(filters);
Utils.nonNull(cpxIntervals);
this.id = Utils.nonNull(id);
this.contigA = contigA;
this.positionA = positionA;
this.contigB = contigB;
this.positionB = positionB;
this.type = Utils.nonNull(type);
this.cpxSubtype = cpxSubtype;
this.cpxIntervals = canonicalizeComplexEventList(cpxIntervals);
this.algorithms = Collections.unmodifiableList(algorithms);
this.alleles = Collections.unmodifiableList(alleles);
this.altAlleles = alleles.stream().filter(allele -> !allele.isNoCall() && !allele.isReference()).collect(Collectors.toList());
Expand Down Expand Up @@ -135,11 +142,26 @@ private void validateCoordinates(final SAMSequenceDictionary dictionary) {
// CPX types may have position B precede A, such as dispersed duplications where A is the insertion point and
// B references the source sequence.
if (type != GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
"End coordinate cannot precede start");
if (IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) > 0) {
throw new IllegalArgumentException("End precedes start in variant " + id);
}
}
for (final ComplexEventInterval interval : cpxIntervals) {
Utils.nonNull(interval);
validatePosition(interval.getContig(), interval.getStart(), dictionary);
validatePosition(interval.getContig(), interval.getEnd(), dictionary);
}
}

/**
* Sorts complex intervals list so that they can be efficiently compared across records.
* @param intervals complex intervals
* @return canonicalized list
*/
private static List<ComplexEventInterval> canonicalizeComplexEventList(final List<ComplexEventInterval> intervals) {
return intervals.stream().sorted(Comparator.comparing(ComplexEventInterval::encode)).collect(Collectors.toList());
}

private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) {
final SAMSequenceRecord seq = dictionary.getSequence(contig);
Utils.validateArg(seq != null, "Contig " + contig + " not found in dictionary");
Expand All @@ -148,7 +170,7 @@ private static void validatePosition(final String contig, final int position, fi

private static Map<String, Object> validateAttributes(final Map<String, Object> attributes) {
for (final String key : INVALID_ATTRIBUTES) {
Utils.validateArg(!attributes.containsKey(key), "Attempted to create record with invalid key: " + key);
Utils.validateArg(!attributes.containsKey(key), "Attempted to create record with reserved key: " + key);
}
return attributes;
}
Expand Down Expand Up @@ -180,6 +202,7 @@ private static Integer inferLength(final GATKSVVCFConstants.StructuralVariantAnn
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) && inputLength != null) {
throw new IllegalArgumentException("Input length should be null for type " + type.name() + " but found " + inputLength);
}
// TODO complex subtypes should be checked and handled properly, but for now we just pass through SVLEN
return inputLength;
}
}
Expand Down Expand Up @@ -384,4 +407,45 @@ public Double getLog10PError() {
return log10PError;
}

public List<ComplexEventInterval> getComplexEventIntervals() {
return cpxIntervals;
}

public static final class ComplexEventInterval extends SVSegment {

public ComplexEventInterval(final GATKSVVCFConstants.StructuralVariantAnnotationType intervalType,
final SimpleInterval interval) {
super(intervalType, interval);
}

public static ComplexEventInterval decode(final String str, final SAMSequenceDictionary dictionary) {
Utils.nonNull(str);
final String[] tokens = str.split("_", 2);
if (tokens.length < 2) {
throw new IllegalArgumentException("Expected complex interval with format \"SVTYPE_chr:pos-end\" but found \"" + str + "\"");
}
final SimpleInterval interval = new SimpleInterval(tokens[1]);
if (!IntervalUtils.intervalIsOnDictionaryContig(interval, dictionary)) {
throw new IllegalArgumentException("Invalid CPX interval: " + interval);
}
return new ComplexEventInterval(GATKSVVCFConstants.StructuralVariantAnnotationType.valueOf(tokens[0]), interval);
}

public String encode() {
return getIntervalSVType().name() + "_" + getInterval().toString();
}

@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
ComplexEventInterval that = (ComplexEventInterval) o;
return getIntervalSVType() == that.getIntervalSVType() && Objects.equals(getInterval(), that.getInterval());
}

@Override
public int hashCode() {
return Objects.hash(getIntervalSVType(), getInterval());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ public final class SVCallRecordUtils {

private static final Set<String> VALID_TYPES = new HashSet<>(Arrays.asList(GATKSVVCFConstants.StructuralVariantAnnotationType.values()).stream()
.map(GATKSVVCFConstants.StructuralVariantAnnotationType::name).collect(Collectors.toList()));
private static final Set<String> VALID_CPX_SUBTYPES = new HashSet<>(Arrays.asList(GATKSVVCFConstants.ComplexVariantSubtype.values()).stream()
.map(GATKSVVCFConstants.ComplexVariantSubtype::name).collect(Collectors.toList()));
private static final Set<String> VALID_CPX_SUBTYPES = GATKSVVCFConstants.COMPLEX_VARIANT_SUBTYPE_MAP.keySet();

/**
* Create a builder for a variant from an {@link SVCallRecord} for VCF interoperability
Expand All @@ -34,34 +33,18 @@ public final class SVCallRecordUtils {
*/
public static VariantContextBuilder getVariantBuilder(final SVCallRecord record) {
Utils.nonNull(record);
final int end;
final GATKSVVCFConstants.StructuralVariantAnnotationType type = record.getType();
final GATKSVVCFConstants.ComplexVariantSubtype cpxType = record.getComplexSubtype();
final boolean isDispersedDup = cpxType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP
|| cpxType == GATKSVVCFConstants.ComplexVariantSubtype.dDUP_iDEL;
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.INS
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.BND
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX
|| isDispersedDup) {
end = record.getPositionA();
} else {
end = record.getPositionB();
}
final int end;
final Integer end2;
final String chr2;
if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.BND
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) {
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) {
// TODO this may need to be modified in the future to handle complex translocations
end = record.getPositionA();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note for future: To handle complex translocations, there need to be breakpoints at CHROM:POS, CHROM:END, CHR2:POS2 (which we currently don't have), and CHR2:END2.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Made a TODO note. Would it also be possible to use the CPX intervals?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just looked back at my conversation with Xuefang about this - for annotation we discussed that we should check CPX_INTERVALS for complex CTX, but in her CTX_INV example only the INV was in CPX_INTERVALS. For the CTX we used the other positions. If the INV is on the first chromosome, then the CTX breakpoints on that chromosome are defined by CHROM:POS and CHROM:END. But if the INV is on the second chromosome, then the CTX breakpoints are not fully defined by CHR2:END2 - there would need to be a CHR2:POS2.

end2 = record.getPositionB();
chr2 = record.getContigB();
} else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
if (isDispersedDup) {
end2 = record.getPositionB();
chr2 = record.getContigB();
} else {
end2 = null;
chr2 = null;
}
} else {
end = record.getPositionB();
end2 = null;
chr2 = null;
}
Expand Down Expand Up @@ -90,14 +73,21 @@ public static VariantContextBuilder getVariantBuilder(final SVCallRecord record)
builder.attribute(GATKSVVCFConstants.END2_ATTRIBUTE, end2);
builder.attribute(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, chr2);
}
final GATKSVVCFConstants.ComplexVariantSubtype cpxType = record.getComplexSubtype();
if (cpxType != null) {
builder.attribute(GATKSVVCFConstants.CPX_TYPE, getComplexSubtypeString(cpxType));
}
final List<SVCallRecord.ComplexEventInterval> cpxIntervals = record.getComplexEventIntervals();
if (!cpxIntervals.isEmpty()) {
builder.attribute(GATKSVVCFConstants.CPX_INTERVALS, cpxIntervals.stream().map(SVCallRecord.ComplexEventInterval::encode).collect(Collectors.toList()));
}

builder.attribute(GATKSVVCFConstants.SVLEN, record.getLength());
if ((svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.BND
|| svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.INV
|| svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.INS)
|| svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.INS
|| svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX
|| svtype == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX)
&& record.getStrandA() != null && record.getStrandB() != null) {
builder.attribute(GATKSVVCFConstants.STRANDS_ATTRIBUTE, getStrandString(record));
}
Expand Down Expand Up @@ -183,12 +173,12 @@ public static GenotypesContext populateGenotypesForMissingSamplesWithAlleles(fin
*/
public static SVCallRecord copyCallWithNewGenotypes(final SVCallRecord record, final GenotypesContext genotypes) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
genotypes, record.getAttributes(), record.getFilters(), record.getLog10PError());
}
public static SVCallRecord copyCallWithNewAttributes(final SVCallRecord record, final Map<String, Object> attr) {
return new SVCallRecord(record.getId(), record.getContigA(), record.getPositionA(), record.getStrandA(), record.getContigB(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
record.getPositionB(), record.getStrandB(), record.getType(), record.getComplexSubtype(), record.getComplexEventIntervals(), record.getLength(), record.getAlgorithms(), record.getAlleles(),
record.getGenotypes(), attr, record.getFilters(), record.getLog10PError());
}

Expand Down Expand Up @@ -300,20 +290,19 @@ public static Stream<SVCallRecord> convertInversionsToBreakends(final SVCallReco
}
Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal");
final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,null,
record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,null,
record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, GATKSVVCFConstants.StructuralVariantAnnotationType.BND, null,record.getComplexEventIntervals(), null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), record.getFilters(), record.getLog10PError(), dictionary);
return Stream.of(positiveBreakend, negativeBreakend);
}

/**
* Creates a new {@link SVCallRecord} from the given {@link VariantContext}, keeping any variant fields.
* @see SVCallRecordUtils#create(VariantContext, boolean)
*/
public static SVCallRecord create(final VariantContext variant) {
return create(variant, true);
public static SVCallRecord create(final VariantContext variant, final SAMSequenceDictionary dictionary) {
return create(variant, true, dictionary);
}

/**
Expand All @@ -322,14 +311,15 @@ public static SVCallRecord create(final VariantContext variant) {
* @param keepVariantAttributes retain variant attribute fields
* @return converted record
*/
public static SVCallRecord create(final VariantContext variant, boolean keepVariantAttributes) {
public static SVCallRecord create(final VariantContext variant, boolean keepVariantAttributes, final SAMSequenceDictionary dictionary) {
Utils.nonNull(variant);
final String id = variant.getID();
final String contigA = variant.getContig();
final int positionA = variant.getStart();

final GATKSVVCFConstants.StructuralVariantAnnotationType type = inferStructuralVariantType(variant);
final GATKSVVCFConstants.ComplexVariantSubtype cpxSubtype = getComplexSubtype(variant);
final List<SVCallRecord.ComplexEventInterval> cpxIntervals = parseComplexIntervals(variant.getAttributeAsStringList(GATKSVVCFConstants.CPX_INTERVALS, null), dictionary);
final List<String> algorithms = getAlgorithms(variant);

final String strands;
Expand Down Expand Up @@ -367,21 +357,11 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
|| type == GATKSVVCFConstants.StructuralVariantAnnotationType.CTX) {
if (!(hasContig2 && hasEnd2)) {
throw new UserException.BadInput("Attributes " + GATKSVVCFConstants.END2_ATTRIBUTE +
" and " + GATKSVVCFConstants.CONTIG2_ATTRIBUTE + " are required for BND records (variant " +
variant.getID() + ").");
" and " + GATKSVVCFConstants.CONTIG2_ATTRIBUTE + " are required for BND and CTX records " +
"(variant " + variant.getID() + ").");
}
contigB = variant.getAttributeAsString(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, null);
positionB = variant.getAttributeAsInt(GATKSVVCFConstants.END2_ATTRIBUTE, 0);
} else if (type == GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
// If CHR2/END2 are defined, use them
if (hasContig2 && hasEnd2) {
contigB = variant.getAttributeAsString(GATKSVVCFConstants.CONTIG2_ATTRIBUTE, null);
positionB = variant.getAttributeAsInt(GATKSVVCFConstants.END2_ATTRIBUTE, 0);
} else {
// Otherwise treat like any other variant
contigB = contigA;
positionB = variant.getEnd();
}
} else {
contigB = contigA;
// Force reset of END coordinate
Expand All @@ -394,8 +374,13 @@ public static SVCallRecord create(final VariantContext variant, boolean keepVari
final Double log10PError = variant.hasLog10PError() ? variant.getLog10PError() : null;

final Map<String, Object> sanitizedAttributes = sanitizeAttributes(attributes);
return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype, length, algorithms,
variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes, variant.getFilters(), log10PError);
return new SVCallRecord(id, contigA, positionA, strand1, contigB, positionB, strand2, type, cpxSubtype,
cpxIntervals, length, algorithms, variant.getAlleles(), variant.getGenotypes(), sanitizedAttributes,
variant.getFilters(), log10PError);
}

private static List<SVCallRecord.ComplexEventInterval> parseComplexIntervals(final List<String> intervals, final SAMSequenceDictionary dictionary) {
return intervals.stream().map(i -> SVCallRecord.ComplexEventInterval.decode(i, dictionary)).toList();
}

private static Map<String, Object> sanitizeAttributes(final Map<String, Object> attributes) {
Expand Down
Loading
Loading