Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate SVCallRecord coordinates #7714

Merged
merged 2 commits into from
Mar 24, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
package org.broadinstitute.hellbender.tools.sv;

import com.google.common.collect.Lists;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.util.CoordMath;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
Expand All @@ -9,6 +11,7 @@
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.lang3.tuple.Pair;
import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants;
import org.broadinstitute.hellbender.utils.IntervalUtils;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.variant.VariantContextGetters;
Expand Down Expand Up @@ -62,10 +65,26 @@ public SVCallRecord(final String id,
final List<String> algorithms,
final List<Allele> alleles,
final List<Genotype> genotypes,
final Map<String,Object> attributes) {
final Map<String,Object> attributes,
final SAMSequenceDictionary dictionary) {
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, length, algorithms, alleles, genotypes, attributes);
validateCoordinates(dictionary);
}

protected SVCallRecord(final String id,
final String contigA,
final int positionA,
final Boolean strandA,
final String contigB,
final int positionB,
final Boolean strandB,
final StructuralVariantType type,
final Integer length,
final List<String> algorithms,
final List<Allele> alleles,
final List<Genotype> genotypes,
final Map<String, Object> attributes) {
Utils.nonNull(id);
Utils.nonNull(contigA);
Utils.nonNull(contigB);
Utils.nonNull(type);
Utils.nonNull(algorithms);
Utils.nonNull(alleles);
Expand All @@ -91,19 +110,40 @@ public SVCallRecord(final String id,
this.strandB = strands.getRight();
}

public SVCallRecord(final String id,
final String contigA,
final int positionA,
/**
* Convenience constructor without extra attributes
*/
public SVCallRecord(final SVCallRecord baseRecord,
final String id,
final Boolean strandA,
final String contigB,
final int positionB,
final Boolean strandB,
final StructuralVariantType type,
final Integer length,
final List<String> algorithms,
final List<Allele> alleles,
final List<Genotype> genotypes) {
this(id, contigA, positionA, strandA, contigB, positionB, strandB, type, length, algorithms, alleles, genotypes, Collections.emptyMap());
final List<Genotype> genotypes,
final Map<String,Object> attributes) {
this(id, baseRecord.getContigA(), baseRecord.getPositionA(), strandA, baseRecord.getContigB(),
baseRecord.getPositionB(), strandB, type, length, algorithms, alleles, genotypes, attributes);
}

/**
* Ensures start/end loci are valid and ordered
*/
private void validateCoordinates(final SAMSequenceDictionary dictionary) {
Utils.nonNull(contigA);
Utils.nonNull(contigB);
Utils.nonNull(dictionary);
validatePosition(contigA, positionA, dictionary);
validatePosition(contigB, positionB, dictionary);
Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I forget -- do we do start == end for insertions in SVCallRecords? Just checking whether this needs to be <= or not. If we do, you might want to add a quick unit test condition to your new test in SVCallRecordUnitTest where start == end with a comment.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes start==end should be correct. I've added a test with some valid coordinates to make sure they pass.

"End coordinate cannot precede start");
}

private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) {
final SAMSequenceRecord seq = dictionary.getSequence(contig);
Utils.validateArg(seq != null, "Contig " + contig + " not found in dictionary");
Utils.validateArg(position > 0 && position <= seq.getSequenceLength(), "Invalid position " + contig + ":" + position);
}

private static Map<String, Object> validateAttributes(final Map<String, Object> attributes) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,17 +238,17 @@ public static int compareCalls(final SVCallRecord first, final SVCallRecord seco
* @param record inversion record
* @return stream of BND records pair, or the original record if not an INV
*/
public static Stream<SVCallRecord> convertInversionsToBreakends(final SVCallRecord record) {
public static Stream<SVCallRecord> convertInversionsToBreakends(final SVCallRecord record, final SAMSequenceDictionary dictionary) {
if (!record.getType().equals(StructuralVariantType.INV)) {
return Stream.of(record);
}
Utils.validateArg(record.isIntrachromosomal(), "Inversion " + record.getId() + " is not intrachromosomal");
final SVCallRecord positiveBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), true, record.getContigB(), record.getPositionB(), true, StructuralVariantType.BND, null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes());
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), dictionary);
final SVCallRecord negativeBreakend = new SVCallRecord(record.getId(), record.getContigA(),
record.getPositionA(), false, record.getContigB(), record.getPositionB(), false, StructuralVariantType.BND, null,
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes());
record.getAlgorithms(), record.getAlleles(), record.getGenotypes(), record.getAttributes(), dictionary);
return Stream.of(positiveBreakend, negativeBreakend);
}

Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,13 @@ public boolean areClusterable(final SVCallRecord a, final SVCallRecord b) {
if (a.getType() != b.getType()) return false;

// Interval overlap
if (!getPaddedRecordInterval(a.getContigA(), a.getPositionA(), a.getPositionB())
.overlaps(getPaddedRecordInterval(b.getContigA(), b.getPositionA(), b.getPositionB()))) return false;
final SimpleInterval intervalA = getPaddedRecordInterval(a.getContigA(), a.getPositionA(), a.getPositionB());
Utils.nonNull(intervalA, "Invalid interval " + new SimpleInterval(a.getContigA(), a.getPositionA(),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like the getPaddedRecordInterval method returns null in the odd cases where the padded start is greater than the contig end or the padded stop is less than 1. Am I missing other cases? If not, aren't we already validating the coordinates when we construct the SVCallRecord object? I guess this assertion is there to be thorough -- maybe a short comment here explaining what the possible error condition is would be helpful to code readers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes you're right I added checks for valid coordinates so this wouldn't happen, and this is here simply for safety. I've added a comment about it.

a.getPositionB()) + " for record " + a.getId());
final SimpleInterval intervalB = getPaddedRecordInterval(b.getContigA(), b.getPositionA(), b.getPositionB());
Utils.nonNull(intervalB, "Invalid interval " + new SimpleInterval(b.getContigA(), b.getPositionA(),
b.getPositionB()) + " for record " + b.getId());
if (!intervalA.overlaps(intervalB)) return false;

// Sample overlap
if (!hasSampleOverlap(a, b, minSampleOverlap)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Sets;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.variant.variantcontext.*;
import org.apache.commons.lang3.tuple.Pair;
Expand Down Expand Up @@ -93,6 +94,7 @@ public enum InsertionLengthSummaryStrategy {
private final BreakpointSummaryStrategy breakpointSummaryStrategy;
private final InsertionLengthSummaryStrategy insertionLengthSummaryStrategy;
private final ReferenceSequenceFile reference;
private final SAMSequenceDictionary dictionary;

private static final AlleleCollectionCollapserComparator ALLELE_COMPARATOR = new AlleleCollectionCollapserComparator();

Expand All @@ -101,6 +103,7 @@ public CanonicalSVCollapser(final ReferenceSequenceFile reference,
final BreakpointSummaryStrategy breakpointSummaryStrategy,
final InsertionLengthSummaryStrategy insertionLengthSummaryStrategy) {
this.reference = Utils.nonNull(reference);
this.dictionary = reference.getSequenceDictionary();
this.altAlleleSummaryStrategy = altAlleleSummaryStrategy;
this.breakpointSummaryStrategy = breakpointSummaryStrategy;
this.insertionLengthSummaryStrategy = insertionLengthSummaryStrategy;
Expand Down Expand Up @@ -135,7 +138,7 @@ public SVCallRecord collapse(final Collection<SVCallRecord> items) {
final List<Genotype> harmonizedGenotypes = harmonizeAltAlleles(altAlleles, genotypes);

return new SVCallRecord(id, exampleCall.getContigA(), start, exampleCall.getStrandA(), exampleCall.getContigB(),
end, exampleCall.getStrandB(), type, length, algorithms, alleles, harmonizedGenotypes, attributes);
end, exampleCall.getStrandB(), type, length, algorithms, alleles, harmonizedGenotypes, attributes, dictionary);
}

/**
Expand Down Expand Up @@ -807,9 +810,12 @@ protected Pair<Integer,Integer> collapseInterval(final Collection<SVCallRecord>
if (exampleCall.getType().equals(StructuralVariantType.INS)) {
// Insertions are a single locus
return Pair.of(newStart, newStart);
} else {
} else if (exampleCall.getContigA().equals(exampleCall.getContigB())) {
// Do not let end precede start
return Pair.of(newStart, Math.max(newEnd, newStart));
return Pair.of(newStart, Math.max(newStart, newEnd));
} else {
// Different contigs, so no constraint on position order
return Pair.of(newStart, newEnd);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,8 @@ private static boolean clusterTogetherWithParams(final SVCallRecord a, final SVC
// Breakend proximity
final SimpleInterval intervalA1 = a.getPositionAInterval().expandWithinContig(params.getWindow(), dictionary);
final SimpleInterval intervalA2 = a.getPositionBInterval().expandWithinContig(params.getWindow(), dictionary);
Utils.nonNull(intervalA1, "Invalid start position " + a.getPositionA() + " in record " + a.getId());
Utils.nonNull(intervalA2, "Invalid end position " + a.getPositionB() + " in record " + a.getId());
final SimpleInterval intervalB1 = b.getPositionAInterval();
final SimpleInterval intervalB2 = b.getPositionBInterval();
if (!(intervalA1.overlaps(intervalB1) && intervalA2.overlaps(intervalB2))) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ public void apply(final VariantContext variant, final ReadsContext readsContext,

// Add to clustering buffer
if (convertInversions) {
SVCallRecordUtils.convertInversionsToBreakends(filteredCall).forEachOrdered(clusterEngine::add);
SVCallRecordUtils.convertInversionsToBreakends(filteredCall, dictionary).forEachOrdered(clusterEngine::add);
} else {
clusterEngine.add(filteredCall);
}
Expand Down Expand Up @@ -461,7 +461,7 @@ public VariantContext buildVariantContext(final SVCallRecord call) {
// Build new variant
final SVCallRecord finalCall = new SVCallRecord(newId, call.getContigA(), call.getPositionA(), call.getStrandA(),
call.getContigB(), call.getPositionB(), call.getStrandB(), call.getType(), call.getLength(),
call.getAlgorithms(), call.getAlleles(), filledGenotypes, call.getAttributes());
call.getAlgorithms(), call.getAlleles(), filledGenotypes, call.getAttributes(), dictionary);
final VariantContextBuilder builder = SVCallRecordUtils.getVariantBuilder(finalCall);
if (omitMembers) {
builder.rmAttribute(GATKSVVCFConstants.CLUSTER_MEMBER_IDS_KEY);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,24 @@ public Object[][] testIsCNVData() {
public void testIsCNV(final SVCallRecord record, final boolean expected) {
Assert.assertEquals(record.isSimpleCNV(), expected);
}

@DataProvider(name = "testCreateInvalidCoordinatesData")
public Object[][] testCreateInvalidCoordinatesData() {
return new Object[][]{
{"chr1", 0, "chr1", 248956422},
{"chr1", 1, "chr1", 248956423},
{"chr1", 1, "chr1", 248956423},
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I mentioned above maybe add a case where start == end? (I guess you'd have to change this test to handle positive examples as well..)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

{"chr1", 1, "chr2", 242193530},
{"chr1", 2, "chr1", 1},
{"chr2", 1, "chr1", 2}
};
}

@Test(dataProvider="testCreateInvalidCoordinatesData", expectedExceptions = { IllegalArgumentException.class })
public void testCreateInvalidCoordinates(final String contigA, final int posA, final String contigB, final int posB) {
new SVCallRecord("var1", contigA, posA, true, contigB, posB, false, StructuralVariantType.BND,
null, SVTestUtils.PESR_ONLY_ALGORITHM_LIST, Collections.emptyList(), Collections.emptyList(),
Collections.emptyMap(), SVTestUtils.hg38Dict);
Assert.fail("Expected exception not thrown");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ public void testGetCallComparator(final SVCallRecord record1, final SVCallRecord
@Test
public void testConvertInversionsToBreakends() {
final SVCallRecord nonInversion = SVTestUtils.newCallRecordWithIntervalAndType(1000, 1999, StructuralVariantType.DEL);
final List<SVCallRecord> nonInversionResult = SVCallRecordUtils.convertInversionsToBreakends(nonInversion).collect(Collectors.toList());
final List<SVCallRecord> nonInversionResult = SVCallRecordUtils.convertInversionsToBreakends(nonInversion, SVTestUtils.hg38Dict).collect(Collectors.toList());
Assert.assertEquals(nonInversionResult.size(), 1);
Assert.assertNotNull(nonInversionResult.get(0));
SVTestUtils.assertEqualsExceptMembership(nonInversionResult.get(0), nonInversion);
Expand All @@ -358,7 +358,7 @@ public void testConvertInversionsToBreakends() {
Collections.emptyList(),
Collections.emptyList(),
Collections.emptyMap());
final List<SVCallRecord> inversionResult = SVCallRecordUtils.convertInversionsToBreakends(inversion).collect(Collectors.toList());
final List<SVCallRecord> inversionResult = SVCallRecordUtils.convertInversionsToBreakends(inversion, SVTestUtils.hg38Dict).collect(Collectors.toList());
Assert.assertEquals(inversionResult.size(), 2);

final SVCallRecord bnd1 = inversionResult.get(0);
Expand Down
Loading