Skip to content

Commit

Permalink
Reduce SVConcordance memory footprint (#8623)
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 authored Dec 14, 2023
1 parent 39cfbba commit b68fadc
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,12 @@ private void validateCoordinates(final SAMSequenceDictionary dictionary) {
Utils.nonNull(dictionary);
validatePosition(contigA, positionA, dictionary);
validatePosition(contigB, positionB, dictionary);
Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
"End coordinate cannot precede start");
// CPX types may have position B precede A, such as dispersed duplications where A is the insertion point and
// B references the source sequence.
if (type != GATKSVVCFConstants.StructuralVariantAnnotationType.CPX) {
Utils.validateArg(IntervalUtils.compareLocatables(getPositionAInterval(), getPositionBInterval(), dictionary) <= 0,
"End coordinate cannot precede start");
}
}

private static void validatePosition(final String contig, final int position, final SAMSequenceDictionary dictionary) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
*/
public class ClosestSVFinder {

protected final boolean sortOutput;
protected final Map<Long, SVCallRecord> truthIdToItemMap;
protected final Map<Long, ActiveClosestPair> idToClusterMap;
private final SVConcordanceLinkage linkage;
Expand All @@ -49,7 +50,9 @@ public class ClosestSVFinder {
*/
public ClosestSVFinder(final SVConcordanceLinkage linkage,
final Function<ClosestPair, SVCallRecord> collapser,
final boolean sortOutput,
final SAMSequenceDictionary dictionary) {
this.sortOutput = sortOutput;
this.linkage = Utils.nonNull(linkage);
this.collapser = Utils.nonNull(collapser);
outputBuffer = new PriorityQueue<>(SVCallRecordUtils.getCallComparator(dictionary));
Expand All @@ -60,6 +63,15 @@ public ClosestSVFinder(final SVConcordanceLinkage linkage,
lastItemContig = null;
}

/**
* Sorts output by default
*/
public ClosestSVFinder(final SVConcordanceLinkage linkage,
final Function<ClosestPair, SVCallRecord> collapser,
final SAMSequenceDictionary dictionary) {
this(linkage, collapser, true, dictionary);
}

/**
* Should be run frequently to reduce memory usage. Forced flushes must be run when a new contig is encountered.
* @param force flushes all variants in the output buffer regardless of position
Expand All @@ -70,8 +82,12 @@ public List<SVCallRecord> flush(final boolean force) {
.map(c -> new ClosestPair(c.getItem(), c.getClosest()))
.map(collapser)
.collect(Collectors.toList());
outputBuffer.addAll(collapsedRecords);
return flushBuffer(force);
if (sortOutput) {
outputBuffer.addAll(collapsedRecords);
return flushBuffer(force);
} else {
return collapsedRecords;
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import com.google.common.collect.Sets;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.Genotype;
import htsjdk.variant.variantcontext.GenotypeBuilder;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
Expand Down Expand Up @@ -30,8 +32,8 @@
import org.broadinstitute.hellbender.utils.SequenceDictionaryUtils;
import picard.vcf.GenotypeConcordance;

import java.util.HashSet;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;

/**
* <p>This tool calculates SV genotype concordance between an "evaluation" VCF and a "truth" VCF. For each evaluation
Expand All @@ -50,6 +52,8 @@
* of the specific fields. For multi-allelic CNVs, only a copy state concordance metric is
* annotated. Allele frequencies will be recalculated automatically if unavailable in the provided VCFs.
*
* For large inputs, users may enable the --do-not-sort flag to reduce memory usage.
*
* <h3>Inputs</h3>
*
* <ul>
Expand Down Expand Up @@ -90,7 +94,7 @@
@DocumentedFeature
public final class SVConcordance extends AbstractConcordanceWalker {

public static final String USE_TRUTH_AF_LONG_NAME = "use-truth-af";
public static final String UNSORTED_OUTPUT_LONG_NAME = "do-not-sort";

@Argument(
doc = "Output VCF",
Expand All @@ -99,6 +103,12 @@ public final class SVConcordance extends AbstractConcordanceWalker {
)
private GATKPath outputFile;

@Argument(
doc = "Skip output sorting. This can substantially reduce memory usage.",
fullName = UNSORTED_OUTPUT_LONG_NAME
)
private boolean doNotSort;

@ArgumentCollection
private final SVClusterEngineArgumentsCollection clusterParameterArgs = new SVClusterEngineArgumentsCollection();

Expand Down Expand Up @@ -136,8 +146,11 @@ public void onTraversalStart() {
new HashSet<>(getEvalHeader().getGenotypeSamples()),
new HashSet<>(getTruthHeader().getGenotypeSamples()));
final SVConcordanceAnnotator collapser = new SVConcordanceAnnotator(commonSamples);
engine = new ClosestSVFinder(linkage, collapser::annotate, dictionary);
engine = new ClosestSVFinder(linkage, collapser::annotate, !doNotSort, dictionary);

if (doNotSort) {
createOutputVariantIndex = false;
}
writer = createVCFWriter(outputFile);
writer.writeHeader(createHeader(getEvalHeader()));
}
Expand Down Expand Up @@ -172,10 +185,32 @@ private void add(final VariantContext variant, final boolean isTruth) {
flushClusters(true);
currentContig = record.getContigA();
}
if (isTruth) {
record = minimizeTruthFootprint(record);
}
engine.add(record, isTruth);
flushClusters(false);
}

/**
* Strips unneeded attributes from a truth variant to save memory.
*/
protected SVCallRecord minimizeTruthFootprint(final SVCallRecord item) {
final List<Genotype> genotypes = item.getGenotypes().stream().map(SVConcordance::stripTruthGenotype).collect(Collectors.toList());
return new SVCallRecord(item.getId(), item.getContigA(), item.getPositionA(),
item.getStrandA(), item.getContigB(), item.getPositionB(), item.getStrandB(), item.getType(),
item.getCpxSubtype(), item.getLength(), item.getAlgorithms(), item.getAlleles(), genotypes,
item.getAttributes(), item.getFilters(), item.getLog10PError(), dictionary);
}

private static Genotype stripTruthGenotype(final Genotype genotype) {
final GenotypeBuilder builder = new GenotypeBuilder(genotype.getSampleName()).alleles(genotype.getAlleles());
if (genotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) {
builder.attribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT, genotype.getExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT));
}
return builder.make();
}

@Override
protected boolean areVariantsAtSameLocusConcordant(final VariantContext truth, final VariantContext eval) {
return true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public class SVConcordanceIntegrationTest extends CommandLineProgramTest {

@Test
public void testRefPanel() {
final File output = createTempFile("concord", ".vcf");
final File output = createTempFile("concord", ".vcf.gz");
final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";
/**
* Test file produced from raw standardized VCFs from manta, melt, wham, and cnv callers with SVCluster parameters:
Expand Down Expand Up @@ -171,7 +171,7 @@ public void testSelf() {

// Run a vcf against itself and check that every variant matched itself

final File output = createTempFile("concord", ".vcf");
final File output = createTempFile("concord", ".vcf.gz");
final String vcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";

final ArgumentsBuilder args = new ArgumentsBuilder()
Expand Down Expand Up @@ -260,7 +260,7 @@ public void testSelfEvalSubset() {

// Run a vcf against itself but with extra samples and variants

final File output = createTempFile("concord", ".vcf");
final File output = createTempFile("concord", ".vcf.gz");
final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.sample_subset.vcf.gz";
final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";

Expand Down Expand Up @@ -322,7 +322,7 @@ private void assertPerfectConcordance(final File output, final String evalVcfPat

@Test
public void testSitesOnly() {
final File output = createTempFile("concord_sites_only", ".vcf");
final File output = createTempFile("concord_sites_only", ".vcf.gz");
final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.sites_only.vcf.gz";
final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.raw_calls.chr22_chrY.sites_only.vcf.gz";

Expand Down Expand Up @@ -358,7 +358,7 @@ public void testSitesOnly() {

@Test(expectedExceptions = UserException.IncompatibleSequenceDictionaries.class)
public void testHeaderContigsOutOfOrder() {
final File output = createTempFile("concord_sites_only", ".vcf");
final File output = createTempFile("concord_sites_only", ".vcf.gz");
final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";
final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.contigs_out_of_order.vcf.gz";

Expand All @@ -370,4 +370,24 @@ public void testHeaderContigsOutOfOrder() {

runCommandLine(args, SVConcordance.class.getSimpleName());
}

@Test
public void testSelfUnsorted() {

// Run a vcf against itself but don't sort the output

final File output = createTempFile("concord", ".vcf");
final String evalVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";
final String truthVcfPath = getToolTestDataDir() + "ref_panel_1kg.cleaned.gatk.chr22_chrY.vcf.gz";

final ArgumentsBuilder args = new ArgumentsBuilder()
.addOutput(output)
.add(StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME, GATKBaseTest.FULL_HG38_DICT)
.add(AbstractConcordanceWalker.EVAL_VARIANTS_LONG_NAME, evalVcfPath)
.add(AbstractConcordanceWalker.TRUTH_VARIANTS_LONG_NAME, truthVcfPath)
.add(SVConcordance.UNSORTED_OUTPUT_LONG_NAME, true);

runCommandLine(args, SVConcordance.class.getSimpleName());
assertPerfectConcordance(output, evalVcfPath);
}
}

0 comments on commit b68fadc

Please sign in to comment.