Skip to content

Commit

Permalink
Use sample set intersection for SVConcordance (#8211)
Browse files Browse the repository at this point in the history
  • Loading branch information
mwalker174 authored Mar 21, 2023
1 parent 2bda7f9 commit e68f066
Show file tree
Hide file tree
Showing 12 changed files with 365 additions and 112 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,21 @@ public class SVConcordanceAnnotator {
protected final Logger logger = LogManager.getLogger(this.getClass());

private final GenotypeConcordanceScheme scheme;
final boolean useTruthAf;
private final Set<String> samples;

/**
* @param useTruthAf annotate truth allele counts using original truth record attributes (AF/AC/AN)
* Default constructor where all eval record samples will be used for concordance.
*/
public SVConcordanceAnnotator(final boolean useTruthAf) {
this.useTruthAf = useTruthAf;
public SVConcordanceAnnotator() {
this(null);
}

/**
* Annotator restricted to specific samples.
* @param samples samples to include for concordance computations. If null, all samples in eval records will be used.
*/
public SVConcordanceAnnotator(final Set<String> samples) {
this.samples = samples;
this.scheme = new SVGenotypeConcordanceScheme();
}

Expand All @@ -48,23 +56,26 @@ public SVCallRecord annotate(final ClosestSVFinder.ClosestPair pair) {
final ArrayList<Genotype> newGenotypes = new ArrayList<>(evalGenotypes.size());
final GenotypeConcordanceCounts counts = new GenotypeConcordanceCounts();
final boolean isCnv = evalRecord.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV;
Integer numCnvMatches = 0;
int numCnvMatches = 0;
int numValidCnvComparisons = 0;
for (final String sample : evalGenotypes.getSampleNames()) {
GenotypeBuilder builder = new GenotypeBuilder(evalGenotypes.get(sample));
if (isCnv) {
final Boolean result = cnvGenotypesMatch(sample, evalRecord, truthRecord);
builder = builder.attribute(GATKSVVCFConstants.TRUTH_CN_EQUAL_FORMAT, result == null ? null : result.booleanValue() ? 1 : 0);
if (result == null) {
// Null this metric if we encounter any null results
numCnvMatches = null;
} else if (numCnvMatches != null && result.booleanValue()) {
numCnvMatches++;
if (samples == null || samples.contains(sample)) {
if (isCnv) {
final Boolean result = copyNumbersMatch(sample, evalRecord, truthRecord);
builder = builder.attribute(GATKSVVCFConstants.TRUTH_CN_EQUAL_FORMAT, result == null ? null : result.booleanValue() ? 1 : 0);
if (result != null) {
numValidCnvComparisons++;
if (result.booleanValue()) {
numCnvMatches++;
}
}
} else {
final GenotypeConcordanceStates.TruthAndCallStates states = getStates(sample, evalRecord, truthRecord);
counts.increment(states);
builder = builder.attribute(GenotypeConcordance.CONTINGENCY_STATE_TAG,
scheme.getContingencyStateString(states.truthState, states.callState));
}
} else {
final GenotypeConcordanceStates.TruthAndCallStates states = getStates(sample, evalRecord, truthRecord);
counts.increment(states);
builder = builder.attribute(GenotypeConcordance.CONTINGENCY_STATE_TAG,
scheme.getContingencyStateString(states.truthState, states.callState));
}
newGenotypes.add(builder.make());
}
Expand All @@ -76,54 +87,66 @@ public SVCallRecord annotate(final ClosestSVFinder.ClosestPair pair) {
attributes.put(Concordance.TRUTH_STATUS_VCF_ATTRIBUTE, variantStatus.getAbbreviation());

if (isCnv) {
final Double cnvConcordance = numCnvMatches == null ? null : newGenotypes.isEmpty() ? Double.NaN : numCnvMatches / (double) newGenotypes.size();
final Double cnvConcordance = numValidCnvComparisons == 0 ? null : numCnvMatches / (double) numValidCnvComparisons;
attributes.put(GATKSVVCFConstants.COPY_NUMBER_CONCORDANCE_INFO, cnvConcordance);
} else if (truthRecord != null) {
final GenotypeConcordanceSummaryMetrics metrics = new GenotypeConcordanceSummaryMetrics(VariantContext.Type.SYMBOLIC, counts, "truth", "eval", true);
attributes.put(GATKSVVCFConstants.GENOTYPE_CONCORDANCE_INFO, metrics.GENOTYPE_CONCORDANCE);
attributes.put(GATKSVVCFConstants.NON_REF_GENOTYPE_CONCORDANCE_INFO, metrics.NON_REF_GENOTYPE_CONCORDANCE);
attributes.put(GATKSVVCFConstants.HET_PPV_INFO, metrics.HET_PPV);
attributes.put(GATKSVVCFConstants.HET_SENSITIVITY_INFO, metrics.HET_SENSITIVITY);
attributes.put(GATKSVVCFConstants.HOMVAR_PPV_INFO, metrics.HOMVAR_PPV);
attributes.put(GATKSVVCFConstants.HOMVAR_SENSITIVITY_INFO, metrics.HOMVAR_SENSITIVITY);
attributes.put(GATKSVVCFConstants.VAR_PPV_INFO, metrics.VAR_PPV);
attributes.put(GATKSVVCFConstants.VAR_SENSITIVITY_INFO, metrics.VAR_SENSITIVITY);
attributes.put(GATKSVVCFConstants.VAR_SPECIFICITY_INFO, metrics.VAR_SPECIFICITY);
attributes.put(GATKSVVCFConstants.GENOTYPE_CONCORDANCE_INFO, Double.isNaN(metrics.GENOTYPE_CONCORDANCE) ? null : metrics.GENOTYPE_CONCORDANCE);
attributes.put(GATKSVVCFConstants.NON_REF_GENOTYPE_CONCORDANCE_INFO, Double.isNaN(metrics.NON_REF_GENOTYPE_CONCORDANCE) ? null : metrics.NON_REF_GENOTYPE_CONCORDANCE);
attributes.put(GATKSVVCFConstants.HET_PPV_INFO, Double.isNaN(metrics.HET_PPV) ? null : metrics.HET_PPV);
attributes.put(GATKSVVCFConstants.HET_SENSITIVITY_INFO, Double.isNaN(metrics.HET_SENSITIVITY) ? null : metrics.HET_SENSITIVITY);
attributes.put(GATKSVVCFConstants.HOMVAR_PPV_INFO, Double.isNaN(metrics.HOMVAR_PPV) ? null : metrics.HOMVAR_PPV);
attributes.put(GATKSVVCFConstants.HOMVAR_SENSITIVITY_INFO, Double.isNaN(metrics.HOMVAR_SENSITIVITY) ? null : metrics.HOMVAR_SENSITIVITY);
attributes.put(GATKSVVCFConstants.VAR_PPV_INFO, Double.isNaN(metrics.VAR_PPV) ? null : metrics.VAR_PPV);
attributes.put(GATKSVVCFConstants.VAR_SENSITIVITY_INFO, Double.isNaN(metrics.VAR_SENSITIVITY) ? null : metrics.VAR_SENSITIVITY);
attributes.put(GATKSVVCFConstants.VAR_SPECIFICITY_INFO, Double.isNaN(metrics.VAR_SPECIFICITY) ? null : metrics.VAR_SPECIFICITY);
}

if (evalRecord.getType() != GATKSVVCFConstants.StructuralVariantAnnotationType.CNV) {
// Compute allele frequency in eval
final SVAlleleCounter counter = new SVAlleleCounter(evalRecord.getAltAlleles(), evalRecord.getGenotypes());
attributes.put(VCFConstants.ALLELE_COUNT_KEY, counter.getCounts());
attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, counter.getFrequencies());
attributes.put(VCFConstants.ALLELE_NUMBER_KEY, counter.getNumber());
if (!evalRecord.getAllSamples().isEmpty() && !hasAlleleFrequencyAnnotations(evalRecord)) {
// Compute allele frequency in eval
final SVAlleleCounter counter = new SVAlleleCounter(evalRecord.getAltAlleles(), evalRecord.getGenotypes());
attributes.put(VCFConstants.ALLELE_COUNT_KEY, counter.getCounts());
attributes.put(VCFConstants.ALLELE_FREQUENCY_KEY, counter.getFrequencies());
attributes.put(VCFConstants.ALLELE_NUMBER_KEY, counter.getNumber());
}

// Add in truth AF
if (truthRecord == null) {
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_COUNT_INFO, null);
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_FREQUENCY_INFO, null);
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_NUMBER_INFO, null);
} else if (useTruthAf) {
// Use AF
final Map<String, Object> truthAttr = truthRecord.getAttributes();
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_COUNT_INFO,
truthAttr.get(GATKSVVCFConstants.TRUTH_ALLELE_COUNT_INFO));
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_FREQUENCY_INFO,
truthAttr.get(GATKSVVCFConstants.TRUTH_ALLELE_FREQUENCY_INFO));
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_NUMBER_INFO,
truthAttr.get(GATKSVVCFConstants.TRUTH_ALLELE_NUMBER_INFO));
} else {
// Calculate truth AF
final SVAlleleCounter truthCounter = new SVAlleleCounter(evalRecord.getAltAlleles(), truthRecord.getGenotypes());
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_COUNT_INFO, truthCounter.getCounts());
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_FREQUENCY_INFO, truthCounter.getFrequencies());
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_NUMBER_INFO, truthCounter.getNumber());
if (hasAlleleFrequencyAnnotations(truthRecord)) {
// Use AF
final Map<String, Object> truthAttr = truthRecord.getAttributes();
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_COUNT_INFO,
truthAttr.get(VCFConstants.ALLELE_COUNT_KEY));
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_FREQUENCY_INFO,
truthAttr.get(VCFConstants.ALLELE_FREQUENCY_KEY));
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_NUMBER_INFO,
truthAttr.get(VCFConstants.ALLELE_NUMBER_KEY));
} else {
// Calculate truth AF
final SVAlleleCounter truthCounter = new SVAlleleCounter(evalRecord.getAltAlleles(), truthRecord.getGenotypes());
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_COUNT_INFO, truthCounter.getCounts());
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_FREQUENCY_INFO, truthCounter.getFrequencies());
attributes.put(GATKSVVCFConstants.TRUTH_ALLELE_NUMBER_INFO, truthCounter.getNumber());
}
}
}

return SVCallRecordUtils.copyCallWithNewAttributes(recordWithGenotypes, attributes);
}

private boolean hasAlleleFrequencyAnnotations(final SVCallRecord record) {
Utils.nonNull(record);
final Map<String, Object> attr = record.getAttributes();
return (attr.containsKey(VCFConstants.ALLELE_COUNT_KEY) && attr.get(VCFConstants.ALLELE_COUNT_KEY) != null)
&& (attr.containsKey(VCFConstants.ALLELE_FREQUENCY_KEY) && attr.get(VCFConstants.ALLELE_FREQUENCY_KEY) != null)
&& (attr.containsKey(VCFConstants.ALLELE_NUMBER_KEY) && attr.get(VCFConstants.ALLELE_NUMBER_KEY) != null);
}

/**
* Get truth/call states for the genotypes of the given sample
*/
Expand All @@ -144,21 +167,24 @@ private GenotypeConcordanceStates.TruthAndCallStates getStates(final String samp
/**
* Returns whether the copy state of the given sample's genotype matches. Only use for multi-allelic CNVs.
*/
public Boolean cnvGenotypesMatch(final String sample, final SVCallRecord eval, final SVCallRecord truth) {
protected Boolean copyNumbersMatch(final String sample, final SVCallRecord eval, final SVCallRecord truth) {
Utils.nonNull(sample);
Utils.nonNull(eval);
Utils.validateArg(eval.getType() == GATKSVVCFConstants.StructuralVariantAnnotationType.CNV, "Expected CNV evaluation variant");
if (truth == null) {
if (eval == null || truth == null) {
return null;
}
final Genotype evalGenotype = eval.getGenotypes().get(sample);
final int evalCopyNumber = VariantContextGetters.getAttributeAsInt(evalGenotype, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1);
final Genotype truthGenotype = truth.getGenotypes().get(sample);
if (evalGenotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT) != truthGenotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT)) {
throw new IllegalArgumentException("One genotype for sample " + sample + " has CN but the other does not");
if (evalGenotype == null || truthGenotype == null) {
return null;
}
final int copyNumber = VariantContextGetters.getAttributeAsInt(truthGenotype, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1);
return copyNumber == evalCopyNumber;
final boolean evalHasCn = evalGenotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT);
final boolean truthHasCn = truthGenotype.hasExtendedAttribute(GATKSVVCFConstants.COPY_NUMBER_FORMAT);
if (!evalHasCn || !truthHasCn) {
return null;
}
final int evalCopyNumber = VariantContextGetters.getAttributeAsInt(evalGenotype, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1);
final int truthCopyNumber = VariantContextGetters.getAttributeAsInt(truthGenotype, GATKSVVCFConstants.COPY_NUMBER_FORMAT, -1);
return truthCopyNumber == evalCopyNumber;
}

@VisibleForTesting
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,12 @@
* <li>Genotype concordance</li>
* </ol>
*
* after meeting minimum overlap criteria. Evaluation variants that are sucessfully matched are annotated with
* genotype concordance metrics, including allele frequency of the truth variant. See output header for descriptions
* of the specific fields. Note that genotypes of samples that are present in the evaluation VCF but not the truth
* VCF are assumed to be no-calls for concordance. For multi-allelic CNVs, only a copy state concordance metric is
* annotated.
* after meeting minimum overlap criteria. Evaluation VCF variants that are sucessfully matched are annotated with
* genotype concordance metrics, including allele frequency of the truth variant. Concordance metrics are computed
* on the intersection of sample sets of the two VCFs, but all other annotations including variant truth status
* and allele frequency use all records and samples available. See output header for descriptions
* of the specific fields. For multi-allelic CNVs, only a copy state concordance metric is
* annotated. Allele frequencies will be recalculated automatically if unavailable in the provided VCFs.
*
* <h3>Inputs</h3>
*
Expand All @@ -55,7 +56,7 @@
* Evaluation VCF
* </li>
* <li>
* Truth VCF (equal set or subset of samples)
* Truth VCF
* </li>
* </ul>
*
Expand Down Expand Up @@ -97,18 +98,6 @@ public final class SVConcordance extends AbstractConcordanceWalker {
)
private GATKPath outputFile;

/**
* By default, truth allele frequencies are calculated on the fly using the evaluation record's allele number as the
* denominator. This option forces the tool to use the allele frequency annotations (AF/AN/AC) of the closest-
* matching truth variant record (by min distance to both breakpoints) for truth allele frequency annotations.
*/
@Argument(
doc = "Use allele frequency annotations from the truth vcf from the best-matching record.",
fullName = USE_TRUTH_AF_LONG_NAME,
optional = true
)
private boolean useTruthAf = false;

@ArgumentCollection
private final SVClusterEngineArgumentsCollection clusterParameterArgs = new SVClusterEngineArgumentsCollection();

Expand All @@ -131,28 +120,23 @@ public void onTraversalStart() {
if (dictionary == null) {
throw new UserException("Reference sequence dictionary required");
}
validateHeaders();

linkage = new SVConcordanceLinkage(dictionary);
linkage.setDepthOnlyParams(clusterParameterArgs.getDepthParameters());
linkage.setMixedParams(clusterParameterArgs.getMixedParameters());
linkage.setEvidenceParams(clusterParameterArgs.getPESRParameters());

final SVConcordanceAnnotator collapser = new SVConcordanceAnnotator(useTruthAf);
// Concordance computations should be done on common samples only
final Set<String> commonSamples = Sets.intersection(
new HashSet<>(getEvalHeader().getGenotypeSamples()),
new HashSet<>(getTruthHeader().getGenotypeSamples()));
final SVConcordanceAnnotator collapser = new SVConcordanceAnnotator(commonSamples);
engine = new ClosestSVFinder(linkage, collapser::annotate, dictionary);

writer = createVCFWriter(outputFile);
writer.writeHeader(createHeader(getEvalHeader()));
}

private void validateHeaders() {
final Set<String> truthSamples = new HashSet<>(getTruthHeader().getSampleNamesInOrder());
final Set<String> evalSamples = new HashSet<>(getEvalHeader().getSampleNamesInOrder());
if (!Sets.difference(truthSamples, evalSamples).isEmpty()) {
throw new UserException.BadInput("Truth vcf samples must be a subset of eval vcf samples");
}
}

@Override
public Object onTraversalSuccess() {
flushClusters(true);
Expand Down
Loading

0 comments on commit e68f066

Please sign in to comment.