Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retain all source IDs on VariantContext merge + test #9032

Merged
merged 16 commits into from
Nov 13, 2024
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,14 @@
import java.nio.file.Path;
import java.util.*;
import java.util.function.BiFunction;
import java.util.stream.Collector;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

public final class GATKVariantContextUtils {

/** maximum number of sources to include when merging sources */
private static final int MAX_SOURCES_TO_INCLUDE = 10;
private static final Logger logger = LogManager.getLogger(GATKVariantContextUtils.class);

public static final String MERGE_FILTER_PREFIX = "filterIn";
Expand Down Expand Up @@ -1096,31 +1099,46 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
final GenotypeMergeType genotypeMergeOptions,
final boolean filteredAreUncalled) {
int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled);
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled, false, -1);
}

public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
final List<String> priorityListOfVCs,
final FilteredRecordMergeType filteredRecordMergeType,
final GenotypeMergeType genotypeMergeOptions,
final boolean filteredAreUncalled,
final boolean storeAllVcfSources,
final int maxSourceFieldLength) {
int originalNumOfVCs = priorityListOfVCs == null ? 0 : priorityListOfVCs.size();
return simpleMerge(unsortedVCs, priorityListOfVCs, originalNumOfVCs, filteredRecordMergeType, genotypeMergeOptions, filteredAreUncalled, storeAllVcfSources, maxSourceFieldLength);
}

/**
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
* the sample name.
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
* SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge.
*
* For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
*
* @param unsortedVCs collection of unsorted VCs
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
* @param filteredRecordMergeType merge type for filtered records
* @param genotypeMergeOptions merge option for genotypes
* @param filteredAreUncalled are filtered records uncalled?
* @return new VariantContext representing the merge of unsortedVCs
*/
* Merges VariantContexts into a single hybrid. Takes genotypes for common samples in priority order, if provided.
* If uniquifySamples is true, the priority order is ignored and names are created by concatenating the VC name with
* the sample name.
* simpleMerge does not verify any more unique sample names EVEN if genotypeMergeOptions == GenotypeMergeType.REQUIRE_UNIQUE. One should use
* SampleUtils.verifyUniqueSamplesNames to check that before using simpleMerge.
*
* For more information on this method see: http://www.thedistractionnetwork.com/programmer-problem/
*
* @param unsortedVCs collection of unsorted VCs
* @param priorityListOfVCs priority list detailing the order in which we should grab the VCs
* @param filteredRecordMergeType merge type for filtered records
* @param genotypeMergeOptions merge option for genotypes
* @param filteredAreUncalled are filtered records uncalled?
* @param storeAllVcfSources if true, the sources of all VCs where isVariable()=true will be concatenated in the output VC's source field. If false, the source of the first VC will be used. This mirror's GATK3's behavior
* @param maxSourceFieldLength This can be used to enforce a maximum length for the value of the source field (primarily useful if storeAllVcfSources=true). Set to -1 for unlimited
* @return new VariantContext representing the merge of unsortedVCs
*/
public static VariantContext simpleMerge(final Collection<VariantContext> unsortedVCs,
final List<String> priorityListOfVCs,
final int originalNumOfVCs,
final FilteredRecordMergeType filteredRecordMergeType,
final GenotypeMergeType genotypeMergeOptions,
final boolean filteredAreUncalled) {
final boolean filteredAreUncalled,
final boolean storeAllVcfSources,
final int maxSourceFieldLength) {
if ( unsortedVCs == null || unsortedVCs.isEmpty() )
return null;

Expand Down Expand Up @@ -1165,7 +1183,7 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort
longestVC = vc; // get the longest location

nFiltered += vc.isFiltered() ? 1 : 0;
if ( vc.isVariant() ) variantSources.add(vc.getSource());
if ( storeAllVcfSources && vc.isVariant() ) variantSources.add(vc.getSource());

AlleleMapper alleleMapping = resolveIncompatibleAlleles(refAllele, vc);

Expand Down Expand Up @@ -1236,7 +1254,19 @@ public static VariantContext simpleMerge(final Collection<VariantContext> unsort

final String ID = rsIDs.isEmpty() ? VCFConstants.EMPTY_ID_FIELD : Utils.join(",", rsIDs);

final VariantContextBuilder builder = new VariantContextBuilder().source(name).id(ID);
// This preserves the GATK3-like behavior of reporting multiple sources, delimited with hyphen:
// NOTE: if storeAllVcfSources is false, variantSources will be empty and therefore no sorting is performed
String allSources = variantSources.isEmpty() ? name : variantSources.stream()
.sorted()
.distinct()
.limit(MAX_SOURCES_TO_INCLUDE)
.collect(Collectors.joining("-"));

if (maxSourceFieldLength != -1 && allSources.length() > maxSourceFieldLength) {
allSources = allSources.substring(0, maxSourceFieldLength);
}

final VariantContextBuilder builder = new VariantContextBuilder().source(allSources).id(ID);
builder.loc(longestVC.getContig(), longestVC.getStart(), longestVC.getEnd());
builder.alleles(alleles);
builder.genotypes(genotypes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -659,6 +659,27 @@ public void testMergeGenotypesUniquify() {
Assert.assertEquals(merged.getSampleNames(), new LinkedHashSet<>(Arrays.asList("s1.1", "s1.2")));
}

@Test
public void testMergeSourceIDs() {
final VariantContext vc1 = makeVC("1", Arrays.asList(Aref, T), makeG("s1", Aref, T, -1));
final VariantContext vc1WithSource= new VariantContextBuilder(vc1).source("source1").make();
final VariantContext vc2 = makeVC("2", Arrays.asList(Aref, T), makeG("s1", Aref, T, -2));
final VariantContext vc2WithSource = new VariantContextBuilder(vc2).source("source2").make();
final VariantContext merged = GATKVariantContextUtils.simpleMerge(
Arrays.asList(vc1WithSource, vc2WithSource), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, true, -1);

// test sources are merged
Assert.assertEquals(merged.getSource(), "source1-source2");

final VariantContext mergedTruncated = GATKVariantContextUtils.simpleMerge(
Arrays.asList(vc1WithSource, vc2WithSource), null, GATKVariantContextUtils.FilteredRecordMergeType.KEEP_IF_ANY_UNFILTERED,
GATKVariantContextUtils.GenotypeMergeType.UNIQUIFY, false, true, 2);

// test sources are merged and respects maxSourceFieldLength
Assert.assertEquals(mergedTruncated.getSource(), "so");
}

// TODO: remove after testing
// @Test(expectedExceptions = IllegalStateException.class)
// public void testMergeGenotypesRequireUnique() {
Expand Down
Loading