Skip to content

Commit

Permalink
Improve SplitVcfBySamples tool (#192)
Browse files Browse the repository at this point in the history
* Improve SplitVcfBySamples tool
  • Loading branch information
bbimber authored Oct 5, 2022
1 parent 776d4a0 commit 93e4cae
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 13 deletions.
16 changes: 14 additions & 2 deletions src/main/java/com/github/discvrseq/walkers/SplitVcfBySamples.java
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ public class SplitVcfBySamples extends VariantWalker {
@Argument(doc="If the final VCF in the split has fewer than this number of samples, it will be merged with the second to last VCF", fullName = "minAllowableInFinalVcf", optional = true)
public Integer minAllowableInFinalVcf = null;

@Argument(doc="If selected, any site in a subset VCF lacking at least one genotype with a variant will be discarded", fullName = "discardNonVariantSites", optional = true)
public boolean discardNonVariantSites = false;

/**
* When this flag is enabled, all alternate alleles that are not present in the (output) samples will be removed.
* Note that this even extends to biallelic SNPs - if the alternate allele is not present in any sample, it will be
Expand All @@ -70,7 +73,7 @@ public void onTraversalStart() {
VCFHeader header = getHeaderForVariants();
List<String> samples = header.getSampleNamesInOrder();

batches = Lists.partition(new ArrayList<>(samples), samplesPerVcf);
batches = new ArrayList<>(Lists.partition(new ArrayList<>(samples), samplesPerVcf));
if (batches.size() == 1) {
throw new GATKException("This split would result in one output VCF, aborting");
}
Expand Down Expand Up @@ -109,10 +112,19 @@ public void apply(VariantContext variant, ReadsContext readsContext, ReferenceCo
int idx = 0;
for (List<String> batch : batches) {
final VariantContextWriter writer = writers.get(idx);
idx++;

final VariantContext sub = variant.subContextFromSamples(new LinkedHashSet<>(batch), removeUnusedAlternates);
if (discardNonVariantSites) {
if (sub.getCalledChrCount() == 0) {
continue;
}
else if (sub.getGenotypes().stream().noneMatch(g -> !g.isFiltered() && !g.isHomRef())) {
continue;
}
}

writer.add(sub);
idx++;
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,28 +12,50 @@ public class SplitVcfBySamplesIntegrationTest extends BaseIntegrationTest {

@Test
public void testBasicOperation() throws Exception {
ArgumentsBuilder args = getBaseArgs();

final File outDir = IOUtils.createTempDir("splitVcfBySamples.");
ArgumentsBuilder args = getBaseArgs(outDir);
runCommandLine(args);

args.addRaw("-O");
args.addRaw(normalizePath(outDir));
String actualMD5 = Utils.calculateFileMD5(new File(outDir, "mergeVcf3.1of2.vcf"));
String expectedMD5 = Utils.calculateFileMD5(getTestFile("mergeVcf3.1of2.vcf"));
Assert.assertEquals(actualMD5, expectedMD5);

args.addRaw("--tmp-dir");
args.addRaw(getTmpDir());
actualMD5 = Utils.calculateFileMD5(new File(outDir, "mergeVcf3.2of2.vcf"));
expectedMD5 = Utils.calculateFileMD5(getTestFile("mergeVcf3.2of2.vcf"));
Assert.assertEquals(actualMD5, expectedMD5);
}

@Test
public void testBasicOperationDiscard() throws Exception {
final File outDir = IOUtils.createTempDir("splitVcfBySamples.");
ArgumentsBuilder args = getBaseArgs(outDir);
args.addFlag("discardNonVariantSites");

runCommandLine(args);

String expectedMD5 = Utils.calculateFileMD5(new File(outDir, "mergeVcf3.1of2.vcf"));
String actualMD5 = Utils.calculateFileMD5(getTestFile("mergeVcf3.1of2.vcf"));
String actualMD5 = Utils.calculateFileMD5(new File(outDir, "mergeVcf3.1of2.vcf"));
String expectedMD5 = Utils.calculateFileMD5(getTestFile("mergeVcf3.1of2Discard.vcf"));
Assert.assertEquals(actualMD5, expectedMD5);

actualMD5 = Utils.calculateFileMD5(new File(outDir, "mergeVcf3.2of2.vcf"));
expectedMD5 = Utils.calculateFileMD5(getTestFile("mergeVcf3.2of2Discard.vcf"));
Assert.assertEquals(actualMD5, expectedMD5);
}

@Test
public void testBasicOperationMinSamples() throws Exception {
final File outDir = IOUtils.createTempDir("splitVcfBySamples.");
ArgumentsBuilder args = getBaseArgs(outDir);
args.add("minAllowableInFinalVcf", 2);

runCommandLine(args);

expectedMD5 = Utils.calculateFileMD5(new File(outDir, "mergeVcf3.2of2.vcf"));
actualMD5 = Utils.calculateFileMD5(getTestFile("mergeVcf3.2of2.vcf"));
String actualMD5 = Utils.calculateFileMD5(new File(outDir, "mergeVcf3.1of1.vcf"));
String expectedMD5 = Utils.calculateFileMD5(new File(testBaseDir, "mergeVcf3.vcf"));
Assert.assertEquals(actualMD5, expectedMD5);
}

private ArgumentsBuilder getBaseArgs() {
private ArgumentsBuilder getBaseArgs(File outDir) {
ArgumentsBuilder args = new ArgumentsBuilder();

args.addRaw("--variant");
Expand All @@ -45,6 +67,12 @@ private ArgumentsBuilder getBaseArgs() {

args.add("samplesPerVcf", 1);

args.addRaw("-O");
args.addRaw(normalizePath(outDir));

args.addRaw("--tmp-dir");
args.addRaw(getTmpDir());

return args;
}
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
##fileformat=VCFv4.2
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=1,length=16000>
##contig=<ID=2,length=16000>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1
1 61 . GT G 724.43 PASS . GT 0/1
1 72 . T A 100 PASS . GT 0/1
1 73 . TT AG 100 PASS . GT 0/1
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
##fileformat=VCFv4.2
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##contig=<ID=1,length=16000>
##contig=<ID=2,length=16000>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample3
1 61 . GT G 724.43 PASS . GT 0/1
1 72 . T A 100 PASS . GT 1/1

0 comments on commit 93e4cae

Please sign in to comment.