Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enabled GVCF type filtering support in SelectVariants #7193

Merged
merged 1 commit into from
Feb 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,21 @@ public final class VariantTypesVariantFilter implements VariantFilter {
private static final long serialVersionUID = 1L;

private final Set<VariantContext.Type> sampleTypes;
private final boolean ignoreNonRef;

public VariantTypesVariantFilter(Set<VariantContext.Type> includeTypes) {
this(includeTypes, false);
}

public VariantTypesVariantFilter(Set<VariantContext.Type> includeTypes, final boolean ignoreNonRefAlleles) {
Utils.nonNull(includeTypes);
sampleTypes = includeTypes;
ignoreNonRef = ignoreNonRefAlleles;
}

@Override
public boolean test(final VariantContext vc) {
final VariantContext.Type vcSampleType = vc.getType();
final VariantContext.Type vcSampleType = vc.getType(ignoreNonRef);
return sampleTypes.contains(vcSampleType);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ public final class SelectVariants extends VariantWalker {
private static final int MAX_NOCALL_NUMBER_DEFAULT_VALUE = Integer.MAX_VALUE;
private static final double MAX_NOCALL_FRACTION_DEFAULT_VALUE = 1.0;

private static final List<String> GVCF_EXTENSIONS = Arrays.asList(".g.vcf", ".g.vcf.gz", ".gvcf", ".gvcf.gz");

/**
* A site is considered discordant if there exists some sample in the variant track that has a non-reference
* genotype and either the site isn't present in this track, the sample isn't present in this track, or the
Expand Down Expand Up @@ -388,6 +390,16 @@ public final class SelectVariants extends VariantWalker {
doc="Do not select certain type of variants from the input file", optional=true)
private List<VariantContext.Type> typesToExclude = new ArrayList<>();

/**
* When this argument is set, NON_REF alleles will not be considered for the variant type determination. This is
* necessary because every variant in a GVCF file would otherwise be assigned the type MIXED, which makes it
* impossible to filter for e.g. SNPs. If only NON_REF alleles are present at a given site it will still be
* considered SYMBOLIC.
*/
@Argument(fullName="ignore-non-ref-in-types",
doc="If set, NON_REF alleles will be ignored for variant type determination, which is required for filtering GVCF files by type", optional=true)
private boolean ignoreNonRefInTypes = false;

/**
* List of IDs (or a .list file containing ids) to select. The tool will only select variants whose ID
* field is present in this list of IDs. The matching is done by exact string matching. If a file, the file
Expand Down Expand Up @@ -635,6 +647,11 @@ public void onTraversalStart() {
}
}

if (!ignoreNonRefInTypes && (!typesToInclude.isEmpty() || !typesToExclude.isEmpty()) &&
GVCF_EXTENSIONS.stream().anyMatch(extension -> getDrivingVariantsFeatureInput().hasExtension(extension))) {
logger.warn("Filtering by variant type and GVCF input detected, but --ignore-non-ref-in-types argument is not set. Variant types will likely not be filtered correctly. Consider setting this argument for meaningful results.");
}

final Path outPath = vcfOutput.toPath();
vcfWriter = createVCFWriter(outPath);
vcfWriter.writeHeader(new VCFHeader(actualHeaderLines, samples));
Expand Down Expand Up @@ -877,7 +894,7 @@ public void closeTool() {
protected CountingVariantFilter makeVariantFilter() {
CountingVariantFilter compositeFilter = new CountingVariantFilter(VariantFilterLibrary.ALLOW_ALL_VARIANTS);
if (!selectedTypes.isEmpty()) {
compositeFilter = compositeFilter.and(new CountingVariantFilter(new VariantTypesVariantFilter(selectedTypes)));
compositeFilter = compositeFilter.and(new CountingVariantFilter(new VariantTypesVariantFilter(selectedTypes, ignoreNonRefInTypes)));
}

if (rsIDsToKeep != null && !rsIDsToKeep.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,21 @@ public void testVariantTypeSelection() throws IOException {
spec.executeTest("testVariantTypeSelection--" + testFile, this);
}

/**
* Test including variant types in GVCF files.
*/
@Test
public void testVariantTypeSelectionForGVCF() throws IOException {
final String testFile = getToolTestDataDir() + "gvcfExample.g.vcf";

final IntegrationTestSpec spec = new IntegrationTestSpec(
baseTestString(" --select-type-to-include SNP --ignore-non-ref-in-types ",testFile),
Collections.singletonList(getToolTestDataDir() + "expected/" + "testSelectVariants_VariantTypeSelectionForGVCF.vcf")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We've been trying to move away from brittle exact match integration tests, but I think that this is fine here since there are no opportunities for numerical changes.

);

spec.executeTest("testVariantTypeSelectionForGVCF--" + testFile, this);
}

/**
* Test excluding indels that are larger than the specified size
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
##fileformat=VCFv4.2
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">
##INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=chr20,length=64444167>
##source=SelectVariants
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1
chr20 69511 . A G,<NON_REF> 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33
chr20 69635 . A T,<NON_REF> 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:10:10,0,119,101,128,229:0,4,0,3
chr20 69785 . T G,<NON_REF> 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:DP:GQ:PL:SB 1/1:80:99:2284,207,0,2287,237,2316:0,1,46,33
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
##fileformat=VCFv4.1
##ALT=<ID=NON_REF,Description="Represents any possible alternative allele at this location">
##FORMAT=<ID=AD,Number=.,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum DP observed within the GVCF block">
##FORMAT=<ID=MIN_GQ,Number=1,Type=Integer,Description="Minimum GQ observed within the GVCF block">
##FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
##FORMAT=<ID=SB,Number=4,Type=Integer,Description="Per-sample component statistics which comprise the Fisher's Exact Test to detect strand bias.">
##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive)
##INFO=<ID=BLOCK_SIZE,Number=1,Type=Integer,Description="Size of the homozygous reference GVCF block">
##INFO=<ID=BaseQRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt Vs. Ref base qualities">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
##INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
##INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
##contig=<ID=chr20,length=64444167,assembly=Homo_sapiens_assembly38.fasta>
##source=SelectVariants
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA1
chr20 69485 . G <NON_REF> . . BLOCK_SIZE=20;END=69510 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:94:99:82:99:0,120,1800
chr20 69511 . A G,<NON_REF> 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:AD:DP:GQ:PL:SB 1/1:1,79,0:80:99:2284,207,0,2287,237,2316:0,1,46,33
chr20 69512 . A <NON_REF> . . BLOCK_SIZE=10;END=69521 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:96:99:82:99:0,120,1800
chr20 69522 . A <NON_REF> . . BLOCK_SIZE=27;END=69548 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:95:0:95:0:0,0,0
chr20 69549 . T <NON_REF> . . BLOCK_SIZE=86;END=69634 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:6:16:5:16:0,16,90
chr20 69635 . A T,<NON_REF> 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:4,3,0:7:10:10,0,119,101,128,229:0,4,0,3
chr20 69762 . A <NON_REF> . . BLOCK_SIZE=1;END=69762 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:18:7:18:0,18,270
chr20 69763 . A <NON_REF> . . BLOCK_SIZE=4;END=69766 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:21:7:21:0,21,253
chr20 69767 . A <NON_REF> . . BLOCK_SIZE=4;END=69770 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:12:7:12:0,12,180
chr20 69771 . TAAAAA T,<NON_REF> 60.77 . BaseQRankSum=0.937;DP=7;MQ=34.15;MQ0=0;MQRankSum=1.300;ReadPosRankSum=1.754 GT:AD:DP:GQ:PL:SB 0/1:0,0,0:0:20:20,0,119,101,128,229:0,4,0,3
chr20 69777 . G <NON_REF> . . BLOCK_SIZE=10;END=69783 GT:DP:GQ:MIN_DP:MIN_GQ:PL 0/0:7:0:3:0:0,0,0
chr20 69785 . T G,<NON_REF> 2253.77 . BaseQRankSum=1.169;DP=82;MQ=31.05;MQ0=0;MQRankSum=-0.866;ReadPosRankSum=1.689 GT:DP:GQ:PL:SB 1/1:80:99:2284,207,0,2287,237,2316:0,1,46,33