Skip to content

Commit

Permalink
add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
jdidion committed Apr 3, 2024
1 parent f6052af commit 87f07fe
Show file tree
Hide file tree
Showing 2 changed files with 145 additions and 56 deletions.
111 changes: 72 additions & 39 deletions src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala
Original file line number Diff line number Diff line change
Expand Up @@ -97,69 +97,102 @@ object DownsampleVcf extends LazyLogging {
def downsampleAndRegenotype(gt: Genotype, proportion: Double, random: Random, epsilon: Double): Genotype = {
val oldAds = gt.getOrElse[IndexedSeq[Int]]("AD", throw new Exception(s"AD tag not found for sample ${gt.sample}"))
val newAds = downsampleADs(oldAds, proportion, random)
val Seq(aa, ab, bb) = computePls(newAds)
val Seq(alleleA, alleleB) = gt.alleles.toSeq

val calls = {
if (aa == 0 && ab == 0 && bb == 0) IndexedSeq(NoCallAllele, NoCallAllele)
else if (aa < ab && aa < bb) IndexedSeq(alleleA, alleleA)
else if (bb < ab && bb < aa) IndexedSeq(alleleB, alleleB)
else IndexedSeq(alleleA, alleleB)
}
gt.copy(attrs=Map("PL" -> IndexedSeq(aa, ab, bb), "AD" -> newAds, "DP" -> newAds.sum), calls=calls)
}

/**
* Compute the genotype likelihoods given the allele depths, assuming a diploid genotype (i.e.
* two allele depths).
* @param ads The input depths for the two alleles A and B.
* @return a list of three likelihoods for the alleles AA, AB, and BB.
*/
def computePls(ads: IndexedSeq[Int]): IndexedSeq[Int] = {
require(ads.length == 2, "there must be exactly two allele depths")
val likelihoods = Likelihoods(ads(0), ads(1))
IndexedSeq(likelihoods.aa.round.toInt, likelihoods.ab.round.toInt, likelihoods.bb.round.toInt)
val likelihoods = Likelihoods(newAds)
val pls = likelihoods.pls
val calls = likelihoods.mostLikelyCall(gt.alleles.toSeq)
gt.copy(attrs=Map("PL" -> pls, "AD" -> newAds, "DP" -> newAds.sum), calls=calls)
}

object Likelihoods {
/** Computes the likelihoods for each possible genotype.
*
/** Computes the likelihoods for each possible biallelic genotype.
* @param alleleDepthA the reference allele depth
* @param alleleDepthB the alternate allele depth
* @param epsilon the error rate for genotyping
* @return a new `Likelihood` that has the likelihoods of AA, AB, and BB
*/
def apply(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double=0.01): Likelihoods = {
def biallelic(alleleDepthA: Int, alleleDepthB: Int, epsilon: Double = 0.01): Likelihoods = {
val aGivenAA = log10(1 - epsilon)
val aGivenBB = log10(epsilon)
val aGivenAB = log10((1 - epsilon) / 2)

val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB)) * -10
val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA)) * -10
val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB) * -10
val rawGlAA = ((alleleDepthA * aGivenAA) + (alleleDepthB * aGivenBB))
val rawGlBB = ((alleleDepthA * aGivenBB) + (alleleDepthB * aGivenAA))
val rawGlAB = ((alleleDepthA + alleleDepthB) * aGivenAB)

val minGL = math.min(math.min(rawGlAA, rawGlAB), rawGlBB)
Likelihoods(2, IndexedSeq(rawGlAA, rawGlAB, rawGlBB))
}

/** Computes the likelihoods for each possible multiallelic genotype.
* @param alleleDepths the sequence of allele depths in the order specified in the VCF
* @param epsilon the error rate for genotyping
* @return a new `Likelihood` that has the likelihoods of all possible genotypes in the order
* specified in VFC spec for the GL/PL tags.
*/
def multiallelic(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = {
val numAlleles = alleleDepths.length
// probabilities associated with each possible genotype for a pair of alleles
val probs: Array[Double] = Array(
math.log10(epsilon),
math.log10((1 - epsilon) / 2),
math.log10(1 - epsilon)
)
// raw genotype log-likelihoods
Likelihoods(
aa = rawGlAA - minGL,
ab = rawGlAB - minGL,
bb = rawGlBB - minGL
numAlleles,
(0 until numAlleles).flatMap(b =>
(0 to b).map(a =>
(0 until numAlleles).map(allele =>
probs(Array(a, b).count(_ == allele)) * alleleDepths(allele)
).sum
)
)
)
}

def apply(alleleDepths: IndexedSeq[Int], epsilon: Double = 0.01): Likelihoods = {
require(alleleDepths.length >= 2, "at least two alleles are required to calculate genotype likelihoods")
if (alleleDepths.length > 2) multiallelic(alleleDepths, epsilon)
else biallelic(alleleDepths(0), alleleDepths(1), epsilon)
}
}

/** Stores the log10(likelihoods) for all possible bi-allelic genotypes.
*
* @param aa likelihood of AA
* @param ab likelihood of AB
* @param bb likelihood of BB
/** Stores the log10(likelihoods) for all possible genotypes.
* @param numAlleles the number of alleles the variant has
* @param genotypeLikelihoods sequence of GLs in the order specified in the VCF spec
*/
case class Likelihoods(aa: Double, ab: Double, bb: Double) {
case class Likelihoods(numAlleles: Int, genotypeLikelihoods: IndexedSeq[Double]) {
/**
* Returns the likelihoods as a list of phred-scaled integers (i.e, the value of the PL tag).
* @return a list of phred-scaled likelihooodS for AA, AB, BB.
*/
def pls = IndexedSeq(aa.round.toInt, ab.round.toInt, bb.round.toInt)
def pls: IndexedSeq[Int] = {
// subtract the min value so the smallest GL is 0, then multiply by -10 and convert to
// Int to make it PHRED-scale
val rawPL = genotypeLikelihoods.map(gl => gl * -10)
val minPL = rawPL.min
rawPL.map(pl => (pl - minPL).round.toInt)
}

def mostLikelyGenotype: Option[(Int, Int)] = {
val minIndexes = pls.zipWithIndex.filter(pair => pair._1 == 0)
minIndexes.length match {
case 0 => throw new RuntimeException("expected the most likely PL to have a value of 0.0")

Check warning on line 179 in src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala

View check run for this annotation

Codecov / codecov/patch

src/main/scala/com/fulcrumgenomics/vcf/DownsampleVcf.scala#L179

Added line #L179 was not covered by tests
case 1 => {
val genotypes =
for (b <- 0 until numAlleles; a <- 0 to b)
yield (a, b)
Some(genotypes(minIndexes.head._2))
}
case _ => None // if multiple genotypes are most likely, don't make a call
}
}

def mostLikelyCall(alleles: Seq[Allele]): IndexedSeq[Allele] = {
mostLikelyGenotype match {
case None => IndexedSeq(NoCallAllele, NoCallAllele)
case Some((a, b)) => IndexedSeq(alleles(a), alleles(b))
}
}
}
}

Expand Down
90 changes: 73 additions & 17 deletions src/test/scala/com/fulcrumgenomics/vcf/DownsampleVcfTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import com.fulcrumgenomics.util.Metric
import com.fulcrumgenomics.vcf.api.Allele.SimpleAllele
import com.fulcrumgenomics.vcf.api.{Allele, AlleleSet, Genotype, Variant}
import com.fulcrumgenomics.testing.UnitSpec
import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, computePls, downsampleAndRegenotype}
import com.fulcrumgenomics.vcf.DownsampleVcf.{Likelihoods, downsampleAndRegenotype}

import scala.util.Random

Expand Down Expand Up @@ -187,7 +187,7 @@ class DownsampleVcfTest extends UnitSpec {
"DownsampleVcf.computePls" should "return new PLs that are not always 0,0,0" in {
val ads = IndexedSeq[Int](0, 100)
val expected = IndexedSeq(1996, 301, 0)
val newlikelihoods = computePls(ads)
val newlikelihoods = Likelihoods(ads).pls
newlikelihoods should contain theSameElementsInOrderAs expected
}

Expand All @@ -196,51 +196,57 @@ class DownsampleVcfTest extends UnitSpec {
*/

"DownsampleVcf.Likelihoods" should "return ref if all allele depths are zero" in {
val likelihood = Likelihoods(alleleDepthA=0, alleleDepthB=0)
val likelihood = Likelihoods(IndexedSeq(0, 0))
val expected = IndexedSeq[Int](0, 0, 0)
likelihood.pls.length shouldBe expected.length
likelihood.pls should contain theSameElementsInOrderAs expected
}

it should "return a likelihood of 0 for AA if there are only ref alleles observed" in {
val likelihood = Likelihoods(alleleDepthA = 10, alleleDepthB = 0)
val likelihood = Likelihoods(IndexedSeq(10, 0))
val expected = IndexedSeq[Int](0, 30, 200)
likelihood.pls should contain theSameElementsInOrderAs expected
}

it should "return a likelihood of 0 for BB if there are only alt alleles observed" in {
val likelihood = Likelihoods(alleleDepthA = 0, alleleDepthB = 10)
val likelihood = Likelihoods(IndexedSeq(0, 10))
val expected = IndexedSeq[Int](200, 30, 0)
likelihood.pls should contain theSameElementsInOrderAs expected
}

it should "return a likelihood of 0 for AB if there are an equal number of ref and alt alleles" in {
val likelihood = Likelihoods(alleleDepthA = 5, alleleDepthB = 5)
val likelihood = Likelihoods(IndexedSeq(5, 5))
val expected = IndexedSeq[Int](70, 0, 70)
likelihood.pls should contain theSameElementsInOrderAs expected
}

it should "return a likelihood of 0 for AA if the AD A >> AD B" in {
val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 2)
val likelihood = Likelihoods(IndexedSeq(15, 2))
likelihood.pls(0) == 0
}

it should "return a likelihood of 0 for AB if AD.A and AD.B are similar but not equal" in {
val likelihood = Likelihoods(alleleDepthA = 15, alleleDepthB = 17)
val likelihood = Likelihoods(IndexedSeq(15, 17))
likelihood.pls(1) == 0
}

it should "return a likelihood of 0 for BB if AD.B >> AD.A but neither are 0" in {
val likelihood = Likelihoods(alleleDepthA = 3, alleleDepthB = 30)
val likelihood = Likelihoods(IndexedSeq(3, 30))
likelihood.pls(2) == 0
}

it should "return correct values when there are very few reads" in {
Likelihoods(0, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0)
Likelihoods(1, 0).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20)
Likelihoods(1, 1).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14)
Likelihoods(0, 2).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0)
Likelihoods(1, 2).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11)
Likelihoods(IndexedSeq(0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0)
Likelihoods(IndexedSeq(1, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 3, 20)
Likelihoods(IndexedSeq(1, 1)).pls should contain theSameElementsInOrderAs IndexedSeq(14, 0, 14)
Likelihoods(IndexedSeq(0, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(40, 6, 0)
Likelihoods(IndexedSeq(1, 2)).pls should contain theSameElementsInOrderAs IndexedSeq(31, 0, 11)
}

it should "return correct values for multi-allelic variants" in {
Likelihoods(IndexedSeq(0, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 0, 0, 0, 0, 0)
Likelihoods(IndexedSeq(10, 0, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(0, 30, 200, 30, 200, 200)
Likelihoods(IndexedSeq(10, 10, 0)).pls should contain theSameElementsInOrderAs IndexedSeq(139, 0, 139, 169, 169, 339)
}


Expand All @@ -251,10 +257,10 @@ class DownsampleVcfTest extends UnitSpec {
Genotype(alleles=AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt))),
sample=sample,
calls=IndexedSeq[Allele](Allele(ref), Allele(alt)),
attrs=Map("AD" -> ads, "PL" -> Likelihoods(alleleDepthA = ads(0), alleleDepthB = ads(1))))
attrs=Map("AD" -> ads, "PL" -> Likelihoods(ads))
)
}


"DownsampleVcf.downsampleAndRegneotype(Genotype)" should "return no call if all allele depths are zero" in {
val geno = makeGt(ref="A", alt="T", ads=IndexedSeq(0,0))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.01, random = new Random(42), epsilon = 0.01)
Expand Down Expand Up @@ -298,6 +304,30 @@ class DownsampleVcfTest extends UnitSpec {
newGeno.calls should contain theSameElementsInOrderAs expected
}

/*
testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes
*/
private def makeTriallelicGt(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Genotype = {
val likelihoods = Likelihoods(ads)
val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2)))
val calls = likelihoods.mostLikelyCall(alleles.toSeq)
Genotype(alleles, sample=sample, calls=calls, attrs=Map("AD" -> ads, "PL" -> likelihoods.pls))
}

it should "return ref,alt1 for a tri-allelic genotype if those alleles have the highest depth" in {
val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val expected = IndexedSeq(Allele("A"), Allele("T"))
newGeno.calls should contain theSameElementsInOrderAs expected
}

it should "return alt1,alt2 for a tri-allelic genotype if those alleles have the highest depth" in {
val geno = makeTriallelicGt(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100))
val newGeno = downsampleAndRegenotype(gt=geno, proportion=0.1, random = new Random(42), epsilon = 0.01)
val expected = IndexedSeq(Allele("T"), Allele("G"))
newGeno.calls should contain theSameElementsInOrderAs expected
}

/*
testing DownsampleVcf.downsampleAndRegenotype on Variant
*/
Expand All @@ -306,7 +336,7 @@ class DownsampleVcfTest extends UnitSpec {
Variant(chrom="1",
pos=10,
alleles=AlleleSet(ref=Allele(ref), alts=Allele(alt)),
genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample = sample))
genotypes=Map(sample -> makeGt(ref=ref, alt=alt, ads=ads, sample=sample))
)
}

Expand Down Expand Up @@ -345,6 +375,32 @@ class DownsampleVcfTest extends UnitSpec {
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
}

/*
testing DownsampleVcf.downsampleAndRegenotype on downsampleAndRegenotypes
*/
private def makeTriallelicVariant(ref: String, alt1: String, alt2: String, ads: IndexedSeq[Int], sample: String ="test"): Variant = {
val likelihoods = Likelihoods(ads)
val alleles = AlleleSet(ref=SimpleAllele(ref), alts=IndexedSeq(Allele(alt1), Allele(alt2)))
Variant(chrom="1",
pos=10,
alleles=alleles,
genotypes=Map(sample -> makeTriallelicGt(ref=ref, alt1=alt1, alt2=alt2, ads=ads, sample=sample)))
}

it should "return ref,alt1 for a tri-allelic variant if those alleles have the highest depth" in {
val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(100, 100, 0))
val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01)
val expected = IndexedSeq(Allele("A"), Allele("T"))
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
}

it should "return alt1,alt2 for a tri-allelic variant if those alleles have the highest depth" in {
val variant = makeTriallelicVariant(ref="A", alt1="T", alt2="G", ads=IndexedSeq(0, 100, 100))
val newVariant = downsampleAndRegenotype(variant=variant, proportions = Map("test" -> 0.1), random = new Random(42), epsilon = 0.01)
val expected = IndexedSeq(Allele("T"), Allele("G"))
newVariant.genotypes("test").calls should contain theSameElementsInOrderAs expected
}

private val sample = "test1"
private val builder = VcfBuilder(samples=Seq(sample))
builder.add(chrom="chr1", pos=100, id="1", alleles=Seq("A", "C"), info=Map(),
Expand Down

0 comments on commit 87f07fe

Please sign in to comment.