From 6ca773317c64c989189e19d9fa554e306b67dd88 Mon Sep 17 00:00:00 2001 From: Zach Norgaard Date: Wed, 31 Jul 2024 13:51:21 -0700 Subject: [PATCH] feat: raise exception if CollectDuplexSeqMetrics run on consensus BAM (#1003) * feat: add isConsensus method * feat: assert DuplexSeqMetrics collected on grouped BAM * refactor: move isConsensus * refactor: move isConsensus check * fix: remove unused import * refactor: only import ConsensusTags * refactor: only check first record * chore: cleanup imports * chore: cleanup import and unused val * fix: cleanup error reporting * fix: use val instead of def * refactor: rename isConsensusRead to isFgbioStyleConsensus * docs: improve doc string accuracy --- .../umi/CollectDuplexSeqMetrics.scala | 15 ++++++++++++++- .../scala/com/fulcrumgenomics/umi/Umis.scala | 12 ++++++++++++ .../umi/CollectDuplexSeqMetricsTest.scala | 16 ++++++++++++++++ .../com/fulcrumgenomics/umi/UmisTest.scala | 18 +++++++++++++++++- 4 files changed, 59 insertions(+), 2 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala index 5d18b6d98..8e2faa94a 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetrics.scala @@ -306,7 +306,20 @@ class CollectDuplexSeqMetrics override def execute(): Unit = { // Build the iterator we'll use based on whether or not we're restricting to a set of intervals val in = SamSource(input) - val _filteredIterator = in.iterator.filter(r => r.paired && r.mapped && r.mateMapped && r.firstOfPair && !r.secondary && !r.supplementary) + val _filteredIterator = in + .iterator + .filter(r => r.paired && r.mapped && r.mateMapped && r.firstOfPair && !r.secondary && !r.supplementary) + .bufferBetter + // Ensure the records are not consensus records + _filteredIterator.headOption.foreach { + rec => + val exceptionString = s"Input BAM file to CollectDuplexSeqMetrics ($input) appears to contain consensus sequences. " + + "CollectDuplexSeqMetrics cannot run on consensus BAMs, and instead requires the UMI-grouped BAM generated " + + "by GroupReadsByUmi which is run prior to consensus calling." + + "\nFirst record in $input has consensus SAM tags present:\n$rec" + + if (Umis.isFgbioStyleConsensus(rec)) throw new IllegalArgumentException(exceptionString) + } val iterator = intervals match { case None => _filteredIterator case Some(path) => diff --git a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala index 08369a682..ed5512567 100644 --- a/src/main/scala/com/fulcrumgenomics/umi/Umis.scala +++ b/src/main/scala/com/fulcrumgenomics/umi/Umis.scala @@ -26,6 +26,7 @@ package com.fulcrumgenomics.umi import com.fulcrumgenomics.bam.api.SamRecord +import com.fulcrumgenomics.umi.ConsensusTags import com.fulcrumgenomics.util.Sequences object Umis { @@ -127,4 +128,15 @@ object Umis { @inline private def isValidUmiCharacter(ch: Char): Boolean = { ch == 'A' || ch == 'C' || ch == 'G' || ch == 'T' || ch == 'N' || ch == '-' } + + /** Returns True if the record appears to be a consensus read, + * typically produced by fgbio's CallMolecularConsensusReads or CallDuplexConsensusReads. + * + * @param rec the record to check + * @return boolean indicating if the record is a consensus or not + */ + def isFgbioStyleConsensus(rec: SamRecord): Boolean = { + rec.contains(ConsensusTags.PerRead.RawReadCount) || + (rec.contains(ConsensusTags.PerRead.AbRawReadCount) && rec.contains(ConsensusTags.PerRead.BaRawReadCount)) + } } diff --git a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala index d9d0ed16f..0834c1532 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/CollectDuplexSeqMetricsTest.scala @@ -312,6 +312,22 @@ class CollectDuplexSeqMetricsTest extends UnitSpec { duplexFamilies.find(f => f.ab_size == 6 && f.ba_size == 0).get.count shouldBe 1 } + it should "raise an exception if duplex consensus records are provided" in { + val builder = new SamBuilder(readLength=100, sort=Some(SamOrder.TemplateCoordinate)) + builder.addPair( + contig=1, + start1=1000, + start2=1100, + attrs=Map( + RX -> "AAA-GGG", + MI -> "1/A", + ConsensusTags.PerRead.AbRawReadCount -> 10, + ConsensusTags.PerRead.BaRawReadCount -> 10 + ) + ) + an[IllegalArgumentException] shouldBe thrownBy { exec(builder) } + } + "CollectDuplexSeqMetrics.updateUmiMetrics" should "not count duplex umis" in collector(duplex=false).foreach { c => val builder = new SamBuilder(readLength=10) builder.addPair(start1=100, start2=200, attrs=Map(RX -> "AAA-CCC", MI -> "1/A")) diff --git a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala index 3d8c010b3..d469aa168 100644 --- a/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala +++ b/src/test/scala/com/fulcrumgenomics/umi/UmisTest.scala @@ -25,7 +25,7 @@ package com.fulcrumgenomics.umi -import com.fulcrumgenomics.bam.api.SamRecord +import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} import org.scalatest.OptionValues @@ -111,4 +111,20 @@ class UmisTest extends UnitSpec with OptionValues { an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME:ACGT-CCKC")) an[Exception] should be thrownBy copyUmiFromReadName(rec=rec("NAME:CCKC-ACGT")) } + + "Umis.isConsensusRead" should "return false for reads without consensus tags" in { + val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) + builder.addFrag(start=100).exists(Umis.isFgbioStyleConsensus) shouldBe false + builder.addPair(start1=100, start2=100, unmapped2=true).exists(Umis.isFgbioStyleConsensus) shouldBe false + } + + it should "return true for reads with consensus tags" in { + val builder = new SamBuilder(sort=Some(SamOrder.Coordinate), readLength=10, baseQuality=20) + builder.addFrag(start=10, attrs=Map(ConsensusTags.PerRead.RawReadCount -> 10)) + .exists(Umis.isFgbioStyleConsensus) shouldBe true + builder.addFrag( + start=10, + attrs=Map(ConsensusTags.PerRead.AbRawReadCount -> 10, ConsensusTags.PerRead.BaRawReadCount -> 10) + ).exists(Umis.isFgbioStyleConsensus) shouldBe true + } }