From 1e8b28ba9ed59aabae0ae97744c8ed2d47665390 Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Mon, 4 Apr 2022 11:03:33 -0400 Subject: [PATCH 1/7] initial --- .../walkers/qc/PostProcessReadsForRSEM.java | 286 ++++++++++++++++++ ...ostProcessReadsForRSEMIntegrationTest.java | 33 ++ 2 files changed, 319 insertions(+) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java create mode 100644 src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java new file mode 100644 index 00000000000..eb81134a8f3 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java @@ -0,0 +1,286 @@ +package org.broadinstitute.hellbender.tools.walkers.qc; + +import htsjdk.samtools.Cigar; +import htsjdk.samtools.CigarElement; +import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.util.PeekableIterator; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; +import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.Utils; +import org.broadinstitute.hellbender.utils.clipping.ReadClipper; +import org.broadinstitute.hellbender.utils.read.CigarBuilder; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter; +import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; + +import java.io.File; +import java.util.*; +import java.util.stream.Collectors; + +/** + * Performs post-processing steps to get a bam aligned to a transcriptome ready for RSEM (https://github.com/deweylab/RSEM) + * + * ### Task 1. Reordering Reads ### + * + * Suppose the queryname "Q1" aligns to multiple loci in the transcriptome. + * STAR aligner outputs the reads in the following order: + * + * Q1: Read1 (Chr 1:1000) + * Q1: Read2 (Chr 1:2000) + * Q1: Read1 (Chr 20:5000) + * Q1: Read2 (Chr 20:6000) + * + * This is the format required by RSEM. After query-name sorting the reads for duplicate marking, + * the reads will be ordered as follows; + * + * Q1: Read1 (Chr 1:1000) + * Q1: Read1 (Chr 20:5000) + * Q1: Read2 (Chr 1:2000) + * Q1: Read2 (Chr 20:6000) + * + * That is, all the read1's come first, then read2's. + * + * This tool reorders such that the alignment pair appears together as in the first example. + * + * ### Task 2. Removing Reads ### + * + * If requested, this tool also removes duplicate marked reads and MT reads, which can skew gene expression counts. + * + */ +@CommandLineProgramProperties( + summary = "", + oneLineSummary = "", + programGroup = ReadDataManipulationProgramGroup.class // Sato: Right program group? +) +public class PostProcessReadsForRSEM extends GATKTool { + @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME) + public File outSam; + + @Argument(fullName = "read-length") + public int readLength = 146; + + PeekableIterator read1Iterator; + SAMFileGATKReadWriter writer; + + ReadPair currentReadPair; + + // Check + int outputReadCount = 0; + int supplementaryReadCount = 0; + int totalReadCount = 0; + + + @Override + public void onTraversalStart(){ + read1Iterator = new PeekableIterator<>(directlyAccessEngineReadsDataSource().iterator()); + // If not query name sorted, then how is this file sorted? +// if (directlyAccessEngineReadsDataSource().getHeader().getSortOrder() != SAMFileHeader.SortOrder.queryname){ +// throw new UserException("Input must be query-name sorted."); +// } + + writer = createSAMWriter(new GATKPath(outSam.getAbsolutePath()), true); + if (!read1Iterator.hasNext()){ + throw new UserException("Input has no reads"); + } + + currentReadPair = new ReadPair(read1Iterator.next()); + outputReadCount++; + } + + @Override + public void traverse() { + while (read1Iterator.hasNext()){ + final GATKRead read = read1Iterator.next(); + if (!currentReadPair.getQueryName().equals(read.getName())){ + // We have gathered all the reads with the same query name (primary, secondary, and supplementary alignments) + // Write the reads to output file, reordering the reads as needed. + writeReads(currentReadPair); + currentReadPair = new ReadPair(read); + } else { + currentReadPair.add(read); + } + progressMeter.update(read); + + } + } + + public boolean passesRSEMFilter(final GATKRead read1, final GATKRead read2) { + // If either of the pair is unmapped, throw out + if (read1.getContig() == null || read2.getContig() == null){ + return false; + } + + // Chimeric reads are not allowed + if (!read1.getContig().equals(read2.getContig())){ + return false; + } + + // Cigar must contain at most two elements + // The only allowed cigar is entirely M, or M followed by S. + final Cigar cigar1 = read1.getCigar(); + final List cigarElements1 = cigar1.getCigarElements(); + final Cigar cigar2 = read2.getCigar(); + final List cigarElements2 = cigar2.getCigarElements(); + + + if (cigarElements1.size() != cigarElements2.size()){ + // Cigars must have the same length + return false; + } else if (cigarElements1.size() == 1){ // And therefore read2 also has size 1 + return cigarElements1.get(0).getOperator() == CigarOperator.M && cigarElements2.get(0).getOperator() == CigarOperator.M; + } else if (cigarElements1.size() == 2){ + // The only allowed cigars are M followed by S in read1 and S followed by M, and vice versa + if ((cigarElements1.get(0).getOperator() == CigarOperator.M && cigarElements1.get(1).getOperator() == CigarOperator.S) || + (cigarElements1.get(0).getOperator() == CigarOperator.S && cigarElements1.get(1).getOperator() == CigarOperator.M)){ + // Now check that e.g. 100M46S is paired with 46S100M + // We don't require the exact match in the sizes of the operators (for now). M + return cigarElements1.get(0).getOperator() == cigarElements2.get(1).getOperator() && + cigarElements1.get(1).getOperator() == cigarElements2.get(0).getOperator(); + } else { + return false; + } + + + } else { + return false; + } + + } + + public boolean passesRSEMFilter(final GATKRead read) { + // Cigar must contain at most two elements + // The only allowed cigar is entirely M, or M followed by S. + final Cigar cigar = read.getCigar(); + final List cigarElements = cigar.getCigarElements(); + + if (cigarElements.size() == 1 && cigarElements.get(0).getOperator() == CigarOperator.M) { // e.g. 146M + return true; + } else { + // e.g. 100M46S or 46S100M. Else false. + return cigarElements.size() == 2 && + cigarElements.stream().allMatch(ce -> ce.getOperator() == CigarOperator.M || + ce.getOperator() == CigarOperator.S); + + } + } + /** Write reads in this order + * 1. Primary. First of pair, second of pair + * 2. For each secondary. First of pair, second of pair. + * **/ + private void writeReads(final ReadPair readPair){ + // Update read by side effect + final GATKRead firstOfPair = readPair.getFirstOfPair(); + final GATKRead secondOfPair = readPair.getSecondOfPair(); + + // Write Primary Reads. If either fails, we discard both, and we don't bother with the secondary alignments + // First, check that the read pair passes the RSEM criteria. If not, no need to bother clipping. + // The only acceptable cigar are 1) 146M, or 2) 142M4S with (insert size) < (read length) + if (passesRSEMFilter(firstOfPair, secondOfPair)){ + final boolean needsClippingPrimary = enableClipping && needsClipping(firstOfPair, secondOfPair); + writer.addRead(needsClippingPrimary ? clipRead(firstOfPair) : firstOfPair); + writer.addRead(needsClippingPrimary ? clipRead(secondOfPair) : secondOfPair); + outputReadCount += 2; + + final List> mateList = groupSecondaryReads(readPair.getSecondaryAlignments()); + for (Pair mates : mateList){ + // The pair is either both written or both not written + if (passesRSEMFilter(mates.getLeft(), mates.getRight())){ + final boolean needsClippingSecondary = enableClipping && needsClipping(mates.getLeft(), mates.getRight()); + writer.addRead(needsClippingSecondary ? clipRead(mates.getLeft()) : mates.getLeft()); + writer.addRead(needsClippingSecondary ? clipRead(mates.getRight()) : mates.getRight()); + } + } + // Ignore supplementary reads + } + } + + /** + * Contract: secondaryAligments may be empty. + */ + private List> groupSecondaryReads(List secondaryAlignments){ + if (secondaryAlignments.isEmpty()){ + return Collections.emptyList(); + } + + final boolean READ1 = true; + final Map> groupdbyRead1 = + secondaryAlignments.stream().collect(Collectors.groupingBy(r -> r.isFirstOfPair())); + final List read1Reads = groupdbyRead1.get(READ1); + final List read2Reads = groupdbyRead1.get(!READ1); + Utils.validate(read1Reads.size() == read2Reads.size(), "By assumption we must have the same number of read1s and read2s among the secondary alignments"); + + // The pairs is (read1, read2) + final List> result = new ArrayList<>(read1Reads.size()); + for (GATKRead read1 : read1Reads){ + final Optional read2 = read2Reads.stream().filter(r -> + r.getStart() == read1.getMateStart() && r.getMateStart() == read1.getStart()).findFirst(); + if (read2.isPresent()){ + result.add(new ImmutablePair<>(read1, read2.get())); + } else { + logger.warn("Mate not found for the secondary alignment " + read1.getName()); + } + } + return result; + } + + // Contract: call passesRSEMFilter on the read pair before calling + private boolean needsClipping(final GATKRead read1, final GATKRead read2){ + if (read1.getFragmentLength() != -1 * read2.getFragmentLength()){ + logger.warn(read1.getName() + ": Fragment lengths must be negative of each other but got " + + read1.getFragmentLength() + ", " + read2.getFragmentLength()); + return false; + } + + // If only one cigar element, then read1 and read2 are both 146M since we've already run RSEM read filter. No need to clip in this case. + if (read1.getCigarElements().size() == 1){ + return false; + } + + return Math.abs(read1.getFragmentLength()) < readLength; + } + + // TODO: replace with ReadClipper.hardClipAdaptorSequence(read) + private GATKRead clipRead(final GATKRead read){ + // Probably don't need to update mate start (check though). Or start, for that matter + final Cigar cigar = read.getCigar(); + final byte[] readBases = read.getBases(); + final byte[] quals = read.getBaseQualities(); + + final GATKRead clippedRead = ReadClipper.hardClipAdaptorSequence(read); + final byte[] clippedReadBases = clippedRead.getBases(); + final byte[] clippedQuals = clippedRead.getBaseQualities(); // length = 123...that ok? Should be 124? + final int length = clippedRead.getLength(); + + // For RSEM, remove H from the cigar + final List matchCigarElement = read.getCigarElements().stream().filter(ce -> ce.getOperator() == CigarOperator.M).collect(Collectors.toList()); + Utils.validate(matchCigarElement.size() == 1, "There must be a singl match element but got: " + matchCigarElement); + // This commented version is the correct way. But sometimes the cigar and read length don't match (a bug in hardClipAdaptorSequence()) + // final CigarElement matchCigarElem = matchCigarElement.get(0); + final CigarElement matchCigarElem = new CigarElement(clippedRead.getLength(), CigarOperator.M); + clippedRead.setCigar(new CigarBuilder().add(matchCigarElem).make()); + // This could be off by one, but as long as we get the reads out, we should be ok. + // Remember this is just a proof of concept; we can fine tune it later, as long as we can verify that we can run RSEM on it. + Utils.validate(clippedRead.getLength() == matchCigarElem.getLength(), + "length of cigar operator and read must match but got: " + clippedRead.getLength() + ", " + matchCigarElem.getLength()); + return clippedRead; + } + + + + @Override + public Object onTraversalSuccess(){ + // Write out the last set of reads + writeReads(currentReadPair); + return "SUCCESS"; + } + + @Override + public void closeTool(){ + writer.close(); + } +} diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java new file mode 100644 index 00000000000..3388f519d19 --- /dev/null +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java @@ -0,0 +1,33 @@ +package org.broadinstitute.hellbender.tools.walkers.qc; + +import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.testng.annotations.Test; + +public class PostProcessReadsForRSEMIntegrationTest extends CommandLineProgramTest { + @Test + public void test(){ + final ArgumentsBuilder args = new ArgumentsBuilder(); + final String home = "/Volumes/dsde_working/tsato/hydro.gen/Analysis/874_twist_RNA/rsem/"; + //final String bam = home + "SM-LLM7I_Kapa_SM-LQZYG_test_1.bam"; + //final String bam = home + "SM-KYN26_Kapa_SM-LQZYH_test.bam"; + //final String output = home + "SM-LLM7I_Kapa_SM-LQZYG_test_1_out.bam"; + + // Case 3 + // final String bam = home + "test/SM-HESUP_SSIV_SM-LQZYY_genome_test.bam"; + // final String output = home + "test/SM-HESUP_SSIV_SM-LQZYY_genome_test_out.bam"; + // 2101:10221:8484 has 2S104M40S and 43S103M, which would be clipped as is. + + // final String bam = home + "test/SM-HESUP_SSIV_SM-LQZYY_transcriptome_test.bam"; + // final String output = home + "test/SM-HESUP_SSIV_SM-LQZYY_transcriptome_test_out.bam"; + + // Case 4 + final String bam = home + "test/SM-LVFDN_15_Min_Low_Mid_transcriptome.duplicate_marked.bam"; + final String output = home + "test/SM-LVFDN_15_Min_Low_Mid_transcriptome_out.bam"; + + // Case 5 + args.addInput(bam); + args.addOutput(output); + runCommandLine(args, PostProcessReadsForRSEM.class.getSimpleName()); + } +} \ No newline at end of file From a9a5b624fe3bf97517d2523f657f5ff82fb96d35 Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Mon, 4 Apr 2022 11:47:19 -0400 Subject: [PATCH 2/7] cleaned up --- .../walkers/qc/PostProcessReadsForRSEM.java | 174 ++++++------------ .../hellbender/tools/walkers/qc/ReadPair.java | 101 ++++++++++ 2 files changed, 157 insertions(+), 118 deletions(-) create mode 100644 src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java index eb81134a8f3..3b3ab16b632 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java @@ -1,8 +1,8 @@ package org.broadinstitute.hellbender.tools.walkers.qc; -import htsjdk.samtools.Cigar; import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; +import htsjdk.samtools.SAMFileHeader; import htsjdk.samtools.util.PeekableIterator; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; @@ -11,9 +11,6 @@ import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.*; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.utils.Utils; -import org.broadinstitute.hellbender.utils.clipping.ReadClipper; -import org.broadinstitute.hellbender.utils.read.CigarBuilder; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; @@ -55,7 +52,7 @@ @CommandLineProgramProperties( summary = "", oneLineSummary = "", - programGroup = ReadDataManipulationProgramGroup.class // Sato: Right program group? + programGroup = ReadDataManipulationProgramGroup.class // Sato: Change to QC when the other PR is merged. ) public class PostProcessReadsForRSEM extends GATKTool { @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME) @@ -69,27 +66,28 @@ public class PostProcessReadsForRSEM extends GATKTool { ReadPair currentReadPair; - // Check - int outputReadCount = 0; - int supplementaryReadCount = 0; - int totalReadCount = 0; - + // Use CountingFilter or another existing framework for counting and printing filtered reads + int totalOutputPair = 0; + int notBothMapped = 0; + int chimera = 0; + int unsupportedCigar = 0; @Override public void onTraversalStart(){ read1Iterator = new PeekableIterator<>(directlyAccessEngineReadsDataSource().iterator()); - // If not query name sorted, then how is this file sorted? -// if (directlyAccessEngineReadsDataSource().getHeader().getSortOrder() != SAMFileHeader.SortOrder.queryname){ -// throw new UserException("Input must be query-name sorted."); -// } + + if (directlyAccessEngineReadsDataSource().getHeader().getSortOrder() != SAMFileHeader.SortOrder.queryname){ + throw new UserException("Input must be query-name sorted."); + } writer = createSAMWriter(new GATKPath(outSam.getAbsolutePath()), true); + if (!read1Iterator.hasNext()){ throw new UserException("Input has no reads"); } currentReadPair = new ReadPair(read1Iterator.next()); - outputReadCount++; + totalOutputPair++; } @Override @@ -105,69 +103,42 @@ public void traverse() { currentReadPair.add(read); } progressMeter.update(read); - } } public boolean passesRSEMFilter(final GATKRead read1, final GATKRead read2) { - // If either of the pair is unmapped, throw out + // If either of the pair is unmapped, throw out. + // With the STAR argument --quantTranscriptomeBan IndelSoftclipSingleend this should not occur, + // but we check just to be thorough. if (read1.getContig() == null || read2.getContig() == null){ + notBothMapped++; return false; } // Chimeric reads are not allowed if (!read1.getContig().equals(read2.getContig())){ + chimera++; return false; } - // Cigar must contain at most two elements - // The only allowed cigar is entirely M, or M followed by S. - final Cigar cigar1 = read1.getCigar(); - final List cigarElements1 = cigar1.getCigarElements(); - final Cigar cigar2 = read2.getCigar(); - final List cigarElements2 = cigar2.getCigarElements(); + // Cigar must have a single operator M. + final List cigarElements1 = read1.getCigar().getCigarElements(); + final List cigarElements2 = read2.getCigar().getCigarElements(); - - if (cigarElements1.size() != cigarElements2.size()){ - // Cigars must have the same length - return false; - } else if (cigarElements1.size() == 1){ // And therefore read2 also has size 1 - return cigarElements1.get(0).getOperator() == CigarOperator.M && cigarElements2.get(0).getOperator() == CigarOperator.M; - } else if (cigarElements1.size() == 2){ - // The only allowed cigars are M followed by S in read1 and S followed by M, and vice versa - if ((cigarElements1.get(0).getOperator() == CigarOperator.M && cigarElements1.get(1).getOperator() == CigarOperator.S) || - (cigarElements1.get(0).getOperator() == CigarOperator.S && cigarElements1.get(1).getOperator() == CigarOperator.M)){ - // Now check that e.g. 100M46S is paired with 46S100M - // We don't require the exact match in the sizes of the operators (for now). M - return cigarElements1.get(0).getOperator() == cigarElements2.get(1).getOperator() && - cigarElements1.get(1).getOperator() == cigarElements2.get(0).getOperator(); - } else { - return false; - } - - - } else { + if (cigarElements1.size() != 1 && cigarElements2.size() != 1){ + unsupportedCigar++; return false; } - } - - public boolean passesRSEMFilter(final GATKRead read) { - // Cigar must contain at most two elements - // The only allowed cigar is entirely M, or M followed by S. - final Cigar cigar = read.getCigar(); - final List cigarElements = cigar.getCigarElements(); - - if (cigarElements.size() == 1 && cigarElements.get(0).getOperator() == CigarOperator.M) { // e.g. 146M - return true; + if (cigarElements1.get(0).getOperator() != CigarOperator.M || + cigarElements2.get(0).getOperator() != CigarOperator.M){ + unsupportedCigar++; + return false; } else { - // e.g. 100M46S or 46S100M. Else false. - return cigarElements.size() == 2 && - cigarElements.stream().allMatch(ce -> ce.getOperator() == CigarOperator.M || - ce.getOperator() == CigarOperator.S); - + return true; } } + /** Write reads in this order * 1. Primary. First of pair, second of pair * 2. For each secondary. First of pair, second of pair. @@ -181,40 +152,47 @@ private void writeReads(final ReadPair readPair){ // First, check that the read pair passes the RSEM criteria. If not, no need to bother clipping. // The only acceptable cigar are 1) 146M, or 2) 142M4S with (insert size) < (read length) if (passesRSEMFilter(firstOfPair, secondOfPair)){ - final boolean needsClippingPrimary = enableClipping && needsClipping(firstOfPair, secondOfPair); - writer.addRead(needsClippingPrimary ? clipRead(firstOfPair) : firstOfPair); - writer.addRead(needsClippingPrimary ? clipRead(secondOfPair) : secondOfPair); - outputReadCount += 2; + writer.addRead(firstOfPair); + writer.addRead(secondOfPair); + totalOutputPair += 1; final List> mateList = groupSecondaryReads(readPair.getSecondaryAlignments()); for (Pair mates : mateList){ // The pair is either both written or both not written if (passesRSEMFilter(mates.getLeft(), mates.getRight())){ - final boolean needsClippingSecondary = enableClipping && needsClipping(mates.getLeft(), mates.getRight()); - writer.addRead(needsClippingSecondary ? clipRead(mates.getLeft()) : mates.getLeft()); - writer.addRead(needsClippingSecondary ? clipRead(mates.getRight()) : mates.getRight()); + writer.addRead(mates.getLeft()); + writer.addRead(mates.getRight()); + totalOutputPair += 1; } } - // Ignore supplementary reads + + // Supplementary reads are not handled i.e. removed } } /** - * Contract: secondaryAligments may be empty. + * + * Reorder the secondary alignments as described above. + * @param secondaryAlignments may be empty. + * @return a list of pair of matching reads (i.e. read1 with the corresponding read2) */ - private List> groupSecondaryReads(List secondaryAlignments){ + private List> groupSecondaryReads(final List secondaryAlignments){ if (secondaryAlignments.isEmpty()){ return Collections.emptyList(); } - final boolean READ1 = true; - final Map> groupdbyRead1 = + final boolean isRead1 = true; + final Map> groupdByRead1 = secondaryAlignments.stream().collect(Collectors.groupingBy(r -> r.isFirstOfPair())); - final List read1Reads = groupdbyRead1.get(READ1); - final List read2Reads = groupdbyRead1.get(!READ1); - Utils.validate(read1Reads.size() == read2Reads.size(), "By assumption we must have the same number of read1s and read2s among the secondary alignments"); + final List read1Reads = groupdByRead1.get(isRead1); + final List read2Reads = groupdByRead1.get(!isRead1); + if(read1Reads.size() != read2Reads.size()){ + logger.warn("Num read1s != num read2s among the secondary alignments; " + + secondaryAlignments.get(0).getName()); + return Collections.emptyList(); + } + - // The pairs is (read1, read2) final List> result = new ArrayList<>(read1Reads.size()); for (GATKRead read1 : read1Reads){ final Optional read2 = read2Reads.stream().filter(r -> @@ -228,54 +206,14 @@ private List> groupSecondaryReads(List second return result; } - // Contract: call passesRSEMFilter on the read pair before calling - private boolean needsClipping(final GATKRead read1, final GATKRead read2){ - if (read1.getFragmentLength() != -1 * read2.getFragmentLength()){ - logger.warn(read1.getName() + ": Fragment lengths must be negative of each other but got " + - read1.getFragmentLength() + ", " + read2.getFragmentLength()); - return false; - } - - // If only one cigar element, then read1 and read2 are both 146M since we've already run RSEM read filter. No need to clip in this case. - if (read1.getCigarElements().size() == 1){ - return false; - } - - return Math.abs(read1.getFragmentLength()) < readLength; - } - - // TODO: replace with ReadClipper.hardClipAdaptorSequence(read) - private GATKRead clipRead(final GATKRead read){ - // Probably don't need to update mate start (check though). Or start, for that matter - final Cigar cigar = read.getCigar(); - final byte[] readBases = read.getBases(); - final byte[] quals = read.getBaseQualities(); - - final GATKRead clippedRead = ReadClipper.hardClipAdaptorSequence(read); - final byte[] clippedReadBases = clippedRead.getBases(); - final byte[] clippedQuals = clippedRead.getBaseQualities(); // length = 123...that ok? Should be 124? - final int length = clippedRead.getLength(); - - // For RSEM, remove H from the cigar - final List matchCigarElement = read.getCigarElements().stream().filter(ce -> ce.getOperator() == CigarOperator.M).collect(Collectors.toList()); - Utils.validate(matchCigarElement.size() == 1, "There must be a singl match element but got: " + matchCigarElement); - // This commented version is the correct way. But sometimes the cigar and read length don't match (a bug in hardClipAdaptorSequence()) - // final CigarElement matchCigarElem = matchCigarElement.get(0); - final CigarElement matchCigarElem = new CigarElement(clippedRead.getLength(), CigarOperator.M); - clippedRead.setCigar(new CigarBuilder().add(matchCigarElem).make()); - // This could be off by one, but as long as we get the reads out, we should be ok. - // Remember this is just a proof of concept; we can fine tune it later, as long as we can verify that we can run RSEM on it. - Utils.validate(clippedRead.getLength() == matchCigarElem.getLength(), - "length of cigar operator and read must match but got: " + clippedRead.getLength() + ", " + matchCigarElem.getLength()); - return clippedRead; - } - - - @Override public Object onTraversalSuccess(){ // Write out the last set of reads writeReads(currentReadPair); + logger.info("Total read pairs output: " + totalOutputPair); + logger.info("Read pairs filtered due to unmapped mate: " + notBothMapped); + logger.info("Read pairs filtered due to chimeric alignment: " + chimera); + logger.info("Read pairs filtered due to a cigar element not supported by RSEM: " + unsupportedCigar); return "SUCCESS"; } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java new file mode 100644 index 00000000000..82cd6cedc04 --- /dev/null +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java @@ -0,0 +1,101 @@ +package org.broadinstitute.hellbender.tools.walkers.qc; + +import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.read.GATKRead; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +public class ReadPair { + private GATKRead firstOfPair; + private GATKRead secondOfPair; + private List secondaryAlignments = new ArrayList<>(10); + private List supplementaryAlignments = new ArrayList<>(10); // Finally understand the difference + private String queryName = null; + + + public ReadPair() { } + + public ReadPair(final GATKRead read) { + this.queryName = read.getName(); + add(read); + } + + public int size(){ + int size = 0; + size = firstOfPair != null ? size + 1 : size; + size = secondOfPair != null ? size + 1 : size; + size += secondaryAlignments.size(); + return size += supplementaryAlignments.size(); // what should we do with the supplementary alignments? + } + + public String getQueryName() { + return queryName; + } + + public GATKRead getFirstOfPair(){ + return firstOfPair; + } + + public GATKRead getSecondOfPair(){ + return secondOfPair; + } + + public List getSecondaryAlignments() { + return secondaryAlignments; + } + + public void add(final GATKRead read) { + if (this.queryName == null){ + this.queryName = read.getName(); + } + + if (! this.queryName.equals(read.getName())){ + throw new UserException("Read names do not match: " + this.queryName + " vs " + read.getName()); + } + + if (isPrimaryAlignment(read) && read.isFirstOfPair()) { + this.firstOfPair = read; + } else if (isPrimaryAlignment(read) && read.isSecondOfPair()) { + this.secondOfPair = read; + } else if (read.isSecondaryAlignment()) { + this.secondaryAlignments.add(read); + } else if (read.isSupplementaryAlignment()) { + this.supplementaryAlignments.add(read); + } else { + throw new UserException("Unknown read type"); + } + } + + private boolean isPrimaryAlignment(final GATKRead read){ + return ! read.isSecondaryAlignment() && ! read.isSupplementaryAlignment(); + } + + public boolean isDuplicateMarked() { + // Doing some investigation + if (firstOfPair.isDuplicate()) { + // Make sure the rest is duplicate-marked + if (!secondOfPair.isDuplicate() || secondaryAlignments.stream().anyMatch(r -> !r.isDuplicate())) { + throw new UserException("First of pair a duplicate but the rest is not" + secondOfPair.getName()); + } + } else { + // Make sure the rest is not duplicate-marked + if (secondOfPair.isDuplicate() || secondaryAlignments.stream().anyMatch(r -> r.isDuplicate())) { + throw new UserException("First of pair a not duplicate but the rest is " + secondOfPair.getName()); + } + } + return firstOfPair.isDuplicate(); + } + + public List getReads(final boolean onlyPrimaryAlignments){ + List reads = Arrays.asList(firstOfPair, secondOfPair); + if (onlyPrimaryAlignments){ + return(reads); + } else { + reads.addAll(secondaryAlignments); + reads.addAll(supplementaryAlignments); + return(reads); + } + } +} From c5ba782379f6c3736c7aa91c1623e8eea413dc85 Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Mon, 4 Apr 2022 12:41:34 -0400 Subject: [PATCH 3/7] add duplicate and mt filter: --- .../walkers/qc/PostProcessReadsForRSEM.java | 68 +++++++++++++++---- ...ostProcessReadsForRSEMIntegrationTest.java | 12 +--- 2 files changed, 55 insertions(+), 25 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java index 3b3ab16b632..d066b4aefaf 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java @@ -58,8 +58,11 @@ public class PostProcessReadsForRSEM extends GATKTool { @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME) public File outSam; - @Argument(fullName = "read-length") - public int readLength = 146; + @Argument(fullName = "keep-MT-reads") + public boolean keepMTReads = false; + + @Argument(fullName = "keep-duplicates") + public boolean keepDuplicates = false; PeekableIterator read1Iterator; SAMFileGATKReadWriter writer; @@ -71,6 +74,8 @@ public class PostProcessReadsForRSEM extends GATKTool { int notBothMapped = 0; int chimera = 0; int unsupportedCigar = 0; + int duplicates = 0; + int mitochondria = 0; @Override public void onTraversalStart(){ @@ -125,7 +130,7 @@ public boolean passesRSEMFilter(final GATKRead read1, final GATKRead read2) { final List cigarElements1 = read1.getCigar().getCigarElements(); final List cigarElements2 = read2.getCigar().getCigarElements(); - if (cigarElements1.size() != 1 && cigarElements2.size() != 1){ + if (cigarElements1.size() != 1 || cigarElements2.size() != 1){ unsupportedCigar++; return false; } @@ -139,6 +144,40 @@ public boolean passesRSEMFilter(final GATKRead read1, final GATKRead read2) { } } + private final static HashSet MT_TRANSCRIPT_IDs = new HashSet<>(Arrays.asList( + "ENST00000387314.1", "ENST00000389680.2", "ENST00000387342.1", "ENST00000387347.2", "ENST00000386347.1", + "ENST00000361390.2", "ENST00000387365.1", "ENST00000387372.1", "ENST00000387377.1", "ENST00000361453.3", + "ENST00000387382.1", "ENST00000387392.1", "ENST00000387400.1", "ENST00000387405.1", "ENST00000387409.1", + "ENST00000361624.2", "ENST00000387416.2", "ENST00000387419.1", "ENST00000361739.1", "ENST00000387421.1", + "ENST00000361851.1", "ENST00000361899.2", "ENST00000362079.2", "ENST00000387429.1", "ENST00000361227.2", + "ENST00000387439.1", "ENST00000361335.1", "ENST00000361381.2", "ENST00000387441.1", "ENST00000387449.1", + "ENST00000387456.1", "ENST00000361567.2", "ENST00000361681.2", "ENST00000387459.1", "ENST00000361789.2", + "ENST00000387460.2", "ENST00000387461.2" + )); + private boolean mitochondrialRead(final GATKRead read){ + return MT_TRANSCRIPT_IDs.contains(read.getContig()); + } + + private boolean passesAdditionalFilters(final GATKRead r1, final GATKRead r2){ + if (! keepDuplicates){ + // If the read pair is duplicate, then return false + if (r1.isDuplicate()){ + duplicates++; + return false; + } + } + + if (!keepMTReads){ + // If the read pair is duplicate, then return false + if (mitochondrialRead(r1) || mitochondrialRead(r2)){ + mitochondria++; + return false; + } + } + + return true; + } + /** Write reads in this order * 1. Primary. First of pair, second of pair * 2. For each secondary. First of pair, second of pair. @@ -148,21 +187,22 @@ private void writeReads(final ReadPair readPair){ final GATKRead firstOfPair = readPair.getFirstOfPair(); final GATKRead secondOfPair = readPair.getSecondOfPair(); - // Write Primary Reads. If either fails, we discard both, and we don't bother with the secondary alignments - // First, check that the read pair passes the RSEM criteria. If not, no need to bother clipping. - // The only acceptable cigar are 1) 146M, or 2) 142M4S with (insert size) < (read length) - if (passesRSEMFilter(firstOfPair, secondOfPair)){ + // Write Primary Reads. If either fails, we discard both, and we don't bother with the secondary alignments. + if (passesRSEMFilter(firstOfPair, secondOfPair) && passesAdditionalFilters(firstOfPair, secondOfPair)){ writer.addRead(firstOfPair); writer.addRead(secondOfPair); totalOutputPair += 1; - final List> mateList = groupSecondaryReads(readPair.getSecondaryAlignments()); - for (Pair mates : mateList){ - // The pair is either both written or both not written - if (passesRSEMFilter(mates.getLeft(), mates.getRight())){ - writer.addRead(mates.getLeft()); - writer.addRead(mates.getRight()); - totalOutputPair += 1; + // Now handle secondary alignments. + final List> secondaryAlignmentPairs = groupSecondaryReads(readPair.getSecondaryAlignments()); + for (Pair mates : secondaryAlignmentPairs){ + // The pair is either both written or both not written. + if (passesRSEMFilter(mates.getLeft(), mates.getRight()) && + passesAdditionalFilters(mates.getLeft(), mates.getRight())) { + writer.addRead(mates.getLeft()); + writer.addRead(mates.getRight()); + totalOutputPair += 1; + } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java index 3388f519d19..73b16389f04 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java @@ -9,18 +9,8 @@ public class PostProcessReadsForRSEMIntegrationTest extends CommandLineProgramTe public void test(){ final ArgumentsBuilder args = new ArgumentsBuilder(); final String home = "/Volumes/dsde_working/tsato/hydro.gen/Analysis/874_twist_RNA/rsem/"; - //final String bam = home + "SM-LLM7I_Kapa_SM-LQZYG_test_1.bam"; - //final String bam = home + "SM-KYN26_Kapa_SM-LQZYH_test.bam"; - //final String output = home + "SM-LLM7I_Kapa_SM-LQZYG_test_1_out.bam"; - - // Case 3 - // final String bam = home + "test/SM-HESUP_SSIV_SM-LQZYY_genome_test.bam"; - // final String output = home + "test/SM-HESUP_SSIV_SM-LQZYY_genome_test_out.bam"; - // 2101:10221:8484 has 2S104M40S and 43S103M, which would be clipped as is. - - // final String bam = home + "test/SM-HESUP_SSIV_SM-LQZYY_transcriptome_test.bam"; - // final String output = home + "test/SM-HESUP_SSIV_SM-LQZYY_transcriptome_test_out.bam"; + // TEst // Case 4 final String bam = home + "test/SM-LVFDN_15_Min_Low_Mid_transcriptome.duplicate_marked.bam"; final String output = home + "test/SM-LVFDN_15_Min_Low_Mid_transcriptome_out.bam"; From ea73824246413703d482a57c0411e43be4437539 Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Mon, 4 Apr 2022 15:47:12 -0400 Subject: [PATCH 4/7] add test files --- .../walkers/qc/PostProcessReadsForRSEM.java | 17 ++---- ...ostProcessReadsForRSEMIntegrationTest.java | 52 +++++++++++++++--- ...transcriptome_query_sorted_abbr_header.bam | Bin 0 -> 2230 bytes 3 files changed, 48 insertions(+), 21 deletions(-) create mode 100644 src/test/resources/transcriptome_query_sorted_abbr_header.bam diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java index d066b4aefaf..a0411a21a7c 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java @@ -48,6 +48,10 @@ * * If requested, this tool also removes duplicate marked reads and MT reads, which can skew gene expression counts. * + * + * Caveat: This tool does not remove duplicate reads; it assumes it's been removed upstream e.g. + * MarkDuplicates with R + * */ @CommandLineProgramProperties( summary = "", @@ -61,9 +65,6 @@ public class PostProcessReadsForRSEM extends GATKTool { @Argument(fullName = "keep-MT-reads") public boolean keepMTReads = false; - @Argument(fullName = "keep-duplicates") - public boolean keepDuplicates = false; - PeekableIterator read1Iterator; SAMFileGATKReadWriter writer; @@ -153,20 +154,12 @@ public boolean passesRSEMFilter(final GATKRead read1, final GATKRead read2) { "ENST00000387439.1", "ENST00000361335.1", "ENST00000361381.2", "ENST00000387441.1", "ENST00000387449.1", "ENST00000387456.1", "ENST00000361567.2", "ENST00000361681.2", "ENST00000387459.1", "ENST00000361789.2", "ENST00000387460.2", "ENST00000387461.2" - )); + )); // ENST00000389680.2 overlaps with the twist targets private boolean mitochondrialRead(final GATKRead read){ return MT_TRANSCRIPT_IDs.contains(read.getContig()); } private boolean passesAdditionalFilters(final GATKRead r1, final GATKRead r2){ - if (! keepDuplicates){ - // If the read pair is duplicate, then return false - if (r1.isDuplicate()){ - duplicates++; - return false; - } - } - if (!keepMTReads){ // If the read pair is duplicate, then return false if (mitochondrialRead(r1) || mitochondrialRead(r2)){ diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java index 73b16389f04..f244747d53e 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java @@ -1,23 +1,57 @@ package org.broadinstitute.hellbender.tools.walkers.qc; import org.broadinstitute.hellbender.CommandLineProgramTest; +import org.broadinstitute.hellbender.engine.ReadsPathDataSource; import org.broadinstitute.hellbender.testutils.ArgumentsBuilder; +import org.broadinstitute.hellbender.utils.read.GATKRead; +import org.testng.Assert; import org.testng.annotations.Test; +import java.io.File; +import java.util.Arrays; +import java.util.Iterator; +import java.util.List; + public class PostProcessReadsForRSEMIntegrationTest extends CommandLineProgramTest { @Test public void test(){ final ArgumentsBuilder args = new ArgumentsBuilder(); - final String home = "/Volumes/dsde_working/tsato/hydro.gen/Analysis/874_twist_RNA/rsem/"; - - // TEst - // Case 4 - final String bam = home + "test/SM-LVFDN_15_Min_Low_Mid_transcriptome.duplicate_marked.bam"; - final String output = home + "test/SM-LVFDN_15_Min_Low_Mid_transcriptome_out.bam"; + // To be replaced + // final String testSam = "/Volumes/dsde_working/tsato/hydro.gen/Analysis/874_twist_RNA/rsem/gatk_test/transcriptome_query_sorted_abbr_header.bam"; + final String testTranscriptomeSam = publicTestDir + "transcriptome_query_sorted_abbr_header.bam"; + final File output = createTempFile("output", "bam"); - // Case 5 - args.addInput(bam); - args.addOutput(output); + args.addInput(testTranscriptomeSam); + args.addOutput(output.getAbsolutePath()); runCommandLine(args, PostProcessReadsForRSEM.class.getSimpleName()); + + final ReadsPathDataSource outputReadSource = new ReadsPathDataSource(output.toPath()); + final Iterator outputSamIterator = outputReadSource.iterator(); + + final List inputReadNames = Arrays.asList("SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:11804", "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:27367", "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:30906", "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:36761", "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:9079", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10276:1689"); + // SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10276:1689 aligns to a transcript in MT + final int[] expectedReadPairCount = new int[] { 4, 5, 2, 1, 6, 0 }; + String currentReadName = inputReadNames.get(0); + int i = 0; + int j = 0; + int count = 0; + + while (outputSamIterator.hasNext()){ + final GATKRead outputRead = outputSamIterator.next(); + // Check that read1 and read2 alternate + Assert.assertTrue(i % 2 == 0 ? outputRead.isFirstOfPair() : ! outputRead.isFirstOfPair()); + if (! outputRead.getName().equals(currentReadName)){ + Assert.assertEquals(count, 2*expectedReadPairCount[j++]); + currentReadName = outputRead.getName(); + count = 1; + i++; + continue; + } + + count++; + i++; + } + } } \ No newline at end of file diff --git a/src/test/resources/transcriptome_query_sorted_abbr_header.bam b/src/test/resources/transcriptome_query_sorted_abbr_header.bam new file mode 100644 index 0000000000000000000000000000000000000000..c1aa2954e007d90d3bf76e4969b22452827b3594 GIT binary patch literal 2230 zcmV;n2ub%JiwFb&00000{{{d;LjnN$0gaT;Y7{{b#!Dp8$;Bw%ye!@wx~u;$M50MZ z0&$J3cwJ`AfB`pSW>>|dZz4JR2)P7u4?cm%fOuBV_HOsA&|L=y7KW{_tG=%K)jWOj z?8e4TfX%(_;?xtR3biS*IPhZSq#_l3 zsj!-p3e-ylXQV>OQp74u(^YIxNG6CP$c3_{2`RYD1X5^;rsP3@xj<-cxhU15gi$pFkRj0rrKL5&10peRkBM*?Js4`-)@jQ^38(z54tI1G4lX|qkd^By^S$#CB z#^ZMMrml|awj5WJ|CRa@Yxe=*p8sWA0{f4N$b38C_XZJ}Zv|XZk@+^jLo%D+0=T%7 zMk4<3b0vv%+{3Rl(g6?O?xtrV>F_y?bcn-M>Y!sAzTZmEM5tjUJ=4()e==_oWB9S2 z*IDoj53-v^aN$elEfNdYxwn8SoZZgO1V-WOYI-KZ3FrPp`WFziYcNp-001A02m}BC z000301^_}s0sx){?b=UhTtyfF@VEc6wTaakNBbh8o#91BOJH{slQt)3GPo1bTGOr8 z99oJNEDA0D1rh9f&S5TIJa|y7lz@1aq8AUI#d`AJK|#G0p?VTejWfGDZ+Bnzy>!=5 zg?(Qknf>yhyx&Z}`DW7d1nEOR)a#8#t@nAQvC^p5>MQkHt+u#QuOF%{dIxUtyod@2F~N*O5-F*apo$U5 zIME0ThPaZH5X>pCUkN3?q&|<#vq~eN1!YKStrVX7-^J19#^a}7dO^H;{>8J;o_^)b z%iel(W4&4TjyKo-eEOrx5xH8HYpX?mVx!e;HW~BQ+oO3qJa5dQ@%^q6n-{FP&HY}^ z%~K-F{K+e^c_G#OcH_IJI%sZl_vY(7EwaoXxq8u=mr~7dGrk`u%x&(A+&nF^%s;&p zo0o^0pDOffcgK7)uIIMKziECxt~tP5D~P2KNHMMiK$t5YA^<|hDdU4H@z3{C0GuQjj_mlSL_)~w{Jvt;ao01>${LbdCkSB;; zsqPZT<=d9@oOmqaLW0;;=6+&V!E@rVh)W4#SBU$GUBS+Y$09EG5kF)IZ5%jgh`mm4 zea7qMKG+&{|7$ESEG>IS&6F`;N+T>W1`4ytpi#&a0Kz9UqLc$glrpXo_&z6G2k=7r zOYLH0ZC(#tK$fW@)>f|@s){()ZjhO=YPL5q^KIS^=M9`GuHO5V7_MO9TyE0f_Hn-r z*X?eHgOv==90YT@Ndx=-++7Z=lnQ3c*>qxsx!k0I zz4zLk4y-&B?4E7Ke{-Pt+wsJJb)_l^pTQcxIZ)##(Wj8`87%3W10{VDeToU6!3w@P zP{AkBrG%6&fea>~4l_qX{6wOHz1eB0cYCbu;5hPF^UTw7|Ne-Pxv z9Cio=5M$&E3>d@q5ES?XnxhZ`I*0;F0%XQma3Lr&Ry^Pu;wTD7r{>8EdgtdrxF+-VUv~Ded01o_YJsX>*_D=4p{-zJ_A+;!yKbh2EO( zm~Y1G+^+qvMy>-rbApk?91%z{l!1f+;ebX!Lvy4bX+HkFQRYmHF-=*er0BNlK4Q0o zyTtLbzLpcG#ZJV z#DsuQL=Z!w7$+nW!fbU2@rW=@DE0-xRB(bN6BJ?1F`t_ru5fdz#IV0Ht8(l;8(Qf? zw^}z_%~t1E31&3k>fR7K-;ViaY;Nb_*U^`CY|aFa5JZYc3IZmn0zwG^0!}qklmRmp z0Rswz91CQ`&BIy%7Aj`G3M3c~BqnC*kLEEV-?o_E`GPQWSr@vM!x5h|#M6dx!JK*) zyxy|5xvQpf1S2o&!?oqZomK5iteM%`OchWHu*es{tYS?BmWrFkO+^vr1VT-aG)tQ^ zZgxQp74xk-j(jFHh>#kM?q)>Rs($_tSvJts?zw!m*=lvpS*4g52ib$;ne9i+un7n7o)uJ~M=TCMVB`Eb_BcF?lJK ze6QiXo*=im%Q<;UWRbu1b6m?SspN*oJ6;|nx4G$ZPCh)c$S;2rlTW3R?=ig3CCF{= z?wmX&GCP*fWAf@y@{@&LL%87e`f%4c{xotQ9+PvZDFz`G1Rxke6q(~AkU(h~f&ge_ z4wPEo$h`aqrUSp^-`;JmKaZhOb#9VkB;X*>#wR`=-t}V=k#}O_j zglC2b&*Z{!j4oVC2)kykU)VK$x$ro`m4vYC-TH-HkCzLNBRrK5cCA{!uxsgZ;c0Do+uSE6cVMIjxv{*oQeQgM5!OJ+0EmF;kfRV##w0?DhEf46I8lIe z2_Xzf5J3`AjKYvc05PmX7y?KrxsCLuv%SelQfLvlEB_dea69)pm;33wULV_89Vc;V|Lj~Z@GqZbCiI|-m z_^+=+VtQ`-0-wFXd+M(6KYDv6N+vb{03VA81ONa4009360763o02=@U0000000000 E04SVE3IG5A literal 0 HcmV?d00001 From 95bd053a765e205cc344f81d527994e1a895d920 Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Thu, 7 Apr 2022 11:50:43 -0400 Subject: [PATCH 5/7] louis --- .../walkers/qc/PostProcessReadsForRSEM.java | 179 +++++++++--------- .../hellbender/tools/walkers/qc/ReadPair.java | 40 ++-- ...ostProcessReadsForRSEMIntegrationTest.java | 3 +- ...MT_transcriptome_abbr_header.interval_list | 94 +++++++++ ...transcriptome_query_sorted_abbr_header.bam | Bin 2230 -> 2629 bytes 5 files changed, 204 insertions(+), 112 deletions(-) create mode 100644 src/test/resources/gencode_v19_MT_transcriptome_abbr_header.interval_list diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java index a0411a21a7c..75357e1b8c6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java @@ -1,106 +1,115 @@ package org.broadinstitute.hellbender.tools.walkers.qc; -import htsjdk.samtools.CigarElement; import htsjdk.samtools.CigarOperator; import htsjdk.samtools.SAMFileHeader; -import htsjdk.samtools.util.PeekableIterator; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; +import org.apache.http.annotation.Experimental; import org.broadinstitute.barclay.argparser.Argument; +import org.broadinstitute.barclay.argparser.BetaFeature; import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; +import org.broadinstitute.barclay.help.DocumentedFeature; import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; import org.broadinstitute.hellbender.engine.*; +import org.broadinstitute.hellbender.engine.filters.CountingReadFilter; +import org.broadinstitute.hellbender.engine.filters.ReadFilter; +import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.read.GATKRead; import org.broadinstitute.hellbender.utils.read.SAMFileGATKReadWriter; import picard.cmdline.programgroups.ReadDataManipulationProgramGroup; -import java.io.File; import java.util.*; import java.util.stream.Collectors; /** * Performs post-processing steps to get a bam aligned to a transcriptome ready for RSEM (https://github.com/deweylab/RSEM) * - * ### Task 1. Reordering Reads ### * - * Suppose the queryname "Q1" aligns to multiple loci in the transcriptome. + * Suppose the read name "Q1" aligns to multiple loci in the transcriptome. * STAR aligner outputs the reads in the following order: * - * Q1: Read1 (Chr 1:1000) - * Q1: Read2 (Chr 1:2000) - * Q1: Read1 (Chr 20:5000) - * Q1: Read2 (Chr 20:6000) + * Q1: First-of-pair (Chr 1:1000) + * Q1: Second-of-pair (Chr 1:2000) + * Q1: First-of-pair (Chr 20:5000) + * Q1: Second-of-pair (Chr 20:6000) * * This is the format required by RSEM. After query-name sorting the reads for duplicate marking, * the reads will be ordered as follows; * - * Q1: Read1 (Chr 1:1000) - * Q1: Read1 (Chr 20:5000) - * Q1: Read2 (Chr 1:2000) - * Q1: Read2 (Chr 20:6000) + * Q1: First-of-pair (Chr 1:1000) + * Q1: First-of-pair (Chr 20:5000) + * Q1: Second-of-pair (Chr 1:2000) + * Q1: Second-of-pair (Chr 20:6000) * * That is, all the read1's come first, then read2's. * * This tool reorders such that the alignment pair appears together as in the first example. * - * ### Task 2. Removing Reads ### + * Caveat: It may be desirable to remove duplicate reads before running RSEM. + * This tool does not remove duplicate reads; it assumes they have been removed upstream e.g. + * MarkDuplicates with REMOVE_DUPLICATES=true * - * If requested, this tool also removes duplicate marked reads and MT reads, which can skew gene expression counts. - * - * - * Caveat: This tool does not remove duplicate reads; it assumes it's been removed upstream e.g. - * MarkDuplicates with R * */ @CommandLineProgramProperties( - summary = "", - oneLineSummary = "", - programGroup = ReadDataManipulationProgramGroup.class // Sato: Change to QC when the other PR is merged. + summary = "Reorder reads before running RSEM", + oneLineSummary = "Reorder reads before running RSEM", + programGroup = ReadDataManipulationProgramGroup.class ) +@Experimental +@BetaFeature +@DocumentedFeature public class PostProcessReadsForRSEM extends GATKTool { @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME) - public File outSam; + public GATKPath outSam; - @Argument(fullName = "keep-MT-reads") - public boolean keepMTReads = false; - - PeekableIterator read1Iterator; SAMFileGATKReadWriter writer; - ReadPair currentReadPair; - - // Use CountingFilter or another existing framework for counting and printing filtered reads + // Should use CountingFilter or another existing framework for counting and printing filtered reads int totalOutputPair = 0; int notBothMapped = 0; int chimera = 0; int unsupportedCigar = 0; - int duplicates = 0; - int mitochondria = 0; @Override - public void onTraversalStart(){ - read1Iterator = new PeekableIterator<>(directlyAccessEngineReadsDataSource().iterator()); + public boolean requiresReads(){ + return true; + } - if (directlyAccessEngineReadsDataSource().getHeader().getSortOrder() != SAMFileHeader.SortOrder.queryname){ + @Override + public void onTraversalStart(){ + Utils.nonNull(getHeaderForReads(), "Input bam must have a header"); + if (getHeaderForReads().getSortOrder() != SAMFileHeader.SortOrder.queryname){ throw new UserException("Input must be query-name sorted."); } - writer = createSAMWriter(new GATKPath(outSam.getAbsolutePath()), true); + writer = createSAMWriter(outSam, true); - if (!read1Iterator.hasNext()){ - throw new UserException("Input has no reads"); - } - currentReadPair = new ReadPair(read1Iterator.next()); - totalOutputPair++; + } + + @Override + public List getDefaultReadFilters() { + return Collections.singletonList(ReadFilterLibrary.NOT_SUPPLEMENTARY_ALIGNMENT); } @Override public void traverse() { - while (read1Iterator.hasNext()){ - final GATKRead read = read1Iterator.next(); - if (!currentReadPair.getQueryName().equals(read.getName())){ + final CountingReadFilter countingReadFilter = makeReadFilter(); + final Iterator readIterator = getTransformedReadStream(countingReadFilter).iterator(); + ReadPair currentReadPair = null; + + // Initialize the first ReadPair object + if (readIterator.hasNext()) { + currentReadPair = new ReadPair(readIterator.next()); + totalOutputPair++; + } + + while (readIterator.hasNext()){ + final GATKRead read = readIterator.next(); + if (!currentReadPair.getName().equals(read.getName())){ // We have gathered all the reads with the same query name (primary, secondary, and supplementary alignments) // Write the reads to output file, reordering the reads as needed. writeReads(currentReadPair); @@ -110,34 +119,44 @@ public void traverse() { } progressMeter.update(read); } + + if (currentReadPair != null){ + writeReads(currentReadPair); + } + } + /** + * For the list of conditions required by RSEM, see: https://github.com/deweylab/RSEM/blob/master/samValidator.cpp + */ public boolean passesRSEMFilter(final GATKRead read1, final GATKRead read2) { + if (read1 == null || read2 == null){ + logger.warn("read1 or read2 is null. This read will not be output. " + read1.getName()); + return false; + } // If either of the pair is unmapped, throw out. // With the STAR argument --quantTranscriptomeBan IndelSoftclipSingleend this should not occur, // but we check just to be thorough. - if (read1.getContig() == null || read2.getContig() == null){ + if (read1.isUnmapped() || read2.isUnmapped()){ notBothMapped++; return false; } - // Chimeric reads are not allowed - if (!read1.getContig().equals(read2.getContig())){ + // Chimeric reads are not allowed. Chimeric alignments should be just as suspect (or potentially interesting) + // in the case of transcriptome as in the genome. + if (!read1.contigsMatch(read2)){ chimera++; return false; } // Cigar must have a single operator M. - final List cigarElements1 = read1.getCigar().getCigarElements(); - final List cigarElements2 = read2.getCigar().getCigarElements(); - - if (cigarElements1.size() != 1 || cigarElements2.size() != 1){ + if (read1.numCigarElements() != 1 || read2.numCigarElements() != 1){ unsupportedCigar++; return false; } - if (cigarElements1.get(0).getOperator() != CigarOperator.M || - cigarElements2.get(0).getOperator() != CigarOperator.M){ + if (read1.getCigarElement(0).getOperator() != CigarOperator.M || + read2.getCigarElement(0).getOperator() != CigarOperator.M){ unsupportedCigar++; return false; } else { @@ -145,32 +164,6 @@ public boolean passesRSEMFilter(final GATKRead read1, final GATKRead read2) { } } - private final static HashSet MT_TRANSCRIPT_IDs = new HashSet<>(Arrays.asList( - "ENST00000387314.1", "ENST00000389680.2", "ENST00000387342.1", "ENST00000387347.2", "ENST00000386347.1", - "ENST00000361390.2", "ENST00000387365.1", "ENST00000387372.1", "ENST00000387377.1", "ENST00000361453.3", - "ENST00000387382.1", "ENST00000387392.1", "ENST00000387400.1", "ENST00000387405.1", "ENST00000387409.1", - "ENST00000361624.2", "ENST00000387416.2", "ENST00000387419.1", "ENST00000361739.1", "ENST00000387421.1", - "ENST00000361851.1", "ENST00000361899.2", "ENST00000362079.2", "ENST00000387429.1", "ENST00000361227.2", - "ENST00000387439.1", "ENST00000361335.1", "ENST00000361381.2", "ENST00000387441.1", "ENST00000387449.1", - "ENST00000387456.1", "ENST00000361567.2", "ENST00000361681.2", "ENST00000387459.1", "ENST00000361789.2", - "ENST00000387460.2", "ENST00000387461.2" - )); // ENST00000389680.2 overlaps with the twist targets - private boolean mitochondrialRead(final GATKRead read){ - return MT_TRANSCRIPT_IDs.contains(read.getContig()); - } - - private boolean passesAdditionalFilters(final GATKRead r1, final GATKRead r2){ - if (!keepMTReads){ - // If the read pair is duplicate, then return false - if (mitochondrialRead(r1) || mitochondrialRead(r2)){ - mitochondria++; - return false; - } - } - - return true; - } - /** Write reads in this order * 1. Primary. First of pair, second of pair * 2. For each secondary. First of pair, second of pair. @@ -181,7 +174,7 @@ private void writeReads(final ReadPair readPair){ final GATKRead secondOfPair = readPair.getSecondOfPair(); // Write Primary Reads. If either fails, we discard both, and we don't bother with the secondary alignments. - if (passesRSEMFilter(firstOfPair, secondOfPair) && passesAdditionalFilters(firstOfPair, secondOfPair)){ + if (passesRSEMFilter(firstOfPair, secondOfPair)){ writer.addRead(firstOfPair); writer.addRead(secondOfPair); totalOutputPair += 1; @@ -190,8 +183,7 @@ private void writeReads(final ReadPair readPair){ final List> secondaryAlignmentPairs = groupSecondaryReads(readPair.getSecondaryAlignments()); for (Pair mates : secondaryAlignmentPairs){ // The pair is either both written or both not written. - if (passesRSEMFilter(mates.getLeft(), mates.getRight()) && - passesAdditionalFilters(mates.getLeft(), mates.getRight())) { + if (passesRSEMFilter(mates.getLeft(), mates.getRight())) { writer.addRead(mates.getLeft()); writer.addRead(mates.getRight()); totalOutputPair += 1; @@ -215,10 +207,10 @@ private List> groupSecondaryReads(final List } final boolean isRead1 = true; - final Map> groupdByRead1 = + final Map> groupedByRead1 = secondaryAlignments.stream().collect(Collectors.groupingBy(r -> r.isFirstOfPair())); - final List read1Reads = groupdByRead1.get(isRead1); - final List read2Reads = groupdByRead1.get(!isRead1); + final List read1Reads = groupedByRead1.get(isRead1); + final List read2Reads = groupedByRead1.get(!isRead1); if(read1Reads.size() != read2Reads.size()){ logger.warn("Num read1s != num read2s among the secondary alignments; " + secondaryAlignments.get(0).getName()); @@ -228,10 +220,14 @@ private List> groupSecondaryReads(final List final List> result = new ArrayList<>(read1Reads.size()); for (GATKRead read1 : read1Reads){ - final Optional read2 = read2Reads.stream().filter(r -> - r.getStart() == read1.getMateStart() && r.getMateStart() == read1.getStart()).findFirst(); - if (read2.isPresent()){ - result.add(new ImmutablePair<>(read1, read2.get())); + final List read2s = read2Reads.stream() + .filter(r -> r.getStart() == read1.getMateStart() + && r.getMateStart() == read1.getStart()) + .collect(Collectors.toList()); + if (read2s.size() == 1){ + result.add(new ImmutablePair<>(read1, read2s.get(0))); + } else if (read2s.size() > 1){ + logger.warn("Multiple mates found for the secondary alignment " + read1.getName()); } else { logger.warn("Mate not found for the secondary alignment " + read1.getName()); } @@ -242,7 +238,6 @@ private List> groupSecondaryReads(final List @Override public Object onTraversalSuccess(){ // Write out the last set of reads - writeReads(currentReadPair); logger.info("Total read pairs output: " + totalOutputPair); logger.info("Read pairs filtered due to unmapped mate: " + notBothMapped); logger.info("Read pairs filtered due to chimeric alignment: " + chimera); @@ -252,6 +247,8 @@ public Object onTraversalSuccess(){ @Override public void closeTool(){ - writer.close(); + if (writer != null){ + writer.close(); + } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java index 82cd6cedc04..7bc5b214c11 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java @@ -1,36 +1,40 @@ package org.broadinstitute.hellbender.tools.walkers.qc; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.utils.Utils; import org.broadinstitute.hellbender.utils.read.GATKRead; import java.util.ArrayList; import java.util.Arrays; import java.util.List; +/** + * Data structure that contains the set of reads sharing the same queryname, including + * the primary, secondary (i.e. multi-mapping), and supplementary (e.g. chimeric) alignments. + * + */ public class ReadPair { private GATKRead firstOfPair; private GATKRead secondOfPair; - private List secondaryAlignments = new ArrayList<>(10); - private List supplementaryAlignments = new ArrayList<>(10); // Finally understand the difference - private String queryName = null; - - - public ReadPair() { } + private final List secondaryAlignments = new ArrayList<>(); + private final List supplementaryAlignments = new ArrayList<>(); + private final String queryName; public ReadPair(final GATKRead read) { this.queryName = read.getName(); add(read); } - public int size(){ - int size = 0; - size = firstOfPair != null ? size + 1 : size; - size = secondOfPair != null ? size + 1 : size; - size += secondaryAlignments.size(); - return size += supplementaryAlignments.size(); // what should we do with the supplementary alignments? + public int numberOfAlignments(){ + int num = 0; + num = firstOfPair != null ? num + 1 : num; + num = secondOfPair != null ? num + 1 : num; + num += secondaryAlignments.size(); + num += supplementaryAlignments.size(); + return num; } - public String getQueryName() { + public String getName() { return queryName; } @@ -47,15 +51,13 @@ public List getSecondaryAlignments() { } public void add(final GATKRead read) { - if (this.queryName == null){ - this.queryName = read.getName(); - } - if (! this.queryName.equals(read.getName())){ throw new UserException("Read names do not match: " + this.queryName + " vs " + read.getName()); } if (isPrimaryAlignment(read) && read.isFirstOfPair()) { + Utils.validate(this.firstOfPair != null, + "The primary firstOfPair is already set. Added read = " + read.getName()); this.firstOfPair = read; } else if (isPrimaryAlignment(read) && read.isSecondOfPair()) { this.secondOfPair = read; @@ -64,7 +66,7 @@ public void add(final GATKRead read) { } else if (read.isSupplementaryAlignment()) { this.supplementaryAlignments.add(read); } else { - throw new UserException("Unknown read type"); + throw new UserException("Unknown read type: " + read.getContig()); } } @@ -73,7 +75,6 @@ private boolean isPrimaryAlignment(final GATKRead read){ } public boolean isDuplicateMarked() { - // Doing some investigation if (firstOfPair.isDuplicate()) { // Make sure the rest is duplicate-marked if (!secondOfPair.isDuplicate() || secondaryAlignments.stream().anyMatch(r -> !r.isDuplicate())) { @@ -85,6 +86,7 @@ public boolean isDuplicateMarked() { throw new UserException("First of pair a not duplicate but the rest is " + secondOfPair.getName()); } } + return firstOfPair.isDuplicate(); } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java index f244747d53e..f44bab1b5e3 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java @@ -16,13 +16,12 @@ public class PostProcessReadsForRSEMIntegrationTest extends CommandLineProgramTe @Test public void test(){ final ArgumentsBuilder args = new ArgumentsBuilder(); - // To be replaced - // final String testSam = "/Volumes/dsde_working/tsato/hydro.gen/Analysis/874_twist_RNA/rsem/gatk_test/transcriptome_query_sorted_abbr_header.bam"; final String testTranscriptomeSam = publicTestDir + "transcriptome_query_sorted_abbr_header.bam"; final File output = createTempFile("output", "bam"); args.addInput(testTranscriptomeSam); args.addOutput(output.getAbsolutePath()); + args.add("XL", publicTestDir + "gencode_v19_MT_transcriptome_abbr_header.interval_list"); runCommandLine(args, PostProcessReadsForRSEM.class.getSimpleName()); final ReadsPathDataSource outputReadSource = new ReadsPathDataSource(output.toPath()); diff --git a/src/test/resources/gencode_v19_MT_transcriptome_abbr_header.interval_list b/src/test/resources/gencode_v19_MT_transcriptome_abbr_header.interval_list new file mode 100644 index 00000000000..521939b0255 --- /dev/null +++ b/src/test/resources/gencode_v19_MT_transcriptome_abbr_header.interval_list @@ -0,0 +1,94 @@ +@HD VN:1.6 SO:queryname +@SQ SN:ENST00000375267.2 LN:15945 +@SQ SN:ENST00000375217.2 LN:15861 +@SQ SN:ENST00000375226.2 LN:15810 +@SQ SN:ENST00000375254.3 LN:15906 +@SQ SN:ENST00000375218.3 LN:4031 +@SQ SN:ENST00000321358.7 LN:1514 +@SQ SN:ENST00000436427.1 LN:1524 +@SQ SN:ENST00000361813.5 LN:4559 +@SQ SN:ENST00000422945.2 LN:4481 +@SQ SN:ENST00000328724.5 LN:4045 +@SQ SN:ENST00000557268.1 LN:2013 +@SQ SN:ENST00000350249.3 LN:3845 +@SQ SN:ENST00000445439.3 LN:1526 +@SQ SN:ENST00000334743.5 LN:4328 +@SQ SN:ENST00000557095.1 LN:1307 +@SQ SN:ENST00000415930.3 LN:4000 +@SQ SN:ENST00000588991.2 LN:2021 +@SQ SN:ENST00000356487.5 LN:2210 +@SQ SN:ENST00000586392.1 LN:1706 +@SQ SN:ENST00000387314.1 LN:71 +@SQ SN:ENST00000389680.2 LN:954 +@SQ SN:ENST00000387342.1 LN:69 +@SQ SN:ENST00000387347.2 LN:1559 +@SQ SN:ENST00000386347.1 LN:75 +@SQ SN:ENST00000361390.2 LN:956 +@SQ SN:ENST00000387365.1 LN:69 +@SQ SN:ENST00000387372.1 LN:72 +@SQ SN:ENST00000387377.1 LN:68 +@SQ SN:ENST00000361453.3 LN:1042 +@SQ SN:ENST00000387382.1 LN:68 +@SQ SN:ENST00000387392.1 LN:69 +@SQ SN:ENST00000387400.1 LN:73 +@SQ SN:ENST00000387405.1 LN:66 +@SQ SN:ENST00000387409.1 LN:66 +@SQ SN:ENST00000361624.2 LN:1542 +@SQ SN:ENST00000387416.2 LN:69 +@SQ SN:ENST00000387419.1 LN:68 +@SQ SN:ENST00000361739.1 LN:684 +@SQ SN:ENST00000387421.1 LN:70 +@SQ SN:ENST00000361851.1 LN:207 +@SQ SN:ENST00000361899.2 LN:681 +@SQ SN:ENST00000362079.2 LN:784 +@SQ SN:ENST00000387429.1 LN:68 +@SQ SN:ENST00000361227.2 LN:346 +@SQ SN:ENST00000387439.1 LN:65 +@SQ SN:ENST00000361335.1 LN:297 +@SQ SN:ENST00000361381.2 LN:1378 +@SQ SN:ENST00000387441.1 LN:69 +@SQ SN:ENST00000387449.1 LN:59 +@SQ SN:ENST00000387456.1 LN:71 +@SQ SN:ENST00000361567.2 LN:1812 +@SQ SN:ENST00000361681.2 LN:525 +@SQ SN:ENST00000387459.1 LN:69 +@SQ SN:ENST00000361789.2 LN:1141 +@SQ SN:ENST00000387460.2 LN:66 +@SQ SN:ENST00000387461.2 LN:68 +ENST00000387314.1 1 71 + . +ENST00000389680.2 1 954 + . +ENST00000387342.1 1 69 + . +ENST00000387347.2 1 1559 + . +ENST00000386347.1 1 75 + . +ENST00000361390.2 1 956 + . +ENST00000387365.1 1 69 + . +ENST00000387372.1 1 72 + . +ENST00000387377.1 1 68 + . +ENST00000361453.3 1 1042 + . +ENST00000387382.1 1 68 + . +ENST00000387392.1 1 69 + . +ENST00000387400.1 1 73 + . +ENST00000387405.1 1 66 + . +ENST00000387409.1 1 66 + . +ENST00000361624.2 1 1542 + . +ENST00000387416.2 1 69 + . +ENST00000387419.1 1 68 + . +ENST00000361739.1 1 684 + . +ENST00000387421.1 1 70 + . +ENST00000361851.1 1 207 + . +ENST00000361899.2 1 681 + . +ENST00000362079.2 1 784 + . +ENST00000387429.1 1 68 + . +ENST00000361227.2 1 346 + . +ENST00000387439.1 1 65 + . +ENST00000361335.1 1 297 + . +ENST00000361381.2 1 1378 + . +ENST00000387441.1 1 69 + . +ENST00000387449.1 1 59 + . +ENST00000387456.1 1 71 + . +ENST00000361567.2 1 1812 + . +ENST00000361681.2 1 525 + . +ENST00000387459.1 1 69 + . +ENST00000361789.2 1 1141 + . +ENST00000387460.2 1 66 + . +ENST00000387461.2 1 68 + . diff --git a/src/test/resources/transcriptome_query_sorted_abbr_header.bam b/src/test/resources/transcriptome_query_sorted_abbr_header.bam index c1aa2954e007d90d3bf76e4969b22452827b3594..ac50e1cfb9be9ba723ea2523eedd23db74937dfc 100644 GIT binary patch delta 2531 zcmV<92^{ve5ycdLABzYC000000RIL6LPG)oi35$5&r4KM6vvO1*$9zI(4vjBYtiuT z?|Y3xO;gFtFj};cF$WUTsZooT{U_0`sDB`Wpndokv<#wkweH^c&YR%f?+gioc+R=! zdw#vo!u*}F4--UlD~qiM>$9S5Tbp-hpB;7kFZVlp-O0Is&3moQ_1UHM&HF|AQO-!~ z+OoA4olVdNdWeCCw*q=dJMa(%^f0=uI7?x17GHVjLV+YwDC66%dP3;TK`X1JYm3T3 z<_^jVFI3xt>Q)!|w2b#5h^h~=kf0C7YVRryQi6HMHBmvb z28YJ&of5ioa3FyX))$q6V8AG2xaRk5KFHEMSXnS>74>P}6$e({3Ka(Fv3hGprgyB) zNo?liHlJTiu8raih&u`6y&N7N&0%D3;?P+fSkPpDu1kZJ>mm#{TVdmYaO1IdE%;M0 zM4SktS`%xl&2TNW6kHql3dIkA5%qQEhmq_dusC%7Hb$}C4NE^#y_CF_O5@%Rs>4gE zcrr3T^~&?r3gzHpXe~xAhUWC}WSlm3_{Cab1|0DMOg^qgEib|VzaNIHi=+44dm%LX zWH@bq`DTYKt`+rcC+9YnTdRw+2c5mcUhm04Yh(T2-z7GMYTaBL{BP#@ZuiB^)!xiu zzq5bvsK5L4u(#LUDzUnC(CZ&|ceXm)+x@M_-Of(8-`?)*{jY0GG3kkDI{s!FnvLmL|?iLUyIbG&*z#J#x4DS9B)`^&eE@@r3Ne=pKr!6S?N{NQX`eV zHWkz?rEe3>7=|c~H)Cpg(w|HlW0QVN<#|e8(xt3Y1|_}Dv@s#+d#r9-5^LxhvLwua zTVTS_h3w`SEp!(3Iza^}V4lzs*u)ZlBzSicL&}c+1s3ORqG=5P03VA81ONa400936 z0763o0GQf5eG$>l@FJsSVRsXgHYaB?xD(M@)2-GVT8b7d z3N3zu2=+baFc&W#JSbMmf_Rpq7Z0Ardh+X_px%m5J&C8*ncbbYyD$4*x@)L^!oGhY znf>QMdB2(d=RcF4D|jC8Jn!pS&s#2Rz=C(wd)Mpcew*<;b?y4~qmNY&Kee?c4?o(F zV)OVjje5P&sP#XuHP#yST79ivtJRj*>h*)QWpDp2o)=L8Atsn{NFpVb5>zn)87CTH z!4OxH5`sAe_A8;pm(=Hxc~)tE1hk+GDXo>li~qYg(%gFd)JrdjSI@n8=GjxPoPOEc zY;J8f>)x^E#-C4rR5>iy%W`AA$d7Nen$0F--ezZ1wZn7992(#6DzSONn%ms(<=i|a zvdo{n9Ge$X&F?V2duE5tZSLNDou@^X`6E{@IP+4f`R&H{3ohq4W`|rBQgsq`4P|WZ0-tqg4mVn z9&ud0ZAs6GCn7E+h+SoW9wc@ZJSU!rxRfAvg?NzI73`dNBI5D@@k55t#{L6_*z5Mz zr@emegX`n&e~s0pl~wPEnKI@}X@n)lKw%acGzyslK=_14lyabmQpQyR-{*wu0G>~O zsa=e$&Fg{l$TC&L#`-lwRS`!!4Kg!U&CVufzE$n8YT!(9_1>?4#Bc=*=W^2qcN_OR zaJ}w!G+4>t96T@#=5o^pHhXrT1500FMT7I>gTr7hH*H|wpS{b0l~TcMIh#wYFqfM) zu=ifO(}9&og59&N__v3Oza38vSWl{w@ENZ0?V%b!jXs5h&u~d^50&(3^eHBMhAVh` zsDe+UPbuLuT(H|ELj`*peaZtq_HxR+iubqq2en-4Uwqr#mnOF|_J+1pJ5*cgoRbL! zGJo^(8Hbi#jI2HJ;)f4g>7r{>8EdgtXHR6TYKPX`ly>iG-@J3?w7Ji6^R&n^-$1c> zaisalLVrzn&D-%hw`>2avFkwJoM0p|M+8y~WguZdIG_>G&>ZQeJ9&x;^uja&Qu@iAIQ8{J};}r#aTXnr(t!?fJle=&H%CWM#Qtwui&jl}k zUz{7QaC54}sJ}6*a_l`5TIoWsS~pwGR`*v4W>jtUZU~)k*SsB@+j;nP{AC@RGr=PS zk>ZhpfJv%=P(px!Q_U1*z)VHJfC3@M0vU1huoi%YikYth35ElSiCOyNd5p=oE#`K< zAk19Wg4cUAimYi71KQw5X)Eb;{~ zt5_3(rQ)V>Q&EIDflw19&C=$Kn_W;t#eD0IBcDkPBBaKnyBU$Ss-OQumJRf5?_9p# zY_+=Qax<#hp(Ve~@Ln#(Bp(nV7tkO1{_dUQ3YM+@+j6C9=rh`Z=!Ul~i)W;~gsxliS=} zIVT?-S>%_#iOFYD$@duE=Mv;LcXv*n5}6&#=P~*0Nb(bfenYtA_4{zwIQ}$tA0Cr) zs3`^^6$Bs{K@^$eBalF88iD|mdj|`DL%-zT-jHXt;j_KFz(HZx|Mi69p3wG*x$q>y zg@mwc_XdSsTbK(^B3w)e&yNtE&xPX{U$~SIcFo+Nuxt8q;Yoxm31Qc}4GOy+FBhIf zcqSq2TD3u8*V5&}lL*fy8f0V?|0){cftzZZ`^4n-kJTVIR#(>QD+jy68VDJG01+@9 zaufo}m_$g?P%3~0Ckk*bA%p=5B1j^NQ5ez)Acl1aLjVbd+}FO3&@Np_@8Z37R)c)u zjn`)eFRGjC>#goZb#uLHhbIbs-lc(yYBvqPjJ2qT$R$QnB7&7YaUkFe9%8L2B$xph t20kJ*Q~>TgGrLEXh=s+W|N1%-BIXviFYtvMyr=F7{{t-X9&VGC2q1L3<=g-O delta 2129 zcmV-X2(I_V6t)q6ABzYC000000RIL6LPG)o`T>oU&uSDw5XMU+(aFUq-n=Z{9lERk zFGQkANCI(4 zV&$YF6@00%nv)9DO9f}7LdjCZDooQ=Y*0uhh$6^^vZVrm#c?DU}qi>l6}=64J=- z7Tyr$fjJd_`yiOfOZW*0*0q}D(^A$t$3q{GA<+k=r8U6=A~A1|iEtP_7;4b){u8!)q99NV7mHHBE_W|IZ|7BZ$0{f4N$b38C_XZJ}Zv|XZk@+^j zLo%D+0=T%7Mk4<3b0vv%+{3Rl(g6?O?xtrV>F_y?bcn-M>Y!sAzTZmEM5tjUJ=4() ze==_oWB9S2*IDoj53-v^aN$elEfNdYxwn8SoZZgO1V-WOYI-KZ3FrPp`WFziYcNp- z0019*h7{{3o@vzUjYh5ad8M(^sMqQ%^;)gAxKghlsx5j4Zt=W`3J5X5 zj6)JBsg$6K5y&{v2n&X|l9Uk4DX?D&CBCFSkIb`5BcKIkNNKGUp8Ma$(dNeEr(b%1 zLA-kY#k0?ze&x)|-g{a((^QzFa!$t$sWA=UhL4H@z3{C0GuQjj_mlSL_)~w{Jvt;a zo01>${LbdCkSB;;sqPZT<=d9@oOmqaLW0;;=6+&V!E@rVh)W4#SBU$GUBS+OiN_)? z_Yps22yGlVXo$T|Z+*t=Zk6ac~}G@_IP zMU*nG68JtRTnF$%`b+I%WNls#TtJqoBGy)~8>)&p)^3oQv1+zAG4pNS4(AP=DX!l8 zl^CvI;aqOg;P!F94cF~%hl7=W49=m0gJ3Q6-&3eG+|&37^3VzBy3AC();r@EI)Fn*#-V5`D^jKK63Tyo&d?`3JQVSn6GT+uWBX zw>0vGwop4_6Q{*a#KlDAs2auaU?g^8- zZ}iHsw7gXBRFhr@FMpq#9HCrF{^UyJsVo-LbqBsTg_JIR|#e`-|F5FI^T}@ zW^8Wf;n&fZb!^TAj}SzPM+yQasRBX?0Rm1nQ5t|yBHy-{-uZ$sb6FRFMku0Po)rZ=U5}Rwk0U&l5O%Fvzp!iRa^Z1=tBD5r4x{*2(Fph5RNLGqCU;<@2D!1k zv{GL>)DhM|$bSHcfa#E<5KzV>LW+h`0W3ICfO82U3`h_`5>br8kVXJ8tV0+ANGRmK z_I-qQ=|XxJ@3pfU0oO)|x_s8GvEnBSJ$3;LbC%dsK;-og4VCuM$IIdT#pypS{6*>aOrVdV3~H HlO+luWLqm% From a2bb2a6f4757c57b33715f73e2619349bc261ce5 Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Thu, 7 Apr 2022 12:18:30 -0400 Subject: [PATCH 6/7] fix tests --- .../tools/walkers/qc/PostProcessReadsForRSEM.java | 3 ++- .../hellbender/tools/walkers/qc/ReadPair.java | 2 +- .../qc/PostProcessReadsForRSEMIntegrationTest.java | 11 +++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java index 75357e1b8c6..f0f7af7035b 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java @@ -221,7 +221,8 @@ private List> groupSecondaryReads(final List final List> result = new ArrayList<>(read1Reads.size()); for (GATKRead read1 : read1Reads){ final List read2s = read2Reads.stream() - .filter(r -> r.getStart() == read1.getMateStart() + .filter(r -> r.contigsMatch(read1) + && r.getStart() == read1.getMateStart() && r.getMateStart() == read1.getStart()) .collect(Collectors.toList()); if (read2s.size() == 1){ diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java index 7bc5b214c11..3051a699245 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java @@ -56,7 +56,7 @@ public void add(final GATKRead read) { } if (isPrimaryAlignment(read) && read.isFirstOfPair()) { - Utils.validate(this.firstOfPair != null, + Utils.validate(this.firstOfPair == null, "The primary firstOfPair is already set. Added read = " + read.getName()); this.firstOfPair = read; } else if (isPrimaryAlignment(read) && read.isSecondOfPair()) { diff --git a/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java index f44bab1b5e3..48fafe2e988 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEMIntegrationTest.java @@ -27,10 +27,13 @@ public void test(){ final ReadsPathDataSource outputReadSource = new ReadsPathDataSource(output.toPath()); final Iterator outputSamIterator = outputReadSource.iterator(); - final List inputReadNames = Arrays.asList("SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:11804", "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:27367", "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:30906", "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:36761", "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:9079", - "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10276:1689"); - // SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10276:1689 aligns to a transcript in MT - final int[] expectedReadPairCount = new int[] { 4, 5, 2, 1, 6, 0 }; + final List inputReadNames = Arrays.asList("SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:11804", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:27367", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:30906", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:36761", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10004:9079", + "SL-NVZ:HNJ2HDRXY211220:HNJ2HDRXY:2:2101:10276:1689"); // aligns to a transcript in MT + final int[] expectedReadPairCount = new int[] { 4, 5, 2, 1, 7, 0 }; String currentReadName = inputReadNames.get(0); int i = 0; int j = 0; From 2f7d12d6b2c4908a225deb953f511e0395ff16ff Mon Sep 17 00:00:00 2001 From: Takuto Sato Date: Fri, 8 Apr 2022 13:58:22 -0400 Subject: [PATCH 7/7] louis2 --- .../tools/walkers/qc/PostProcessReadsForRSEM.java | 9 ++++++--- .../hellbender/tools/walkers/qc/ReadPair.java | 4 +++- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java index f0f7af7035b..71f8ad9967f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/PostProcessReadsForRSEM.java @@ -52,13 +52,18 @@ * MarkDuplicates with REMOVE_DUPLICATES=true * * + * Usage Example: + * + * gatk PostProcessReadsForRSEM \ + * -I input.bam \ + * -O output.bam */ @CommandLineProgramProperties( summary = "Reorder reads before running RSEM", oneLineSummary = "Reorder reads before running RSEM", programGroup = ReadDataManipulationProgramGroup.class ) -@Experimental + @BetaFeature @DocumentedFeature public class PostProcessReadsForRSEM extends GATKTool { @@ -86,8 +91,6 @@ public void onTraversalStart(){ } writer = createSAMWriter(outSam, true); - - } @Override diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java index 3051a699245..87d991cd253 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/walkers/qc/ReadPair.java @@ -57,9 +57,11 @@ public void add(final GATKRead read) { if (isPrimaryAlignment(read) && read.isFirstOfPair()) { Utils.validate(this.firstOfPair == null, - "The primary firstOfPair is already set. Added read = " + read.getName()); + "The primary firstOfPair is already set. Read = " + read.getName()); this.firstOfPair = read; } else if (isPrimaryAlignment(read) && read.isSecondOfPair()) { + Utils.validate(this.secondOfPair == null, + "The primary secondOfPair is already set. Read = " + read.getName()); this.secondOfPair = read; } else if (read.isSecondaryAlignment()) { this.secondaryAlignments.add(read);