-
Notifications
You must be signed in to change notification settings - Fork 594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
LocusDepthtoBAF tool #7776
LocusDepthtoBAF tool #7776
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,6 +2,7 @@ | |
|
||
import com.google.common.annotations.VisibleForTesting; | ||
import htsjdk.samtools.util.Locatable; | ||
import org.broadinstitute.hellbender.utils.Utils; | ||
|
||
import java.util.*; | ||
|
||
|
@@ -11,25 +12,36 @@ public final class LocusDepth implements SVFeature { | |
private final String contig; | ||
private final int position; | ||
private final String sample; | ||
private final byte refCall; // index into nucleotideValues | ||
// next three fields index "ACGT" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you create an enum type for this? |
||
private final int refIndex; | ||
private final int altIndex; | ||
private final int[] depths; | ||
|
||
public final static String BCI_VERSION = "1.0"; | ||
|
||
public LocusDepth( final Locatable loc, final String sample, final byte refCall ) { | ||
public LocusDepth( final Locatable loc, final String sample, final int refIndex, final int altIndex ) { | ||
Utils.validateArg(refIndex >= 0 && refIndex <= 3, "refIndex must be between 0 and 3"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How do you feel about |
||
Utils.validateArg(altIndex >= 0 && altIndex <= 3, "altIndex must be between 0 and 3"); | ||
Utils.validateArg(refIndex != altIndex, "refIndex and altIndex must be different"); | ||
this.contig = loc.getContig(); | ||
this.position = loc.getStart(); | ||
this.sample = sample; | ||
this.refCall = refCall; | ||
this.refIndex = refIndex; | ||
this.altIndex = altIndex; | ||
this.depths = new int[4]; | ||
} | ||
|
||
public LocusDepth( final String contig, final int position, | ||
final String sample, final byte refCall, | ||
final String sample, final int refIndex, final int altIndex, | ||
final int aDepth, final int cDepth, final int gDepth, final int tDepth ) { | ||
Utils.validateArg(refIndex >= 0 && refIndex <= 3, "refIndex must be between 0 and 3"); | ||
Utils.validateArg(altIndex >= 0 && altIndex <= 3, "altIndex must be between 0 and 3"); | ||
Utils.validateArg(refIndex != altIndex, "refIndex and altIndex must be different"); | ||
this.contig = contig; | ||
this.position = position; | ||
this.sample = sample; | ||
this.refCall = refCall; | ||
this.refIndex = refIndex; | ||
this.altIndex = altIndex; | ||
this.depths = new int[4]; | ||
depths[0] = aDepth; | ||
depths[1] = cDepth; | ||
|
@@ -57,13 +69,19 @@ public int getStart() { | |
} | ||
|
||
public String getSample() { return sample; } | ||
public char getRefCall() { | ||
return (char)refCall; | ||
// index into "ACGT" | ||
public int getRefIndex() { | ||
return refIndex; | ||
} | ||
// index into "ACGT" | ||
public int getAltIndex() { | ||
return altIndex; | ||
} | ||
public int getADepth() { return depths[0]; } | ||
public int getCDepth() { return depths[1]; } | ||
public int getGDepth() { return depths[2]; } | ||
public int getTDepth() { return depths[3]; } | ||
public int getTotalDepth() { return depths[0] + depths[1] + depths[2] + depths[3]; } | ||
// idx is index into "ACGT" | ||
public int getDepth( final int idx ) { return depths[idx]; } | ||
public int getRefDepth() { return depths[refIndex]; } | ||
public int getAltDepth() { return depths[altIndex]; } | ||
|
||
@Override | ||
public LocusDepth extractSamples( final Set<String> sampleNames, final Object header ) { | ||
|
@@ -77,7 +95,7 @@ public boolean equals( final Object obj ) { | |
} | ||
|
||
public boolean equals( final LocusDepth that ) { | ||
return this.position == that.position && this.refCall == that.refCall && | ||
return this.position == that.position && this.refIndex == that.refIndex && | ||
this.depths[0] == that.depths[0] && this.depths[1] == that.depths[1] && | ||
this.depths[2] == that.depths[2] && this.depths[3] == that.depths[3] && | ||
Objects.equals(this.contig, that.contig) && | ||
|
@@ -86,12 +104,12 @@ public boolean equals( final LocusDepth that ) { | |
|
||
@Override | ||
public int hashCode() { | ||
return Objects.hash(contig, position, sample, refCall, Arrays.hashCode(depths)); | ||
return Objects.hash(contig, position, sample, refIndex, Arrays.hashCode(depths)); | ||
} | ||
|
||
@Override | ||
public String toString() { | ||
return contig + "\t" + position + "\t" + sample + "\t" + (char)refCall + "\t" + | ||
return contig + "\t" + position + "\t" + sample + "\t" + (char)refIndex + "\t" + | ||
depths[0] + "\t" + depths[1] + "\t" + depths[2] + "\t" + depths[3]; | ||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -0,0 +1,200 @@ | ||||||
package org.broadinstitute.hellbender.tools.sv; | ||||||
|
||||||
import com.google.common.annotations.VisibleForTesting; | ||||||
import htsjdk.tribble.Feature; | ||||||
import org.apache.commons.math3.distribution.ChiSquaredDistribution; | ||||||
import org.broadinstitute.barclay.argparser.Argument; | ||||||
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties; | ||||||
import org.broadinstitute.barclay.argparser.ExperimentalFeature; | ||||||
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions; | ||||||
import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; | ||||||
import org.broadinstitute.hellbender.engine.*; | ||||||
import org.broadinstitute.hellbender.exceptions.UserException; | ||||||
import org.broadinstitute.hellbender.utils.MathUtils.RunningAverage; | ||||||
import org.broadinstitute.hellbender.utils.codecs.FeatureOutputCodec; | ||||||
import org.broadinstitute.hellbender.utils.codecs.FeatureOutputCodecFinder; | ||||||
import org.broadinstitute.hellbender.utils.codecs.FeatureSink; | ||||||
|
||||||
import java.util.ArrayList; | ||||||
import java.util.Arrays; | ||||||
import java.util.List; | ||||||
|
||||||
/** | ||||||
* <p>Merges locus-sorted LocusDepth evidence files, and calculates the bi-allelic frequency for each | ||||||
* sample and site, and writes these values as a BafEvidence output file.</p> | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a description of the filtering methodology |
||||||
* | ||||||
* <h3>Inputs</h3> | ||||||
* | ||||||
* <ul> | ||||||
* <li> | ||||||
* One or more LocusDepth evidence files, or a file containing a list of evidence files, one per line. | ||||||
* </li> | ||||||
* </ul> | ||||||
* | ||||||
* <h3>Output</h3> | ||||||
* | ||||||
* <ul> | ||||||
* <li> | ||||||
* An output file containing merged BafEvidence from the inputs. | ||||||
* </li> | ||||||
* </ul> | ||||||
* | ||||||
* <h3>Usage example</h3> | ||||||
* | ||||||
* <pre> | ||||||
* gatk LocusDepthtoBAF \ | ||||||
* -F file1.ld.txt.gz [-F file2.ld.txt.gz ...] \ | ||||||
* -O merged.baf.bci | ||||||
* </pre> | ||||||
* | ||||||
* @author Ted Sharpe <[email protected]> | ||||||
*/ | ||||||
@CommandLineProgramProperties( | ||||||
summary = "Convert LocusDepth to BafEvidence", | ||||||
oneLineSummary = "Convert LocusDepth to BafEvidence", | ||||||
programGroup = StructuralVariantDiscoveryProgramGroup.class | ||||||
) | ||||||
@ExperimentalFeature | ||||||
public class LocusDepthtoBAF extends MultiFeatureWalker<LocusDepth> { | ||||||
public static final String LOCUS_DEPTH_FILE_NAME = "locus-depth"; | ||||||
public static final String SAMPLE_NAMES_NAME = "sample-names"; | ||||||
public static final String BAF_EVIDENCE_FILE_NAME = "baf-evidence-output"; | ||||||
public static final String MIN_TOTAL_DEPTH_NAME = "min-total-depth"; | ||||||
public static final String MAX_STD_NAME = "max-std"; | ||||||
public static final String MIN_HET_LIKELIHOOD = "min-het-likelihood"; | ||||||
public static final String COMPRESSION_LEVEL_NAME = "compression-level"; | ||||||
public static ChiSquaredDistribution chiSqDist = new ChiSquaredDistribution(null, 1.); | ||||||
|
||||||
@Argument( | ||||||
doc = "Locus depth files to process", | ||||||
fullName = LOCUS_DEPTH_FILE_NAME, | ||||||
shortName = StandardArgumentDefinitions.FEATURE_SHORT_NAME ) | ||||||
private List<FeatureInput<LocusDepth>> locusDepthFiles; | ||||||
|
||||||
@Argument(fullName = SAMPLE_NAMES_NAME, doc = "Sample names", optional = true) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you elaborate on what this does in the doc? Is it used for subsetting? |
||||||
private List<String> sampleNames; | ||||||
|
||||||
@Argument( | ||||||
doc = "BAF output file", | ||||||
fullName = BAF_EVIDENCE_FILE_NAME, | ||||||
shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME ) | ||||||
private GATKPath bafOutputFile; | ||||||
|
||||||
@Argument( | ||||||
doc = "Maximum standard deviation across bafs at a locus (otherwise locus will be ignored)", | ||||||
fullName = MAX_STD_NAME, optional = true ) | ||||||
private double maxStdDev = .2; | ||||||
|
||||||
@Argument( | ||||||
doc = "Minimum total depth at a locus (otherwise locus will be ignored)", | ||||||
fullName = MIN_TOTAL_DEPTH_NAME, optional = true ) | ||||||
private int minTotalDepth = 10; | ||||||
|
||||||
@Argument( | ||||||
doc = "Minimum likelihood of site being biallelic and heterozygous", | ||||||
fullName = MIN_HET_LIKELIHOOD, optional = true ) | ||||||
private double minHetLikelihood = .5; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should be a probability instead of likelihood |
||||||
|
||||||
@Argument( | ||||||
doc = "Output compression level", | ||||||
fullName = COMPRESSION_LEVEL_NAME, | ||||||
minValue = 0, maxValue = 9, optional = true ) | ||||||
private int compressionLevel = 4; | ||||||
|
||||||
private FeatureSink<BafEvidence> writer; | ||||||
private final List<BafEvidence> sameLocusBuffer = new ArrayList<>(); | ||||||
|
||||||
@Override | ||||||
@SuppressWarnings("unchecked") | ||||||
public void onTraversalStart() { | ||||||
super.onTraversalStart(); | ||||||
final FeatureOutputCodec<? extends Feature, ? extends FeatureSink<? extends Feature>> codec = | ||||||
FeatureOutputCodecFinder.find(bafOutputFile); | ||||||
if ( !codec.getFeatureType().isAssignableFrom(BafEvidence.class) ) { | ||||||
throw new UserException("We're intending to write BafEvidence, but the feature type " + | ||||||
"associated with the output file expects features of type " + | ||||||
codec.getFeatureType().getSimpleName()); | ||||||
} | ||||||
writer = (FeatureSink<BafEvidence>)codec.makeSink(bafOutputFile, | ||||||
getBestAvailableSequenceDictionary(), | ||||||
sampleNames, compressionLevel); | ||||||
} | ||||||
|
||||||
@Override | ||||||
public void apply( LocusDepth locusDepth, Object header, | ||||||
ReadsContext readsContext, ReferenceContext referenceContext ) { | ||||||
final double baf = calcBAF(locusDepth); | ||||||
if ( baf > 0. ) { | ||||||
final BafEvidence bafEvidence = | ||||||
new BafEvidence(locusDepth.getSample(), locusDepth.getContig(), | ||||||
locusDepth.getStart(), baf); | ||||||
if ( !sameLocus(bafEvidence) ) { | ||||||
processBuffer(); | ||||||
} | ||||||
sameLocusBuffer.add(bafEvidence); | ||||||
} | ||||||
} | ||||||
|
||||||
@Override | ||||||
public Object onTraversalSuccess() { | ||||||
super.onTraversalSuccess(); | ||||||
if ( !sameLocusBuffer.isEmpty() ) { | ||||||
processBuffer(); | ||||||
} | ||||||
writer.close(); | ||||||
return null; | ||||||
} | ||||||
|
||||||
@VisibleForTesting double calcBAF( final LocusDepth locusDepth ) { | ||||||
final int totalDepth = locusDepth.getTotalDepth(); | ||||||
if ( totalDepth < minTotalDepth ) { | ||||||
return 0.; | ||||||
} | ||||||
double expectRefAlt = totalDepth / 2.; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
final double refDiff = locusDepth.getRefDepth() - expectRefAlt; | ||||||
final double altDiff = locusDepth.getAltDepth() - expectRefAlt; | ||||||
double chiSq = (refDiff * refDiff + altDiff * altDiff) / expectRefAlt; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
double fitProb = 1. - chiSqDist.cumulativeProbability(chiSq); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add comment that this is a Pearson chi-squared test There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
if ( fitProb < minHetLikelihood ) { | ||||||
return 0.; | ||||||
} | ||||||
return (double)locusDepth.getAltDepth() / totalDepth; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I can never remember if casts get applied before division |
||||||
} | ||||||
|
||||||
private boolean sameLocus( final BafEvidence bafEvidence ) { | ||||||
if ( sameLocusBuffer.isEmpty() ) { | ||||||
return true; | ||||||
} | ||||||
final BafEvidence curLocus = sameLocusBuffer.get(0); | ||||||
return curLocus.getContig().equals(bafEvidence.getContig()) && | ||||||
curLocus.getStart() == bafEvidence.getStart(); | ||||||
} | ||||||
|
||||||
private void processBuffer() { | ||||||
int nBafs = sameLocusBuffer.size(); | ||||||
if ( nBafs == 1 ) { | ||||||
writer.write(new BafEvidence(sameLocusBuffer.get(0), .5)); | ||||||
} else { | ||||||
final RunningAverage ra = new RunningAverage(); | ||||||
for ( final BafEvidence bafEvidence : sameLocusBuffer ) { | ||||||
ra.add(bafEvidence.getValue()); | ||||||
} | ||||||
final double stddev = ra.stddev(); | ||||||
if ( stddev <= maxStdDev ) { | ||||||
final double[] vals = new double[nBafs]; | ||||||
for ( int idx = 0; idx != nBafs; ++idx ) { | ||||||
vals[idx] = sameLocusBuffer.get(idx).getValue(); | ||||||
} | ||||||
Arrays.sort(vals); | ||||||
int midIdx = nBafs / 2; | ||||||
double median = | ||||||
(nBafs & 1) != 0 ? vals[midIdx] : (vals[midIdx] + vals[midIdx - 1])/2.; | ||||||
final double adj = .5 - median; | ||||||
for ( final BafEvidence bafEvidence : sameLocusBuffer ) { | ||||||
writer.write(new BafEvidence(bafEvidence, bafEvidence.getValue()+adj)); | ||||||
} | ||||||
} | ||||||
} | ||||||
sameLocusBuffer.clear(); | ||||||
} | ||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,6 +5,7 @@ | |
import htsjdk.samtools.CigarOperator; | ||
import htsjdk.samtools.SAMSequenceDictionary; | ||
import htsjdk.samtools.util.Locatable; | ||
import htsjdk.variant.variantcontext.Allele; | ||
import htsjdk.variant.variantcontext.VariantContext; | ||
import org.broadinstitute.barclay.argparser.Argument; | ||
import org.broadinstitute.barclay.argparser.BetaFeature; | ||
|
@@ -694,14 +695,16 @@ private boolean readNextLocus() { | |
return false; | ||
} | ||
VariantContext snp = snpSourceItr.next(); | ||
while ( !snp.isSNP() ) { | ||
while ( !snp.isSNP() || !snp.isBiallelic() ) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's future-proof this and allow multiallelic by default, but have a command line flag to restrict only to biallelic |
||
if ( !snpSourceItr.hasNext() ) { | ||
return false; | ||
} | ||
snp = snpSourceItr.next(); | ||
} | ||
final byte[] refSeq = snp.getReference().getBases(); | ||
final LocusDepth locusDepth = new LocusDepth(snp, sampleName, refSeq[0]); | ||
final List<Allele> alleles = snp.getAlleles(); | ||
final int refIndex = Nucleotide.decode(alleles.get(0).getBases()[0]).ordinal(); | ||
final int altIndex = Nucleotide.decode(alleles.get(1).getBases()[0]).ordinal(); | ||
final LocusDepth locusDepth = new LocusDepth(snp, sampleName, refIndex, altIndex); | ||
locusDepthQueue.add(locusDepth); | ||
return true; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please add a quick unit test for this?