Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

First draft for CEMBA type caller #5762

Merged
merged 18 commits into from
Apr 11, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
b9dd7cf
First draft for CEMBA type caller
benjamincarlin Mar 5, 2019
034b56b
Fixed PR comments
benjamincarlin Mar 12, 2019
4f4fe02
changed vcf constants, added default headerlines, updated outputfile
benjamincarlin Mar 15, 2019
a4d9f6f
added BetaFeature and DocumentedFeature
benjamincarlin Mar 15, 2019
d279150
changed class name and ensured ref context would not be null
benjamincarlin Mar 15, 2019
81436f6
made the methylation walker an experimental feature
benjamincarlin Mar 18, 2019
10613ce
fixed formating, updated variables as final, moved methylation vcf co…
benjamincarlin Mar 20, 2019
bb37fae
updated methylation walker class documentation
benjamincarlin Mar 20, 2019
4c76724
updated ref context name
benjamincarlin Mar 20, 2019
be065df
MethylationTypeCaller unit test first draft
benjamincarlin Apr 2, 2019
f70e1c3
changed test file location, addded data provider, assert by variant c…
benjamincarlin Apr 2, 2019
8b95d28
updated test file dir and changed named to ...IntegrationTest
benjamincarlin Apr 2, 2019
bf48595
fixed format/comments, changed file/var names, using getTestFile()/ge…
benjamincarlin Apr 3, 2019
08a5915
updated documentation
benjamincarlin Apr 3, 2019
dedec28
updated documentation
benjamincarlin Apr 3, 2019
5ec9ad3
changed ref test dir variable scope, removed comments/print lines
benjamincarlin Apr 9, 2019
5bdd470
removed ref test dir variable global
benjamincarlin Apr 9, 2019
81334de
specified type of input BAM (bisulfite sequenced, methylation-aware a…
benjamincarlin Apr 9, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package org.broadinstitute.hellbender.cmdline.programgroups;

import org.broadinstitute.barclay.argparser.CommandLineProgramGroup;
import org.broadinstitute.hellbender.utils.help.HelpConstants;

/**
* Tools that performs methylation calling and methylation-based coverage for bisulfite BAMs
*/
public class MethylationProgramGroup implements CommandLineProgramGroup {
@Override
public String getName() { return HelpConstants.DOC_CAT_METHYLATION_DISCOVERY; }

@Override
public String getDescription() { return HelpConstants.DOC_CAT_METHYLATION_DISCOVERY_SUMMARY; }
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
package org.broadinstitute.hellbender.tools.walkers;

import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.*;
import org.broadinstitute.barclay.argparser.Argument;
import org.broadinstitute.barclay.argparser.ExperimentalFeature;
import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
import org.broadinstitute.barclay.help.DocumentedFeature;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.programgroups.MethylationProgramGroup;
import org.broadinstitute.hellbender.engine.*;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.pileup.ReadPileup;
import htsjdk.variant.variantcontext.writer.VariantContextWriter;
import htsjdk.samtools.SAMFileHeader;
import org.broadinstitute.hellbender.utils.BaseUtils;
import org.broadinstitute.hellbender.utils.variant.GATKVCFConstants;

import java.util.*;
import java.util.stream.Collectors;
/**
*
* <p>Identifies methylated bases from bisulfite sequencing data. Given a bisulfite sequenced, methylation-aware
* aligned BAM and a reference, it outputs methylation-site coverage to a specified output vcf file.</p>
*
* <h>Usage example</h>
* <pre>
* gatk MethylationTypeCaller \
* -R "GRCm38_primary_assembly_genome.fa \
* -I bisulfite_input.bam \
* -O output.vcf
* </pre>
*
* @author Benjamin Carlin
*/
@CommandLineProgramProperties(
summary = "Tool that prints methylation-based coverage from supplied bisulfite sequenced, methylation-aware aligned BAM to the specified output vcf file",
oneLineSummary = "Identify methylated bases from bisulfite sequenced, methylation-aware BAMs",
programGroup = MethylationProgramGroup.class
)
@DocumentedFeature
@ExperimentalFeature
public class MethylationTypeCaller extends LocusWalker {

@Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME, shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, doc = "Output VCF file")
private GATKPathSpecifier outputFile;

private VariantContextWriter vcfWriter;

private static final int REFERENCE_CONTEXT_LENGTH = 2;


@Override
public void onTraversalStart() {
vcfWriter = createVCFWriter(outputFile.toPath());
vcfWriter.writeHeader(createMethylationHeader(getHeaderForReads(), getDefaultToolVCFHeaderLines()));
}

private static VCFHeader createMethylationHeader(SAMFileHeader header, Set<VCFHeaderLine> headerLines) {
benjamincarlin marked this conversation as resolved.
Show resolved Hide resolved
if(header == null) {
throw new UserException.BadInput("Error writing header, getHeaderForReads() returns null");
}

final VCFInfoHeaderLine unconvertedCoverageLine = new VCFInfoHeaderLine(GATKVCFConstants.UNCONVERTED_BASE_COVERAGE_KEY, 1, VCFHeaderLineType.Integer, "Count of reads supporting methylation that are unconverted ");
final VCFInfoHeaderLine coverageLine = new VCFInfoHeaderLine(GATKVCFConstants.CONVERTED_BASE_COVERAGE_KEY, 1, VCFHeaderLineType.Integer, "Count of reads supporting methylation that are converted ");
final VCFInfoHeaderLine contextLine = new VCFInfoHeaderLine(GATKVCFConstants.METHYLATION_REFERENCE_CONTEXT_KEY, 1, VCFHeaderLineType.String, "Forward Strand Reference context");
final VCFInfoHeaderLine readDepthLine = VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY);
final VCFFormatHeaderLine gtLine = VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY);

headerLines.add(unconvertedCoverageLine);
headerLines.add(coverageLine);
headerLines.add(contextLine);
headerLines.add(readDepthLine);
headerLines.add(gtLine);

final List<String> samples = header.getReadGroups()
.stream()
.map(SAMReadGroupRecord::getSample)
.sorted()
.distinct()
.collect(Collectors.toList());

return new VCFHeader(headerLines, samples);
}

@Override
public void apply(AlignmentContext alignmentContext, ReferenceContext referenceContext, FeatureContext featureContext) {
final byte referenceBase = referenceContext.getBases()[0];
final int unconvertedBases;
final int convertedBases;
final byte alt;
byte[] context = null;

// check the forward strand for methylated coverage
if (referenceBase == (byte)'C') {
alt = (byte)'T';
final ReadPileup forwardBasePileup = alignmentContext.stratify(AlignmentContext.ReadOrientation.FORWARD).getBasePileup();
// unconverted: C, index=1; converted: T, index=3
final int[] forwardBaseCounts = forwardBasePileup.getBaseCounts();
unconvertedBases = forwardBaseCounts[BaseUtils.simpleBaseToBaseIndex((byte)'C')];
convertedBases = forwardBaseCounts[BaseUtils.simpleBaseToBaseIndex((byte)'T')];

// if there is methylated coverage
if (unconvertedBases + convertedBases > 0) {
context = referenceContext.getBases(0, REFERENCE_CONTEXT_LENGTH);
}
}
// check the reverse strand for methylated coverage
else if (referenceBase == (byte)'G') {
alt = (byte)'A';
final ReadPileup reverseBasePileup = alignmentContext.stratify(AlignmentContext.ReadOrientation.REVERSE).getBasePileup();
// unconverted: G, index=2; converted: A, index=0
final int[] reverseBaseCounts = reverseBasePileup.getBaseCounts();
unconvertedBases = reverseBaseCounts[BaseUtils.simpleBaseToBaseIndex((byte)'G')];
convertedBases = reverseBaseCounts[BaseUtils.simpleBaseToBaseIndex((byte)'A')];

// if there is methylated coverage
if (unconvertedBases + convertedBases > 0) {
// get the reverse complement for context b/c we are on the reverse strand
context = BaseUtils.simpleReverseComplement(referenceContext.getBases(REFERENCE_CONTEXT_LENGTH,0));
}
}
// if reference strand does not support methylation
else {
return;
}
benjamincarlin marked this conversation as resolved.
Show resolved Hide resolved

// if there are reads that have methylated coverage
if (context != null) {
final LinkedHashSet<Allele> alleles = new LinkedHashSet<>();
alleles.add(Allele.create(referenceBase, true));
alleles.add(Allele.create(alt, false));

final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(alignmentContext.getContig());
vcb.start(alignmentContext.getPosition());
vcb.stop(alignmentContext.getPosition());
vcb.noID();
vcb.alleles(alleles);
vcb.noGenotypes();
vcb.unfiltered();
vcb.attribute(GATKVCFConstants.UNCONVERTED_BASE_COVERAGE_KEY, unconvertedBases);
vcb.attribute(GATKVCFConstants.CONVERTED_BASE_COVERAGE_KEY, convertedBases);
vcb.attribute(GATKVCFConstants.METHYLATION_REFERENCE_CONTEXT_KEY, new String(context));
vcb.attribute(VCFConstants.DEPTH_KEY, alignmentContext.size());

// write to VCF
final VariantContext vc = vcb.make();
vcfWriter.add(vc);
}
}

@Override
public void closeTool() {
if (vcfWriter != null ) {
vcfWriter.close();
}
}

@Override
public boolean requiresReference() {
return true;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ public static String forumPost(String post) {
public final static String DOC_CAT_RNA = "RNA-Specific Tools";
public final static String DOC_CAT_RNA_SUMMARY = "Tools intended to be used for processing RNA data.";

public final static String DOC_CAT_METHYLATION_DISCOVERY = "Methylation-Specific Tools";
public final static String DOC_CAT_METHYLATION_DISCOVERY_SUMMARY = "Tools that perform methylation calling, processing bisulfite sequenced, methylation-aware aligned BAM";

// End GATK Program groups

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,12 @@ public final class GATKVCFConstants {
public static final String MEDIAN_FRAGMENT_LENGTH_KEY = "MFRL";
public static final String MEDIAN_READ_POSITON_KEY = "MPOS";

// Methylation-specific INFO Keys
public static final String UNCONVERTED_BASE_COVERAGE_KEY = "UNCONVERTED_BASE_COV";
public static final String CONVERTED_BASE_COVERAGE_KEY = "CONVERTED_BASE_COV";
public static final String METHYLATION_REFERENCE_CONTEXT_KEY = "REFERENCE_CONTEXT";


// FORMAT keys
public static final String ALLELE_BALANCE_KEY = "AB";
public static final String PL_FOR_ALL_SNP_ALLELES_KEY = "APL";
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
package org.broadinstitute.hellbender.tools.walkers;

import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;

public class MethylationTypeCallerIntegrationTest extends CommandLineProgramTest {

@Test(dataProvider="getMethylationTypeCallerTestInput")
public void testBasicMethylationCoverage(final String inputFileName, final String referenceFileName) throws Exception {

final File outputVCF = createTempFile("testBasicMethylationCoverageVCF", ".vcf");
final File expectedVCF = getTestFile("chr14_subset.methylC_seq.expected.vcf");

final String[] inputArgs = {
"-I", inputFileName,
"-R", referenceFileName,
"-O", outputVCF.getAbsolutePath(),
"--" + StandardArgumentDefinitions.ADD_OUTPUT_VCF_COMMANDLINE, "false"
};

runCommandLine(inputArgs);

// Test for an exact match against past results
IntegrationTestSpec.assertEqualTextFiles(outputVCF, expectedVCF);
}

@DataProvider
public Object[][] getMethylationTypeCallerTestInput() {
final String inputFileName = getToolTestDataDir() + "chr14_subset.methylC_seq.bam";
final String referenceFileName = largeFileTestDir + "GRCm38_primary_assembly_genome/chr14.GRCm38.primary_assembly.genome.fa.gz";
return new Object[][] {
{inputFileName, referenceFileName}
};
}
}

Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Binary file not shown.
Binary file not shown.
Loading