Skip to content

Commit

Permalink
(SV) output a single VCF for new interpretation tool, also: (#4996)
Browse files Browse the repository at this point in the history
* refactor some utilities and GATKSVVCFConstants and GATKSVVCFHeaderLines
  * group methods in StructuralVariationDiscoveryPipelineSpark by functionality
  * bring MAX_ALIGN_LENGTH and MAPPING_QUALITIES annotations from CPX variants to re-interpreted simple variants
  * add new CLI argument and filter assembly based variants based on annotation MAPPING_QUALITIES, MAX_ALIGN_LENGTH
  * filter out variants of size < 50
  • Loading branch information
SHuang-Broad authored Aug 2, 2018
1 parent 9afdd45 commit 2810890
Show file tree
Hide file tree
Showing 32 changed files with 687 additions and 317 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -269,11 +269,15 @@ public void validate() {

public enum SvEvidenceFilterType {DENSITY, XGBOOST}

public static class DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection implements Serializable {
public static class DiscoverVariantsFromContigAlignmentsSparkArgumentCollection implements Serializable {
private static final long serialVersionUID = 1L;

public static final int GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY = STRUCTURAL_VARIANT_SIZE_LOWER_BOUND; // alignment with gap of size >= 50 will be broken apart.

// TODO: 7/30/18 the following two values essentially perform the same filtering, except one (CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD) is used in a tool (ContigChimericAlignmentIterativeInterpreter) that is about to be phased out, so move it when the kill switch is flipped
public static final int CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD = 60;
public static final int ASSEMBLY_ALIGNMENT_MQ_FILTER_DEFAULT = 30;

public static final int DEFAULT_MIN_ALIGNMENT_LENGTH = 50; // Minimum flanking alignment length filters used when going through contig alignments.
public static final int DEFAULT_ASSEMBLED_IMPRECISE_EVIDENCE_OVERLAP_UNCERTAINTY = 100;
public static final int DEFAULT_IMPRECISE_VARIANT_EVIDENCE_THRESHOLD = 7;
Expand All @@ -283,6 +287,9 @@ public static class DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection
@Argument(doc = "Minimum flanking alignment length", fullName = "min-align-length")
public Integer minAlignLength = DEFAULT_MIN_ALIGNMENT_LENGTH;

@Argument(doc = "Minimum mapping quality of evidence assembly contig", shortName = "mq", fullName = "min-mq")
public Integer minMQ = ASSEMBLY_ALIGNMENT_MQ_FILTER_DEFAULT;

@Hidden
@Argument(doc = "VCF containing the true breakpoints used only for evaluation (not generation) of calls",
fullName = "truth-vcf", optional = true)
Expand All @@ -308,6 +315,10 @@ public static class DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection
fullName = "max-callable-imprecise-deletion-size", optional=true)
public int maxCallableImpreciseVariantDeletionSize = DEFAULT_MAX_CALLABLE_IMPRECISE_DELETION_SIZE;

@Advanced
@Argument(doc = "Run interpretation tool in debug mode (more information print to screen)", fullName = "debug-mode", optional = true)
public Boolean runInDebugMode = false;

/**
* Explicit call this method.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource;
import org.broadinstitute.hellbender.engine.spark.GATKSparkTool;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.FindBreakpointEvidenceSparkArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.AnnotatedVariantProducer;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoverFromLocalAssemblyContigAlignmentsSpark;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoveryInputMetaData;
Expand Down Expand Up @@ -122,8 +122,8 @@ public class StructuralVariationDiscoveryPipelineSpark extends GATKSparkTool {
private final FindBreakpointEvidenceSparkArgumentCollection evidenceAndAssemblyArgs
= new FindBreakpointEvidenceSparkArgumentCollection();
@ArgumentCollection
private final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs
= new DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection();
private final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs
= new DiscoverVariantsFromContigAlignmentsSparkArgumentCollection();

@Argument(doc = "sam file for aligned contigs", fullName = "contig-sam-file")
private String outputAssemblyAlignments;
Expand Down Expand Up @@ -153,13 +153,6 @@ protected void runTool( final JavaSparkContext ctx ) {

validateParams();

Utils.validate(evidenceAndAssemblyArgs.externalEvidenceFile == null || discoverStageArgs.cnvCallsFile == null,
"Please only specify one of externalEvidenceFile or cnvCallsFile");

if (discoverStageArgs.cnvCallsFile != null) {
evidenceAndAssemblyArgs.externalEvidenceFile = discoverStageArgs.cnvCallsFile;
}

JavaRDD<GATKRead> unfilteredReads = getUnfilteredReads();
final SAMFileHeader headerForReads = getHeaderForReads();

Expand Down Expand Up @@ -204,9 +197,18 @@ protected void runTool( final JavaSparkContext ctx ) {
}
}

// init ============================================================================================================

private void validateParams() {
evidenceAndAssemblyArgs.validate();
discoverStageArgs.validate();

Utils.validate(evidenceAndAssemblyArgs.externalEvidenceFile == null || discoverStageArgs.cnvCallsFile == null,
"Please only specify one of externalEvidenceFile or cnvCallsFile");

if (discoverStageArgs.cnvCallsFile != null) {
evidenceAndAssemblyArgs.externalEvidenceFile = discoverStageArgs.cnvCallsFile;
}
}

private SvDiscoveryInputMetaData getSvDiscoveryInputData(final JavaSparkContext ctx,
Expand All @@ -232,49 +234,43 @@ private SvDiscoveryInputMetaData getSvDiscoveryInputData(final JavaSparkContext
cnvCallsBroadcast, getHeaderForReads(), getReference(), localLogger);
}

/**
* Uses the input EvidenceTargetLinks to
* <ul>
* <li>
* either annotate the variants called from assembly discovery with split read and read pair evidence, or
* </li>
* <li>
* to call new imprecise variants if the number of pieces of evidence exceeds a given threshold.
* </li>
* </ul>
*
*/
private static List<VariantContext> processEvidenceTargetLinks(List<VariantContext> assemblyBasedVariants,
final SvDiscoveryInputMetaData svDiscoveryInputMetaData) {

final List<VariantContext> annotatedVariants;
if (svDiscoveryInputMetaData.getSampleSpecificData().getEvidenceTargetLinks() != null) {
final PairedStrandedIntervalTree<EvidenceTargetLink> evidenceTargetLinks = svDiscoveryInputMetaData.getSampleSpecificData().getEvidenceTargetLinks();
final ReadMetadata readMetadata = svDiscoveryInputMetaData.getSampleSpecificData().getReadMetadata();
final ReferenceMultiSource reference = svDiscoveryInputMetaData.getReferenceData().getReferenceBroadcast().getValue();
final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs = svDiscoveryInputMetaData.getDiscoverStageArgs();
final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger();
public static Broadcast<SVIntervalTree<VariantContext>> broadcastCNVCalls(final JavaSparkContext ctx,
final SAMFileHeader header,
final String cnvCallsFile) {
final SVIntervalTree<VariantContext> cnvCalls;
if (cnvCallsFile != null) {
cnvCalls = CNVInputReader.loadCNVCalls(cnvCallsFile, header);
} else {
cnvCalls = null;
}

// annotate with evidence links
annotatedVariants = AnnotatedVariantProducer.
annotateBreakpointBasedCallsWithImpreciseEvidenceLinks(assemblyBasedVariants,
evidenceTargetLinks, readMetadata, reference, discoverStageArgs, toolLogger);
final Broadcast<SVIntervalTree<VariantContext>> broadcastCNVCalls;
if (cnvCalls != null) {
broadcastCNVCalls = ctx.broadcast(cnvCalls);
} else {
broadcastCNVCalls = null;
}
return broadcastCNVCalls;
}

// then also imprecise deletion
final List<VariantContext> impreciseVariants = ImpreciseVariantDetector.
callImpreciseDeletionFromEvidenceLinks(evidenceTargetLinks, readMetadata, reference,
discoverStageArgs.impreciseVariantEvidenceThreshold,
discoverStageArgs.maxCallableImpreciseVariantDeletionSize,
toolLogger);
/**
* Makes a PairedStrandedIntervalTree from a list of EvidenceTargetLinks. The value of each entry in the resulting tree
* will be the original EvidenceTargetLink. If the input list is null, returns a null tree.
*/
private PairedStrandedIntervalTree<EvidenceTargetLink> makeEvidenceLinkTree(final List<EvidenceTargetLink> evidenceTargetLinks) {
final PairedStrandedIntervalTree<EvidenceTargetLink> evidenceLinkTree;

annotatedVariants.addAll(impreciseVariants);
if (evidenceTargetLinks != null) {
evidenceLinkTree = new PairedStrandedIntervalTree<>();
evidenceTargetLinks.forEach(l -> evidenceLinkTree.put(l.getPairedStrandedIntervals(), l));
} else {
annotatedVariants = assemblyBasedVariants;
evidenceLinkTree = null;
}

return annotatedVariants;
return evidenceLinkTree;
}

// interpretation: assembly-based ==================================================================================

private static void experimentalInterpretation(final JavaSparkContext ctx,
final FindBreakpointEvidenceSpark.AssembledEvidenceResults assembledEvidenceResults,
final SvDiscoveryInputMetaData svDiscoveryInputMetaData) {
Expand All @@ -287,8 +283,11 @@ private static void experimentalInterpretation(final JavaSparkContext ctx,
SvDiscoverFromLocalAssemblyContigAlignmentsSpark.AssemblyContigsClassifiedByAlignmentSignatures contigsByPossibleRawTypes
= SvDiscoverFromLocalAssemblyContigAlignmentsSpark.preprocess(svDiscoveryInputMetaData, assemblyRawAlignments);

SvDiscoverFromLocalAssemblyContigAlignmentsSpark.dispatchJobs(ctx, contigsByPossibleRawTypes,
svDiscoveryInputMetaData, assemblyRawAlignments, true);
final List<VariantContext> variants = SvDiscoverFromLocalAssemblyContigAlignmentsSpark
.dispatchJobs(ctx, contigsByPossibleRawTypes, svDiscoveryInputMetaData, assemblyRawAlignments, true);
contigsByPossibleRawTypes.unpersist();

SvDiscoverFromLocalAssemblyContigAlignmentsSpark.filterAndWriteMergedVCF(updatedOutputPath, variants, svDiscoveryInputMetaData);
}

private static JavaRDD<GATKRead> getContigRawAlignments(final JavaSparkContext ctx,
Expand All @@ -312,21 +311,50 @@ private static JavaRDD<GATKRead> getContigRawAlignments(final JavaSparkContext c
}

/**
* Makes a PairedStrandedIntervalTree from a list of EvidenceTargetLinks. The value of each entry in the resulting tree
* will be the original EvidenceTargetLink. If the input list is null, returns a null tree.
* Uses the input EvidenceTargetLinks to
* <ul>
* <li>
* either annotate the variants called from assembly discovery with split read and read pair evidence, or
* </li>
* <li>
* to call new imprecise variants if the number of pieces of evidence exceeds a given threshold.
* </li>
* </ul>
*
*/
private PairedStrandedIntervalTree<EvidenceTargetLink> makeEvidenceLinkTree(final List<EvidenceTargetLink> evidenceTargetLinks) {
final PairedStrandedIntervalTree<EvidenceTargetLink> evidenceLinkTree;
private static List<VariantContext> processEvidenceTargetLinks(List<VariantContext> assemblyBasedVariants,
final SvDiscoveryInputMetaData svDiscoveryInputMetaData) {

if (evidenceTargetLinks != null) {
evidenceLinkTree = new PairedStrandedIntervalTree<>();
evidenceTargetLinks.forEach(l -> evidenceLinkTree.put(l.getPairedStrandedIntervals(), l));
final List<VariantContext> annotatedVariants;
if (svDiscoveryInputMetaData.getSampleSpecificData().getEvidenceTargetLinks() != null) {
final PairedStrandedIntervalTree<EvidenceTargetLink> evidenceTargetLinks = svDiscoveryInputMetaData.getSampleSpecificData().getEvidenceTargetLinks();
final ReadMetadata readMetadata = svDiscoveryInputMetaData.getSampleSpecificData().getReadMetadata();
final ReferenceMultiSource reference = svDiscoveryInputMetaData.getReferenceData().getReferenceBroadcast().getValue();
final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs = svDiscoveryInputMetaData.getDiscoverStageArgs();
final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger();

// annotate with evidence links
annotatedVariants = AnnotatedVariantProducer.
annotateBreakpointBasedCallsWithImpreciseEvidenceLinks(assemblyBasedVariants,
evidenceTargetLinks, readMetadata, reference, discoverStageArgs, toolLogger);

// then also imprecise deletion
final List<VariantContext> impreciseVariants = ImpreciseVariantDetector.
callImpreciseDeletionFromEvidenceLinks(evidenceTargetLinks, readMetadata, reference,
discoverStageArgs.impreciseVariantEvidenceThreshold,
discoverStageArgs.maxCallableImpreciseVariantDeletionSize,
toolLogger);

annotatedVariants.addAll(impreciseVariants);
} else {
evidenceLinkTree = null;
annotatedVariants = assemblyBasedVariants;
}
return evidenceLinkTree;

return annotatedVariants;
}

// parser ==========================================================================================================

public static final class InMemoryAlignmentParser extends AlignedContigGenerator implements Serializable {
private static final long serialVersionUID = 1L;

Expand Down Expand Up @@ -384,8 +412,7 @@ public static JavaRDD<AlignedContig> filterAndConvertToAlignedContigViaSAM(final
SvDiscoverFromLocalAssemblyContigAlignmentsSpark.
SAMFormattedContigAlignmentParser.
parseReadsAndOptionallySplitGappedAlignments(forOneContig,
StructuralVariationDiscoveryArgumentCollection
.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection
DiscoverVariantsFromContigAlignmentsSparkArgumentCollection
.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY,
true));
}
Expand Down Expand Up @@ -447,8 +474,7 @@ private static List<AlignmentInterval> getAlignmentsForOneContig(final String co
.map(bwaMemAlignment -> BwaMemAlignmentUtils.applyAlignment(contigName, contigSequence, null,
null, bwaMemAlignment, refNames, header, false, false))
.map(AlignmentInterval::new)
.map(ar -> ContigAlignmentsModifier.splitGappedAlignment(ar, StructuralVariationDiscoveryArgumentCollection
.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection
.map(ar -> ContigAlignmentsModifier.splitGappedAlignment(ar, DiscoverVariantsFromContigAlignmentsSparkArgumentCollection
.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, contigSequence.length))
.flatMap(Utils::stream).collect(Collectors.toList());
}
Expand All @@ -464,23 +490,4 @@ public JavaRDD<AlignedContig> getAlignedContigs() {
}
}

public static Broadcast<SVIntervalTree<VariantContext>> broadcastCNVCalls(final JavaSparkContext ctx,
final SAMFileHeader header,
final String cnvCallsFile) {
final SVIntervalTree<VariantContext> cnvCalls;
if (cnvCallsFile != null) {
cnvCalls = CNVInputReader.loadCNVCalls(cnvCallsFile, header);
} else {
cnvCalls = null;
}

final Broadcast<SVIntervalTree<VariantContext>> broadcastCNVCalls;
if (cnvCalls != null) {
broadcastCNVCalls = ctx.broadcast(cnvCalls);
} else {
broadcastCNVCalls = null;
}
return broadcastCNVCalls;
}

}
Loading

0 comments on commit 2810890

Please sign in to comment.