diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryArgumentCollection.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryArgumentCollection.java index 50b5fd3ac5b..f82537fa22a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryArgumentCollection.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryArgumentCollection.java @@ -269,11 +269,15 @@ public void validate() { public enum SvEvidenceFilterType {DENSITY, XGBOOST} - public static class DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection implements Serializable { + public static class DiscoverVariantsFromContigAlignmentsSparkArgumentCollection implements Serializable { private static final long serialVersionUID = 1L; public static final int GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY = STRUCTURAL_VARIANT_SIZE_LOWER_BOUND; // alignment with gap of size >= 50 will be broken apart. + + // TODO: 7/30/18 the following two values essentially perform the same filtering, except one (CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD) is used in a tool (ContigChimericAlignmentIterativeInterpreter) that is about to be phased out, so move it when the kill switch is flipped public static final int CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD = 60; + public static final int ASSEMBLY_ALIGNMENT_MQ_FILTER_DEFAULT = 30; + public static final int DEFAULT_MIN_ALIGNMENT_LENGTH = 50; // Minimum flanking alignment length filters used when going through contig alignments. public static final int DEFAULT_ASSEMBLED_IMPRECISE_EVIDENCE_OVERLAP_UNCERTAINTY = 100; public static final int DEFAULT_IMPRECISE_VARIANT_EVIDENCE_THRESHOLD = 7; @@ -283,6 +287,9 @@ public static class DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection @Argument(doc = "Minimum flanking alignment length", fullName = "min-align-length") public Integer minAlignLength = DEFAULT_MIN_ALIGNMENT_LENGTH; + @Argument(doc = "Minimum mapping quality of evidence assembly contig", shortName = "mq", fullName = "min-mq") + public Integer minMQ = ASSEMBLY_ALIGNMENT_MQ_FILTER_DEFAULT; + @Hidden @Argument(doc = "VCF containing the true breakpoints used only for evaluation (not generation) of calls", fullName = "truth-vcf", optional = true) @@ -308,6 +315,10 @@ public static class DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection fullName = "max-callable-imprecise-deletion-size", optional=true) public int maxCallableImpreciseVariantDeletionSize = DEFAULT_MAX_CALLABLE_IMPRECISE_DELETION_SIZE; + @Advanced + @Argument(doc = "Run interpretation tool in debug mode (more information print to screen)", fullName = "debug-mode", optional = true) + public Boolean runInDebugMode = false; + /** * Explicit call this method. */ diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java index 5dd37523e4e..3811e1ac5da 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/StructuralVariationDiscoveryPipelineSpark.java @@ -18,8 +18,8 @@ import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; import org.broadinstitute.hellbender.exceptions.GATKException; -import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection; import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.FindBreakpointEvidenceSparkArgumentCollection; +import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection; import org.broadinstitute.hellbender.tools.spark.sv.discovery.AnnotatedVariantProducer; import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoverFromLocalAssemblyContigAlignmentsSpark; import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoveryInputMetaData; @@ -122,8 +122,8 @@ public class StructuralVariationDiscoveryPipelineSpark extends GATKSparkTool { private final FindBreakpointEvidenceSparkArgumentCollection evidenceAndAssemblyArgs = new FindBreakpointEvidenceSparkArgumentCollection(); @ArgumentCollection - private final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs - = new DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection(); + private final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs + = new DiscoverVariantsFromContigAlignmentsSparkArgumentCollection(); @Argument(doc = "sam file for aligned contigs", fullName = "contig-sam-file") private String outputAssemblyAlignments; @@ -153,13 +153,6 @@ protected void runTool( final JavaSparkContext ctx ) { validateParams(); - Utils.validate(evidenceAndAssemblyArgs.externalEvidenceFile == null || discoverStageArgs.cnvCallsFile == null, - "Please only specify one of externalEvidenceFile or cnvCallsFile"); - - if (discoverStageArgs.cnvCallsFile != null) { - evidenceAndAssemblyArgs.externalEvidenceFile = discoverStageArgs.cnvCallsFile; - } - JavaRDD unfilteredReads = getUnfilteredReads(); final SAMFileHeader headerForReads = getHeaderForReads(); @@ -204,9 +197,18 @@ protected void runTool( final JavaSparkContext ctx ) { } } + // init ============================================================================================================ + private void validateParams() { evidenceAndAssemblyArgs.validate(); discoverStageArgs.validate(); + + Utils.validate(evidenceAndAssemblyArgs.externalEvidenceFile == null || discoverStageArgs.cnvCallsFile == null, + "Please only specify one of externalEvidenceFile or cnvCallsFile"); + + if (discoverStageArgs.cnvCallsFile != null) { + evidenceAndAssemblyArgs.externalEvidenceFile = discoverStageArgs.cnvCallsFile; + } } private SvDiscoveryInputMetaData getSvDiscoveryInputData(final JavaSparkContext ctx, @@ -232,49 +234,43 @@ private SvDiscoveryInputMetaData getSvDiscoveryInputData(final JavaSparkContext cnvCallsBroadcast, getHeaderForReads(), getReference(), localLogger); } - /** - * Uses the input EvidenceTargetLinks to - * - * - */ - private static List processEvidenceTargetLinks(List assemblyBasedVariants, - final SvDiscoveryInputMetaData svDiscoveryInputMetaData) { - - final List annotatedVariants; - if (svDiscoveryInputMetaData.getSampleSpecificData().getEvidenceTargetLinks() != null) { - final PairedStrandedIntervalTree evidenceTargetLinks = svDiscoveryInputMetaData.getSampleSpecificData().getEvidenceTargetLinks(); - final ReadMetadata readMetadata = svDiscoveryInputMetaData.getSampleSpecificData().getReadMetadata(); - final ReferenceMultiSource reference = svDiscoveryInputMetaData.getReferenceData().getReferenceBroadcast().getValue(); - final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs = svDiscoveryInputMetaData.getDiscoverStageArgs(); - final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger(); + public static Broadcast> broadcastCNVCalls(final JavaSparkContext ctx, + final SAMFileHeader header, + final String cnvCallsFile) { + final SVIntervalTree cnvCalls; + if (cnvCallsFile != null) { + cnvCalls = CNVInputReader.loadCNVCalls(cnvCallsFile, header); + } else { + cnvCalls = null; + } - // annotate with evidence links - annotatedVariants = AnnotatedVariantProducer. - annotateBreakpointBasedCallsWithImpreciseEvidenceLinks(assemblyBasedVariants, - evidenceTargetLinks, readMetadata, reference, discoverStageArgs, toolLogger); + final Broadcast> broadcastCNVCalls; + if (cnvCalls != null) { + broadcastCNVCalls = ctx.broadcast(cnvCalls); + } else { + broadcastCNVCalls = null; + } + return broadcastCNVCalls; + } - // then also imprecise deletion - final List impreciseVariants = ImpreciseVariantDetector. - callImpreciseDeletionFromEvidenceLinks(evidenceTargetLinks, readMetadata, reference, - discoverStageArgs.impreciseVariantEvidenceThreshold, - discoverStageArgs.maxCallableImpreciseVariantDeletionSize, - toolLogger); + /** + * Makes a PairedStrandedIntervalTree from a list of EvidenceTargetLinks. The value of each entry in the resulting tree + * will be the original EvidenceTargetLink. If the input list is null, returns a null tree. + */ + private PairedStrandedIntervalTree makeEvidenceLinkTree(final List evidenceTargetLinks) { + final PairedStrandedIntervalTree evidenceLinkTree; - annotatedVariants.addAll(impreciseVariants); + if (evidenceTargetLinks != null) { + evidenceLinkTree = new PairedStrandedIntervalTree<>(); + evidenceTargetLinks.forEach(l -> evidenceLinkTree.put(l.getPairedStrandedIntervals(), l)); } else { - annotatedVariants = assemblyBasedVariants; + evidenceLinkTree = null; } - - return annotatedVariants; + return evidenceLinkTree; } + // interpretation: assembly-based ================================================================================== + private static void experimentalInterpretation(final JavaSparkContext ctx, final FindBreakpointEvidenceSpark.AssembledEvidenceResults assembledEvidenceResults, final SvDiscoveryInputMetaData svDiscoveryInputMetaData) { @@ -287,8 +283,11 @@ private static void experimentalInterpretation(final JavaSparkContext ctx, SvDiscoverFromLocalAssemblyContigAlignmentsSpark.AssemblyContigsClassifiedByAlignmentSignatures contigsByPossibleRawTypes = SvDiscoverFromLocalAssemblyContigAlignmentsSpark.preprocess(svDiscoveryInputMetaData, assemblyRawAlignments); - SvDiscoverFromLocalAssemblyContigAlignmentsSpark.dispatchJobs(ctx, contigsByPossibleRawTypes, - svDiscoveryInputMetaData, assemblyRawAlignments, true); + final List variants = SvDiscoverFromLocalAssemblyContigAlignmentsSpark + .dispatchJobs(ctx, contigsByPossibleRawTypes, svDiscoveryInputMetaData, assemblyRawAlignments, true); + contigsByPossibleRawTypes.unpersist(); + + SvDiscoverFromLocalAssemblyContigAlignmentsSpark.filterAndWriteMergedVCF(updatedOutputPath, variants, svDiscoveryInputMetaData); } private static JavaRDD getContigRawAlignments(final JavaSparkContext ctx, @@ -312,21 +311,50 @@ private static JavaRDD getContigRawAlignments(final JavaSparkContext c } /** - * Makes a PairedStrandedIntervalTree from a list of EvidenceTargetLinks. The value of each entry in the resulting tree - * will be the original EvidenceTargetLink. If the input list is null, returns a null tree. + * Uses the input EvidenceTargetLinks to + *
    + *
  • + * either annotate the variants called from assembly discovery with split read and read pair evidence, or + *
  • + *
  • + * to call new imprecise variants if the number of pieces of evidence exceeds a given threshold. + *
  • + *
+ * */ - private PairedStrandedIntervalTree makeEvidenceLinkTree(final List evidenceTargetLinks) { - final PairedStrandedIntervalTree evidenceLinkTree; + private static List processEvidenceTargetLinks(List assemblyBasedVariants, + final SvDiscoveryInputMetaData svDiscoveryInputMetaData) { - if (evidenceTargetLinks != null) { - evidenceLinkTree = new PairedStrandedIntervalTree<>(); - evidenceTargetLinks.forEach(l -> evidenceLinkTree.put(l.getPairedStrandedIntervals(), l)); + final List annotatedVariants; + if (svDiscoveryInputMetaData.getSampleSpecificData().getEvidenceTargetLinks() != null) { + final PairedStrandedIntervalTree evidenceTargetLinks = svDiscoveryInputMetaData.getSampleSpecificData().getEvidenceTargetLinks(); + final ReadMetadata readMetadata = svDiscoveryInputMetaData.getSampleSpecificData().getReadMetadata(); + final ReferenceMultiSource reference = svDiscoveryInputMetaData.getReferenceData().getReferenceBroadcast().getValue(); + final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs = svDiscoveryInputMetaData.getDiscoverStageArgs(); + final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger(); + + // annotate with evidence links + annotatedVariants = AnnotatedVariantProducer. + annotateBreakpointBasedCallsWithImpreciseEvidenceLinks(assemblyBasedVariants, + evidenceTargetLinks, readMetadata, reference, discoverStageArgs, toolLogger); + + // then also imprecise deletion + final List impreciseVariants = ImpreciseVariantDetector. + callImpreciseDeletionFromEvidenceLinks(evidenceTargetLinks, readMetadata, reference, + discoverStageArgs.impreciseVariantEvidenceThreshold, + discoverStageArgs.maxCallableImpreciseVariantDeletionSize, + toolLogger); + + annotatedVariants.addAll(impreciseVariants); } else { - evidenceLinkTree = null; + annotatedVariants = assemblyBasedVariants; } - return evidenceLinkTree; + + return annotatedVariants; } + // parser ========================================================================================================== + public static final class InMemoryAlignmentParser extends AlignedContigGenerator implements Serializable { private static final long serialVersionUID = 1L; @@ -384,8 +412,7 @@ public static JavaRDD filterAndConvertToAlignedContigViaSAM(final SvDiscoverFromLocalAssemblyContigAlignmentsSpark. SAMFormattedContigAlignmentParser. parseReadsAndOptionallySplitGappedAlignments(forOneContig, - StructuralVariationDiscoveryArgumentCollection - .DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection + DiscoverVariantsFromContigAlignmentsSparkArgumentCollection .GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, true)); } @@ -447,8 +474,7 @@ private static List getAlignmentsForOneContig(final String co .map(bwaMemAlignment -> BwaMemAlignmentUtils.applyAlignment(contigName, contigSequence, null, null, bwaMemAlignment, refNames, header, false, false)) .map(AlignmentInterval::new) - .map(ar -> ContigAlignmentsModifier.splitGappedAlignment(ar, StructuralVariationDiscoveryArgumentCollection - .DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection + .map(ar -> ContigAlignmentsModifier.splitGappedAlignment(ar, DiscoverVariantsFromContigAlignmentsSparkArgumentCollection .GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, contigSequence.length)) .flatMap(Utils::stream).collect(Collectors.toList()); } @@ -464,23 +490,4 @@ public JavaRDD getAlignedContigs() { } } - public static Broadcast> broadcastCNVCalls(final JavaSparkContext ctx, - final SAMFileHeader header, - final String cnvCallsFile) { - final SVIntervalTree cnvCalls; - if (cnvCallsFile != null) { - cnvCalls = CNVInputReader.loadCNVCalls(cnvCallsFile, header); - } else { - cnvCalls = null; - } - - final Broadcast> broadcastCNVCalls; - if (cnvCalls != null) { - broadcastCNVCalls = ctx.broadcast(cnvCalls); - } else { - broadcastCNVCalls = null; - } - return broadcastCNVCalls; - } - } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducer.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducer.java index e2f0ab8337d..dbe95d1d1b9 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducer.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducer.java @@ -10,7 +10,6 @@ import org.apache.logging.log4j.Logger; import org.apache.spark.broadcast.Broadcast; import org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource; -import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection; import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignmentInterval; import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments; import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype; @@ -26,7 +25,7 @@ import java.util.*; import java.util.stream.Collectors; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection; /** * Given identified pair of breakpoints for a simple SV and its supportive evidence, i.e. chimeric alignments, @@ -137,7 +136,7 @@ public static List annotateBreakpointBasedCallsWithImpreciseEvid final PairedStrandedIntervalTree evidenceTargetLinks, final ReadMetadata metadata, final ReferenceMultiSource reference, - final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection parameters, + final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection parameters, final Logger localLogger) { final int originalEvidenceLinkSize = evidenceTargetLinks.size(); @@ -192,7 +191,7 @@ static Map getAssemblyEvidenceRelatedAnnotations(final Iterable< final Map attributeMap = new HashMap<>(); attributeMap.put(GATKSVVCFConstants.TOTAL_MAPPINGS, annotations.size()); - attributeMap.put(GATKSVVCFConstants.HQ_MAPPINGS, annotations.stream().filter(annotation -> annotation.minMQ == StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD).count()); + attributeMap.put(GATKSVVCFConstants.HQ_MAPPINGS, annotations.stream().filter(annotation -> annotation.minMQ == DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD).count()); attributeMap.put(GATKSVVCFConstants.MAPPING_QUALITIES, annotations.stream().map(annotation -> String.valueOf(annotation.minMQ)).collect(Collectors.joining(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR))); attributeMap.put(GATKSVVCFConstants.ALIGN_LENGTHS, annotations.stream().map(annotation -> String.valueOf(annotation.minAL)).collect(Collectors.joining(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR))); attributeMap.put(GATKSVVCFConstants.MAX_ALIGN_LENGTH, annotations.stream().map(annotation -> annotation.minAL).max(Comparator.naturalOrder()).orElse(0)); diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java index 21a9cb6374d..b1b7a59deec 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/DiscoverVariantsFromContigAlignmentsSAMSpark.java @@ -30,7 +30,7 @@ import java.util.Collections; import java.util.List; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection; /** * (Internal) Examines aligned contigs from local assemblies and calls structural variants @@ -93,8 +93,8 @@ public final class DiscoverVariantsFromContigAlignmentsSAMSpark extends GATKSpar private final Logger localLogger = LogManager.getLogger(DiscoverVariantsFromContigAlignmentsSAMSpark.class); @ArgumentCollection - private final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs = - new DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection(); + private final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs = + new DiscoverVariantsFromContigAlignmentsSparkArgumentCollection(); @Argument(doc = "prefix for discovery (non-genotyped) VCF; sample name will be appended after the provided argument, followed by \"_inv_del_ins.vcf\"", shortName = StandardArgumentDefinitions.OUTPUT_SHORT_NAME, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java index 09cc062a380..f7a481eada5 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoverFromLocalAssemblyContigAlignmentsSpark.java @@ -4,6 +4,7 @@ import htsjdk.samtools.*; import htsjdk.samtools.util.SequenceUtil; import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.variantcontext.VariantContextBuilder; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.spark.api.java.JavaRDD; @@ -18,14 +19,17 @@ import org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariantDiscoveryProgramGroup; import org.broadinstitute.hellbender.engine.filters.ReadFilter; import org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary; +import org.broadinstitute.hellbender.engine.filters.VariantFilter; import org.broadinstitute.hellbender.engine.spark.GATKSparkTool; import org.broadinstitute.hellbender.exceptions.GATKException; import org.broadinstitute.hellbender.exceptions.UserException; +import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection; import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryPipelineSpark; import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.*; import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.CpxVariantInterpreter; import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SegmentedCpxVariantSimpleVariantExtractor; import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.SimpleNovelAdjacencyInterpreter; +import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVUtils; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFWriter; @@ -41,8 +45,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY; import static org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments.AlignmentSignatureBasicType.*; import static org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments.ReasonForAlignmentClassificationFailure; @@ -103,9 +107,9 @@ public final class SvDiscoverFromLocalAssemblyContigAlignmentsSpark extends GATK private final Logger localLogger = LogManager.getLogger(SvDiscoverFromLocalAssemblyContigAlignmentsSpark.class); @ArgumentCollection - private DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection + private DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs - = new DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection(); + = new DiscoverVariantsFromContigAlignmentsSparkArgumentCollection(); @Argument(doc = "file containing non-canonical chromosome names (e.g chrUn_KI270588v1) in the reference, human reference (hg19 or hg38) assumed when omitted", shortName = "alt-tigs", @@ -121,6 +125,12 @@ public final class SvDiscoverFromLocalAssemblyContigAlignmentsSpark extends GATK fullName = "write-sam", optional = true) private boolean writeSAMFiles; + static final String SIMPLE_CHIMERA_VCF_FILE_NAME = "NonComplex.vcf"; + static final String COMPLEX_CHIMERA_VCF_FILE_NAME = "Complex.vcf"; + static final String REINTERPRETED_1_SEG_CALL_VCF_FILE_NAME = "cpx_reinterpreted_simple_1_seg.vcf"; + static final String REINTERPRETED_MULTI_SEG_CALL_VCF_FILE_NAME = "cpx_reinterpreted_simple_multi_seg.vcf"; + static final String MERGED_VCF_FILE_NAME = "merged_simple.vcf"; + @Override public boolean requiresReference() { return true; @@ -155,7 +165,11 @@ protected void runTool(final JavaSparkContext ctx) { final AssemblyContigsClassifiedByAlignmentSignatures contigsByPossibleRawTypes = preprocess(svDiscoveryInputMetaData, assemblyRawAlignments); - dispatchJobs(ctx, contigsByPossibleRawTypes, svDiscoveryInputMetaData, assemblyRawAlignments, writeSAMFiles); + final List variants = + dispatchJobs(ctx, contigsByPossibleRawTypes, svDiscoveryInputMetaData, assemblyRawAlignments, writeSAMFiles); + contigsByPossibleRawTypes.unpersist(); + + filterAndWriteMergedVCF(outputPrefixWithSampleName, variants, svDiscoveryInputMetaData); } @@ -202,6 +216,12 @@ public JavaRDD getContigsWithSignatureCla return complex; } + public void unpersist() { + simple.unpersist(false); + complex.unpersist(false); + unknown.unpersist(false); + } + /** * Write SAM file, if requested, for original alignments of contigs recognized as "Ambiguous", "Incomplete", and "MisAssemblySuspect" * TODO: 11/17/17 salvation on assembly contigs that 1) has ambiguous "best" configuration, and 2) has incomplete picture; and flag accordingly @@ -266,50 +286,207 @@ public static AssemblyContigsClassifiedByAlignmentSignatures preprocess(final Sv * Sends assembly contigs classified based on their alignment signature to * a corresponding breakpoint location inference unit. * - * Two VCF files will be output: {@link #outputPrefix}"NonComplex.vcf" and {@link #outputPrefix}"Complex.vcf". - * * Note that contigs with alignment signature classified as * {@link AssemblyContigWithFineTunedAlignments.AlignmentSignatureBasicType#UNKNOWN} * currently DO NOT generate any VCF yet. */ - public static void dispatchJobs(final JavaSparkContext ctx, - final AssemblyContigsClassifiedByAlignmentSignatures contigsByPossibleRawTypes, - final SvDiscoveryInputMetaData svDiscoveryInputMetaData, - final JavaRDD assemblyRawAlignments, - final boolean writeSAMFiles) { + public static List dispatchJobs(final JavaSparkContext ctx, + final AssemblyContigsClassifiedByAlignmentSignatures contigsByPossibleRawTypes, + final SvDiscoveryInputMetaData svDiscoveryInputMetaData, + final JavaRDD assemblyRawAlignments, + final boolean writeSAMFiles) { final String outputPrefixWithSampleName = svDiscoveryInputMetaData.getOutputPath(); - // TODO: 1/10/18 bring back read annotation, see ticket 4228 + final List simpleChimeraVariants = + extractSimpleVariants(contigsByPossibleRawTypes.simple, svDiscoveryInputMetaData, outputPrefixWithSampleName); - final List simpleVariants = - SimpleNovelAdjacencyInterpreter.makeInterpretation(contigsByPossibleRawTypes.simple, svDiscoveryInputMetaData); - contigsByPossibleRawTypes.simple.unpersist(); - SVVCFWriter.writeVCF(simpleVariants, outputPrefixWithSampleName + "NonComplex.vcf", - svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(), - svDiscoveryInputMetaData.getToolLogger()); - - final List complexVariants = - CpxVariantInterpreter.makeInterpretation(contigsByPossibleRawTypes.complex, svDiscoveryInputMetaData); - contigsByPossibleRawTypes.complex.unpersist(); - SVVCFWriter.writeVCF(complexVariants, outputPrefixWithSampleName + "Complex.vcf", - svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(), - svDiscoveryInputMetaData.getToolLogger()); + final CpxAndReInterpretedSimpleVariants complexChimeraVariants = + extractCpxVariants(ctx, contigsByPossibleRawTypes.complex, svDiscoveryInputMetaData, assemblyRawAlignments, outputPrefixWithSampleName); if (writeSAMFiles) { contigsByPossibleRawTypes.writeSAMfilesForUnknown(outputPrefixWithSampleName, assemblyRawAlignments, svDiscoveryInputMetaData.getSampleSpecificData().getHeaderBroadcast().getValue()); } + final List inversions = extractInversions();// TODO: 6/29/18 placeholder + + // merged output + final List merged = new ArrayList<>(simpleChimeraVariants.size() + complexChimeraVariants.reInterpretedSimpleVariants.size() + inversions.size()); + merged.addAll(simpleChimeraVariants); + merged.addAll(complexChimeraVariants.reInterpretedSimpleVariants); + merged.addAll(inversions); + return merged; + } + + // return simple variants, including BND's + private static List extractSimpleVariants(final JavaRDD contigsWithSimpleChimera, + final SvDiscoveryInputMetaData svDiscoveryInputMetaData, + final String outputPrefixWithSampleName) { + final List simpleVariants = + SimpleNovelAdjacencyInterpreter.makeInterpretation(contigsWithSimpleChimera, svDiscoveryInputMetaData); + final Logger logger = svDiscoveryInputMetaData.getDiscoverStageArgs().runInDebugMode ? svDiscoveryInputMetaData.getToolLogger() : null; + SVVCFWriter.writeVCF(simpleVariants, outputPrefixWithSampleName + SIMPLE_CHIMERA_VCF_FILE_NAME, + svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(), + logger); + return simpleVariants; + } + + private static final class CpxAndReInterpretedSimpleVariants { + private final List cpxVariants; + private final List reInterpretedSimpleVariants; + + CpxAndReInterpretedSimpleVariants(final List cpxVariants, final List reInterpretedSimpleVariants) { + this.cpxVariants = cpxVariants; + this.reInterpretedSimpleVariants = reInterpretedSimpleVariants; + } + } + + private static CpxAndReInterpretedSimpleVariants extractCpxVariants(final JavaSparkContext ctx, + final JavaRDD contigsWithCpxAln, + final SvDiscoveryInputMetaData svDiscoveryInputMetaData, + final JavaRDD assemblyRawAlignments, + final String outputPrefixWithSampleName) { + final Logger toolLogger = svDiscoveryInputMetaData.getDiscoverStageArgs().runInDebugMode ? svDiscoveryInputMetaData.getToolLogger() : null; + final List complexVariants = + CpxVariantInterpreter.makeInterpretation(contigsWithCpxAln, svDiscoveryInputMetaData); + SVVCFWriter.writeVCF(complexVariants, outputPrefixWithSampleName + COMPLEX_CHIMERA_VCF_FILE_NAME, + svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(), + toolLogger); + final JavaRDD complexVariantsRDD = ctx.parallelize(complexVariants); final SegmentedCpxVariantSimpleVariantExtractor.ExtractedSimpleVariants reInterpretedSimple = SegmentedCpxVariantSimpleVariantExtractor.extract(complexVariantsRDD, svDiscoveryInputMetaData, assemblyRawAlignments); final SAMSequenceDictionary refSeqDict = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(); - final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger(); - final String derivedOneSegmentSimpleVCF = outputPrefixWithSampleName + "cpx_reinterpreted_simple_1_seg.vcf"; - final String derivedMultiSegmentSimpleVCF = outputPrefixWithSampleName + "cpx_reinterpreted_simple_multi_seg.vcf"; + final String derivedOneSegmentSimpleVCF = outputPrefixWithSampleName + REINTERPRETED_1_SEG_CALL_VCF_FILE_NAME; + final String derivedMultiSegmentSimpleVCF = outputPrefixWithSampleName + REINTERPRETED_MULTI_SEG_CALL_VCF_FILE_NAME; SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, refSeqDict, toolLogger); SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, refSeqDict, toolLogger); + + return new CpxAndReInterpretedSimpleVariants(complexVariants, reInterpretedSimple.getMergedReinterpretedCalls()); + } + + // TODO: 6/29/18 when BND variants are interpreted using short read evidence (e.g. EvidenceTargetLinks, resolved inversions), put it here + private static List extractInversions() { + return Collections.emptyList(); + } + + //================================================================================================================== + + /** + * Apply filters (that implements {@link StructuralVariantFilter}) given list of variants, + * and write the variants to a single VCF file. + * @param outputPrefixWithSampleName prefix with sample name + * @param variants variants to which filters are to be applied and written to file + * @param svDiscoveryInputMetaData metadata for use in filtering and file output + */ + public static void filterAndWriteMergedVCF(final String outputPrefixWithSampleName, + final List variants, + final SvDiscoveryInputMetaData svDiscoveryInputMetaData) { + final List variantsWithFilterApplied = new ArrayList<>(variants.size()); + final List filters = Arrays.asList( + new SVMappingQualityFilter(svDiscoveryInputMetaData.getDiscoverStageArgs().minMQ), + new SVAlignmentLengthFilter(svDiscoveryInputMetaData.getDiscoverStageArgs().minAlignLength)); + for (final VariantContext variant : variants) { + String svType = variant.getAttributeAsString(GATKSVVCFConstants.SVTYPE, ""); + if (svType.equals(GATKSVVCFConstants.SYMB_ALT_ALLELE_DEL) || svType.equals(GATKSVVCFConstants.SYMB_ALT_ALLELE_INS) || svType.equals(GATKSVVCFConstants.SYMB_ALT_ALLELE_DUP)) { + if (Math.abs(variant.getAttributeAsInt(GATKSVVCFConstants.SVLEN, 0)) < StructuralVariationDiscoveryArgumentCollection.STRUCTURAL_VARIANT_SIZE_LOWER_BOUND ) + continue; + } + variantsWithFilterApplied.add(applyFilters(variant, filters)); + } + + final String out = outputPrefixWithSampleName + MERGED_VCF_FILE_NAME; + SVVCFWriter.writeVCF(variantsWithFilterApplied, out, + svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(), + svDiscoveryInputMetaData.getToolLogger()); + } + + /** + * Filters out variants by testing against provided + * filter key, threshold. + * + * Variants with value below specified threshold (or null value) + * are filtered out citing given reason. + * + * @throws ClassCastException if the value corresponding to provided key cannot be casted as a {@link Double} + */ + private static VariantContext applyFilters(final VariantContext variantContext, + final List filters) { + + final Set appliedFilters = new HashSet<>(); + for (final StructuralVariantFilter filter : filters) { + if ( !filter.test(variantContext) ) + appliedFilters.add(filter.getName()); + } + + if (appliedFilters.isEmpty()) + return variantContext; + else { + return new VariantContextBuilder(variantContext).filters(appliedFilters).make(); + } + } + + public interface StructuralVariantFilter extends VariantFilter { + + /** + * @return name of filter for use in filtered record + */ + String getName(); + } + public static final class SVMappingQualityFilter implements StructuralVariantFilter { + static final long serialVersionUID = 1L; + + private static final String attributeKey = GATKSVVCFConstants.MAPPING_QUALITIES; + private final int threshold; + + public SVMappingQualityFilter(final int threshold) { + this.threshold = threshold; + } + + @Override + public String getName() { + return GATKSVVCFConstants.ASSEMBLY_BASED_VARIANT_MQ_FILTER_KEY; + } + + @Override + public boolean test(final VariantContext variantContext) { + if ( !variantContext.hasAttribute(GATKSVVCFConstants.CONTIG_NAMES) ) + return true; + + final List mapQuals = SvDiscoveryUtils.getAttributeAsStringList(variantContext, attributeKey); + int maxMQ = 0; + for (final String mapQual : mapQuals) { + Integer integer = Integer.valueOf(mapQual); + maxMQ = maxMQ < integer ? integer : maxMQ; + } + return maxMQ >= threshold; + } + } + + public static final class SVAlignmentLengthFilter implements StructuralVariantFilter { + static final long serialVersionUID = 1L; + + private static final String attributeKey = GATKSVVCFConstants.MAX_ALIGN_LENGTH; + private final int threshold; + + public SVAlignmentLengthFilter(final int threshold) { + this.threshold = threshold; + } + + @Override + public String getName() { + return GATKSVVCFConstants.ASSEMBLY_BASED_VARIANT_ALN_LENGTH_FILTER_KEY; + } + + @Override + public boolean test(final VariantContext variantContext) { + if ( !variantContext.hasAttribute(GATKSVVCFConstants.CONTIG_NAMES) ) + return true; + + final int alnLength = variantContext.getAttributeAsInt(attributeKey, 0); + return alnLength >= threshold; + } } //================================================================================================================== diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryInputMetaData.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryInputMetaData.java index 85dae419f98..6284a7e228f 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryInputMetaData.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryInputMetaData.java @@ -17,7 +17,7 @@ import java.util.List; import java.util.Set; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection; public final class SvDiscoveryInputMetaData { @@ -30,7 +30,7 @@ public SampleSpecificData getSampleSpecificData() { return sampleSpecificData; } - public DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection getDiscoverStageArgs() { + public DiscoverVariantsFromContigAlignmentsSparkArgumentCollection getDiscoverStageArgs() { return discoverStageArgs; } @@ -119,14 +119,14 @@ public List getAssembledIntervals() { private final SampleSpecificData sampleSpecificData; - private final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs; + private final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs; private final Logger toolLogger; private String outputPath; public SvDiscoveryInputMetaData(final JavaSparkContext ctx, - final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs, + final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs, final String nonCanonicalChromosomeNamesFile, final String outputPath, final ReadMetadata readMetadata, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtils.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtils.java index 97e9a7b1534..a26766bfb7a 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtils.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvDiscoveryUtils.java @@ -1,10 +1,12 @@ package org.broadinstitute.hellbender.tools.spark.sv.discovery; import htsjdk.samtools.*; +import htsjdk.variant.variantcontext.VariantContext; +import htsjdk.variant.vcf.VCFConstants; import org.apache.logging.log4j.Logger; import org.apache.spark.api.java.JavaRDD; import org.broadinstitute.hellbender.exceptions.UserException; -import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection; +import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection; import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.BreakpointComplications; import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVFileUtils; @@ -19,10 +21,7 @@ import javax.annotation.Nonnull; import java.io.IOException; import java.nio.file.Files; -import java.util.ArrayList; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Set; +import java.util.*; import java.util.stream.Collectors; import java.util.stream.Stream; @@ -33,7 +32,7 @@ public class SvDiscoveryUtils { public static void evaluateIntervalsAndNarls(final List assembledIntervals, final List narls, final SAMSequenceDictionary referenceSequenceDictionary, - final DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection parameters, + final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection parameters, final Logger toolLogger) { if ( parameters.truthVCF != null ) { final SVIntervalTree trueBreakpoints = @@ -154,4 +153,28 @@ public static void writeSAMRecords(final List reads, final Set samRecords.sort(localComparator); SVFileUtils.writeSAMFile( outputPath, samRecords.iterator(), cloneHeader, true); } + + /** + * todo: this should be fixed in a new major version of htsjdk + * this exist because for whatever reason, + * VC.getAttributeAsStringList() sometimes returns a giant single string, while using + * VC.getAttributeAsString() gives back an array..... + */ + public static List getAttributeAsStringList(final VariantContext vc, final String attributeKey) { + if (vc.getAttribute(attributeKey) == null) return Collections.emptyList(); + return vc.getAttributeAsStringList(attributeKey, "").stream() + .flatMap(s -> { + if ( s.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR) ) { + final String[] split = s.split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR); + return Arrays.stream(split); + } else { + return Stream.of(s); + } + }) + .collect(Collectors.toList()); + } + + public static SimpleInterval makeOneBpInterval(final String chr, final int pos) { + return new SimpleInterval(chr, pos, pos); + } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvType.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvType.java index 2d04b699a26..b7347ed12c6 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvType.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/SvType.java @@ -136,10 +136,7 @@ public static SortedSet getKnownTypes() { final SortedSet knownTypes = new TreeSet<>( EnumUtils.getEnumMap(SimpleSVType.SupportedType.class).keySet() ); knownTypes.add(GATKSVVCFConstants.CPX_SV_SYB_ALT_ALLELE_STR); - - for (final BreakEndVariantType.SupportedType supportedType : BreakEndVariantType.SupportedType.values()) { - knownTypes.add(supportedType.name()); - } + knownTypes.add(GATKSVVCFConstants.BREAKEND_STR); return Collections.unmodifiableSortedSet(knownTypes); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPicker.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPicker.java index ba926050cfd..85023893328 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPicker.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPicker.java @@ -21,7 +21,7 @@ import java.util.stream.Collectors; import java.util.stream.IntStream; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY; /** * A simple heuristic optimizer based on extensive manual review of alignments diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ContigChimericAlignmentIterativeInterpreter.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ContigChimericAlignmentIterativeInterpreter.java index 50f64644500..b6aca5e04f3 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ContigChimericAlignmentIterativeInterpreter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ContigChimericAlignmentIterativeInterpreter.java @@ -24,8 +24,8 @@ import java.util.List; import java.util.stream.Stream; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.DEFAULT_MIN_ALIGNMENT_LENGTH; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.DEFAULT_MIN_ALIGNMENT_LENGTH; /** * This class scans the chimeric alignments of input {@link AlignedContig}, @@ -56,7 +56,7 @@ public static List discoverVariantsFromChimeras(final SvDiscover final List assembledIntervals = svDiscoveryInputMetaData.getSampleSpecificData().getAssembledIntervals(); final Broadcast> cnvCallsBroadcast = svDiscoveryInputMetaData.getSampleSpecificData().getCnvCallsBroadcast(); final String sampleId = svDiscoveryInputMetaData.getSampleSpecificData().getSampleId(); - final StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection discoverStageArgs = svDiscoveryInputMetaData.getDiscoverStageArgs(); + final StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs = svDiscoveryInputMetaData.getDiscoverStageArgs(); final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger(); // step 2: extract novel adjacency diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInterpreter.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInterpreter.java index 40d3685b23f..1ad70e47c73 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInterpreter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantInterpreter.java @@ -24,7 +24,7 @@ import java.util.*; import java.util.stream.Collectors; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD; import static org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants.*; /** diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantReInterpreterSpark.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantReInterpreterSpark.java index 047a9d7e0f4..e6350161184 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantReInterpreterSpark.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/CpxVariantReInterpreterSpark.java @@ -60,9 +60,9 @@ public List getDefaultReadFilters() { } @ArgumentCollection - private StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection + private StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs - = new StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection(); + = new StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection(); @Argument(doc = "file containing non-canonical chromosome names (e.g chrUn_KI270588v1) in the reference, human reference (hg19 or hg38) assumed when omitted", shortName = "alt-tigs", diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SegmentedCpxVariantSimpleVariantExtractor.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SegmentedCpxVariantSimpleVariantExtractor.java index 3685dbe2004..790c167f5b2 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SegmentedCpxVariantSimpleVariantExtractor.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SegmentedCpxVariantSimpleVariantExtractor.java @@ -13,9 +13,9 @@ import org.broadinstitute.hellbender.tools.spark.sv.discovery.SimpleSVType; import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoverFromLocalAssemblyContigAlignmentsSpark; import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoveryInputMetaData; +import org.broadinstitute.hellbender.tools.spark.sv.discovery.SvDiscoveryUtils; import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AlignedContig; import org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigWithFineTunedAlignments; -import org.broadinstitute.hellbender.tools.spark.sv.utils.GATKSVVCFConstants; import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval; import org.broadinstitute.hellbender.utils.SimpleInterval; import org.broadinstitute.hellbender.utils.read.GATKRead; @@ -61,10 +61,8 @@ public abstract class SegmentedCpxVariantSimpleVariantExtractor implements Serializable { private static final long serialVersionUID = 1L; - // TODO: 5/2/18 for use in output VCF to link to original CPX variant, to be moved to GATKSVVCFConstants - static String EVENT_KEY = "CPX_EVENT"; private static int EVENT_SIZE_THRESHOLD = STRUCTURAL_VARIANT_SIZE_LOWER_BOUND - 1; - static final String CPX_DERIVED_POSTFIX_STRING = "CPX_DERIVED"; + private static final String CPX_DERIVED_POSTFIX_STRING = "CPX_DERIVED"; private static String makeID(final String typeName, final String chr, final int start, final int stop) { return typeName + INTERVAL_VARIANT_ID_FIELD_SEPARATOR + chr + INTERVAL_VARIANT_ID_FIELD_SEPARATOR @@ -73,29 +71,6 @@ private static String makeID(final String typeName, final String chr, final int CPX_DERIVED_POSTFIX_STRING; } - // TODO: 5/2/18 move to a utility class - /** - * this exist because for whatever reason, - * VC.getAttributeAsStringList() sometimes returns a giant single string, while using - * VC.getAttributeAsString() gives back an array..... - */ - static List getAttributeAsStringList(final VariantContext vc, final String attributeKey) { - if (vc.getAttribute(attributeKey) == null) return Collections.emptyList(); - return vc.getAttributeAsStringList(attributeKey, "").stream() - .flatMap(s -> { - if ( s.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR) ) { - final String[] split = s.split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR); - return Arrays.stream(split); - } else { - return Stream.of(s); - } - }) - .collect(Collectors.toList()); - } - static SimpleInterval makeOneBpInterval(final String chr, final int pos) { - return new SimpleInterval(chr, pos, pos); - } - public static final class ExtractedSimpleVariants { private final List reInterpretZeroOrOneSegmentCalls; private final List reInterpretMultiSegmentsCalls; @@ -113,6 +88,12 @@ public List getReInterpretZeroOrOneSegmentCalls() { public List getReInterpretMultiSegmentsCalls() { return reInterpretMultiSegmentsCalls; } + + public List getMergedReinterpretedCalls() { + final ArrayList merged = new ArrayList<>(reInterpretZeroOrOneSegmentCalls); + merged.addAll(reInterpretMultiSegmentsCalls); + return merged; + } } // main interface to user code @@ -126,7 +107,7 @@ public static ExtractedSimpleVariants extract(final JavaRDD comp // that was due to restriction from how multi-segment calls are to be re-interpreted final ZeroAndOneSegmentCpxVariantExtractor zeroAndOneSegmentCpxVariantExtractor = new ZeroAndOneSegmentCpxVariantExtractor(); final JavaRDD zeroOrOneSegmentComplexVariants = complexVariants - .filter(vc -> getAttributeAsStringList(vc, CPX_SV_REF_SEGMENTS).size() < 2) + .filter(vc -> SvDiscoveryUtils.getAttributeAsStringList(vc, CPX_SV_REF_SEGMENTS).size() < 2) .cache(); final List reInterpretedZeroAndOneSegmentCalls = zeroOrOneSegmentComplexVariants @@ -135,7 +116,7 @@ public static ExtractedSimpleVariants extract(final JavaRDD comp zeroOrOneSegmentComplexVariants.unpersist(false); final JavaRDD multiSegmentCalls = - complexVariants.filter(vc -> getAttributeAsStringList(vc, CPX_SV_REF_SEGMENTS).size() > 1) + complexVariants.filter(vc -> SvDiscoveryUtils.getAttributeAsStringList(vc, CPX_SV_REF_SEGMENTS).size() > 1) .cache(); final MultiSegmentsCpxVariantExtractor multiSegmentsCpxVariantExtractor = new MultiSegmentsCpxVariantExtractor(); @@ -165,9 +146,9 @@ static final class RelevantAttributes implements Serializable { @VisibleForTesting RelevantAttributes(final VariantContext multiSegmentComplexVar) { id = multiSegmentComplexVar.getID(); - referenceSegments = getAttributeAsStringList(multiSegmentComplexVar, CPX_SV_REF_SEGMENTS) + referenceSegments = SvDiscoveryUtils.getAttributeAsStringList(multiSegmentComplexVar, CPX_SV_REF_SEGMENTS) .stream().map(SimpleInterval::new).collect(Collectors.toList()); - altArrangements = getAttributeAsStringList(multiSegmentComplexVar, CPX_EVENT_ALT_ARRANGEMENTS); + altArrangements = SvDiscoveryUtils.getAttributeAsStringList(multiSegmentComplexVar, CPX_EVENT_ALT_ARRANGEMENTS); } } @@ -193,7 +174,7 @@ private static List reInterpretMultiSegmentComplexVarThroughAlig multiSegmentCalls .flatMapToPair(complex -> { final RelevantAttributes relevantAttributes = new RelevantAttributes(complex); - return getAttributeAsStringList(complex, CONTIG_NAMES).stream() + return SvDiscoveryUtils.getAttributeAsStringList(complex, CONTIG_NAMES).stream() .map(name -> new Tuple2<>(name, relevantAttributes)) .iterator(); }) @@ -215,7 +196,7 @@ private static List reInterpretMultiSegmentComplexVarThroughAlig return pairIterationReInterpreted.stream() .map(vc -> { final List consistentComplexVariantIDs = - getAttributeAsStringList(vc, CONTIG_NAMES).stream() + SvDiscoveryUtils.getAttributeAsStringList(vc, CONTIG_NAMES).stream() .map(contigNameToCpxVariantAttributes::get) .filter(attributes -> isConsistentWithCPX(vc, attributes)) .map(attributes -> attributes.id) @@ -226,7 +207,7 @@ private static List reInterpretMultiSegmentComplexVarThroughAlig return new VariantContextBuilder(vc) .id(vc.getID() + INTERVAL_VARIANT_ID_FIELD_SEPARATOR + CPX_DERIVED_POSTFIX_STRING) - .attribute(EVENT_KEY, + .attribute(CPX_EVENT_KEY, String.join(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR, consistentComplexVariantIDs)) .make(); } @@ -425,8 +406,8 @@ public int hashCode() { .stream().map(ai -> ai.sourceVC) .collect(Collectors.toMap(AnnotatedInterval::new, simpleVC -> { - final TreeSet complexEvents = new TreeSet<>(getAttributeAsStringList(simpleVC, EVENT_KEY)); - final TreeSet sourceCtgNames = new TreeSet<>(getAttributeAsStringList(simpleVC, CONTIG_NAMES)); + final TreeSet complexEvents = new TreeSet<>(SvDiscoveryUtils.getAttributeAsStringList(simpleVC, CPX_EVENT_KEY)); + final TreeSet sourceCtgNames = new TreeSet<>(SvDiscoveryUtils.getAttributeAsStringList(simpleVC, CONTIG_NAMES)); return new Tuple2<>(complexEvents, sourceCtgNames); }) ); // hashMap is good enough for us @@ -461,8 +442,8 @@ static List removeDuplicates(final List sourceWi sourceCpxIDs.addAll(anotherSourceAttributes._1); sourceCtgNames.addAll(anotherSourceAttributes._2); final VariantContextBuilder variant = new VariantContextBuilder(interval.sourceVC) - .rmAttribute(EVENT_KEY) - .attribute(EVENT_KEY, String.join(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR, sourceCpxIDs)) + .rmAttribute(CPX_EVENT_KEY) + .attribute(CPX_EVENT_KEY, String.join(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR, sourceCpxIDs)) .rmAttribute(CONTIG_NAMES) .attribute(CONTIG_NAMES, String.join(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR, sourceCtgNames)); result.add( variant.make()); @@ -519,11 +500,11 @@ static final class ZeroAndOneSegmentCpxVariantExtractor extends SegmentedCpxVari @Override List extract(final VariantContext complexVC, final ReferenceMultiSource reference) { - final List segments = getAttributeAsStringList(complexVC, CPX_SV_REF_SEGMENTS); + final List segments = SvDiscoveryUtils.getAttributeAsStringList(complexVC, CPX_SV_REF_SEGMENTS); if (segments.isEmpty()) return whenZeroSegments(complexVC, reference); final SimpleInterval refSegment = new SimpleInterval(segments.get(0)); - final List altArrangement = getAttributeAsStringList(complexVC, CPX_EVENT_ALT_ARRANGEMENTS); + final List altArrangement = SvDiscoveryUtils.getAttributeAsStringList(complexVC, CPX_EVENT_ALT_ARRANGEMENTS); final int altSeqLength = complexVC.getAttributeAsString(SEQ_ALT_HAPLOTYPE, "").length(); final List result = new ArrayList<>(); @@ -539,18 +520,26 @@ List extract(final VariantContext complexVC, final ReferenceMult } final String sourceID = complexVC.getID(); - final List evidenceContigs = getAttributeAsStringList(complexVC, CONTIG_NAMES); + final List evidenceContigs = SvDiscoveryUtils.getAttributeAsStringList(complexVC, CONTIG_NAMES); + final List mappingQualities = SvDiscoveryUtils.getAttributeAsStringList(complexVC, MAPPING_QUALITIES); + final int maxAlignLength = complexVC.getAttributeAsInt(MAX_ALIGN_LENGTH, 0); return result.stream() - .map(vc -> vc.attribute(EVENT_KEY, sourceID).attribute(CONTIG_NAMES, evidenceContigs).make()) + .map(vc -> vc.attribute(CPX_EVENT_KEY, sourceID).attribute(CONTIG_NAMES, evidenceContigs) + .attribute(MAPPING_QUALITIES, mappingQualities) + .attribute(MAX_ALIGN_LENGTH, maxAlignLength).make()) .collect(Collectors.toList()); } private List whenZeroSegments(final VariantContext complexVC, final ReferenceMultiSource reference) { final Allele anchorBaseRefAllele = getAnchorBaseRefAllele(complexVC.getContig(), complexVC.getStart(), reference); final int altSeqLength = complexVC.getAttributeAsString(SEQ_ALT_HAPLOTYPE, "").length() - 2; + final List mappingQualities = SvDiscoveryUtils.getAttributeAsStringList(complexVC, MAPPING_QUALITIES); + final int maxAlignLength = complexVC.getAttributeAsInt(MAX_ALIGN_LENGTH, 0); final VariantContext insertion = makeInsertion(complexVC.getContig(), complexVC.getStart(), complexVC.getStart(), altSeqLength, anchorBaseRefAllele) - .attribute(EVENT_KEY, complexVC.getID()) + .attribute(CPX_EVENT_KEY, complexVC.getID()) .attribute(CONTIG_NAMES, complexVC.getAttribute(CONTIG_NAMES)) + .attribute(MAPPING_QUALITIES, mappingQualities) + .attribute(MAX_ALIGN_LENGTH, maxAlignLength) .make(); return Collections.singletonList(insertion); } @@ -628,13 +617,13 @@ private static void extractFrontAndRearInsertions(final SimpleInterval refSegmen final List segmentLen = Collections.singletonList(refSegment.size()); - final SimpleInterval frontInsPos = makeOneBpInterval(refSegment.getContig(), refSegment.getStart() - 1); + final SimpleInterval frontInsPos = SvDiscoveryUtils.makeOneBpInterval(refSegment.getContig(), refSegment.getStart() - 1); final VariantContextBuilder frontIns = getInsFromOneEnd(true, segmentIdx, frontInsPos, anchorBaseRefAlleleFront, segmentLen, altArrangement, true); if (frontIns != null) result.add(frontIns); - final SimpleInterval rearInsPos = makeOneBpInterval(refSegment.getContig(), refSegment.getEnd()); + final SimpleInterval rearInsPos = SvDiscoveryUtils.makeOneBpInterval(refSegment.getContig(), refSegment.getEnd()); final VariantContextBuilder rearIns = getInsFromOneEnd(false, segmentIdx, rearInsPos, anchorBaseRefAlleleRear, segmentLen, altArrangement, true); if (rearIns != null) @@ -650,11 +639,11 @@ static final class MultiSegmentsCpxVariantExtractor extends SegmentedCpxVariantS List extract(final VariantContext complexVC, final ReferenceMultiSource reference) { final List refSegments = - getAttributeAsStringList(complexVC, CPX_SV_REF_SEGMENTS).stream() + SvDiscoveryUtils.getAttributeAsStringList(complexVC, CPX_SV_REF_SEGMENTS).stream() .map(SimpleInterval::new) .collect(Collectors.toList()); - final List altArrangement = getAttributeAsStringList(complexVC, CPX_EVENT_ALT_ARRANGEMENTS); + final List altArrangement = SvDiscoveryUtils.getAttributeAsStringList(complexVC, CPX_EVENT_ALT_ARRANGEMENTS); final Tuple3, Set, List> missingAndPresentAndInvertedSegments = getMissingAndPresentAndInvertedSegments(refSegments, altArrangement); final Set missingSegments = missingAndPresentAndInvertedSegments._1(); @@ -684,10 +673,14 @@ List extract(final VariantContext complexVC, final ReferenceMult } final String sourceID = complexVC.getID(); - final List evidenceContigs = getAttributeAsStringList(complexVC, CONTIG_NAMES); + final List evidenceContigs = SvDiscoveryUtils.getAttributeAsStringList(complexVC, CONTIG_NAMES); + final List mappingQualities = SvDiscoveryUtils.getAttributeAsStringList(complexVC, MAPPING_QUALITIES); + final int maxAlignLength = complexVC.getAttributeAsInt(MAX_ALIGN_LENGTH, 0); return result.stream() - .map(vc -> vc.attribute(EVENT_KEY, sourceID).attribute(CONTIG_NAMES, evidenceContigs).make()) + .map(vc -> vc.attribute(CPX_EVENT_KEY, sourceID).attribute(CONTIG_NAMES, evidenceContigs) + .attribute(MAPPING_QUALITIES, mappingQualities) + .attribute(MAX_ALIGN_LENGTH, maxAlignLength).make()) .collect(Collectors.toList()); } @@ -755,7 +748,7 @@ private void extractInversions(final ReferenceMultiSource reference, final List< .filter(i -> refSegmentIntervals.get(i - 1).size() > EVENT_SIZE_THRESHOLD && (!presentSegments.contains(i))) .map(i -> { final SimpleInterval invertedSegment = refSegmentIntervals.get(i - 1); - final byte[] ref = getReferenceBases(makeOneBpInterval(invertedSegment.getContig(), invertedSegment.getStart()), reference); + final byte[] ref = getReferenceBases(SvDiscoveryUtils.makeOneBpInterval(invertedSegment.getContig(), invertedSegment.getStart()), reference); final Allele refAllele = Allele.create(ref, true); return makeInversion(invertedSegment, refAllele); }) @@ -768,7 +761,7 @@ private void extractDeletions(final ReferenceMultiSource reference, final Set deletions = compactifyMissingSegments(missingSegments).stream() .filter(gone -> gone.size() > EVENT_SIZE_THRESHOLD) // large enough .map(gone -> { - final byte[] ref = getReferenceBases(makeOneBpInterval(gone.getContig(), gone.getStart()), reference); + final byte[] ref = getReferenceBases(SvDiscoveryUtils.makeOneBpInterval(gone.getContig(), gone.getStart()), reference); final Allele refAllele = Allele.create(ref, true); return makeDeletion(new SimpleInterval(gone.getContig(), gone.getStart(), gone.getEnd() - 1), refAllele); }) @@ -819,7 +812,7 @@ private void extractFrontAndRearInsertions(final VariantContext complexVC, final } } if (firstRefSegmentIdx > 0) { - final SimpleInterval startAndStop = makeOneBpInterval(complexVC.getContig(), complexVC.getStart()); + final SimpleInterval startAndStop = SvDiscoveryUtils.makeOneBpInterval(complexVC.getContig(), complexVC.getStart()); final Allele anchorBaseRefAlleleFront = Allele.create(getReferenceBases(startAndStop, reference), true); final VariantContextBuilder frontIns = getInsFromOneEnd(true, firstRefSegmentIdx, startAndStop, anchorBaseRefAlleleFront, refSegmentLengths, altArrangement, true); if (frontIns != null) result.add( frontIns ); @@ -836,7 +829,7 @@ private void extractFrontAndRearInsertions(final VariantContext complexVC, final if (firstRefSegmentIdx != altArrangement.size() - 1) { final int pos = complexVC.getEnd(); - final SimpleInterval insertionPos = makeOneBpInterval(complexVC.getContig(), pos); + final SimpleInterval insertionPos = SvDiscoveryUtils.makeOneBpInterval(complexVC.getContig(), pos); final Allele anchorBaseRefAlleleRear = Allele.create(getReferenceBases(insertionPos, reference), true); final VariantContextBuilder rearIns = getInsFromOneEnd(false, firstRefSegmentIdx, insertionPos, anchorBaseRefAlleleRear, refSegmentLengths, altArrangement, true); if (rearIns != null) result.add( rearIns ); @@ -934,7 +927,7 @@ static Tuple3, Set, List> getMissingAndPre // boiler-plate code block ========================================================================================= private static Allele getAnchorBaseRefAllele(final String chr, final int pos, final ReferenceMultiSource reference) { - return Allele.create(getReferenceBases(makeOneBpInterval(chr, pos), reference), true); + return Allele.create(getReferenceBases(SvDiscoveryUtils.makeOneBpInterval(chr, pos), reference), true); } // try not to have many try's diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SimpleNovelAdjacencyInterpreter.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SimpleNovelAdjacencyInterpreter.java index b09820c78f0..97a85683a56 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SimpleNovelAdjacencyInterpreter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SimpleNovelAdjacencyInterpreter.java @@ -74,7 +74,7 @@ private static void evaluateNarls(final SvDiscoveryInputMetaData svDiscoveryInpu final Broadcast referenceSequenceDictionaryBroadcast = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast(); final List assembledIntervals = svDiscoveryInputMetaData.getSampleSpecificData().getAssembledIntervals(); - final StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection + final StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs = svDiscoveryInputMetaData.getDiscoverStageArgs(); final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger(); SvDiscoveryUtils.evaluateIntervalsAndNarls(assembledIntervals, narls, diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java index 61574f91120..e86710e25db 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFConstants.java @@ -1,9 +1,5 @@ package org.broadinstitute.hellbender.tools.spark.sv.utils; -import java.util.List; -import java.util.stream.Collectors; -import java.util.stream.Stream; - public final class GATKSVVCFConstants { // todo: add these and the other standard SV info fields from the VCF spec to htsjdk VCFStandardHeaderLines @@ -46,6 +42,8 @@ public final class GATKSVVCFConstants { public static final String HOMOLOGY = "HOMSEQ"; public static final String HOMOLOGY_LENGTH = "HOMLEN"; public static final String LINK = "LINK"; + public static final String EXTERNAL_CNV_CALLS = "EXTERNAL_CNV_CALLS"; + public static final String CTG_GOOD_NONCANONICAL_MAPPING = "CTG_GOOD_NONCANONICAL_MAPPING"; // type specific: tandem duplication public static final String DUP_REPEAT_UNIT_REF_SPAN = "DUP_REPEAT_UNIT_REF_SPAN"; @@ -64,30 +62,27 @@ public final class GATKSVVCFConstants { public static final String INV33 = "INV33"; public static final String INV55 = "INV55"; + // type specific: CPX + public static final String CPX_SV_SYB_ALT_ALLELE_STR = "CPX"; + public static final String CPX_EVENT_ALT_ARRANGEMENTS = "ALT_ARRANGEMENT"; + public static final String CPX_SV_REF_SEGMENTS = "SEGMENTS"; + public static final String CPX_EVENT_KEY = "CPX_EVENT"; + // not defined in output vcf header but used in internal id that is currently output in the ID column public static final String INTERVAL_VARIANT_ID_FIELD_SEPARATOR = "_"; - public static final String DUP_TAN_CONTRACTION_INTERNAL_ID_START_STRING = "DEL-DUPLICATION-TANDEM-CONTRACTION"; public static final String DUP_TAN_EXPANSION_INTERNAL_ID_START_STRING = "INS-DUPLICATION-TANDEM-EXPANSION"; public static final String DUP_INV_INTERNAL_ID_START_STRING = "INS-DUPLICATION-INVERTED-EXPANSION"; - public static final String EXTERNAL_CNV_CALLS = "EXTERNAL_CNV_CALLS"; + // format block public static final String COPY_NUMBER_FORMAT = "CN"; public static final String COPY_NUMBER_QUALITY_FORMAT = "CNQ"; - public static final String CPX_SV_SYB_ALT_ALLELE_STR = "CPX"; - public static final String CPX_EVENT_ALT_ARRANGEMENTS = "ALT_ARRANGEMENT"; - public static final String CPX_SV_REF_SEGMENTS = "SEGMENTS"; + // filter block + public static final String ASSEMBLY_BASED_VARIANT_MQ_FILTER_KEY = "LOW_MQ"; + public static final String ASSEMBLY_BASED_VARIANT_ALN_LENGTH_FILTER_KEY = "SHORT_ALN"; - public static final String CTG_GOOD_NONCANONICAL_MAPPING = "CTG_GOOD_NONCANONICAL_MAPPING"; + /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - public static final List expectedHeaderLinesInVCF - = Stream.of("SVTYPE", "SVLEN", "MATEID", "INV", "DEL", "INS", "DUP", "DUP:INV", - "CIPOS", "CIEND", "IMPRECISE", "READ_PAIR_SUPPORT", "SPLIT_READ_SUPPORT", "LINK", - "CTG_NAMES", "TOTAL_MAPPINGS", "MAPPING_QUALITIES", "HQ_MAPPINGS", "ALIGN_LENGTHS", "MAX_ALIGN_LENGTH", - "SEQ_ALT_HAPLOTYPE", "INSSEQ", "INSLEN", "INSSEQ_MAP", "HOMSEQ", "HOMLEN", "DUP_REPEAT_UNIT_REF_SPAN", - "DUP_SEQ_CIGARS", "DUP_NUM", "DUP_ANNOTATIONS_IMPRECISE", "CONTRACTION", "EXPANSION", "DUP_ORIENTATIONS", - "INV33", "INV55", "EXTERNAL_CNV_CALLS", "DUP_IMPRECISE_AFFECTED_RANGE", "CTG_GOOD_NONCANONICAL_MAPPING") - .sorted().collect(Collectors.toList()); } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFHeaderLines.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFHeaderLines.java index 91f655c6f7b..3c0c8283c07 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFHeaderLines.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFHeaderLines.java @@ -19,7 +19,7 @@ public final class GATKSVVCFHeaderLines { public static VCFFilterHeaderLine getFilterLine(final String id) { return filterLines.get(id); } public static Set getFilterLines() { return new HashSet<>(filterLines.values()); } - private static final Map infoLines = new LinkedHashMap<>(20); + private static final Map infoLines = new LinkedHashMap<>(30); private static final Map formatLines = new LinkedHashMap<>(5); private static final Map filterLines = new LinkedHashMap<>(2); @@ -60,6 +60,8 @@ private static void addSymbAltAlleleLine(final VCFSimpleHeaderLine line) { GATKSVVCFConstants.SYMB_ALT_ALLELE_DUP, "Region of elevated copy number relative to the reference")); addSymbAltAlleleLine(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, GATKSVVCFConstants.SYMB_ALT_ALLELE_INVDUP, "Region of elevated copy number relative to the reference, with some copies inverted")); + addSymbAltAlleleLine(new VCFSimpleHeaderLine(GATKVCFConstants.SYMBOLIC_ALLELE_DEFINITION_HEADER_TAG, + GATKSVVCFConstants.CPX_SV_SYB_ALT_ALLELE_STR, "Complex rearrangement of reference sequence")); // descriptions on INFO fields that are available for each record addInfoLine(new VCFInfoHeaderLine(GATKSVVCFConstants.SVTYPE, @@ -114,11 +116,15 @@ private static void addSymbAltAlleleLine(final VCFSimpleHeaderLine line) { 1, VCFHeaderLineType.String, "Comma-delimited list of external copy number calls that overlap with this variant in format ID:CN:CNQ")); + + addInfoLine(new VCFInfoHeaderLine(GATKSVVCFConstants.LINK, + VCFHeaderLineCount.UNBOUNDED, + VCFHeaderLineType.String, + "ID(s) of other record(s) linked to current record")); } // for variants-detected from assembly - // todo: create an alternate assembly file and link to it with breakpoint IDs according to the VCF spec - { + {// todo: create an alternate assembly file and link to it with breakpoint IDs according to the VCF spec addInfoLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CONTIG_NAMES, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, @@ -221,6 +227,41 @@ private static void addSymbAltAlleleLine(final VCFSimpleHeaderLine line) { 0, VCFHeaderLineType.Flag, "Tandem repeats expansion compared to reference")); + + addInfoLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CPX_EVENT_ALT_ARRANGEMENTS, + VCFHeaderLineCount.UNBOUNDED, + VCFHeaderLineType.String, + "For CPX variants only; specifies how reference segments given in " + GATKSVVCFConstants.CPX_SV_REF_SEGMENTS + " are re-arranged")); + addInfoLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CPX_SV_REF_SEGMENTS, + VCFHeaderLineCount.UNBOUNDED, + VCFHeaderLineType.String, + "For CPX variants only; segments of reference that are rearranged")); + addInfoLine(new VCFInfoHeaderLine(GATKSVVCFConstants.CPX_EVENT_KEY, + VCFHeaderLineCount.UNBOUNDED, + VCFHeaderLineType.String, + "ID(s) of " + GATKSVVCFConstants.CPX_SV_SYB_ALT_ALLELE_STR + "(s) events from which current simple variant record is extracted")); + } + + // format lines + { + addFormatLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_FORMAT, + 1, // TODO: 7/3/18 Spec 4.3 has this example value at bottom of page 12, but what about multi-allelic sites? + VCFHeaderLineType.Integer, + "Copy number genotype for imprecise events")); + + addFormatLine(new VCFFormatHeaderLine(GATKSVVCFConstants.COPY_NUMBER_QUALITY_FORMAT, + 1, // TODO: 7/3/18 Spec 4.3 has this example value at bottom of page 12, but what about multi-allelic sites? + VCFHeaderLineType.Float, + "Copy number genotype quality for imprecise events")); + } + + // filter lines + { + addFilterLine(new VCFFilterHeaderLine(GATKSVVCFConstants.ASSEMBLY_BASED_VARIANT_MQ_FILTER_KEY, + "Assembly evidence based record that whose maximum value specified in " + GATKSVVCFConstants.MAPPING_QUALITIES + " is lower than user specified threshold")); + + addFilterLine(new VCFFilterHeaderLine(GATKSVVCFConstants.ASSEMBLY_BASED_VARIANT_ALN_LENGTH_FILTER_KEY, + "Assembly evidence based record that whose " + GATKSVVCFConstants.MAPPING_QUALITIES + " value is lower than user specified threshold")); } } } diff --git a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVVCFWriter.java b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVVCFWriter.java index 0e9d183c811..9a937c36aba 100644 --- a/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVVCFWriter.java +++ b/src/main/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVVCFWriter.java @@ -41,7 +41,8 @@ public static void writeVCF(final List localVariants, final Stri final List sortedVariantsList = sortVariantsByCoordinate(localVariants, referenceSequenceDictionary); - logNumOfVarByTypes(sortedVariantsList, logger); + if (logger != null) + logNumOfVarByTypes(sortedVariantsList, logger); writeVariants(vcfFileName, sortedVariantsList, referenceSequenceDictionary); } @@ -97,6 +98,8 @@ static VCFHeader getVcfHeader(final SAMSequenceDictionary referenceSequenceDicti final Set headerLines = new HashSet<>(GATKSVVCFHeaderLines.getSymbAltAlleleLines()); headerLines.addAll(GATKSVVCFHeaderLines.getInfoLines()); headerLines.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY)); + headerLines.addAll(GATKSVVCFHeaderLines.getFormatLines()); + headerLines.addAll(GATKSVVCFHeaderLines.getFilterLines()); final VCFHeader header = new VCFHeader(new VCFHeader( headerLines )); header.setSequenceDictionary(referenceSequenceDictionary); return header; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducerUnitTest.java index 3bfd4d6a66b..e906d3ef859 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducerUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/AnnotatedVariantProducerUnitTest.java @@ -34,7 +34,6 @@ import org.testng.annotations.Test; import scala.Tuple2; -import java.io.IOException; import java.util.*; import static org.broadinstitute.hellbender.tools.spark.sv.discovery.AnnotatedVariantProducer.produceAnnotatedVcFromAssemblyEvidence; @@ -258,8 +257,8 @@ public void testProcessEvidenceTargetLinks(final List etls, final List expectedVariants) { final Logger localLogger = LogManager.getLogger(AnnotatedVariantProducer.class); - final StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection params = - new StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection(); + final StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection params = + new StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection(); ReadMetadata metadata = Mockito.mock(ReadMetadata.class); when(metadata.getMaxMedianFragmentSize()).thenReturn(300); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContigGeneratorUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContigGeneratorUnitTest.java index fd44c9f42cb..376e2b569d2 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContigGeneratorUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AlignedContigGeneratorUnitTest.java @@ -24,7 +24,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY; import static org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.AssemblyBasedSVDiscoveryTestDataProviderForInversionBreakpoints.LONG_CONTIG1; import static org.testng.Assert.assertEquals; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPickerUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPickerUnitTest.java index 846e212e7db..3ccbb84c03d 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPickerUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/AssemblyContigAlignmentsConfigPickerUnitTest.java @@ -15,7 +15,7 @@ import java.util.*; import java.util.stream.Collectors; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY; import static org.broadinstitute.hellbender.tools.spark.sv.discovery.TestUtilsForAssemblyBasedSVDiscovery.*; import static org.broadinstitute.hellbender.tools.spark.sv.discovery.alignment.AssemblyContigAlignmentsConfigPicker.*; import static org.testng.Assert.assertEquals; diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/ContigAlignmentsModifierUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/ContigAlignmentsModifierUnitTest.java index d944f5cbf13..fc45fc9d9e8 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/ContigAlignmentsModifierUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/alignment/ContigAlignmentsModifierUnitTest.java @@ -111,7 +111,7 @@ public void testGappedAlignmentBreaker_GapSizeSensitivity() { 1, 120, cigar, true, 60, 0, 100, ContigAlignmentsModifier.AlnModType.NONE); final List generatedARList = Utils.stream(ContigAlignmentsModifier.splitGappedAlignment(alignmentInterval, - StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, + StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.GAPPED_ALIGNMENT_BREAK_DEFAULT_SENSITIVITY, cigar.getReadLength())).collect(Collectors.toList()); Assert.assertEquals(generatedARList.size(), 3); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ContigChimericAlignmentIterativeInterpreterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ContigChimericAlignmentIterativeInterpreterUnitTest.java index 43c87e1725d..c2d24f529c4 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ContigChimericAlignmentIterativeInterpreterUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ContigChimericAlignmentIterativeInterpreterUnitTest.java @@ -17,7 +17,7 @@ import java.util.*; -import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD; +import static org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD; import static org.broadinstitute.hellbender.tools.spark.sv.discovery.SimpleSVType.SupportedType.*; import static org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.ContigChimericAlignmentIterativeInterpreter.firstAlignmentIsTooShort; import static org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.ContigChimericAlignmentIterativeInterpreter.nextAlignmentMayBeInsertion; @@ -37,8 +37,8 @@ public void testFilterByRegionTooSmall() { final AlignmentInterval region1 = new AlignmentInterval(new SimpleInterval(AssemblyBasedSVDiscoveryTestDataProviderForInversionBreakpoints.chrForLongContig1, 20138007, 20142231), 1, contigSequence.length - 1986, TextCigarCodec.decode("1986S236M2D1572M1I798M5D730M1I347M4I535M"), false, 60, 36, 100, ContigAlignmentsModifier.AlnModType.NONE); final AlignmentInterval region2 = new AlignmentInterval(new SimpleInterval(AssemblyBasedSVDiscoveryTestDataProviderForInversionBreakpoints.chrForLongContig1, 20152030, 20154634), 3604, contigSequence.length, TextCigarCodec.decode("3603H24M1I611M1I1970M"), true, 60, 36, 100, ContigAlignmentsModifier.AlnModType.NONE); - Assert.assertFalse( firstAlignmentIsTooShort(region1, region2, StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.DEFAULT_MIN_ALIGNMENT_LENGTH) ); - Assert.assertFalse( firstAlignmentIsTooShort(region2, region1, StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.DEFAULT_MIN_ALIGNMENT_LENGTH) ); + Assert.assertFalse( firstAlignmentIsTooShort(region1, region2, StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.DEFAULT_MIN_ALIGNMENT_LENGTH) ); + Assert.assertFalse( firstAlignmentIsTooShort(region2, region1, StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.DEFAULT_MIN_ALIGNMENT_LENGTH) ); Assert.assertFalse( firstAlignmentIsTooShort(region1, region2, 3000) ); Assert.assertTrue( firstAlignmentIsTooShort(region2, region1, 3000) ); @@ -123,7 +123,7 @@ public void testGetAssembledBreakpointsFromAlignmentIntervalsWithOverlappingAlig final AlignmentInterval region3 = new AlignmentInterval(new SimpleInterval("20", 23103633, 23104602), 556, 1525, TextCigarCodec.decode("555S970M"), true, 60, 3, 100, ContigAlignmentsModifier.AlnModType.NONE); final AlignedContig alignedContig = new AlignedContig("asm00001:tig0001", contigSequence, Arrays.asList(region1, region2, region3)); - final List assembledBreakpointsFromAlignmentIntervals = ContigChimericAlignmentIterativeInterpreter.parseOneContig(alignedContig, TestUtilsForAssemblyBasedSVDiscovery.b37_seqDict, true, StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.DEFAULT_MIN_ALIGNMENT_LENGTH, StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD, true); + final List assembledBreakpointsFromAlignmentIntervals = ContigChimericAlignmentIterativeInterpreter.parseOneContig(alignedContig, TestUtilsForAssemblyBasedSVDiscovery.b37_seqDict, true, StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.DEFAULT_MIN_ALIGNMENT_LENGTH, StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection.CHIMERIC_ALIGNMENTS_HIGHMQ_THRESHOLD, true); Assert.assertEquals(assembledBreakpointsFromAlignmentIntervals.size(), 1); final SimpleChimera simpleChimera = assembledBreakpointsFromAlignmentIntervals.get(0); Assert.assertEquals(simpleChimera.sourceContigName, "asm00001:tig0001"); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ImpreciseVariantDetectorUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ImpreciseVariantDetectorUnitTest.java index 3b80bc9383b..c3486a61a69 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ImpreciseVariantDetectorUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/ImpreciseVariantDetectorUnitTest.java @@ -68,10 +68,10 @@ public Object[][] getEvidenceTargetLinksAndVariants() { public void testProcessEvidenceTargetLinks(final List etls, final List expectedVariants) { final int impreciseEvidenceVariantCallingThreshold = - new StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection().impreciseVariantEvidenceThreshold; + new StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection().impreciseVariantEvidenceThreshold; final int maxCallableImpreciseVariantDeletionSize = - new StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigsAlignmentsSparkArgumentCollection().maxCallableImpreciseVariantDeletionSize; + new StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection().maxCallableImpreciseVariantDeletionSize; final ReferenceMultiSource referenceMultiSource = new ReferenceMultiSource(twoBitRefURL, ReferenceWindowFunctions.IDENTITY_FUNCTION); diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SegmentedCpxVariantSimpleVariantExtractorUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SegmentedCpxVariantSimpleVariantExtractorUnitTest.java index 3b08393ce2d..ac915c3dad6 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SegmentedCpxVariantSimpleVariantExtractorUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/discovery/inference/SegmentedCpxVariantSimpleVariantExtractorUnitTest.java @@ -32,6 +32,8 @@ public class SegmentedCpxVariantSimpleVariantExtractorUnitTest extends GATKBaseTest { + private static final Random random = new Random(1); + private static final ZeroAndOneSegmentCpxVariantExtractor zeroAndOneSegmentCpxVariantExtractor = new ZeroAndOneSegmentCpxVariantExtractor(); private static final MultiSegmentsCpxVariantExtractor multiSegmentsCpxVariantExtractor = new MultiSegmentsCpxVariantExtractor(); @@ -47,8 +49,10 @@ private List caseForZeroAndOneSegmentCalls() { Arrays.asList("-chr18:11642876-11642927","UINS-496")); data.add(new Object[]{complex, b38_reference_chr20_chr21, zeroAndOneSegmentCpxVariantExtractor, Collections.singletonList(makeInsertion("chr20",51740560, 51740560, 549, Allele.create("A", true)) - .attribute(EVENT_KEY, "CPX_chr20:51740560-51740561") - .attribute(CONTIG_NAMES, "asm028558:tig00002,asm028558:tig00003").make()) + .attribute(CPX_EVENT_KEY, "CPX_chr20:51740560-51740561") + .attribute(CONTIG_NAMES, "asm028558:tig00002,asm028558:tig00003") + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()) }); // 2. one segment -> with inversion @@ -58,8 +62,10 @@ private List caseForZeroAndOneSegmentCalls() { Collections.singletonList("-1")); data.add(new Object[]{complex, b38_reference_chr20_chr21, zeroAndOneSegmentCpxVariantExtractor, Collections.singletonList(makeInversion(new SimpleInterval("chr21:402807-402904"), Allele.create("N", true)) // THE REF ALLELE N HERE IS BECAUSE OF COORDINATE MESSING on TEST DATA MENTIONED ABOVE - .attribute(EVENT_KEY, "CPX_chr21:402806-402905") - .attribute(CONTIG_NAMES, "asm002252:tig00003").make()) + .attribute(CPX_EVENT_KEY, "CPX_chr21:402806-402905") + .attribute(CONTIG_NAMES, "asm002252:tig00003") + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()) }); // 3. one segment -> when deletion is not allowed @@ -80,7 +86,9 @@ private List caseForZeroAndOneSegmentCalls() { Arrays.asList("1","UINS-28","1","UINS-64","1")); data.add(new Object[]{complex, b38_reference_chr20_chr21, zeroAndOneSegmentCpxVariantExtractor, Collections.singletonList(makeInsertion("chr20", 18675720, 18675720, 408, Allele.create("A", true)) - .attribute(EVENT_KEY, "CPX_chr20:18675721-18675877").attribute(CONTIG_NAMES, "asm028012:tig00004").make()) + .attribute(CPX_EVENT_KEY, "CPX_chr20:18675721-18675877").attribute(CONTIG_NAMES, "asm028012:tig00004") + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()) }); // 3.2 -> 1, ..... @@ -89,7 +97,10 @@ private List caseForZeroAndOneSegmentCalls() { Collections.singletonList("asm028821:tig00001"), Collections.singletonList(new SimpleInterval("chr20:64096905-64097041")), Arrays.asList("1","1","UINS-166")); data.add(new Object[]{complex, b38_reference_chr20_chr21, zeroAndOneSegmentCpxVariantExtractor, - Collections.singletonList(makeInsertion("chr20", 64097041, 64097041, 318, Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chr20:64096905-64097041").attribute(CONTIG_NAMES, "asm028821:tig00001").make()) + Collections.singletonList(makeInsertion("chr20", 64097041, 64097041, 318, Allele.create("A", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:64096905-64097041").attribute(CONTIG_NAMES, "asm028821:tig00001") + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()) }); // 3.3 -> ...., 1, .... @@ -98,8 +109,15 @@ private List caseForZeroAndOneSegmentCalls() { Arrays.asList("asm028558:tig00000", "asm028558:tig00001"), Collections.singletonList(new SimpleInterval("chr20:51740561-51741034")), Arrays.asList("-chr18:11642876-11642927","UINS-496","1","UINS-49")); data.add(new Object[]{complex, b38_reference_chr20_chr21, zeroAndOneSegmentCpxVariantExtractor, - Arrays.asList(makeInsertion("chr20", 51740560, 51740560, 549, Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chr20:51740560-51741035").attribute(CONTIG_NAMES, "asm028558:tig00000,asm028558:tig00001").make(), - makeInsertion("chr20", 51741034, 51741034, 50, Allele.create("T", true)).attribute(EVENT_KEY, "CPX_chr20:51740560-51741035").attribute(CONTIG_NAMES, "asm028558:tig00000,asm028558:tig00001").make()) + Arrays.asList( + makeInsertion("chr20", 51740560, 51740560, 549, Allele.create("A", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:51740560-51741035").attribute(CONTIG_NAMES, "asm028558:tig00000,asm028558:tig00001") + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make(), + makeInsertion("chr20", 51741034, 51741034, 50, Allele.create("T", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:51740560-51741035").attribute(CONTIG_NAMES, "asm028558:tig00000,asm028558:tig00001") + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()) }); // 4. one segment -> whenNoInvAndNoAsIsAppearance @@ -110,7 +128,10 @@ private List caseForZeroAndOneSegmentCalls() { Collections.singletonList("asm028026:tig00000"), Collections.singletonList(new SimpleInterval("chr20:20269131-20269199")), Collections.singletonList("-chrX:137700299-137700331")); data.add(new Object[]{complex, b38_reference_chr20_chr21, zeroAndOneSegmentCpxVariantExtractor, - Collections.singletonList(makeDeletion(new SimpleInterval("chr20:20269131-20269198"), Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chr20:20269131-20269199").attribute(CONTIG_NAMES, "asm028026:tig00000").make()) + Collections.singletonList(makeDeletion(new SimpleInterval("chr20:20269131-20269198"), Allele.create("A", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:20269131-20269199").attribute(CONTIG_NAMES, "asm028026:tig00000") + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()) }); // 4.2 -> deletion and insertion complex = makeTestComplexVariant(new SimpleInterval("chr20:54849491-54849615"), 15, @@ -118,8 +139,15 @@ private List caseForZeroAndOneSegmentCalls() { Collections.singletonList("asm028586:tig00000"), Collections.singletonList(new SimpleInterval("chr20:54849491-54849615")), Arrays.asList("UINS-36","-chr14:58474127-58474172","UINS-54")); data.add(new Object[]{complex, b38_reference_chr20_chr21, zeroAndOneSegmentCpxVariantExtractor, - Arrays.asList(makeDeletion(new SimpleInterval("chr20:54849491-54849614"), Allele.create("C", true)).attribute(EVENT_KEY, "CPX_chr20:54849491-54849615").attribute(CONTIG_NAMES, "asm028586:tig00000").make(), - makeInsertion("chr20", 54849491, 54849491, 140, Allele.create("c", true)).attribute(EVENT_KEY, "CPX_chr20:54849491-54849615").attribute(CONTIG_NAMES, "asm028586:tig00000").make()) + Arrays.asList( + makeDeletion(new SimpleInterval("chr20:54849491-54849614"), Allele.create("C", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:54849491-54849615").attribute(CONTIG_NAMES, "asm028586:tig00000") + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make(), + makeInsertion("chr20", 54849491, 54849491, 140, Allele.create("c", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:54849491-54849615").attribute(CONTIG_NAMES, "asm028586:tig00000") + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()) }); // 4.3 -> fat insertion @@ -128,7 +156,10 @@ private List caseForZeroAndOneSegmentCalls() { Collections.singletonList("asm027960:tig00003"), Collections.singletonList(new SimpleInterval("chr20:12558793-12558810")), Arrays.asList("-chrX:99014092-99014129","UINS-101")); data.add(new Object[]{complex, b38_reference_chr20_chr21, zeroAndOneSegmentCpxVariantExtractor, - Collections.singletonList(makeInsertion("chr20", 12558793, 12558809, 133, Allele.create("AAAAAAAAAAAAAAAAA", true)).attribute(EVENT_KEY, "CPX_chr20:12558793-12558810").attribute(CONTIG_NAMES, "asm027960:tig00003").make()) + Collections.singletonList(makeInsertion("chr20", 12558793, 12558809, 133, Allele.create("AAAAAAAAAAAAAAAAA", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:12558793-12558810").attribute(CONTIG_NAMES, "asm027960:tig00003") + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()) }); return data; @@ -147,7 +178,9 @@ private List caseForMultiSegmentsCalls() { List refSegments = Arrays.asList(new SimpleInterval("chr21:21264944-21264988"), new SimpleInterval("chr21:21264988-21265052"), new SimpleInterval("chr21:21265052-21265096")); List altArrangements = Arrays.asList("1","2","3","2","1","2","3"); VariantContext complex = makeTestComplexVariant(affectedInterval, svLen, refAllele, altSeq, ctgNames, refSegments, altArrangements); - List expectedSimple = Collections.singletonList(makeInsertion("chr21", 21264943, 21264943, 221, Allele.create("G", true)).attribute(EVENT_KEY, "CPX_chr21:21264944-21265096").attribute(CONTIG_NAMES, ctgNames).make()); + List expectedSimple = Collections.singletonList(makeInsertion("chr21", 21264943, 21264943, 221, Allele.create("G", true)).attribute(CPX_EVENT_KEY, "CPX_chr21:21264944-21265096").attribute(CONTIG_NAMES, ctgNames) + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()); data.add(new Object[]{complex, b38_reference_chr20_chr21, multiSegmentsCpxVariantExtractor, expectedSimple}); // case 1.2: rear insertion only @@ -159,7 +192,10 @@ private List caseForMultiSegmentsCalls() { refSegments = Arrays.asList(new SimpleInterval("chr20:61919906-61919908"), new SimpleInterval("chr20:61919908-61920054"), new SimpleInterval("chr20:61920054-61920109")); altArrangements = Arrays.asList("1","2","3","UINS-177","1","2","2","3"); complex = makeTestComplexVariant(affectedInterval, svLen, refAllele, altSeq, ctgNames, refSegments, altArrangements); - expectedSimple = Collections.singletonList(makeInsertion("chr20", 61920109, 61920109, 531, Allele.create("G", true)).attribute(EVENT_KEY, "CPX_chr20:61919906-61920109").attribute(CONTIG_NAMES, ctgNames).make()); + expectedSimple = Collections.singletonList(makeInsertion("chr20", 61920109, 61920109, 531, Allele.create("G", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:61919906-61920109").attribute(CONTIG_NAMES, ctgNames) + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()); data.add(new Object[]{complex, b38_reference_chr20_chr21, multiSegmentsCpxVariantExtractor, expectedSimple}); // case 1.3: front and rear insertion @@ -171,8 +207,15 @@ private List caseForMultiSegmentsCalls() { refSegments = Arrays.asList(new SimpleInterval("chr20:38653054-38653113"), new SimpleInterval("chr20:38653113-38653145"), new SimpleInterval("chr20:38653145-38653179"), new SimpleInterval("chr20:38653179-38653273"), new SimpleInterval("chr20:38653273-38653283")); altArrangements = Arrays.asList("1","2","3","4","3","1","2","3","4","5","2","3","4","5"); complex = makeTestComplexVariant(affectedInterval, svLen, refAllele, altSeq, ctgNames, refSegments, altArrangements); - expectedSimple = Arrays.asList(makeInsertion("chr20", 38653053, 38653053, 259, Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chr20:38653054-38653283").attribute(CONTIG_NAMES, ctgNames).make(), - makeInsertion("chr20", 38653283, 38653283, 175, Allele.create("G", true)).attribute(EVENT_KEY, "CPX_chr20:38653054-38653283").attribute(CONTIG_NAMES, ctgNames).make()); + expectedSimple = Arrays.asList( + makeInsertion("chr20", 38653053, 38653053, 259, Allele.create("A", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:38653054-38653283").attribute(CONTIG_NAMES, ctgNames) + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make(), + makeInsertion("chr20", 38653283, 38653283, 175, Allele.create("G", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:38653054-38653283").attribute(CONTIG_NAMES, ctgNames) + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()); data.add(new Object[]{complex, b38_reference_chr20_chr21, multiSegmentsCpxVariantExtractor, expectedSimple}); // case 2: possibly inversion @@ -186,7 +229,10 @@ private List caseForMultiSegmentsCalls() { refSegments = Arrays.asList(new SimpleInterval("chr20:23122561-23122596"), new SimpleInterval("chr20:23122596-23122666"), new SimpleInterval("chr20:23122666-23122996")); altArrangements = Arrays.asList("1","2","-1"); complex = makeTestComplexVariant(affectedInterval, svLen, refAllele, altSeq, ctgNames, refSegments, altArrangements); - expectedSimple = Collections.singletonList(makeDeletion(new SimpleInterval("chr20:23122666-23122995"), Allele.create("C", true)).attribute(EVENT_KEY, "CPX_chr20:23122561-23122996").attribute(CONTIG_NAMES, ctgNames).make()); + expectedSimple = Collections.singletonList(makeDeletion(new SimpleInterval("chr20:23122666-23122995"), Allele.create("C", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:23122561-23122996").attribute(CONTIG_NAMES, ctgNames) + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()); data.add(new Object[]{complex, b38_reference_chr20_chr21, multiSegmentsCpxVariantExtractor, expectedSimple}); // case 2.2: no as-is but with inverted, but too short (coordinate and allele massage, original event CPX_chr20:34732145-34733344) @@ -198,7 +244,10 @@ private List caseForMultiSegmentsCalls() { refSegments = Arrays.asList(new SimpleInterval("chr20:34732145-34733303"), new SimpleInterval("chr20:34733303-34733342"), new SimpleInterval("chr20:34733342-34733344")); altArrangements = Arrays.asList("-3","-2","UINS-14","3"); // segment 1 deleted, segment 2 appear inverted but length too short complex = makeTestComplexVariant(affectedInterval, svLen, refAllele, altSeq, ctgNames, refSegments, altArrangements); - expectedSimple = Collections.singletonList(makeDeletion(new SimpleInterval("chr20:34732145-34733302"), Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chr20:34732145-34733344").attribute(CONTIG_NAMES, ctgNames).make()); + expectedSimple = Collections.singletonList(makeDeletion(new SimpleInterval("chr20:34732145-34733302"), Allele.create("A", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:34732145-34733344").attribute(CONTIG_NAMES, ctgNames) + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()); data.add(new Object[]{complex, b38_reference_chr20_chr21, multiSegmentsCpxVariantExtractor, expectedSimple}); // case 2.3: no as-is but with inverted, and inverted sequence long enough @@ -210,7 +259,10 @@ private List caseForMultiSegmentsCalls() { refSegments = Arrays.asList(new SimpleInterval("chr21:26001844-26002384"), new SimpleInterval("chr21:26002384-26002386")); altArrangements = Arrays.asList("-1"); complex = makeTestComplexVariant(affectedInterval, svLen, refAllele, altSeq, ctgNames, refSegments, altArrangements); - expectedSimple = Collections.singletonList(makeInversion(new SimpleInterval("chr21:26001844-26002384"), Allele.create("T", true)).attribute(EVENT_KEY, "CPX_chr21:26001843-26002386").attribute(CONTIG_NAMES, ctgNames).make()); + expectedSimple = Collections.singletonList(makeInversion(new SimpleInterval("chr21:26001844-26002384"), Allele.create("T", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr21:26001843-26002386").attribute(CONTIG_NAMES, ctgNames) + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()); data.add(new Object[]{complex, b38_reference_chr20_chr21, multiSegmentsCpxVariantExtractor, expectedSimple}); // case 3: possibly deletion @@ -225,7 +277,10 @@ private List caseForMultiSegmentsCalls() { refSegments = Arrays.asList(new SimpleInterval("chr21:23428920-23428968"), new SimpleInterval("chr21:23428968-23428998"), new SimpleInterval("chr21:23428998-23429023")); altArrangements = Arrays.asList("UINS-84","2","3","UINS-5","2","2","3"); complex = makeTestComplexVariant(affectedInterval, svLen, refAllele, altSeq, ctgNames, refSegments, altArrangements); - expectedSimple = Collections.singletonList(makeInsertion("chr21", 23428920, 23428920, 85, Allele.create("T", true)).attribute(EVENT_KEY, "CPX_chr21:23428920-23429023").attribute(CONTIG_NAMES, ctgNames).make()); + expectedSimple = Collections.singletonList(makeInsertion("chr21", 23428920, 23428920, 85, Allele.create("T", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr21:23428920-23429023").attribute(CONTIG_NAMES, ctgNames) + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()); data.add(new Object[]{complex, b38_reference_chr20_chr21, multiSegmentsCpxVariantExtractor, expectedSimple}); // case 3.2: deleted range long enough (tested together with 2.1) @@ -241,7 +296,10 @@ private List caseForMultiSegmentsCalls() { refSegments = Arrays.asList(new SimpleInterval("chr20:13062977-13063037"), new SimpleInterval("chr20:13063037-13063272"), new SimpleInterval("chr20:13063272-13063278")); altArrangements = Arrays.asList("1","-2","-1","UINS-14"); complex = makeTestComplexVariant(affectedInterval, svLen, refAllele, altSeq, ctgNames, refSegments, altArrangements); - expectedSimple = Arrays.asList(makeInversion(new SimpleInterval("chr20:13063037-13063272"), Allele.create("G", true)).attribute(EVENT_KEY, "CPX_chr20:13062977-13063278").attribute(CONTIG_NAMES, ctgNames).make()); + expectedSimple = Arrays.asList(makeInversion(new SimpleInterval("chr20:13063037-13063272"), Allele.create("G", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr20:13062977-13063278").attribute(CONTIG_NAMES, ctgNames) + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()); data.add(new Object[]{complex, b38_reference_chr20_chr21, multiSegmentsCpxVariantExtractor, expectedSimple}); // case 4.2: front insertion only (tested together with 3.1) @@ -255,7 +313,10 @@ private List caseForMultiSegmentsCalls() { refSegments = Arrays.asList(new SimpleInterval("chr21:36680290-36680331"), new SimpleInterval("chr21:36680331-36680659"), new SimpleInterval("chr21:36680659-36680686")); altArrangements = Arrays.asList("1","2","1","UINS-249"); complex = makeTestComplexVariant(affectedInterval, svLen, refAllele, altSeq, ctgNames, refSegments, altArrangements); - expectedSimple = Arrays.asList(makeInsertion("chr21", 36680686, 36680686, 250, Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chr21:36680290-36680686").attribute(CONTIG_NAMES, ctgNames).make()); + expectedSimple = Arrays.asList(makeInsertion("chr21", 36680686, 36680686, 250, Allele.create("A", true)) + .attribute(CPX_EVENT_KEY, "CPX_chr21:36680290-36680686").attribute(CONTIG_NAMES, ctgNames) + .attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)) + .attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()); data.add(new Object[]{complex, b38_reference_chr20_chr21, multiSegmentsCpxVariantExtractor, expectedSimple}); // case 4.4: front and rear insertion (coordinate and allele massage, original event CPX_chr19:8888822-8895655) @@ -267,10 +328,10 @@ private List caseForMultiSegmentsCalls() { refSegments = Arrays.asList(new SimpleInterval("chr20:8888822-8895288"), new SimpleInterval("chr20:8895288-8895361"), new SimpleInterval("chr20:8895361-8895655")); altArrangements = Arrays.asList("UINS-297","2","UINS-280"); complex = makeTestComplexVariant(affectedInterval, svLen, refAllele, altSeq, ctgNames, refSegments, altArrangements); - expectedSimple = Arrays.asList(makeDeletion(new SimpleInterval("chr20:8888822-8895287"), Allele.create("G", true)).attribute(EVENT_KEY, "CPX_chr20:8888822-8895655").attribute(CONTIG_NAMES, ctgNames).make(), - makeDeletion(new SimpleInterval("chr20:8895361-8895654"), Allele.create("T", true)).attribute(EVENT_KEY, "CPX_chr20:8888822-8895655").attribute(CONTIG_NAMES, ctgNames).make(), - makeInsertion("chr20", 8888822, 8888822, 298, Allele.create("G", true)).attribute(EVENT_KEY, "CPX_chr20:8888822-8895655").attribute(CONTIG_NAMES, ctgNames).make(), - makeInsertion("chr20", 8895655, 8895655, 281, Allele.create("C", true)).attribute(EVENT_KEY, "CPX_chr20:8888822-8895655").attribute(CONTIG_NAMES, ctgNames).make()); + expectedSimple = Arrays.asList(makeDeletion(new SimpleInterval("chr20:8888822-8895287"), Allele.create("G", true)).attribute(CPX_EVENT_KEY, "CPX_chr20:8888822-8895655").attribute(CONTIG_NAMES, ctgNames).attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)).attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make(), + makeDeletion(new SimpleInterval("chr20:8895361-8895654"), Allele.create("T", true)).attribute(CPX_EVENT_KEY, "CPX_chr20:8888822-8895655").attribute(CONTIG_NAMES, ctgNames).attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)).attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make(), + makeInsertion("chr20", 8888822, 8888822, 298, Allele.create("G", true)).attribute(CPX_EVENT_KEY, "CPX_chr20:8888822-8895655").attribute(CONTIG_NAMES, ctgNames).attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)).attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make(), + makeInsertion("chr20", 8895655, 8895655, 281, Allele.create("C", true)).attribute(CPX_EVENT_KEY, "CPX_chr20:8888822-8895655").attribute(CONTIG_NAMES, ctgNames).attribute(MAX_ALIGN_LENGTH, complex.getAttributeAsInt(MAX_ALIGN_LENGTH, 0)).attribute(MAPPING_QUALITIES, complex.getAttributeAsString(MAPPING_QUALITIES, "")).make()); data.add(new Object[]{complex, b38_reference_chr20_chr21, multiSegmentsCpxVariantExtractor, expectedSimple}); return data; @@ -418,7 +479,7 @@ private Object[][] forPostProcessConvertShortDupToIns() { .attribute(SEQ_ALT_HAPLOTYPE, "GTTGGGTGTACTCGGAGATCCGGTCGATGGCGTTGGGTGTACTCGGAGATCCAGTCGGTGGCGTTGGGTGTACTCGGAGATCCAGTCGGTGGCGTTGGGTGTACTCGGAGATCCAGTCGGTGGCGTTGGGTGTACTCGGAGATCCAGTGGATGGCGTTGGGTGTACTCGGAGATCCAGTCGGTGGCGTTGGGTGTACTCGGAGATCCAGTGGATGGCGTTGGGTGTACTCGGAGATCCAGTCGGTGGCGTTGGGTGTACTCAGAGATCCAGCTGATGGCATTCAGCGTACTCGGAGATCCAGTTGATGGTGTTGGGTGTTCTCGGAGATCCAGTCGGTGGCGTTGGGTGTACTCAGAGATCCAGTTGATGGCATTCAGTGTACTCGGAGATCTAGTCGATGGCGTTGGGTGTACTCGGAGATCCAGTTGATGGCATTCAGCGTACTCGGAGATCCAGTTGATGGTGTTGGGTGTTCTCGGAGATCCAGTCGGTGGTGTTGGGTGTACTCAGAGATCCAGTTGATGGCATTCATTGTACTCGGAGATCCAGTCGATGGCGTTGGGTGTACTTGGAGATCCAGTCGGTGGCGTTGGGTGTACTTGGAGATCC") .attribute(SVLEN, 403) .attribute(SVTYPE, "DUP") - .attribute(EVENT_KEY, "CPX_chr2:241987323-241987529") + .attribute(CPX_EVENT_KEY, "CPX_chr2:241987323-241987529") .attribute(DUPLICATION_NUMBERS, "1,2") .attribute(DUP_ORIENTATIONS, "++") .attribute(DUP_REPEAT_UNIT_REF_SPAN, "chr2:241987323-241987529") @@ -444,7 +505,7 @@ private Object[][] forPostProcessConvertShortDupToIns() { .attribute(SEQ_ALT_HAPLOTYPE, "TATATTATAATATATTTTAATATTATATTATATTATAATATATTTTAATATTATATTATATTATAATATATTTTAATATTATATTATATTATAATATATTTTAATATATTATAATATATTTTAATATTATATTATATTATAATATATT") .attribute(SVLEN, 104) .attribute(SVTYPE, "DUP") - .attribute(EVENT_KEY, "CPX_chr2:83340902-83340950") + .attribute(CPX_EVENT_KEY, "CPX_chr2:83340902-83340950") .attribute(DUPLICATION_NUMBERS, "1,2") .attribute(DUP_ORIENTATIONS, "+-") .attribute(DUP_REPEAT_UNIT_REF_SPAN, "chr2:83340907-83340950") @@ -467,7 +528,7 @@ private Object[][] forPostProcessConvertShortDupToIns() { .attribute(SEQ_ALT_HAPLOTYPE, "TATATTATAATATATTTTAATATTATATTATATTATAATATATTTTAATATTATATTATATTATAATATATTTTAATATTATATTATATTATAATATATTTTAATATATTATAATATATTTTAATATTATATTATATTATAATATATT") .attribute(SVLEN, 104) .attribute(SVTYPE, "INS") - .attribute(EVENT_KEY, "CPX_chr2:83340902-83340950") + .attribute(CPX_EVENT_KEY, "CPX_chr2:83340902-83340950") .attribute(DUPLICATION_NUMBERS, "1,2") .attribute(DUP_ORIENTATIONS, "+-") .attribute(DUP_REPEAT_UNIT_REF_SPAN, "chr2:83340907-83340950") @@ -494,22 +555,22 @@ private Object[][] forPostProcessConvertReplacementToFatInsOrInsAndDel() { }); // fat insertion - VariantContext deletion = makeDeletion(new SimpleInterval("chr21:23428920-23428967"), Allele.create("T", true)).attribute(ALIGN_LENGTHS, "56,56").attribute(EVENT_KEY, "CPX_chr21:23428920-23429023").attribute(CONTIG_NAMES, "asm029052:tig00000,asm029052:tig00001").attribute(HQ_MAPPINGS, 2).attribute(INSERTED_SEQUENCE, "ATAAATATATATAATAATATATAATTATATATTATATTATATAATATAATAATATATATTATAATACATTATATAATATATTATA").attribute(INSERTED_SEQUENCE_LENGTH, 85).attribute(INSERTED_SEQUENCE_MAPPINGS, "1330_1385_chr21:23428968-23429023_+_1329H56M1200H_49_4_36_O,1330_1385_chr21:23428968-23429023_+_1329H56M1200H_49_4_36_O").attribute(MAPPING_QUALITIES, "60,60").attribute(MAX_ALIGN_LENGTH, 56).attribute(SEQ_ALT_HAPLOTYPE, "ATAAATATATATAATAATATATAATTATATATTATATTATATAATATAATAATATATATTATAATACATTATATAATATATTATA").attribute(TOTAL_MAPPINGS, 2).make(); - VariantContext fatInsertion = makeInsertion("chr21", 23428920, 23428967, 85, Allele.create("TTTATATAAATATATATAAATATATAATATATAATAATATAATATAAT", true)).attribute(ALIGN_LENGTHS, "56,56").attribute(EVENT_KEY, "CPX_chr21:23428920-23429023").attribute(CONTIG_NAMES, "asm029052:tig00000,asm029052:tig00001").attribute(HQ_MAPPINGS, 2).attribute(INSERTED_SEQUENCE, "ATAAATATATATAATAATATATAATTATATATTATATTATATAATATAATAATATATATTATAATACATTATATAATATATTATA").attribute(INSERTED_SEQUENCE_LENGTH, 85).attribute(MAPPING_QUALITIES, "60,60").attribute(MAX_ALIGN_LENGTH, 56).attribute(SEQ_ALT_HAPLOTYPE, "ATAAATATATATAATAATATATAATTATATATTATATTATATAATATAATAATATATATTATAATACATTATATAATATATTATA").attribute(TOTAL_MAPPINGS, 2).make(); + VariantContext deletion = makeDeletion(new SimpleInterval("chr21:23428920-23428967"), Allele.create("T", true)).attribute(ALIGN_LENGTHS, "56,56").attribute(CPX_EVENT_KEY, "CPX_chr21:23428920-23429023").attribute(CONTIG_NAMES, "asm029052:tig00000,asm029052:tig00001").attribute(HQ_MAPPINGS, 2).attribute(INSERTED_SEQUENCE, "ATAAATATATATAATAATATATAATTATATATTATATTATATAATATAATAATATATATTATAATACATTATATAATATATTATA").attribute(INSERTED_SEQUENCE_LENGTH, 85).attribute(INSERTED_SEQUENCE_MAPPINGS, "1330_1385_chr21:23428968-23429023_+_1329H56M1200H_49_4_36_O,1330_1385_chr21:23428968-23429023_+_1329H56M1200H_49_4_36_O").attribute(MAPPING_QUALITIES, "60,60").attribute(MAX_ALIGN_LENGTH, 56).attribute(SEQ_ALT_HAPLOTYPE, "ATAAATATATATAATAATATATAATTATATATTATATTATATAATATAATAATATATATTATAATACATTATATAATATATTATA").attribute(TOTAL_MAPPINGS, 2).make(); + VariantContext fatInsertion = makeInsertion("chr21", 23428920, 23428967, 85, Allele.create("TTTATATAAATATATATAAATATATAATATATAATAATATAATATAAT", true)).attribute(ALIGN_LENGTHS, "56,56").attribute(CPX_EVENT_KEY, "CPX_chr21:23428920-23429023").attribute(CONTIG_NAMES, "asm029052:tig00000,asm029052:tig00001").attribute(HQ_MAPPINGS, 2).attribute(INSERTED_SEQUENCE, "ATAAATATATATAATAATATATAATTATATATTATATTATATAATATAATAATATATATTATAATACATTATATAATATATTATA").attribute(INSERTED_SEQUENCE_LENGTH, 85).attribute(MAPPING_QUALITIES, "60,60").attribute(MAX_ALIGN_LENGTH, 56).attribute(SEQ_ALT_HAPLOTYPE, "ATAAATATATATAATAATATATAATTATATATTATATTATATAATATAATAATATATATTATAATACATTATATAATATATTATA").attribute(TOTAL_MAPPINGS, 2).make(); data.add(new Object[]{deletion, Collections.singletonList(fatInsertion) }); // deletion with small insertion, i.e. no modification - VariantContext deletionWithMicroInsertion = new VariantContextBuilder().chr("chr20").start(63093346).stop(63094245).alleles(Arrays.asList(Allele.create("G", true), Allele.create(""))).attribute(VCFConstants.END_KEY, 63094245).attribute(SVTYPE, "DEL").attribute(SVLEN, -899).attribute(INSERTED_SEQUENCE_LENGTH, 1).attribute(INSERTED_SEQUENCE, "T").attribute(ALIGN_LENGTHS,942).attribute(EVENT_KEY, "CPX_chr20:63092255-63094246").attribute(CONTIG_NAMES, "asm028762:tig00002").attribute(HQ_MAPPINGS, 1).attribute(MAPPING_QUALITIES, "60").attribute(MAX_ALIGN_LENGTH,942).attribute(SEQ_ALT_HAPLOTYPE, "T").attribute(TOTAL_MAPPINGS, 1).make(); + VariantContext deletionWithMicroInsertion = new VariantContextBuilder().chr("chr20").start(63093346).stop(63094245).alleles(Arrays.asList(Allele.create("G", true), Allele.create(""))).attribute(VCFConstants.END_KEY, 63094245).attribute(SVTYPE, "DEL").attribute(SVLEN, -899).attribute(INSERTED_SEQUENCE_LENGTH, 1).attribute(INSERTED_SEQUENCE, "T").attribute(ALIGN_LENGTHS,942).attribute(CPX_EVENT_KEY, "CPX_chr20:63092255-63094246").attribute(CONTIG_NAMES, "asm028762:tig00002").attribute(HQ_MAPPINGS, 1).attribute(MAPPING_QUALITIES, "60").attribute(MAX_ALIGN_LENGTH,942).attribute(SEQ_ALT_HAPLOTYPE, "T").attribute(TOTAL_MAPPINGS, 1).make(); data.add(new Object[]{deletionWithMicroInsertion, Collections.singletonList(deletionWithMicroInsertion) }); // deletion and insertion at the same time (location massaging) - VariantContext sourceDeletion = makeDeletion(new SimpleInterval("chr20:440444-440697"), Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chrX:439692-440698").attribute(CONTIG_NAMES, "asm030101:tig00001").attribute(TOTAL_MAPPINGS, 1).attribute(MAPPING_QUALITIES, 60).attribute(HQ_MAPPINGS, 1).attribute(ALIGN_LENGTHS, 170).attribute(MAX_ALIGN_LENGTH, 170).attribute(INSERTED_SEQUENCE_LENGTH, 60).attribute(INSERTED_SEQUENCE, "TTCATACACACACAGATACACACCCGCGCACACACAGATGCACACACACACCCGTACACT").attribute(SEQ_ALT_HAPLOTYPE, "TTCATACACACACAGATACACACCCGCGCACACACAGATGCACACACACACCCGTACACT").make(); - VariantContext linkedDel = makeDeletion(new SimpleInterval("chr20:440444-440697"), Allele.create("A", true)).attribute(LINK, "INS_chr20_440444_440444_CPX_DERIVED").attribute(EVENT_KEY, "CPX_chrX:439692-440698").attribute(CONTIG_NAMES, "asm030101:tig00001").attribute(TOTAL_MAPPINGS, 1).attribute(MAPPING_QUALITIES, 60).attribute(HQ_MAPPINGS, 1).attribute(ALIGN_LENGTHS, 170).attribute(MAX_ALIGN_LENGTH, 170).make(); - VariantContext linkedIns = makeInsertion("chr20", 440444, 440444, 60, Allele.create("A", true)).attribute(LINK, "DEL_chr20_440444_440697_CPX_DERIVED").attribute(EVENT_KEY, "CPX_chrX:439692-440698").attribute(CONTIG_NAMES, "asm030101:tig00001").attribute(TOTAL_MAPPINGS, 1).attribute(MAPPING_QUALITIES, 60).attribute(HQ_MAPPINGS, 1).attribute(ALIGN_LENGTHS, 170).attribute(MAX_ALIGN_LENGTH, 170).attribute(INSERTED_SEQUENCE_LENGTH, 60).attribute(INSERTED_SEQUENCE_LENGTH, 60).attribute(INSERTED_SEQUENCE, "TTCATACACACACAGATACACACCCGCGCACACACAGATGCACACACACACCCGTACACT").attribute(SEQ_ALT_HAPLOTYPE, "TTCATACACACACAGATACACACCCGCGCACACACAGATGCACACACACACCCGTACACT").make(); + VariantContext sourceDeletion = makeDeletion(new SimpleInterval("chr20:440444-440697"), Allele.create("A", true)).attribute(CPX_EVENT_KEY, "CPX_chrX:439692-440698").attribute(CONTIG_NAMES, "asm030101:tig00001").attribute(TOTAL_MAPPINGS, 1).attribute(MAPPING_QUALITIES, 60).attribute(HQ_MAPPINGS, 1).attribute(ALIGN_LENGTHS, 170).attribute(MAX_ALIGN_LENGTH, 170).attribute(INSERTED_SEQUENCE_LENGTH, 60).attribute(INSERTED_SEQUENCE, "TTCATACACACACAGATACACACCCGCGCACACACAGATGCACACACACACCCGTACACT").attribute(SEQ_ALT_HAPLOTYPE, "TTCATACACACACAGATACACACCCGCGCACACACAGATGCACACACACACCCGTACACT").make(); + VariantContext linkedDel = makeDeletion(new SimpleInterval("chr20:440444-440697"), Allele.create("A", true)).attribute(LINK, "INS_chr20_440444_440444_CPX_DERIVED").attribute(CPX_EVENT_KEY, "CPX_chrX:439692-440698").attribute(CONTIG_NAMES, "asm030101:tig00001").attribute(TOTAL_MAPPINGS, 1).attribute(MAPPING_QUALITIES, 60).attribute(HQ_MAPPINGS, 1).attribute(ALIGN_LENGTHS, 170).attribute(MAX_ALIGN_LENGTH, 170).make(); + VariantContext linkedIns = makeInsertion("chr20", 440444, 440444, 60, Allele.create("A", true)).attribute(LINK, "DEL_chr20_440444_440697_CPX_DERIVED").attribute(CPX_EVENT_KEY, "CPX_chrX:439692-440698").attribute(CONTIG_NAMES, "asm030101:tig00001").attribute(TOTAL_MAPPINGS, 1).attribute(MAPPING_QUALITIES, 60).attribute(HQ_MAPPINGS, 1).attribute(ALIGN_LENGTHS, 170).attribute(MAX_ALIGN_LENGTH, 170).attribute(INSERTED_SEQUENCE_LENGTH, 60).attribute(INSERTED_SEQUENCE_LENGTH, 60).attribute(INSERTED_SEQUENCE, "TTCATACACACACAGATACACACCCGCGCACACACAGATGCACACACACACCCGTACACT").attribute(SEQ_ALT_HAPLOTYPE, "TTCATACACACACAGATACACACCCGCGCACACACAGATGCACACACACACCCGTACACT").make(); data.add(new Object[]{sourceDeletion, Arrays.asList(linkedIns, linkedDel) }); @@ -531,29 +592,29 @@ private Object[][] forTestRemoveDuplicates() { final List sourceWithMoreAnnotations = new ArrayList<>(); final List expected = new ArrayList<>(); - final VariantContext firstInsertion = makeInsertion("chr21", 46069065, 46069065, 60, Allele.create("C", true)).attribute(EVENT_KEY, "CPX_chr21:46069065-46069209").attribute(CONTIG_NAMES, "asm029362:tig00001,asm029362:tig00002").make(); - final VariantContext firstInsertionWithMoreAnnotations = makeInsertion("chr21", 46069065, 46069065, 60, Allele.create("C", true)).attribute(EVENT_KEY, "CPX_chr21:46069065-46069209").attribute(CONTIG_NAMES, "asm029362:tig00001,asm029362:tig00002") + final VariantContext firstInsertion = makeInsertion("chr21", 46069065, 46069065, 60, Allele.create("C", true)).attribute(CPX_EVENT_KEY, "CPX_chr21:46069065-46069209").attribute(CONTIG_NAMES, "asm029362:tig00001,asm029362:tig00002").make(); + final VariantContext firstInsertionWithMoreAnnotations = makeInsertion("chr21", 46069065, 46069065, 60, Allele.create("C", true)).attribute(CPX_EVENT_KEY, "CPX_chr21:46069065-46069209").attribute(CONTIG_NAMES, "asm029362:tig00001,asm029362:tig00002") .attribute(TOTAL_MAPPINGS, 2).attribute(MAPPING_QUALITIES, "60,60").attribute(HQ_MAPPINGS, "2").attribute(ALIGN_LENGTHS, "91,91").attribute(MAX_ALIGN_LENGTH, "91").attribute(INSERTED_SEQUENCE, "CTAGGTGTGTGCATGTGTGCACACGTGTGTGCATGTGTGTGCATGTGTGCACACGTGTGT").attribute(INSERTED_SEQUENCE_LENGTH, 60).attribute(SEQ_ALT_HAPLOTYPE, "CTAGGTGTGTGCATGTGTGCACACGTGTGTGCATGTGTGTGCATGTGTGCACACGTGTGT").make(); sourceWithLessAnnotations.add(firstInsertion); sourceWithMoreAnnotations.add(firstInsertionWithMoreAnnotations); expected.add(firstInsertionWithMoreAnnotations); - final VariantContext firstDeletion = makeDeletion(new SimpleInterval("chr21:46069156-46069208"), Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chr21:46069065-46069209").attribute(CONTIG_NAMES, "asm029362:tig00001,asm029362:tig00002").make(); - final VariantContext firstDeletionWitMoreAnnotations = makeDeletion(new SimpleInterval("chr21:46069156-46069208"), Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chr21:46069065-46069209").attribute(CONTIG_NAMES, "asm029362:tig00001,asm029362:tig00002") + final VariantContext firstDeletion = makeDeletion(new SimpleInterval("chr21:46069156-46069208"), Allele.create("A", true)).attribute(CPX_EVENT_KEY, "CPX_chr21:46069065-46069209").attribute(CONTIG_NAMES, "asm029362:tig00001,asm029362:tig00002").make(); + final VariantContext firstDeletionWitMoreAnnotations = makeDeletion(new SimpleInterval("chr21:46069156-46069208"), Allele.create("A", true)).attribute(CPX_EVENT_KEY, "CPX_chr21:46069065-46069209").attribute(CONTIG_NAMES, "asm029362:tig00001,asm029362:tig00002") .attribute(TOTAL_MAPPINGS, 2).attribute(MAPPING_QUALITIES, "60,60").attribute(HQ_MAPPINGS, "2").attribute(ALIGN_LENGTHS, "91,91").attribute(MAX_ALIGN_LENGTH, "91").make(); sourceWithLessAnnotations.add(firstDeletion); sourceWithMoreAnnotations.add(firstDeletionWitMoreAnnotations); expected.add(firstDeletionWitMoreAnnotations); // locations below seems inconsistent with the annotations, but that's because we artificially put data on chr20 and 21 - final VariantContext insertionFromStringParsing = makeInsertion("chr20", 439692, 439692, 130, Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chrX:439692-440698").attribute(CONTIG_NAMES, "asm030101:tig00001").make(); - final VariantContext deletionFromStringParsing = makeDeletion(new SimpleInterval("chr20:439692-440161"), Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chrX:439692-440698").attribute(CONTIG_NAMES, "asm030101:tig00001").make(); + final VariantContext insertionFromStringParsing = makeInsertion("chr20", 439692, 439692, 130, Allele.create("A", true)).attribute(CPX_EVENT_KEY, "CPX_chrX:439692-440698").attribute(CONTIG_NAMES, "asm030101:tig00001").make(); + final VariantContext deletionFromStringParsing = makeDeletion(new SimpleInterval("chr20:439692-440161"), Allele.create("A", true)).attribute(CPX_EVENT_KEY, "CPX_chrX:439692-440698").attribute(CONTIG_NAMES, "asm030101:tig00001").make(); sourceWithLessAnnotations.add(insertionFromStringParsing); expected.add(insertionFromStringParsing); sourceWithLessAnnotations.add(deletionFromStringParsing); expected.add(deletionFromStringParsing); - final VariantContext inversionFromStringParsing = makeInversion(new SimpleInterval("chr21:187497346-187497595"), Allele.create("A", true)).attribute(EVENT_KEY, "CPX_chr1:187495696-187497598").attribute(CONTIG_NAMES, "asm001762:tig00000,asm001762:tig00001,asm001763:tig00000").make(); + final VariantContext inversionFromStringParsing = makeInversion(new SimpleInterval("chr21:187497346-187497595"), Allele.create("A", true)).attribute(CPX_EVENT_KEY, "CPX_chr1:187495696-187497598").attribute(CONTIG_NAMES, "asm001762:tig00000,asm001762:tig00001,asm001763:tig00000").make(); sourceWithLessAnnotations.add(inversionFromStringParsing); expected.add(inversionFromStringParsing); @@ -575,7 +636,7 @@ private Object[][] forTestRemoveDuplicates() { .attribute(DUP_SEQ_CIGARS, "109M,109M") .attribute(DUP_TAN_EXPANSION_STRING, "") .attribute(SEQ_ALT_HAPLOTYPE, "CATTGTGACTTATCTCTGCACTGATCACCCAGGTGATGTAACTCTTGTCTAGGCTCTGGCCACAGGGACATAGTGACATATATCTGCACTGATCACACAGGTAATGTAACTGTTGACTAGTCTTTGCCTACAGAGGGCGTTGTGACATATCTCTGCACTGATCTCTCAGGTGAGGTAACTTCTCTAGTCTCTGCCTACAGAGGGCATTGTGACATCACTCTGCAATGATCACCCAGGTGATGTAACTCTTGTCTAGGCTCTGCCTACATGGACATTGTGACATGTCTCTGCACTGATCACCCAGGTGATGTAA") - .attribute(EVENT_KEY, "CPX_chrY:56838784-56839794") + .attribute(CPX_EVENT_KEY, "CPX_chrY:56838784-56839794") .attribute(CONTIG_NAMES, "asm031346:tig00004") .make(); sourceWithMoreAnnotations.add(tandemDuplicationFromPairIteration); @@ -706,10 +767,16 @@ public void testGetMissingAndPresentAndInvertedSegments(final List contigNames, - final List referenceSegments, final List altArrangement) { + private VariantContext makeTestComplexVariant(final SimpleInterval affectedRefRegion, final int svLen, + final String referenceBases, final String altSeqBases, + final List contigNames, + final List referenceSegments, final List altArrangement) { + final int maxAlignmentLength = random.nextInt(4000) + 1; + int evidenceContigCount = contigNames.size(); + StringBuilder mqs = new StringBuilder(); + for (int i = 0; i <= evidenceContigCount; ++i) mqs.append(random.nextInt(61)).append(","); + String mq = mqs.toString(); + final VariantContextBuilder builder = new VariantContextBuilder() .chr(affectedRefRegion.getContig()).start(affectedRefRegion.getStart()).stop(affectedRefRegion.getEnd()) .alleles(Arrays.asList(Allele.create(referenceBases, true), @@ -720,7 +787,9 @@ private static VariantContext makeTestComplexVariant(final SimpleInterval affect .attribute(SVTYPE, CPX_SV_SYB_ALT_ALLELE_STR) .attribute(CPX_EVENT_ALT_ARRANGEMENTS, String.join(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR, altArrangement)) .attribute(CONTIG_NAMES, String.join(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR, contigNames)) - .attribute(SEQ_ALT_HAPLOTYPE, altSeqBases); + .attribute(SEQ_ALT_HAPLOTYPE, altSeqBases) + .attribute(MAX_ALIGN_LENGTH, maxAlignmentLength) + .attribute(MAPPING_QUALITIES, mq.substring(0, mq.length() - 1)); // drop last coma if (referenceSegments.isEmpty()) return builder.make(); else diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFUtilsUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFUtilsUnitTest.java index dad9b6a53da..18c69eadcf8 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFUtilsUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/GATKSVVCFUtilsUnitTest.java @@ -20,27 +20,51 @@ public class GATKSVVCFUtilsUnitTest extends GATKBaseTest { + private static final Stream expectedAltAlleleHeaderKeysInVCF + = Stream.of("INV", "DEL", "INS", "DUP", "DUP:INV", "CPX"); + private static final Stream expectedInfoHeaderKeysInVCF + = Stream.of("SVTYPE", "SVLEN", "MATEID", + "CIPOS", "CIEND", "IMPRECISE", "READ_PAIR_SUPPORT", "SPLIT_READ_SUPPORT", "LINK", + "CTG_NAMES", "TOTAL_MAPPINGS", "MAPPING_QUALITIES", "HQ_MAPPINGS", "ALIGN_LENGTHS", "MAX_ALIGN_LENGTH", + "SEQ_ALT_HAPLOTYPE", "INSSEQ", "INSLEN", "INSSEQ_MAP", "HOMSEQ", "HOMLEN", "DUP_REPEAT_UNIT_REF_SPAN", + "DUP_SEQ_CIGARS", "DUP_NUM", "DUP_ANNOTATIONS_IMPRECISE", "CONTRACTION", "EXPANSION", "DUP_ORIENTATIONS", + "INV33", "INV55", "EXTERNAL_CNV_CALLS", "DUP_IMPRECISE_AFFECTED_RANGE", "CTG_GOOD_NONCANONICAL_MAPPING", + "ALT_ARRANGEMENT", "SEGMENTS", "CPX_EVENT"); + private static final Stream expectedFormatHeaderKeysInVCF + = Stream.of("CN", "CNQ"); + private static final Stream expectedFilterHeaderKeysInVCF + = Stream.of("LOW_MQ", "SHORT_ALN"); + static final List expectedHeaderKeysInVCF + = Stream.of(expectedAltAlleleHeaderKeysInVCF, expectedInfoHeaderKeysInVCF, expectedFormatHeaderKeysInVCF, + expectedFilterHeaderKeysInVCF) + .flatMap(i->i).sorted().collect(Collectors.toList()); + @Test(groups = "sv") public void testVCFConstants() { - Assert.assertEquals(GATKSVVCFConstants.expectedHeaderLinesInVCF, + Assert.assertEquals(expectedHeaderKeysInVCF, Stream.of(SVTYPE, SVLEN, IMPRECISE, CIPOS, CIEND, BND_MATEID_STR, SYMB_ALT_ALLELE_INV, READ_PAIR_SUPPORT, SPLIT_READ_SUPPORT, SYMB_ALT_ALLELE_DEL, SYMB_ALT_ALLELE_INS, SYMB_ALT_ALLELE_DUP, SYMB_ALT_ALLELE_INVDUP, CONTIG_NAMES, TOTAL_MAPPINGS, MAPPING_QUALITIES, HQ_MAPPINGS, ALIGN_LENGTHS, MAX_ALIGN_LENGTH, SEQ_ALT_HAPLOTYPE, INSERTED_SEQUENCE, INSERTED_SEQUENCE_LENGTH, INSERTED_SEQUENCE_MAPPINGS, HOMOLOGY, HOMOLOGY_LENGTH, DUP_REPEAT_UNIT_REF_SPAN, DUP_SEQ_CIGARS, DUPLICATION_NUMBERS, DUP_ANNOTATIONS_IMPRECISE, DUP_IMPRECISE_AFFECTED_RANGE, DUP_TAN_CONTRACTION_STRING, DUP_TAN_EXPANSION_STRING, DUP_ORIENTATIONS, INV33, INV55, EXTERNAL_CNV_CALLS, - CTG_GOOD_NONCANONICAL_MAPPING, LINK) + CTG_GOOD_NONCANONICAL_MAPPING, LINK, COPY_NUMBER_FORMAT, COPY_NUMBER_QUALITY_FORMAT, + CPX_SV_SYB_ALT_ALLELE_STR, CPX_EVENT_ALT_ARRANGEMENTS, CPX_SV_REF_SEGMENTS, CPX_EVENT_KEY, + ASSEMBLY_BASED_VARIANT_MQ_FILTER_KEY, ASSEMBLY_BASED_VARIANT_ALN_LENGTH_FILTER_KEY) .sorted().collect(Collectors.toList())); } @Test(groups = "sv") public void testHeaderLines() { - Assert.assertTrue(GATKSVVCFHeaderLines.getFilterLines().isEmpty()); - Assert.assertTrue(GATKSVVCFHeaderLines.getFormatLines().isEmpty()); + final Stream infoHeaders = GATKSVVCFHeaderLines.getInfoLines().stream().map(VCFInfoHeaderLine::getID); final Stream altAlleleHeaders = GATKSVVCFHeaderLines.getSymbAltAlleleLines().stream().map(VCFSimpleHeaderLine::getID); - Assert.assertEquals(Stream.concat(infoHeaders, altAlleleHeaders).sorted().collect(Collectors.toList()), - GATKSVVCFConstants.expectedHeaderLinesInVCF); + final Stream formatHeaders = GATKSVVCFHeaderLines.getFormatLines().stream().map((VCFCompoundHeaderLine::getID)); + final Stream filterHeaders = GATKSVVCFHeaderLines.getFilterLines().stream().map((VCFSimpleHeaderLine::getID)); + + Assert.assertEquals( + Stream.of(infoHeaders, altAlleleHeaders, formatHeaders, filterHeaders).flatMap(i -> i).sorted().collect(Collectors.toList()), + expectedHeaderKeysInVCF); } @DataProvider(name = "svVcfFiles") @@ -61,14 +85,14 @@ public void checkTestVcfFiles(final Path svVCFFilePath) { try (final VCFFileReader reader = new VCFFileReader(svVCFFilePath.toFile(), false)) { final VCFHeader fileHeader = reader.getFileHeader(); Assert.assertNotNull(fileHeader.getSequenceDictionary()); - Assert.assertTrue(fileHeader.getFilterLines().isEmpty()); + final List refContigs = fileHeader.getContigLines().stream().map(VCFContigHeaderLine::getID).collect(Collectors.toList()); Assert.assertFalse(refContigs.isEmpty()); - Assert.assertTrue(fileHeader.getFormatHeaderLines().isEmpty()); + final List headerKeys = fileHeader.getIDHeaderLines().stream().map(VCFIDHeaderLine::getID).sorted().collect(Collectors.toList()); Assert.assertTrue(headerKeys.remove(VCFConstants.END_KEY)); Assert.assertTrue(headerKeys.removeAll(refContigs)); - Assert.assertEquals(headerKeys, GATKSVVCFConstants.expectedHeaderLinesInVCF); + Assert.assertEquals(headerKeys, expectedHeaderKeysInVCF); } } } diff --git a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVVCFWriterUnitTest.java b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVVCFWriterUnitTest.java index 35b473abab1..849b1cc937f 100644 --- a/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVVCFWriterUnitTest.java +++ b/src/test/java/org/broadinstitute/hellbender/tools/spark/sv/utils/SVVCFWriterUnitTest.java @@ -74,13 +74,13 @@ public void testSetHeader() { b37_2bit_reference_20_21 , ReferenceWindowFunctions.IDENTITY_FUNCTION).getReferenceSequenceDictionary(null); final VCFHeader vcfHeader = SVVCFWriter.getVcfHeader(referenceSequenceDictionary); Assert.assertNotNull(vcfHeader.getSequenceDictionary()); - Assert.assertTrue(vcfHeader.getFilterLines().isEmpty()); + final List refContigs = vcfHeader.getContigLines().stream().map(VCFContigHeaderLine::getID).collect(Collectors.toList()); Assert.assertTrue(refContigs.size()==2); - Assert.assertTrue(vcfHeader.getFormatHeaderLines().isEmpty()); + final List headerKeys = vcfHeader.getIDHeaderLines().stream().map(VCFIDHeaderLine::getID).sorted().collect(Collectors.toList()); Assert.assertTrue(headerKeys.remove(VCFConstants.END_KEY)); Assert.assertTrue(headerKeys.removeAll(refContigs)); - Assert.assertEquals(headerKeys, GATKSVVCFConstants.expectedHeaderLinesInVCF); + Assert.assertEquals(headerKeys, GATKSVVCFUtilsUnitTest.expectedHeaderKeysInVCF); } } diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/cpx_reinterpreted_simple_1_seg.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/cpx_reinterpreted_simple_1_seg.vcf index 8eeb1cd1d31..4523cbf067c 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/cpx_reinterpreted_simple_1_seg.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/cpx_reinterpreted_simple_1_seg.vcf @@ -1,13 +1,16 @@ ##fileformat=VCFv4.2 +##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= @@ -33,23 +36,28 @@ ##INFO= ##INFO= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= ##contig= ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO -chr20 12558793 INS_chr20_12558793_12558809_CPX_DERIVED AAAAAAAAAAAAAAAAA . . CPX_EVENT=CPX_chr20:12558793-12558810;CTG_NAMES=asm027960:tig00003;END=12558809;SVLEN=133;SVTYPE=INS -chr20 18675720 INS_chr20_18675720_18675720_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:18675721-18675877;CTG_NAMES=asm028012:tig00004;END=18675720;SVLEN=408;SVTYPE=INS -chr20 20269131 DEL_chr20_20269131_20269198_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:20269131-20269199;CTG_NAMES=asm028026:tig00000;END=20269198;SVLEN=-67;SVTYPE=DEL -chr20 28561412 INS_chr20_28561412_28561412_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:28561412-28561576;CTG_NAMES=asm008294:tig00014;END=28561412;SVLEN=75;SVTYPE=INS -chr20 28561575 INS_chr20_28561575_28561575_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:28561412-28561576;CTG_NAMES=asm008294:tig00014;END=28561575;SVLEN=75;SVTYPE=INS -chr20 51740560 INS_chr20_51740560_51740560_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:51740560-51740561;CTG_NAMES=asm028558:tig00002,asm028558:tig00003;END=51740560;SVLEN=549;SVTYPE=INS -chr20 51740560 INS_chr20_51740560_51740560_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:51740560-51741035;CTG_NAMES=asm028558:tig00000,asm028558:tig00001;END=51740560;SVLEN=549;SVTYPE=INS -chr20 51741034 INS_chr20_51741034_51741034_CPX_DERIVED T . . CPX_EVENT=CPX_chr20:51740560-51741035;CTG_NAMES=asm028558:tig00000,asm028558:tig00001;END=51741034;SVLEN=50;SVTYPE=INS -chr20 54849491 INS_chr20_54849491_54849491_CPX_DERIVED C . . CPX_EVENT=CPX_chr20:54849491-54849615;CTG_NAMES=asm028586:tig00000;END=54849491;SVLEN=140;SVTYPE=INS -chr20 54849491 DEL_chr20_54849491_54849614_CPX_DERIVED C . . CPX_EVENT=CPX_chr20:54849491-54849615;CTG_NAMES=asm028586:tig00000;END=54849614;SVLEN=-123;SVTYPE=DEL -chr20 58695019 INS_chr20_58695019_58695019_CPX_DERIVED G . . CPX_EVENT=CPX_chr20:58695019-58695020;CTG_NAMES=asm028638:tig00002;END=58695019;SVLEN=549;SVTYPE=INS -chr20 64097041 INS_chr20_64097041_64097041_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:64096905-64097041;CTG_NAMES=asm028821:tig00001;END=64097041;SVLEN=318;SVTYPE=INS +chr20 12558793 INS_chr20_12558793_12558809_CPX_DERIVED AAAAAAAAAAAAAAAAA . . CPX_EVENT=CPX_chr20:12558793-12558810;CTG_NAMES=asm027960:tig00003;END=12558809;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=597;SVLEN=133;SVTYPE=INS +chr20 18675720 INS_chr20_18675720_18675720_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:18675721-18675877;CTG_NAMES=asm028012:tig00004;END=18675720;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=564;SVLEN=408;SVTYPE=INS +chr20 20269131 DEL_chr20_20269131_20269198_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:20269131-20269199;CTG_NAMES=asm028026:tig00000;END=20269198;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=40;SVLEN=-67;SVTYPE=DEL +chr20 28561412 INS_chr20_28561412_28561412_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:28561412-28561576;CTG_NAMES=asm008294:tig00014;END=28561412;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=416;SVLEN=75;SVTYPE=INS +chr20 28561575 INS_chr20_28561575_28561575_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:28561412-28561576;CTG_NAMES=asm008294:tig00014;END=28561575;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=416;SVLEN=75;SVTYPE=INS +chr20 51740560 INS_chr20_51740560_51740560_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:51740560-51740561;CTG_NAMES=asm028558:tig00002,asm028558:tig00003;END=51740560;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=1104;SVLEN=549;SVTYPE=INS +chr20 51740560 INS_chr20_51740560_51740560_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:51740560-51741035;CTG_NAMES=asm028558:tig00000,asm028558:tig00001;END=51740560;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=895;SVLEN=549;SVTYPE=INS +chr20 51741034 INS_chr20_51741034_51741034_CPX_DERIVED T . . CPX_EVENT=CPX_chr20:51740560-51741035;CTG_NAMES=asm028558:tig00000,asm028558:tig00001;END=51741034;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=895;SVLEN=50;SVTYPE=INS +chr20 54849491 INS_chr20_54849491_54849491_CPX_DERIVED C . . CPX_EVENT=CPX_chr20:54849491-54849615;CTG_NAMES=asm028586:tig00000;END=54849491;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=88;SVLEN=140;SVTYPE=INS +chr20 54849491 DEL_chr20_54849491_54849614_CPX_DERIVED C . . CPX_EVENT=CPX_chr20:54849491-54849615;CTG_NAMES=asm028586:tig00000;END=54849614;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=88;SVLEN=-123;SVTYPE=DEL +chr20 58695019 INS_chr20_58695019_58695019_CPX_DERIVED G . . CPX_EVENT=CPX_chr20:58695019-58695020;CTG_NAMES=asm028638:tig00002;END=58695019;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=82;SVLEN=549;SVTYPE=INS +chr20 64097041 INS_chr20_64097041_64097041_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:64096905-64097041;CTG_NAMES=asm028821:tig00001;END=64097041;MAPPING_QUALITIES=54;MAX_ALIGN_LENGTH=166;SVLEN=318;SVTYPE=INS diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/cpx_reinterpreted_simple_multi_seg.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/cpx_reinterpreted_simple_multi_seg.vcf index ca4b0c47d58..8c6f4754ec4 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/cpx_reinterpreted_simple_multi_seg.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/cpx_reinterpreted_simple_multi_seg.vcf @@ -1,13 +1,16 @@ ##fileformat=VCFv4.2 +##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= @@ -33,37 +36,42 @@ ##INFO= ##INFO= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= ##contig= ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO -chr20 2379968 INS_chr20_2379968_2379968_CPX_DERIVED G . . CPX_EVENT=CPX_chr20:2379056-2379968;CTG_NAMES=asm027854:tig00003;END=2379968;SVLEN=213;SVTYPE=INS +chr20 2379968 INS_chr20_2379968_2379968_CPX_DERIVED G . . CPX_EVENT=CPX_chr20:2379056-2379968;CTG_NAMES=asm027854:tig00003;END=2379968;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=1401;SVLEN=213;SVTYPE=INS chr20 23122666 DEL_chr20_23122666_23122995_CPX_DERIVED C . . ALIGN_LENGTHS=876,876;CPX_EVENT=CPX_chr20:23122561-23122996;CTG_NAMES=asm028059:tig00000,asm028059:tig00001;END=23122995;HQ_MAPPINGS=2;INSLEN=36;INSSEQ=TTCACCTGCAAGCCCTCCCACACGGTGACATGACAG;INSSEQ_MAP=1254_1295_chr20:23122559-23122600_-_1253H42M874H_60_0_42_O,1254_1295_chr20:23122559-23122600_-_1253H42M874H_60_0_42_O;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=876;SEQ_ALT_HAPLOTYPE=TTCACCTGCAAGCCCTCCCACACGGTGACATGACAG;SVLEN=-329;SVTYPE=DEL;TOTAL_MAPPINGS=2 chr20 38124003 INS-DUPLICATION-TANDEM-EXPANSION_chr20_38124004_38124014_CPX_DERIVED C . . ALIGN_LENGTHS=231,231;CPX_EVENT=CPX_chr20:38123803-38124181,CPX_chr20:38123803-38124181;CTG_NAMES=asm028411:tig00000,asm028411:tig00001;DUP_NUM=1,2;DUP_ORIENTATIONS=++;DUP_REPEAT_UNIT_REF_SPAN=chr20:38124004-38124014;DUP_SEQ_CIGARS=11M,11M;END=38124003;EXPANSION;HQ_MAPPINGS=2;INSLEN=606;INSSEQ=TAATATACCTATTATATATAATATACCTATTATATATAAAATATACCTATTATATATAATATACCTATTATATACATTATATATAATATACCTATTATATGCATTATATATAATATACCTATTATATGCATTATATATAATATACCTATTATATGCATTATATATAATATACCTATTATATGCATTATATATAATATACCTATTATATGCATTATATAATATATTATATATAATATGCATATTATATGTATTATATATTATATATTATATATATAATATACATATTATATGTATTATATATTATATATTATATATATAATATACATATTATATGTATTATATATTATATATAATATACATATTATATATTATATATTATATACATATTATATATTATATATTATATACATATTATATATTATATACATATTATATATTATATATTATATACATATTATACATTATATATATCTAAAATATATAATACACATTATATATTATATAATACACATTATATATAATATATAATACACATTATATATTATATATAATACACATTATATATTATATATAATACACATTATATATTATATAATACACATTATA;INSSEQ_MAP=1523_1590_chr20:38123803-38123868_+_1522H31M2I35M2301H_60_3_43_O,1523_1590_chr20:38123803-38123868_+_1522H31M2I35M2301H_60_3_43_O;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=231;SEQ_ALT_HAPLOTYPE=TATTATATATATAATATACCTATTATATATAATATACCTATTATATATAAAATATACCTATTATATATAATATACCTATTATATACATTATATATAATATACCTATTATATGCATTATATATAATATACCTATTATATGCATTATATATAATATACCTATTATATGCATTATATATAATATACCTATTATATGCATTATATATAATATACCTATTATATGCATTATATAATATATTATATATAATATGCATATTATATGTATTATATATTATATATTATATATATAATATACATATTATATGTATTATATATTATATATTATATATATAATATACATATTATATGTATTATATATTATATATAATATACATATTATATATTATATATTATATACATATTATATATTATATATTATATACATATTATATATTATATACATATTATATATTATATATTATATACATATTATACATTATATATATCTAAAATATATAATACACATTATATATTATATAATACACATTATATATAATATATAATACACATTATATATTATATATAATACACATTATATATTATATATAATACACATTATATATTATATAATACACATTATATATTATATATA;SVLEN=617;SVTYPE=INS;TOTAL_MAPPINGS=2 -chr20 38653053 INS_chr20_38653053_38653053_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:38653054-38653283;CTG_NAMES=asm028418:tig00000;END=38653053;SVLEN=259;SVTYPE=INS +chr20 38653053 INS_chr20_38653053_38653053_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:38653054-38653283;CTG_NAMES=asm028418:tig00000;END=38653053;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=1534;SVLEN=259;SVTYPE=INS chr20 38653112 INS-DUPLICATION-TANDEM-EXPANSION_chr20_38653113_38653268_CPX_DERIVED A . . ALIGN_LENGTHS=224;CPX_EVENT=CPX_chr20:38653054-38653283;CTG_NAMES=asm028418:tig00000;DUP_NUM=1,2;DUP_ORIENTATIONS=++;DUP_REPEAT_UNIT_REF_SPAN=chr20:38653113-38653268;DUP_SEQ_CIGARS=67M44I89M,57M21I99M;END=38653112;EXPANSION;HQ_MAPPINGS=1;INSLEN=135;INSSEQ=CTGGTGATGATAATGGTGGTGGTGGTGGTGATGGTGATGATGATTATGATGGTGGTGGTGGTGGTGGTGGTGCTGGTGATAGTGGTGGTGGTGGTGCTGGTGATGATAATGGTGGTGGTGGTGATGATGGTGATG;INSSEQ_MAP=1546_1871_chr20:38653054-38653385_-_1545H128M15D64M6I105M3I20M1613H_59_53_100_O;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=224;SEQ_ALT_HAPLOTYPE=ATGATTGTGATGGTGGTGTTGGTGGTGGTGATAATGATGGTAGTGGTGGTGGTGATGATGGTGATGATGATTATGATGGTGTGTTGGTGGTGCTGGTGATGATAATGGTGGTGGTGGTGGTGGTGATGGAAATGATGATGATGTTAATTGTGGTGTTGATGATGGTAATGATAATGATTGTGATGGTGGTGTTGGTGGTGCTGGTGATGATAATGGTGGTGGTGGTGGTGATGGTGATGATGATTATGATGGTGGTGGTGGTGGTGGTGGTGCTGGTGATAGTGGTGGTGGTGGTGCTGGTGATGATAATGGTGGTGGTGGTGATGATGGTGATGATGATTATGATGGTGGTGTTGGTGGTGCTGGTGATGATAATCATGCTGGTGGTGGTGGCGTTGATGATGGTGACAGTAGTGGTGATGATGGTGGTGGTGGTGATGGAAATGATGATGATGTTAGTTGTGGTGTTGATGATGGTAATGATAATGATTGTGATGATGGTGGTGGTGGTG;SVLEN=291;SVTYPE=DUP;TOTAL_MAPPINGS=1 -chr20 38653283 INS_chr20_38653283_38653283_CPX_DERIVED G . . CPX_EVENT=CPX_chr20:38653054-38653283;CTG_NAMES=asm028418:tig00000;END=38653283;SVLEN=175;SVTYPE=INS -chr20 47895195 DEL_chr20_47895195_47895293_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:47895195-47895581;CTG_NAMES=asm028508:tig00005;END=47895293;SVLEN=-98;SVTYPE=DEL +chr20 38653283 INS_chr20_38653283_38653283_CPX_DERIVED G . . CPX_EVENT=CPX_chr20:38653054-38653283;CTG_NAMES=asm028418:tig00000;END=38653283;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=1534;SVLEN=175;SVTYPE=INS +chr20 47895195 DEL_chr20_47895195_47895293_CPX_DERIVED A . . CPX_EVENT=CPX_chr20:47895195-47895581;CTG_NAMES=asm028508:tig00005;END=47895293;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=289;SVLEN=-98;SVTYPE=DEL chr20 47895482 DEL_chr20_47895482_47895580_CPX_DERIVED G . . ALIGN_LENGTHS=189;CPX_EVENT=CPX_chr20:47895195-47895581;CTG_NAMES=asm028508:tig00005;END=47895580;HQ_MAPPINGS=1;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=189;SVLEN=-98;SVTYPE=DEL;TOTAL_MAPPINGS=1 chr20 61375462 INS-DUPLICATION-TANDEM-EXPANSION_chr20_61375463_61375819_CPX_DERIVED A . . ALIGN_LENGTHS=358;CPX_EVENT=CPX_chr20:61375650-61376102;CTG_NAMES=asm028687:tig00001;DUP_ANNOTATIONS_IMPRECISE;DUP_IMPRECISE_AFFECTED_RANGE=chr20:61375463-61376102;DUP_NUM=1,2;DUP_ORIENTATIONS=++;DUP_REPEAT_UNIT_REF_SPAN=chr20:61375463-61375819;END=61375462;EXPANSION;HOMLEN=283;HOMSEQ=GTGATGGTGTGGTTTGTTGATGGTAGTGTGATGCTCTTGGTGCTGGTGGTGGTGATGGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGATGATGGTGGTGCTGGTGGTGGTCATAGCACTGGTGGTGATGGTATGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTGATGATAGTGGGGTTTATTGATGGTAGTGTGATGGTCTTGGTGGTGCTGATAATGGTGTGGTTTGTTGATGATAGTGTGATGGTCTTGGTGGTGGTGG;HQ_MAPPINGS=1;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=358;SEQ_ALT_HAPLOTYPE=GTGATGGTGTGGTTTATTGATGGCAGTGTGATTGTCTTGGTGGTGGTGATGATGGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGGTGGTGATGATGGTGGCGCTGGTGGTGGTCATAGCACTGGTGGTGGTCATAGCACTGGTGGTGATGGTATGGTTTGTTGATGGTAGTGTGATGATCTTGGTGGTGGTGATGGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGGTGGTGATGGTGTAGTTTGTTGATGGTAGTGTGATGGTCTTGGTGGTGGTGATGATGGTGGTGCTGGTGGTGGTCATAGCACTGGTGGTGATGCTATGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGGTGGTGATGGTGTGGTTTGTTGATGGTAGTGTGATGCTCTTGGTGCTGGTGGTGGTGATGGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGATGATGGTGGTGCTGGTGGTGGTCATAGCACTGGTGGTGATGGTATGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTGATGATAGTGGGGTTTATTGATGGTAGTGTGATGGTCTTGGTGGTGCTGATAATGGTGTGGTTTGTTGATGATAGTGTGATGGTCTTGGTGGTGGTGGCGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTATGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTGGTGGTGATGATGTAGTTTGTTGATGGTAGCGTGATGTTCTTGGTGCTGGTGGTGGTGATGATGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGGTGGTGGTGGTGGTGATAGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGATGCTGGTGATGGTATTGGTGATGGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTCGTGGTGGTGGTGTGGTTTGTCGATGGTAGTGTGGTGGTCTTGGTGCTGGTGG;SVLEN=357;SVTYPE=DUP;TOTAL_MAPPINGS=1 chr20 61375649 INS-DUPLICATION-TANDEM-EXPANSION_chr20_61375650_61375892_CPX_DERIVED T . . ALIGN_LENGTHS=430;CPX_EVENT=CPX_chr20:61375650-61376102;CTG_NAMES=asm028687:tig00001;DUP_ANNOTATIONS_IMPRECISE;DUP_IMPRECISE_AFFECTED_RANGE=chr20:61375650-61376103;DUP_NUM=1,2;DUP_ORIENTATIONS=++;DUP_REPEAT_UNIT_REF_SPAN=chr20:61375650-61375892;END=61375649;EXPANSION;HOMLEN=211;HOMSEQ=TGGTGGTGGTGATGATGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGGTGGTGGTGGTGGTGATAGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGATGCTGGTGATGGTATTGGTGATGGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTCGTGGTGGTGGTGTGGTTTGTCGATGGTAGTGTGGTGGTCTTGGTGCTGGTGGT;HQ_MAPPINGS=1;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=430;SEQ_ALT_HAPLOTYPE=TGGTGGTGATGATAGTGGGGTTTATTGATGGTAGTGTGATGGTCTTGGTGGTGCTGATAATGGTGTGGTTTGTTGATGATAGTGTGATGGTCTTGGTGGTGGTGGCGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTATGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTGGTGGTGATGATGTAGTTTGTTGATGGTAGCGTGATGTTCTTGGTGCTGGTGGTGGTGATGATGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGGTGGTGGTGGTGGTGATAGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGATGCTGGTGATGGTATTGGTGATGGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTCGTGGTGGTGGTGTGGTTTGTCGATGGTAGTGTGGTGGTCTTGGTGCTGGTGGTTGTCGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGTGGTGATGTGGTTTGTTGATGATAGTGTGATGGTCTTGGTGGTGGTAGTGATGGTGTGGTTTGCTGATGGTAGTGTGATGGTCTTGGTGGTGGTGATACTGTGGTTTGTGGATGGTAGTGTGATGGTCTTGGTGGTGATGGTGTGGTTTGTTGATGGTAGTGTGATGGTCTTGGTGCTGGTGGT;SVLEN=243;SVTYPE=DUP;TOTAL_MAPPINGS=1 -chr20 61375649 INS_chr20_61375649_61375649_CPX_DERIVED T . . CPX_EVENT=CPX_chr20:61375650-61376102;CTG_NAMES=asm028687:tig00001;END=61375649;SVLEN=572;SVTYPE=INS +chr20 61375649 INS_chr20_61375649_61375649_CPX_DERIVED T . . CPX_EVENT=CPX_chr20:61375650-61376102;CTG_NAMES=asm028687:tig00001;END=61375649;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=1559;SVLEN=572;SVTYPE=INS chr20 61919907 INS-DUPLICATION-TANDEM-EXPANSION_chr20_61919908_61920054_CPX_DERIVED C . . ALIGN_LENGTHS=149;CPX_EVENT=CPX_chr20:61919906-61920109;CTG_NAMES=asm028707:tig00000;DUP_ANNOTATIONS_IMPRECISE;DUP_IMPRECISE_AFFECTED_RANGE=chr20:61919908-61920109;DUP_NUM=1,2;DUP_ORIENTATIONS=++;DUP_REPEAT_UNIT_REF_SPAN=chr20:61919908-61920054;END=61919907;EXPANSION;HOMLEN=55;HOMSEQ=GTGATTGTGTGGAAGCGTGGTGTCACGGTGATTGCGTGGAAGCGTGTTGTGATTG;HQ_MAPPINGS=1;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=149;SEQ_ALT_HAPLOTYPE=GTGATTGTGTGGAAGTGTGATGTCACGGTGATTGCGTGGAAGCGTGTTGTGATTGTGTGGAAGCGTGGTATCGTGATTGGAAGTGTGGTGTCACGCTGATTGCATGGAAGTGTGTTGTGATTGTGTGGAAGCGTGATATCGCAGTGATTGTGTGGAAGCGTGGTGTCACGGTGATTGTGTGGAAGCGTGGTGTCACGGTGATTGCGTGGAAGCGTGTTGTGATTGTGTGGAAGCGTGGTATCGTGATCGGAAGCGTGGTGTTGCGGTGATTGCATGGAAGCATGTTGTGATTGTGTGGAAGCATGGTATCGTGATTGTCTGGAAGCATGGTGTCATGGTGATTGGAAGTGTGTCGTGATTG;SVLEN=147;SVTYPE=DUP;TOTAL_MAPPINGS=1 -chr20 61920109 INS_chr20_61920109_61920109_CPX_DERIVED G . . CPX_EVENT=CPX_chr20:61919906-61920109;CTG_NAMES=asm028707:tig00000;END=61920109;SVLEN=531;SVTYPE=INS +chr20 61920109 INS_chr20_61920109_61920109_CPX_DERIVED G . . CPX_EVENT=CPX_chr20:61919906-61920109;CTG_NAMES=asm028707:tig00000;END=61920109;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=947;SVLEN=531;SVTYPE=INS chr20 62452037 INS-DUPLICATION-TANDEM-EXPANSION_chr20_62452038_62452041_CPX_DERIVED A . . ALIGN_LENGTHS=59;CPX_EVENT=CPX_chr20:62452038-62452236;CTG_NAMES=asm028732:tig00001;DUP_NUM=1,2;DUP_ORIENTATIONS=++;DUP_REPEAT_UNIT_REF_SPAN=chr20:62452038-62452041;DUP_SEQ_CIGARS=4M,4M;END=62452037;EXPANSION;HQ_MAPPINGS=1;INSLEN=149;INSSEQ=GGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAGGGAGGGCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=59;SEQ_ALT_HAPLOTYPE=GGGGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAGGGAGGGCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGAGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG;SVLEN=153;SVTYPE=INS;TOTAL_MAPPINGS=1 chr20 62452087 DEL_chr20_62452087_62452235_CPX_DERIVED A . . ALIGN_LENGTHS=50;CPX_EVENT=CPX_chr20:62452038-62452236;CTG_NAMES=asm028732:tig00001;END=62452235;HOMLEN=9;HOMSEQ=GAAGGAGAG;HQ_MAPPINGS=1;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=50;SVLEN=-148;SVTYPE=DEL;TOTAL_MAPPINGS=1 chr20 63092255 INS_chr20_63092255_63092255_CPX_DERIVED G . . ALIGN_LENGTHS=1091;CPX_EVENT=CPX_chr20:63092255-63094246;CTG_NAMES=asm028762:tig00002;END=63092255;HQ_MAPPINGS=1;INSLEN=71;INSSEQ=GCCCAGGTTCCCGGGGCTGCGTGGGAGACACAGAAGTGGGGGCACCTCCGCAGCACCCACATCCTGCCGAT;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=1091;SEQ_ALT_HAPLOTYPE=GCCCAGGTTCCCGGGGCTGCGTGGGAGACACAGAAGTGGGGGCACCTCCGCAGCACCCACATCCTGCCGAT;SVLEN=71;SVTYPE=INS;TOTAL_MAPPINGS=1 chr20 63093346 DEL_chr20_63093346_63094245_CPX_DERIVED G . . ALIGN_LENGTHS=942;CPX_EVENT=CPX_chr20:63092255-63094246;CTG_NAMES=asm028762:tig00002;END=63094245;HQ_MAPPINGS=1;INSLEN=1;INSSEQ=T;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=942;SEQ_ALT_HAPLOTYPE=T;SVLEN=-899;SVTYPE=DEL;TOTAL_MAPPINGS=1 chr20 63353949 DEL_chr20_63353949_63354001_CPX_DERIVED T . . ALIGN_LENGTHS=80;CPX_EVENT=CPX_chr20:63353949-63354272;CTG_NAMES=asm028777:tig00001;END=63354001;HQ_MAPPINGS=1;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=80;SVLEN=-52;SVTYPE=DEL;TOTAL_MAPPINGS=1 -chr20 63354205 DEL_chr20_63354205_63354271_CPX_DERIVED G . . CPX_EVENT=CPX_chr20:63353949-63354272;CTG_NAMES=asm028777:tig00001;END=63354271;SVLEN=-66;SVTYPE=DEL -chr21 21264943 INS_chr21_21264943_21264943_CPX_DERIVED G . . CPX_EVENT=CPX_chr21:21264944-21265096;CTG_NAMES=asm029034:tig00000,asm029034:tig00001;END=21264943;SVLEN=221;SVTYPE=INS +chr20 63354205 DEL_chr20_63354205_63354271_CPX_DERIVED G . . CPX_EVENT=CPX_chr20:63353949-63354272;CTG_NAMES=asm028777:tig00001;END=63354271;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=81;SVLEN=-66;SVTYPE=DEL +chr21 21264943 INS_chr21_21264943_21264943_CPX_DERIVED G . . CPX_EVENT=CPX_chr21:21264944-21265096;CTG_NAMES=asm029034:tig00000,asm029034:tig00001;END=21264943;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=316;SVLEN=221;SVTYPE=INS chr21 21264961 INS-DUPLICATION-TANDEM-EXPANSION_chr21_21264962_21265070_CPX_DERIVED G . . ALIGN_LENGTHS=110,110;CPX_EVENT=CPX_chr21:21264944-21265096,CPX_chr21:21264944-21265096;CTG_NAMES=asm029034:tig00000,asm029034:tig00001;DUP_ANNOTATIONS_IMPRECISE;DUP_IMPRECISE_AFFECTED_RANGE=chr21:21264962-21265096;DUP_NUM=1,2;DUP_ORIENTATIONS=++;DUP_REPEAT_UNIT_REF_SPAN=chr21:21264962-21265070;END=21264961;EXPANSION;HOMLEN=26;HOMSEQ=TATATATACACATATATATTATATAT;HQ_MAPPINGS=2;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=110;SEQ_ALT_HAPLOTYPE=TATATATACACATATATATTATATATGTGTATGTGTATATATACACATATATATTATATATGTGTATGTGTATATATACACATATATATTATATATGTGTATGTGTATATATATACACATATATATTATATATGTGTATGTGTATATATATACACATATATATTATATATGTGTATATGTATATATACACATATATATTATATATATATGTGTCTGTATATATATACACATATATATTATATAT;SVLEN=109;SVTYPE=DUP;TOTAL_MAPPINGS=2 -chr21 23428920 INS_chr21_23428920_23428920_CPX_DERIVED T . . CPX_EVENT=CPX_chr21:23428920-23429023;CTG_NAMES=asm029052:tig00000,asm029052:tig00001;END=23428920;SVLEN=85;SVTYPE=INS +chr21 23428920 INS_chr21_23428920_23428920_CPX_DERIVED T . . CPX_EVENT=CPX_chr21:23428920-23429023;CTG_NAMES=asm029052:tig00000,asm029052:tig00001;END=23428920;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=1182;SVLEN=85;SVTYPE=INS chr21 23428920 INS_chr21_23428920_23428967_CPX_DERIVED TTTATATAAATATATATAAATATATAATATATAATAATATAATATAAT . . ALIGN_LENGTHS=56,56;CPX_EVENT=CPX_chr21:23428920-23429023,CPX_chr21:23428920-23429023;CTG_NAMES=asm029052:tig00000,asm029052:tig00001;END=23428967;HQ_MAPPINGS=2;INSLEN=85;INSSEQ=ATAAATATATATAATAATATATAATTATATATTATATTATATAATATAATAATATATATTATAATACATTATATAATATATTATA;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=56;SEQ_ALT_HAPLOTYPE=ATAAATATATATAATAATATATAATTATATATTATATTATATAATATAATAATATATATTATAATACATTATATAATATATTATA;SVLEN=85;SVTYPE=INS;TOTAL_MAPPINGS=2 -chr21 26001843 INV_chr21_26001843_26002384_CPX_DERIVED T . . CPX_EVENT=CPX_chr21:26001843-26002386;CTG_NAMES=asm029075:tig00000;END=26002384;SVLEN=0;SVTYPE=INV +chr21 26001843 INV_chr21_26001843_26002384_CPX_DERIVED T . . CPX_EVENT=CPX_chr21:26001843-26002386;CTG_NAMES=asm029075:tig00000;END=26002384;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=1370;SVLEN=0;SVTYPE=INV chr21 46069065 INS_chr21_46069065_46069065_CPX_DERIVED C . . ALIGN_LENGTHS=91,91;CPX_EVENT=CPX_chr21:46069065-46069209;CTG_NAMES=asm029362:tig00001,asm029362:tig00002;END=46069065;HQ_MAPPINGS=2;INSLEN=60;INSSEQ=CTAGGTGTGTGCATGTGTGCACACGTGTGTGCATGTGTGTGCATGTGTGCACACGTGTGT;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=91;SEQ_ALT_HAPLOTYPE=CTAGGTGTGTGCATGTGTGCACACGTGTGTGCATGTGTGTGCATGTGTGCACACGTGTGT;SVLEN=60;SVTYPE=INS;TOTAL_MAPPINGS=2 chr21 46069156 DEL_chr21_46069156_46069208_CPX_DERIVED A . . ALIGN_LENGTHS=91,91;CPX_EVENT=CPX_chr21:46069065-46069209;CTG_NAMES=asm029362:tig00001,asm029362:tig00002;END=46069208;HQ_MAPPINGS=2;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=91;SVLEN=-52;SVTYPE=DEL;TOTAL_MAPPINGS=2 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/hg19_DEL.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/hg19_DEL.vcf index eb0dbc4dca1..8d5033f7ed6 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/hg19_DEL.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/hg19_DEL.vcf @@ -1,13 +1,16 @@ ##fileformat=VCFv4.2 +##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= @@ -33,12 +36,17 @@ ##INFO= ##INFO= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= -##contig= -##contig= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##contig= +##contig= #CHROM POS ID REF ALT QUAL FILTER INFO 21 43350116 DEL_21_43350116_43353485 G . . ALIGN_LENGTHS=229,417;CTG_NAMES=asm000000:tig00006,asm000001:tig00001;END=43353485;EXTERNAL_CNV_CALLS=CNV_21_43350200_43353400:1:80;HOMLEN=6;HOMSEQ=GAGGAA;HQ_MAPPINGS=2;MAPPING_QUALITIES=60,60;MAX_ALIGN_LENGTH=417;READ_PAIR_SUPPORT=23;SPLIT_READ_SUPPORT=8;SVLEN=-3369;SVTYPE=DEL;TOTAL_MAPPINGS=2 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/hg19_INV.vcf b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/hg19_INV.vcf index 8bfa057d387..0ffeebd9bf2 100644 --- a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/hg19_INV.vcf +++ b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/integration/outputs/hg19_INV.vcf @@ -1,24 +1,27 @@ ##fileformat=VCFv4.2 +##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##ALT= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= -##INFO= +##INFO= +##INFO= ##INFO= ##INFO= -##INFO= ##INFO= -##INFO= +##INFO= +##INFO= ##INFO= ##INFO= ##INFO= ##INFO= -##INFO= +##INFO= ##INFO= ##INFO= ##INFO= @@ -33,13 +36,18 @@ ##INFO= ##INFO= ##INFO= +##INFO= ##INFO= ##INFO= ##INFO= ##INFO= ##INFO= -##contig= -##contig= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##contig= +##contig= #CHROM POS ID REF ALT QUAL FILTER INFO 21 27374151 INV55_21_27374151_27374706 A . . ALIGN_LENGTHS=541;CTG_NAMES=asm000001:tig00009;END=27374706;HOMSEQ=GGATCCA;HOMLEN=7;HQ_MAPPINGS=1;INV55;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=541;SVLEN=555;SVTYPE=INV;TOTAL_MAPPINGS=1 21 27374158 INV33_21_27374158_27374700 A . . ALIGN_LENGTHS=518;CTG_NAMES=asm000001:tig00009;END=27374700;HQ_MAPPINGS=1;INV33;MAPPING_QUALITIES=60;MAX_ALIGN_LENGTH=518;SVLEN=542;SVTYPE=INV;TOTAL_MAPPINGS=1 diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/utils/SVContext.vcf.gz b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/utils/SVContext.vcf.gz index 193d99ef72a..522f69c0662 100644 Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/utils/SVContext.vcf.gz and b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/utils/SVContext.vcf.gz differ diff --git a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/utils/SVContext.vcf.gz.tbi b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/utils/SVContext.vcf.gz.tbi index 63a15b18e67..3fbf99048f9 100644 Binary files a/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/utils/SVContext.vcf.gz.tbi and b/src/test/resources/org/broadinstitute/hellbender/tools/spark/sv/utils/SVContext.vcf.gz.tbi differ