Skip to content

Commit

Permalink
(SV) test and utils clean up (#5116)
Browse files Browse the repository at this point in the history
* clean up on util classes in SV packages
* get default headers into output VCF
* remove phantom sga-related test files
* move large test data files to the new sub-dir: src/test/resources/large/sv/
* integration test for new interpretation tool SvDiscoverFromLocalAssemblyContigAlignmentsSpark
  • Loading branch information
SHuang-Broad authored Aug 27, 2018
1 parent 3f1d75d commit e451a28
Show file tree
Hide file tree
Showing 57 changed files with 455 additions and 55,137 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
Expand Down Expand Up @@ -47,6 +48,7 @@
import java.io.IOException;
import java.nio.file.Paths;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.IntStream;

Expand Down Expand Up @@ -188,8 +190,9 @@ protected void runTool( final JavaSparkContext ctx ) {

final String outputPath = svDiscoveryInputMetaData.getOutputPath();
final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue();
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines();
final Logger toolLogger = svDiscoveryInputMetaData.getToolLogger();
SVVCFWriter.writeVCF(annotatedVariants, outputPath + "inv_del_ins.vcf", refSeqDictionary, toolLogger);
SVVCFWriter.writeVCF(annotatedVariants, outputPath + "inv_del_ins.vcf", refSeqDictionary, defaultToolVCFHeaderLines, toolLogger);

// TODO: 1/14/18 this is the next version of precise variant calling
if ( expInterpret != null ) {
Expand Down Expand Up @@ -231,7 +234,7 @@ private SvDiscoveryInputMetaData getSvDiscoveryInputData(final JavaSparkContext
outputPrefixWithSampleName,
assembledEvidenceResults.getReadMetadata(), assembledEvidenceResults.getAssembledIntervals(),
makeEvidenceLinkTree(assembledEvidenceResults.getEvidenceTargetLinks()),
cnvCallsBroadcast, getHeaderForReads(), getReference(), localLogger);
cnvCallsBroadcast, getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);
}

public static Broadcast<SVIntervalTree<VariantContext>> broadcastCNVCalls(final JavaSparkContext ctx,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ protected void runTool(final JavaSparkContext ctx) {
new SvDiscoveryInputMetaData(ctx, discoverStageArgs, null, vcfOutputPath,
null, null, null,
cnvCallsBroadcast,
getHeaderForReads(), getReference(), localLogger);
getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);

final JavaRDD<AlignedContig> parsedContigAlignments =
new SvDiscoverFromLocalAssemblyContigAlignmentsSpark
Expand All @@ -146,7 +146,7 @@ protected void runTool(final JavaSparkContext ctx) {
.discoverVariantsFromChimeras(svDiscoveryInputMetaData, parsedContigAlignments);

final SAMSequenceDictionary refSeqDictionary = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue();
SVVCFWriter.writeVCF(annotatedVariants, vcfOutputPath, refSeqDictionary, localLogger);
SVVCFWriter.writeVCF(annotatedVariants, vcfOutputPath, refSeqDictionary, getDefaultToolVCFHeaderLines(), localLogger);
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
Expand Down Expand Up @@ -125,11 +126,11 @@ public final class SvDiscoverFromLocalAssemblyContigAlignmentsSpark extends GATK
fullName = "write-sam", optional = true)
private boolean writeSAMFiles;

static final String SIMPLE_CHIMERA_VCF_FILE_NAME = "NonComplex.vcf";
static final String COMPLEX_CHIMERA_VCF_FILE_NAME = "Complex.vcf";
static final String REINTERPRETED_1_SEG_CALL_VCF_FILE_NAME = "cpx_reinterpreted_simple_1_seg.vcf";
static final String REINTERPRETED_MULTI_SEG_CALL_VCF_FILE_NAME = "cpx_reinterpreted_simple_multi_seg.vcf";
static final String MERGED_VCF_FILE_NAME = "merged_simple.vcf";
public static final String SIMPLE_CHIMERA_VCF_FILE_NAME = "NonComplex.vcf";
public static final String COMPLEX_CHIMERA_VCF_FILE_NAME = "Complex.vcf";
public static final String REINTERPRETED_1_SEG_CALL_VCF_FILE_NAME = "cpx_reinterpreted_simple_1_seg.vcf";
public static final String REINTERPRETED_MULTI_SEG_CALL_VCF_FILE_NAME = "cpx_reinterpreted_simple_multi_seg.vcf";
public static final String MERGED_VCF_FILE_NAME = "merged_simple.vcf";

@Override
public boolean requiresReference() {
Expand Down Expand Up @@ -159,7 +160,7 @@ protected void runTool(final JavaSparkContext ctx) {
new SvDiscoveryInputMetaData(ctx, discoverStageArgs, nonCanonicalChromosomeNamesFile, outputPrefixWithSampleName,
null, null, null,
cnvCallsBroadcast,
getHeaderForReads(), getReference(), localLogger);
getHeaderForReads(), getReference(), getDefaultToolVCFHeaderLines(), localLogger);
final JavaRDD<GATKRead> assemblyRawAlignments = getReads();

final AssemblyContigsClassifiedByAlignmentSignatures contigsByPossibleRawTypes =
Expand Down Expand Up @@ -328,7 +329,7 @@ private static List<VariantContext> extractSimpleVariants(final JavaRDD<Assembly
final Logger logger = svDiscoveryInputMetaData.getDiscoverStageArgs().runInDebugMode ? svDiscoveryInputMetaData.getToolLogger() : null;
SVVCFWriter.writeVCF(simpleVariants, outputPrefixWithSampleName + SIMPLE_CHIMERA_VCF_FILE_NAME,
svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(),
logger);
svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines(), logger);
return simpleVariants;
}

Expand All @@ -348,20 +349,21 @@ private static CpxAndReInterpretedSimpleVariants extractCpxVariants(final JavaSp
final JavaRDD<GATKRead> assemblyRawAlignments,
final String outputPrefixWithSampleName) {
final Logger toolLogger = svDiscoveryInputMetaData.getDiscoverStageArgs().runInDebugMode ? svDiscoveryInputMetaData.getToolLogger() : null;
final Set<VCFHeaderLine> defaultToolVCFHeaderLines = svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines();
final List<VariantContext> complexVariants =
CpxVariantInterpreter.makeInterpretation(contigsWithCpxAln, svDiscoveryInputMetaData);
SVVCFWriter.writeVCF(complexVariants, outputPrefixWithSampleName + COMPLEX_CHIMERA_VCF_FILE_NAME,
svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(),
toolLogger);
defaultToolVCFHeaderLines, toolLogger);

final JavaRDD<VariantContext> complexVariantsRDD = ctx.parallelize(complexVariants);
final SegmentedCpxVariantSimpleVariantExtractor.ExtractedSimpleVariants reInterpretedSimple =
SegmentedCpxVariantSimpleVariantExtractor.extract(complexVariantsRDD, svDiscoveryInputMetaData, assemblyRawAlignments);
final SAMSequenceDictionary refSeqDict = svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue();
final String derivedOneSegmentSimpleVCF = outputPrefixWithSampleName + REINTERPRETED_1_SEG_CALL_VCF_FILE_NAME;
final String derivedMultiSegmentSimpleVCF = outputPrefixWithSampleName + REINTERPRETED_MULTI_SEG_CALL_VCF_FILE_NAME;
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, refSeqDict, toolLogger);
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, refSeqDict, toolLogger);
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretZeroOrOneSegmentCalls(), derivedOneSegmentSimpleVCF, refSeqDict, defaultToolVCFHeaderLines, toolLogger);
SVVCFWriter.writeVCF(reInterpretedSimple.getReInterpretMultiSegmentsCalls(), derivedMultiSegmentSimpleVCF, refSeqDict, defaultToolVCFHeaderLines, toolLogger);

return new CpxAndReInterpretedSimpleVariants(complexVariants, reInterpretedSimple.getMergedReinterpretedCalls());
}
Expand Down Expand Up @@ -399,6 +401,7 @@ public static void filterAndWriteMergedVCF(final String outputPrefixWithSampleNa
final String out = outputPrefixWithSampleName + MERGED_VCF_FILE_NAME;
SVVCFWriter.writeVCF(variantsWithFilterApplied, out,
svDiscoveryInputMetaData.getReferenceData().getReferenceSequenceDictionaryBroadcast().getValue(),
svDiscoveryInputMetaData.getDefaultToolVCFHeaderLines(),
svDiscoveryInputMetaData.getToolLogger());
}

Expand Down Expand Up @@ -454,7 +457,7 @@ public boolean test(final VariantContext variantContext) {
if ( !variantContext.hasAttribute(GATKSVVCFConstants.CONTIG_NAMES) )
return true;

final List<String> mapQuals = SvDiscoveryUtils.getAttributeAsStringList(variantContext, attributeKey);
final List<String> mapQuals = SVUtils.getAttributeAsStringList(variantContext, attributeKey);
int maxMQ = 0;
for (final String mapQual : mapQuals) {
Integer integer = Integer.valueOf(mapQual);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFHeaderLine;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.broadcast.Broadcast;
Expand Down Expand Up @@ -42,6 +43,10 @@ public String getOutputPath() {
return outputPath;
}

public Set<VCFHeaderLine> getDefaultToolVCFHeaderLines() {
return defaultToolVCFHeaderLines;
}

public static final class ReferenceData {
private final Broadcast<Set<String>> canonicalChromosomesBroadcast;
private final Broadcast<ReferenceMultiSource> referenceBroadcast;
Expand Down Expand Up @@ -121,6 +126,8 @@ public List<SVInterval> getAssembledIntervals() {

private final DiscoverVariantsFromContigAlignmentsSparkArgumentCollection discoverStageArgs;

private final Set<VCFHeaderLine> defaultToolVCFHeaderLines;

private final Logger toolLogger;

private String outputPath;
Expand All @@ -135,17 +142,19 @@ public SvDiscoveryInputMetaData(final JavaSparkContext ctx,
final Broadcast<SVIntervalTree<VariantContext>> cnvCallsBroadcast,
final SAMFileHeader headerForReads,
final ReferenceMultiSource reference,
final Set<VCFHeaderLine> defaultToolVCFHeaderLines,
final Logger toolLogger) {

final SAMSequenceDictionary sequenceDictionary = headerForReads.getSequenceDictionary();
final Broadcast<Set<String>> canonicalChromosomesBroadcast =
ctx.broadcast(SvDiscoveryUtils.getCanonicalChromosomes(nonCanonicalChromosomeNamesFile, sequenceDictionary));
ctx.broadcast(SVUtils.getCanonicalChromosomes(nonCanonicalChromosomeNamesFile, sequenceDictionary));
final String sampleId = SVUtils.getSampleId(headerForReads);

this.referenceData = new ReferenceData(canonicalChromosomesBroadcast, ctx.broadcast(reference), ctx.broadcast(sequenceDictionary));
this.sampleSpecificData = new SampleSpecificData(sampleId, cnvCallsBroadcast, assembledIntervals, evidenceTargetLinks, readMetadata, ctx.broadcast(headerForReads));
this.discoverStageArgs = discoverStageArgs;
this.outputPath = outputPath;
this.defaultToolVCFHeaderLines = defaultToolVCFHeaderLines;
this.toolLogger = toolLogger;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,33 +1,18 @@
package org.broadinstitute.hellbender.tools.spark.sv.discovery;

import htsjdk.samtools.*;
import htsjdk.variant.variantcontext.VariantContext;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.samtools.SAMSequenceDictionary;
import org.apache.logging.log4j.Logger;
import org.apache.spark.api.java.JavaRDD;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.spark.sv.StructuralVariationDiscoveryArgumentCollection.DiscoverVariantsFromContigAlignmentsSparkArgumentCollection;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.BreakpointComplications;
import org.broadinstitute.hellbender.tools.spark.sv.discovery.inference.NovelAdjacencyAndAltHaplotype;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVFileUtils;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVInterval;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVIntervalTree;
import org.broadinstitute.hellbender.tools.spark.sv.utils.SVVCFReader;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.read.GATKRead;

import javax.annotation.Nonnull;
import java.io.IOException;
import java.nio.file.Files;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.Stream;

public class SvDiscoveryUtils {

import java.util.List;

public final class SvDiscoveryUtils {

public static void evaluateIntervalsAndNarls(final List<SVInterval> assembledIntervals,
final List<NovelAdjacencyAndAltHaplotype> narls,
Expand Down Expand Up @@ -99,82 +84,4 @@ private static int getAmbiguity(final BreakpointComplications complications) {
return complications.getHomologyForwardStrandRep().length();
}

//==================================================================================================================

public static Set<String> getCanonicalChromosomes(final String nonCanonicalContigNamesFile,
@Nonnull final SAMSequenceDictionary dictionary) {
final LinkedHashSet<String> allContigs = Utils.nonNull(dictionary).getSequences().stream().map(SAMSequenceRecord::getSequenceName)
.collect(Collectors.toCollection(LinkedHashSet::new));
if (nonCanonicalContigNamesFile == null)
return allContigs;

try (final Stream<String> nonCanonical = Files.lines(IOUtils.getPath(( Utils.nonNull(nonCanonicalContigNamesFile) )))) {
nonCanonical.forEach(allContigs::remove);
return allContigs;
} catch ( final IOException ioe ) {
throw new UserException("Can't read nonCanonicalContigNamesFile file "+nonCanonicalContigNamesFile, ioe);
}
}

//==================================================================================================================

/**
* write SAM file for provided {@code filteredContigs}
* by extracting original alignments from {@code originalAlignments},
* to directory specified by {@code outputDir}.
*/
public static void writeSAMRecords(final JavaRDD<GATKRead> originalAlignments, final Set<String> readNameToInclude,
final String outputPath, final SAMFileHeader header) {
final List<GATKRead> reads = originalAlignments.collect();
writeSAMRecords(reads, readNameToInclude, outputPath, header);
}

public static void writeSAMRecords(final List<GATKRead> reads, final Set<String> readNameToInclude,
final String outputPath, final SAMFileHeader header) {
final SAMFileHeader cloneHeader = header.clone();
final SAMRecordComparator localComparator;
if (outputPath.toLowerCase().endsWith("bam")) {
cloneHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
localComparator = new SAMRecordCoordinateComparator();
} else if (outputPath.toLowerCase().endsWith("sam")) {
cloneHeader.setSortOrder(SAMFileHeader.SortOrder.queryname);
localComparator = new SAMRecordQueryNameComparator();
} else {
throw new IllegalArgumentException("Unsupported output format " + outputPath);
}

final List<SAMRecord> samRecords = new ArrayList<>();
reads.forEach(gatkRead -> {
if ( readNameToInclude.contains(gatkRead.getName()) ) {
samRecords.add(gatkRead.convertToSAMRecord(cloneHeader));
}
});

samRecords.sort(localComparator);
SVFileUtils.writeSAMFile( outputPath, samRecords.iterator(), cloneHeader, true);
}

/**
* todo: this should be fixed in a new major version of htsjdk
* this exist because for whatever reason,
* VC.getAttributeAsStringList() sometimes returns a giant single string, while using
* VC.getAttributeAsString() gives back an array.....
*/
public static List<String> getAttributeAsStringList(final VariantContext vc, final String attributeKey) {
if (vc.getAttribute(attributeKey) == null) return Collections.emptyList();
return vc.getAttributeAsStringList(attributeKey, "").stream()
.flatMap(s -> {
if ( s.contains(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR) ) {
final String[] split = s.split(VCFConstants.INFO_FIELD_ARRAY_SEPARATOR);
return Arrays.stream(split);
} else {
return Stream.of(s);
}
})
.collect(Collectors.toList());
}

public static SimpleInterval makeOneBpInterval(final String chr, final int pos) {
return new SimpleInterval(chr, pos, pos);
}
}
Loading

0 comments on commit e451a28

Please sign in to comment.