Skip to content
This repository has been archived by the owner on Jan 24, 2024. It is now read-only.

Commit

Permalink
Merge multiple input VCFs in annotate-svs with clustering (#75)
Browse files Browse the repository at this point in the history
Closes: #75
Related-Issue: #75
Projected-Results-Impact: none
  • Loading branch information
holtgrewe committed Sep 19, 2022
1 parent 6c3008b commit e280b39
Show file tree
Hide file tree
Showing 11 changed files with 151 additions and 23 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

- Writing out proper SV type for Dragen CNV (#76)
- Adding support for depth of coverage annotation (#73)
- Merge multiple input VCFs in annotate-svs with clustering (#75)

## v0.26

Expand Down
2 changes: 2 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
<h2.version>1.4.197</h2.version>
<htsjdk.version>2.24.1</htsjdk.version>
<jannovar.version>0.41</jannovar.version>
<intervaltree.version>1.0.0</intervaltree.version>
<externalsortinginjava.version>0.6.1</externalsortinginjava.version>
</properties>

<modules>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,16 @@ public final class AnnotateSvsArgs {
"Annotate CNV with coverage and mapping quality from maelstrom-core coverage VCF file")
private List<String> coverageVcfs = new ArrayList<>();

@Parameter(
names = "--merge-overlap",
description = "Reciprocal overlap to require for merging (default: 0.75)")
private double mergeOverlap = 0.75;

@Parameter(
names = "--merge-bnd-radius",
description = "Merge BNDs within the given radius (default: 50)")
private int mergeBndRadius = 50;

public String getRefseqSerPath() {
return refseqSerPath;
}
Expand Down Expand Up @@ -198,6 +208,14 @@ public List<String> getCoverageVcfs() {
return coverageVcfs;
}

public double getMergeOverlap() {
return mergeOverlap;
}

public int getMergeBndRadius() {
return mergeBndRadius;
}

@Override
public String toString() {
return "AnnotateSvsArgs{"
Expand Down Expand Up @@ -259,6 +277,10 @@ public String toString() {
+ '\''
+ ", coverageVcfs="
+ coverageVcfs
+ ", mergeOverlap="
+ mergeOverlap
+ ", mergeBndRadius="
+ mergeBndRadius
+ '}';
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
import com.github.bihealth.varfish_annotator.data.GenomeVersion;
import com.github.bihealth.varfish_annotator.db.DbInfoWriterHelper;
import com.github.bihealth.varfish_annotator.utils.*;
import com.google.code.externalsorting.csv.CSVRecordBuffer;
import com.google.code.externalsorting.csv.CsvExternalSort;
import com.google.code.externalsorting.csv.CsvSortOptions;
import com.google.common.base.Joiner;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
Expand Down Expand Up @@ -34,12 +37,17 @@
import htsjdk.variant.variantcontext.VariantContextBuilder;
import htsjdk.variant.vcf.VCFFileReader;
import java.io.*;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.sql.*;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;

/** Implementation of the <tt>annotate-svs</tt> command. */
public final class AnnotateSvsVcf {
Expand Down Expand Up @@ -114,6 +122,15 @@ public void run() {
}
}

Path tmpDir = null;
try {
tmpDir = Files.createTempDirectory("varfish-annotator");
} catch (IOException e) {
System.err.println("Could not create temporary directory");
System.exit(1);
}
final Path tmpGtsPath = Paths.get(tmpDir.toString(), "tmp.gts.tsv");

try (Connection conn =
DriverManager.getConnection(
"jdbc:h2:"
Expand All @@ -123,11 +140,12 @@ public void run() {
"sa",
"");
VCFFileReader reader = new VCFFileReader(new File(args.getInputVcf()));
OutputStream gtsStream = Files.newOutputStream(Paths.get(args.getOutputGts()));
OutputStream featureEffectsStream =
Files.newOutputStream(Paths.get(args.getOutputFeatureEffects()));
OutputStream dbInfoStream = Files.newOutputStream(Paths.get(args.getOutputDbInfos()));
Writer gtWriter = GzipUtil.maybeOpenGzipOutputStream(gtsStream, args.getOutputGts());
OutputStream tmpGtsStream = Files.newOutputStream(tmpGtsPath);
Writer tmpGtsWriter =
GzipUtil.maybeOpenGzipOutputStream(tmpGtsStream, tmpGtsPath.toString());
Writer featureEffectsWriter =
GzipUtil.maybeOpenGzipOutputStream(
featureEffectsStream, args.getOutputFeatureEffects());
Expand Down Expand Up @@ -161,8 +179,13 @@ public void run() {
refseqJvData,
ensemblJvData,
callerSupport,
gtWriter,
tmpGtsWriter,
featureEffectsWriter);

// Finalize genotypes and write out sorted
tmpGtsWriter.close();
writeSortedGts(tmpGtsPath);

new DbInfoWriterHelper()
.writeDbInfos(conn, dbInfoBufWriter, args.getRelease(), AnnotateVcf.class);
} catch (SQLException e) {
Expand Down Expand Up @@ -192,6 +215,52 @@ public void run() {
}
}

/** Finalize and write out sorted files. */
private void writeSortedGts(Path tmpGtsPath) throws IOException {
// Configuration for sorting
final boolean hasChrom2Columns =
!args.getOptOutFeatures().contains(GtRecordBuilder.FEATURE_CHROM2_COLUMNS);
final CsvSortOptions sortOptions =
new CsvSortOptions.Builder(
new VarFishGtsTsvComparator(hasChrom2Columns),
CsvExternalSort.DEFAULTMAXTEMPFILES,
CsvExternalSort.estimateAvailableMemory())
.charset(Charset.defaultCharset())
.distinct(false)
.numHeader(1)
.skipHeader(false)
.format(
CSVFormat.DEFAULT
.builder()
.setDelimiter('\t')
.setIgnoreSurroundingSpaces(true)
.setQuote((Character) null)
.build())
.build();

// Sort genotypes file and write final file to output.
final ArrayList<CSVRecord> gtHeader = new ArrayList<>();
final List<File> gtSortInBatch =
CsvExternalSort.sortInBatch(tmpGtsPath.toFile(), null, sortOptions, gtHeader);
try (OutputStream gtsStream = Files.newOutputStream(Paths.get(args.getOutputGts()));
Writer gtsWriter = GzipUtil.maybeOpenGzipOutputStream(gtsStream, args.getOutputGts());
BufferedWriter bufWriter = new BufferedWriter(gtsWriter)) {
List<CSVRecordBuffer> bfbs = new ArrayList<>();
for (File f : gtSortInBatch) {
InputStream in = new FileInputStream(f);
BufferedReader fbr =
new BufferedReader(new InputStreamReader(in, sortOptions.getCharset()));
CSVParser parser = new CSVParser(fbr, sortOptions.getFormat());
CSVRecordBuffer bfb = new CSVRecordBuffer(parser);
bfbs.add(bfb);
}

CsvExternalSort.mergeSortedFiles(bufWriter, sortOptions, bfbs, gtHeader);
} catch (ClassNotFoundException e) {
throw new RuntimeException("Problem with external file sort", e);
}
}

private void checkWriteOutBndMates() {
if (!ImmutableList.of("true", "false", "auto").contains(args.getWriteBndMates())) {
System.err.println("Unsupported feature in --opt-out: " + args.getWriteBndMates());
Expand Down
Loading

0 comments on commit e280b39

Please sign in to comment.