diff --git a/CHANGES.md b/CHANGES.md index 8f1a7c8..9ddcbe7 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,6 +2,7 @@ ## v0.26-SNAPSHOT +- Explicitely model support for SV callers (#68) - Removing explicit support for SV2 (#67) - Adding end-to-end tests `hg19-chr22` (#61) diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/AnnotateVcf.java b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/AnnotateVcf.java index c7945b5..efd81fe 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/AnnotateVcf.java +++ b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/AnnotateVcf.java @@ -1,8 +1,12 @@ package com.github.bihealth.varfish_annotator.annotate; -import com.github.bihealth.varfish_annotator.DbInfo; import com.github.bihealth.varfish_annotator.VarfishAnnotatorException; -import com.github.bihealth.varfish_annotator.init_db.DbReleaseUpdater; +import com.github.bihealth.varfish_annotator.checks.IncompatibleVcfException; +import com.github.bihealth.varfish_annotator.checks.VcfCompatibilityChecker; +import com.github.bihealth.varfish_annotator.data.GenomeVersion; +import com.github.bihealth.varfish_annotator.data.VcfConstants; +import com.github.bihealth.varfish_annotator.db.DbInfo; +import com.github.bihealth.varfish_annotator.db.DbInfoWriterHelper; import com.github.bihealth.varfish_annotator.utils.*; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableList; @@ -52,71 +56,6 @@ /** Implementation of the annotate command. */ public final class AnnotateVcf { - /** Name of table with ExAC variants. */ - public static final String EXAC_PREFIX = "exac"; - - /** Name of table with gnomAD exomes variants. */ - public static final String GNOMAD_EXOMES_PREFIX = "gnomad_exome"; - - /** Name of table with gnomAD genomes variants. */ - public static final String GNOMAD_GENOMES_PREFIX = "gnomad_genome"; - - /** Name of table with Thousand Genomes variants. */ - public static final String THOUSAND_GENOMES_PREFIX = "thousand_genomes"; - - /** Header fields for the genotypes file. */ - public static final ImmutableList HEADERS_GT = - ImmutableList.of( - "release", - "chromosome", - "chromosome_no", - "start", - "end", - "bin", - "reference", - "alternative", - "var_type", - "case_id", - "set_id", - "info", - "genotype", - "num_hom_alt", - "num_hom_ref", - "num_het", - "num_hemi_alt", - "num_hemi_ref", - "in_clinvar", - "exac_frequency", - "exac_homozygous", - "exac_heterozygous", - "exac_hemizygous", - "thousand_genomes_frequency", - "thousand_genomes_homozygous", - "thousand_genomes_heterozygous", - "thousand_genomes_hemizygous", - "gnomad_exomes_frequency", - "gnomad_exomes_homozygous", - "gnomad_exomes_heterozygous", - "gnomad_exomes_hemizygous", - "gnomad_genomes_frequency", - "gnomad_genomes_homozygous", - "gnomad_genomes_heterozygous", - "gnomad_genomes_hemizygous", - "refseq_gene_id", - "refseq_transcript_id", - "refseq_transcript_coding", - "refseq_hgvs_c", - "refseq_hgvs_p", - "refseq_effect", - "refseq_exon_dist", - "ensembl_gene_id", - "ensembl_transcript_id", - "ensembl_transcript_coding", - "ensembl_hgvs_c", - "ensembl_hgvs_p", - "ensembl_effect", - "ensembl_exon_dist"); - /** Configuration for the command. */ private final AnnotateArgs args; @@ -178,7 +117,8 @@ public void run() { JannovarData ensemblJvData = new JannovarDataSerializer(args.getEnsemblSerPath()).load(); final VariantNormalizer normalizer = new VariantNormalizer(args.getRefPath()); annotateVcf(conn, reader, refseqJvData, ensemblJvData, normalizer, gtWriter); - writeDbInfos(conn, dbInfoBufWriter); + new DbInfoWriterHelper() + .writeDbInfos(conn, dbInfoBufWriter, args.getRelease(), AnnotateVcf.class); } catch (SQLException e) { System.err.println("Problem with database connection"); e.printStackTrace(); @@ -206,48 +146,6 @@ public void run() { } } - /** - * Write information about used databases to TSV file. - * - * @param conn Database connection to get the information from. - * @param dbInfoWriter Writer for database information. - * @throws VarfishAnnotatorException in case of problems - */ - private void writeDbInfos(Connection conn, BufferedWriter dbInfoWriter) - throws VarfishAnnotatorException { - try { - dbInfoWriter.write("genomebuild\tdb_name\trelease\n"); - } catch (IOException e) { - throw new VarfishAnnotatorException("Could not write out headers", e); - } - - final String query = - "SELECT db_name, release FROM " + DbReleaseUpdater.TABLE_NAME + " ORDER BY db_name"; - try { - final PreparedStatement stmt = conn.prepareStatement(query); - - try (ResultSet rs = stmt.executeQuery()) { - while (true) { - if (!rs.next()) { - return; - } - final String versionString; - if (rs.getString(1).equals("varfish-annotator")) { - versionString = AnnotateVcf.class.getPackage().getSpecificationVersion(); - } else { - versionString = rs.getString(2); - } - dbInfoWriter.write( - args.getRelease() + "\t" + rs.getString(1) + "\t" + versionString + "\n"); - } - } - } catch (SQLException e) { - throw new VarfishAnnotatorException("Problem with querying database", e); - } catch (IOException e) { - throw new VarfishAnnotatorException("Could not write TSV info", e); - } - } - /** * Perform the variant annotation. * @@ -267,13 +165,12 @@ private void annotateVcf( VariantNormalizer normalizer, Writer gtWriter) throws VarfishAnnotatorException { - // Guess genome version. - GenomeVersion genomeVersion = new VcfCompatibilityChecker(reader).guessGenomeVersion(); + final GenomeVersion genomeVersion = new VcfCompatibilityChecker(reader).guessGenomeVersion(); // Write out header. try { - gtWriter.append(Joiner.on("\t").join(HEADERS_GT) + "\n"); + gtWriter.append(Joiner.on("\t").join(VcfConstants.HEADERS_GT) + "\n"); } catch (IOException e) { throw new VarfishAnnotatorException("Could not write out headers", e); } @@ -365,20 +262,11 @@ private void annotateVariantContext( normalizer.normalizeInsertion( new VariantDescription( contigName, ctx.getStart() - 1, ctx.getReference().getBaseString(), baseString)); - - final String varType; - if ((normalizedVar.getRef().length() == 1) && (normalizedVar.getAlt().length() == 1)) { - varType = "snv"; - } else if (normalizedVar.getRef().length() == normalizedVar.getAlt().length()) { - varType = "mnv"; - } else { - varType = "indel"; - } + // Get variant type string. + final String varType = getVarType(normalizedVar); // Get annotations sorted descendingly by variant effect. - final List sortedRefseqAnnos = sortAnnos(refseqAnnotationsList, i); - final List sortedEnsemblAnnos = sortAnnos(ensemblAnnotationsList, i); - + // // Collect RefSeq and ENSEMBL annotations per gene. Jannovar provides gene ID mappings // *to* both RefSeq and ENSEMBL *from* both RefSeq and ENSEMBL. We use an (arbitrarily) // fixed lookup order as any should give sensible results except for genes whose annotation @@ -389,65 +277,41 @@ private void annotateVariantContext( // so many more transcripts/genes such that we would expect them to never match. // RefSeq first - + final List sortedRefseqAnnos = + AnnoSorting.sortAnnotations(refseqAnnotationsList, i); final HashMap refSeqAnnoByRefSeqGene = new HashMap<>(); final HashMap refSeqAnnoByEnsemblGene = new HashMap<>(); - - for (Annotation annotation : sortedRefseqAnnos) { - if (annotation.getEffects().isEmpty() - || annotation.getEffects().equals(ImmutableSet.of(VariantEffect.INTERGENIC_VARIANT))) { - // Put into map under pseudo-identifier "__intergenic__". This ways, intergenic variants - // are written out only once for RefSeq/ENSEMBL and not twice if different genes are - // closest. - refSeqAnnoByRefSeqGene.put("__intergenic__", annotation); - refSeqAnnoByEnsemblGene.put("__intergenic__", annotation); - } else { - final String refseqGeneId = annotation.getTranscript().getGeneID(); - final String ensemblGeneId = - annotation.getTranscript().getAltGeneIDs().get("ENSEMBL_GENE_ID"); - if (refseqGeneId != null && !refSeqAnnoByRefSeqGene.containsKey(refseqGeneId)) { - refSeqAnnoByRefSeqGene.put(refseqGeneId, annotation); - } - if (ensemblGeneId != null && !refSeqAnnoByEnsemblGene.containsKey(ensemblGeneId)) { - refSeqAnnoByEnsemblGene.put(ensemblGeneId, annotation); - } - } - } + extractAnnotations(sortedRefseqAnnos, refSeqAnnoByRefSeqGene, refSeqAnnoByEnsemblGene, true); // Then ENSEMBL - + final List sortedEnsemblAnnos = + AnnoSorting.sortAnnotations(ensemblAnnotationsList, i); final HashMap ensemblAnnoByRefSeqGene = new HashMap<>(); final HashMap ensemblAnnoByEnsemblGene = new HashMap<>(); + extractAnnotations( + sortedEnsemblAnnos, ensemblAnnoByRefSeqGene, ensemblAnnoByEnsemblGene, false); - for (Annotation annotation : sortedEnsemblAnnos) { - if (annotation.getEffects().isEmpty() - || annotation.getEffects().equals(ImmutableSet.of(VariantEffect.INTERGENIC_VARIANT))) { - // Put into map under pseudo-identifier "__intergenic__". This ways, intergenic variants - // are written out only once for RefSeq/ENSEMBL and not twice if different genes are - // closest. - ensemblAnnoByRefSeqGene.put("__intergenic__", annotation); - ensemblAnnoByEnsemblGene.put("__intergenic__", annotation); - } else { - final String refseqGeneId = annotation.getTranscript().getAltGeneIDs().get("ENTREZ_ID"); - final String ensemblGeneId = annotation.getTranscript().getGeneID(); - if (refseqGeneId != null && !ensemblAnnoByRefSeqGene.containsKey(refseqGeneId)) { - ensemblAnnoByRefSeqGene.put(refseqGeneId, annotation); - } - if (ensemblGeneId != null && !ensemblAnnoByEnsemblGene.containsKey(ensemblGeneId)) { - ensemblAnnoByEnsemblGene.put(ensemblGeneId, annotation); - } - } - } - - // Query information in databases. + // Query for frequency/presence information in databases. final String gnomadChrPrefix = "GRCh38".equals(args.getRelease()) ? "chr" : ""; - final DbInfo exacInfo = getDbInfo(conn, args.getRelease(), normalizedVar, EXAC_PREFIX, ""); + final DbInfo exacInfo = + getDbInfo(conn, args.getRelease(), normalizedVar, VcfConstants.EXAC_PREFIX, ""); final DbInfo gnomadExomesInfo = - getDbInfo(conn, args.getRelease(), normalizedVar, GNOMAD_EXOMES_PREFIX, gnomadChrPrefix); + getDbInfo( + conn, + args.getRelease(), + normalizedVar, + VcfConstants.GNOMAD_EXOMES_PREFIX, + gnomadChrPrefix); final DbInfo gnomadGenomesInfo = - getDbInfo(conn, args.getRelease(), normalizedVar, GNOMAD_GENOMES_PREFIX, gnomadChrPrefix); + getDbInfo( + conn, + args.getRelease(), + normalizedVar, + VcfConstants.GNOMAD_GENOMES_PREFIX, + gnomadChrPrefix); final DbInfo thousandGenomesInfo = - getDbInfo(conn, args.getRelease(), normalizedVar, THOUSAND_GENOMES_PREFIX, ""); + getDbInfo( + conn, args.getRelease(), normalizedVar, VcfConstants.THOUSAND_GENOMES_PREFIX, ""); final boolean inClinvar = getClinVarInfo(conn, args.getRelease(), normalizedVar); // Build a list of all gene IDs that we will iterate over later. @@ -463,231 +327,398 @@ private void annotateVariantContext( // Additional information. final String infoStr = "{}"; + // Build per-genotype counts, taking into consideration the sex information from pedigree. final GenotypeCounts gtCounts = GenotypeCounts.buildGenotypeCounts(ctx, i, pedigree, args.getRelease()); - // Write one entry for each gene into the annotated genotype call file. - for (String geneId : geneIds) { - if (doneGeneIds.contains(geneId)) { - continue; // Do not process twice. - } + // Write output record (alsow write out empty one if necessary). + writeOutputRecords( + refDict, + ctx, + gtWriter, + i, + normalizedVar, + varType, + refSeqAnnoByRefSeqGene, + refSeqAnnoByEnsemblGene, + ensemblAnnoByRefSeqGene, + ensemblAnnoByEnsemblGene, + exacInfo, + gnomadExomesInfo, + gnomadGenomesInfo, + thousandGenomesInfo, + inClinvar, + geneIds, + doneGeneIds, + infoStr, + gtCounts); + } + } - // Select both RefSeq and ENSEMBL annotation for the given (RefSeq or ENSEMBL gene ID). - final Annotation refseqAnno; - if (refSeqAnnoByRefSeqGene.containsKey(geneId)) { - refseqAnno = refSeqAnnoByRefSeqGene.get(geneId); - } else if (refSeqAnnoByEnsemblGene.containsKey(geneId)) { - refseqAnno = refSeqAnnoByEnsemblGene.get(geneId); - } else { - refseqAnno = null; - } - final Annotation ensemblAnno; - if (ensemblAnnoByEnsemblGene.containsKey(geneId)) { - ensemblAnno = ensemblAnnoByEnsemblGene.get(geneId); - } else if (ensemblAnnoByRefSeqGene.containsKey(geneId)) { - ensemblAnno = ensemblAnnoByRefSeqGene.get(geneId); - } else { - ensemblAnno = null; - } + private static String getVarType(VariantDescription normalizedVar) { + final String varType; + if ((normalizedVar.getRef().length() == 1) && (normalizedVar.getAlt().length() == 1)) { + varType = "snv"; + } else if (normalizedVar.getRef().length() == normalizedVar.getAlt().length()) { + varType = "mnv"; + } else { + varType = "indel"; + } + return varType; + } - // Mark all gene IDs as done. - if (ensemblAnno != null && ensemblAnno.getTranscript() != null) { - doneGeneIds.add(ensemblAnno.getTranscript().getGeneID()); - if (ensemblAnno.getTranscript().getAltGeneIDs().containsKey("ENTREZ_ID")) { - doneGeneIds.add(ensemblAnno.getTranscript().getAltGeneIDs().get("ENTREZ_ID")); - } + /** + * Write output records with the data built for {@code ctx). + * + * This function will also take care of writing out an empty record in case there were no overlapping genes. + */ + private void writeOutputRecords( + ReferenceDictionary refDict, + VariantContext ctx, + Writer gtWriter, + int i, + VariantDescription normalizedVar, + String varType, + HashMap refSeqAnnoByRefSeqGene, + HashMap refSeqAnnoByEnsemblGene, + HashMap ensemblAnnoByRefSeqGene, + HashMap ensemblAnnoByEnsemblGene, + DbInfo exacInfo, + DbInfo gnomadExomesInfo, + DbInfo gnomadGenomesInfo, + DbInfo thousandGenomesInfo, + boolean inClinvar, + TreeSet geneIds, + TreeSet doneGeneIds, + String infoStr, + GenotypeCounts gtCounts) + throws VarfishAnnotatorException { + // Write one entry for each gene into the annotated genotype call file. + for (String geneId : geneIds) { + if (doneGeneIds.contains(geneId)) { + continue; // Do not process twice. + } + + // Select both RefSeq and ENSEMBL annotation for the given (RefSeq or ENSEMBL gene ID). + final Annotation refseqAnno; + if (refSeqAnnoByRefSeqGene.containsKey(geneId)) { + refseqAnno = refSeqAnnoByRefSeqGene.get(geneId); + } else if (refSeqAnnoByEnsemblGene.containsKey(geneId)) { + refseqAnno = refSeqAnnoByEnsemblGene.get(geneId); + } else { + refseqAnno = null; + } + final Annotation ensemblAnno; + if (ensemblAnnoByEnsemblGene.containsKey(geneId)) { + ensemblAnno = ensemblAnnoByEnsemblGene.get(geneId); + } else if (ensemblAnnoByRefSeqGene.containsKey(geneId)) { + ensemblAnno = ensemblAnnoByRefSeqGene.get(geneId); + } else { + ensemblAnno = null; + } + + // Mark all gene IDs as done. + if (ensemblAnno != null && ensemblAnno.getTranscript() != null) { + doneGeneIds.add(ensemblAnno.getTranscript().getGeneID()); + if (ensemblAnno.getTranscript().getAltGeneIDs().containsKey("ENTREZ_ID")) { + doneGeneIds.add(ensemblAnno.getTranscript().getAltGeneIDs().get("ENTREZ_ID")); } - if (refseqAnno != null && refseqAnno.getTranscript() != null) { - doneGeneIds.add(refseqAnno.getTranscript().getGeneID()); - if (refseqAnno.getTranscript().getAltGeneIDs().containsKey("ENSEMBL_GENE_ID")) { - doneGeneIds.add(refseqAnno.getTranscript().getAltGeneIDs().get("ENSEMBL_GENE_ID")); - } + } + if (refseqAnno != null && refseqAnno.getTranscript() != null) { + doneGeneIds.add(refseqAnno.getTranscript().getGeneID()); + if (refseqAnno.getTranscript().getAltGeneIDs().containsKey("ENSEMBL_GENE_ID")) { + doneGeneIds.add(refseqAnno.getTranscript().getAltGeneIDs().get("ENSEMBL_GENE_ID")); } + } - // Distance to next base of exon. - final int refSeqExonDist = - getDistance(normalizedVar, refseqAnno == null ? null : refseqAnno.getTranscript()); - final int ensemblExonDist = - getDistance(normalizedVar, ensemblAnno == null ? null : ensemblAnno.getTranscript()); - - // Construct output record. - final List gtOutRec = - Lists.newArrayList( - args.getRelease(), - normalizedVar.getChrom(), - refDict.getContigNameToID().get(normalizedVar.getChrom()), - String.valueOf(normalizedVar.getPos() + 1), - String.valueOf(normalizedVar.getPos() + normalizedVar.getRef().length()), - UcscBinning.getContainingBin( - normalizedVar.getPos(), - normalizedVar.getPos() + normalizedVar.getRef().length()), - normalizedVar.getRef(), - normalizedVar.getAlt(), - varType, - args.getCaseId(), - args.getSetId(), - infoStr, - buildGenotypeValue(ctx, i), - String.valueOf(gtCounts.numHomAlt), - String.valueOf(gtCounts.numHomRef), - String.valueOf(gtCounts.numHet), - String.valueOf(gtCounts.numHemiAlt), - String.valueOf(gtCounts.numHemiRef), - // ClinVar - inClinvar ? "TRUE" : "FALSE", - // EXAC - exacInfo.getAfPopmaxStr(), - exacInfo.getHomTotalStr(), - exacInfo.getHetTotalStr(), - exacInfo.getHemiTotalStr(), - // Thousand Genomes - thousandGenomesInfo.getAfPopmaxStr(), - thousandGenomesInfo.getHomTotalStr(), - thousandGenomesInfo.getHetTotalStr(), - thousandGenomesInfo.getHemiTotalStr(), - // gnomAD exomes - gnomadExomesInfo.getAfPopmaxStr(), - gnomadExomesInfo.getHomTotalStr(), - gnomadExomesInfo.getHetTotalStr(), - gnomadExomesInfo.getHemiTotalStr(), - // gnomAD genomes - gnomadGenomesInfo.getAfPopmaxStr(), - gnomadGenomesInfo.getHomTotalStr(), - gnomadGenomesInfo.getHetTotalStr(), - gnomadGenomesInfo.getHemiTotalStr(), - // RefSeq - (refseqAnno == null || refseqAnno.getTranscript() == null) - ? "." - : refseqAnno.getTranscript().getGeneID(), - (refseqAnno == null || refseqAnno.getTranscript() == null) - ? "." - : refseqAnno.getTranscript().getAccession(), - (refseqAnno == null || refseqAnno.getTranscript() == null) - ? "." - : refseqAnno.getTranscript().isCoding() ? "TRUE" : "FALSE", - (refseqAnno == null || refseqAnno.getTranscript() == null) - ? "." - : refseqAnno.getCDSNTChange() == null - ? "." - : (refseqAnno.getTranscript().isCoding() ? "c." : "n.") - + refseqAnno.getCDSNTChange().toHGVSString(AminoAcidCode.ONE_LETTER), - refseqAnno == null - ? "." - : refseqAnno.getProteinChange() == null - ? "." - : "p." - + refseqAnno - .getProteinChange() - .withOnlyPredicted(false) - .toHGVSString(AminoAcidCode.ONE_LETTER), - (refseqAnno == null || refseqAnno.getTranscript() == null) - ? "{}" - : buildEffectsValue(refseqAnno.getEffects()), - (refSeqExonDist >= 0) ? Integer.toString(refSeqExonDist) : ".", - // ENSEMBL - (ensemblAnno == null || ensemblAnno.getTranscript() == null) - ? "." - : ensemblAnno.getTranscript().getGeneID(), - (ensemblAnno == null || ensemblAnno.getTranscript() == null) - ? "." - : ensemblAnno.getTranscript().getAccession(), - (ensemblAnno == null || ensemblAnno.getTranscript() == null) - ? "." - : ensemblAnno.getTranscript().isCoding() ? "TRUE" : "FALSE", - (ensemblAnno == null || ensemblAnno.getTranscript() == null) - ? "." - : ensemblAnno.getCDSNTChange() == null - ? "." - : (ensemblAnno.getTranscript().isCoding() ? "c." : "n.") - + ensemblAnno.getCDSNTChange().toHGVSString(AminoAcidCode.ONE_LETTER), - (ensemblAnno == null || ensemblAnno.getTranscript() == null) - ? "." - : ensemblAnno.getProteinChange() == null - ? "." - : "p." - + ensemblAnno - .getProteinChange() - .withOnlyPredicted(false) - .toHGVSString(AminoAcidCode.ONE_LETTER), - (ensemblAnno == null || ensemblAnno.getTranscript() == null) - ? "{}" - : buildEffectsValue(ensemblAnno.getEffects()), - (ensemblExonDist >= 0) ? Integer.toString(ensemblExonDist) : "."); - // Write record to output stream. - try { - gtWriter.append(Joiner.on("\t").join(gtOutRec) + "\n"); - } catch (IOException e) { - throw new VarfishAnnotatorException("Problem writing to genotypes call file.", e); - } + // Distance to next base of exon. + final int refSeqExonDist = + getDistance(normalizedVar, refseqAnno == null ? null : refseqAnno.getTranscript()); + final int ensemblExonDist = + getDistance(normalizedVar, ensemblAnno == null ? null : ensemblAnno.getTranscript()); + + // Construct output record. + final List gtOutRec = + constructOutputRecord( + refDict, + ctx, + i, + normalizedVar, + varType, + exacInfo, + gnomadExomesInfo, + gnomadGenomesInfo, + thousandGenomesInfo, + inClinvar, + infoStr, + gtCounts, + refseqAnno, + ensemblAnno, + refSeqExonDist, + ensemblExonDist); + // Write record to output stream. + try { + gtWriter.append(Joiner.on("\t").join(gtOutRec) + "\n"); + } catch (IOException e) { + throw new VarfishAnnotatorException("Problem writing to genotypes call file.", e); } + } - if (geneIds.isEmpty()) { - // Construct output record. - final List gtOutRec = - Lists.newArrayList( - args.getRelease(), - normalizedVar.getChrom(), - refDict.getContigNameToID().get(normalizedVar.getChrom()), - String.valueOf(normalizedVar.getPos() + 1), - String.valueOf(normalizedVar.getPos() + normalizedVar.getRef().length()), - UcscBinning.getContainingBin( - normalizedVar.getPos(), - normalizedVar.getPos() + normalizedVar.getRef().length()), - normalizedVar.getRef(), - normalizedVar.getAlt(), - varType, - args.getCaseId(), - args.getSetId(), - infoStr, - buildGenotypeValue(ctx, i), - String.valueOf(gtCounts.numHomAlt), - String.valueOf(gtCounts.numHomRef), - String.valueOf(gtCounts.numHet), - String.valueOf(gtCounts.numHemiAlt), - String.valueOf(gtCounts.numHemiRef), - // ClinVar - inClinvar ? "TRUE" : "FALSE", - // EXAC - exacInfo.getAfPopmaxStr(), - exacInfo.getHomTotalStr(), - exacInfo.getHetTotalStr(), - exacInfo.getHemiTotalStr(), - // Thousand Genomes - thousandGenomesInfo.getAfPopmaxStr(), - thousandGenomesInfo.getHomTotalStr(), - thousandGenomesInfo.getHetTotalStr(), - thousandGenomesInfo.getHemiTotalStr(), - // gnomAD exomes - gnomadExomesInfo.getAfPopmaxStr(), - gnomadExomesInfo.getHomTotalStr(), - gnomadExomesInfo.getHetTotalStr(), - gnomadExomesInfo.getHemiTotalStr(), - // gnomAD genomes - gnomadGenomesInfo.getAfPopmaxStr(), - gnomadGenomesInfo.getHomTotalStr(), - gnomadGenomesInfo.getHetTotalStr(), - gnomadGenomesInfo.getHemiTotalStr(), - // RefSeq - ".", - ".", - ".", - ".", - ".", - ".", - ".", - // ENSEMBL - ".", - ".", - ".", - ".", - ".", - ".", - "."); - // Write record to output stream. - try { - gtWriter.append(Joiner.on("\t").join(gtOutRec) + "\n"); - } catch (IOException e) { - throw new VarfishAnnotatorException("Problem writing to genotypes call file.", e); + if (geneIds.isEmpty()) { + // Construct output record. + final List gtOutRec = + constructEmptyOutputRecord( + refDict, + ctx, + i, + normalizedVar, + varType, + exacInfo, + gnomadExomesInfo, + gnomadGenomesInfo, + thousandGenomesInfo, + inClinvar, + infoStr, + gtCounts); + // Write record to output stream. + try { + gtWriter.append(Joiner.on("\t").join(gtOutRec) + "\n"); + } catch (IOException e) { + throw new VarfishAnnotatorException("Problem writing to genotypes call file.", e); + } + } + } + + private List constructOutputRecord( + ReferenceDictionary refDict, + VariantContext ctx, + int i, + VariantDescription normalizedVar, + String varType, + DbInfo exacInfo, + DbInfo gnomadExomesInfo, + DbInfo gnomadGenomesInfo, + DbInfo thousandGenomesInfo, + boolean inClinvar, + String infoStr, + GenotypeCounts gtCounts, + Annotation refseqAnno, + Annotation ensemblAnno, + int refSeqExonDist, + int ensemblExonDist) { + return Lists.newArrayList( + args.getRelease(), + normalizedVar.getChrom(), + refDict.getContigNameToID().get(normalizedVar.getChrom()), + String.valueOf(normalizedVar.getPos() + 1), + String.valueOf(normalizedVar.getPos() + normalizedVar.getRef().length()), + UcscBinning.getContainingBin( + normalizedVar.getPos(), normalizedVar.getPos() + normalizedVar.getRef().length()), + normalizedVar.getRef(), + normalizedVar.getAlt(), + varType, + args.getCaseId(), + args.getSetId(), + infoStr, + buildGenotypeValue(ctx, i), + String.valueOf(gtCounts.numHomAlt), + String.valueOf(gtCounts.numHomRef), + String.valueOf(gtCounts.numHet), + String.valueOf(gtCounts.numHemiAlt), + String.valueOf(gtCounts.numHemiRef), + // ClinVar + inClinvar ? "TRUE" : "FALSE", + // EXAC + exacInfo.getAfPopmaxStr(), + exacInfo.getHomTotalStr(), + exacInfo.getHetTotalStr(), + exacInfo.getHemiTotalStr(), + // Thousand Genomes + thousandGenomesInfo.getAfPopmaxStr(), + thousandGenomesInfo.getHomTotalStr(), + thousandGenomesInfo.getHetTotalStr(), + thousandGenomesInfo.getHemiTotalStr(), + // gnomAD exomes + gnomadExomesInfo.getAfPopmaxStr(), + gnomadExomesInfo.getHomTotalStr(), + gnomadExomesInfo.getHetTotalStr(), + gnomadExomesInfo.getHemiTotalStr(), + // gnomAD genomes + gnomadGenomesInfo.getAfPopmaxStr(), + gnomadGenomesInfo.getHomTotalStr(), + gnomadGenomesInfo.getHetTotalStr(), + gnomadGenomesInfo.getHemiTotalStr(), + // RefSeq + (refseqAnno == null || refseqAnno.getTranscript() == null) + ? "." + : refseqAnno.getTranscript().getGeneID(), + (refseqAnno == null || refseqAnno.getTranscript() == null) + ? "." + : refseqAnno.getTranscript().getAccession(), + (refseqAnno == null || refseqAnno.getTranscript() == null) + ? "." + : refseqAnno.getTranscript().isCoding() ? "TRUE" : "FALSE", + (refseqAnno == null || refseqAnno.getTranscript() == null) + ? "." + : refseqAnno.getCDSNTChange() == null + ? "." + : (refseqAnno.getTranscript().isCoding() ? "c." : "n.") + + refseqAnno.getCDSNTChange().toHGVSString(AminoAcidCode.ONE_LETTER), + refseqAnno == null + ? "." + : refseqAnno.getProteinChange() == null + ? "." + : "p." + + refseqAnno + .getProteinChange() + .withOnlyPredicted(false) + .toHGVSString(AminoAcidCode.ONE_LETTER), + (refseqAnno == null || refseqAnno.getTranscript() == null) + ? "{}" + : buildEffectsValue(refseqAnno.getEffects()), + (refSeqExonDist >= 0) ? Integer.toString(refSeqExonDist) : ".", + // ENSEMBL + (ensemblAnno == null || ensemblAnno.getTranscript() == null) + ? "." + : ensemblAnno.getTranscript().getGeneID(), + (ensemblAnno == null || ensemblAnno.getTranscript() == null) + ? "." + : ensemblAnno.getTranscript().getAccession(), + (ensemblAnno == null || ensemblAnno.getTranscript() == null) + ? "." + : ensemblAnno.getTranscript().isCoding() ? "TRUE" : "FALSE", + (ensemblAnno == null || ensemblAnno.getTranscript() == null) + ? "." + : ensemblAnno.getCDSNTChange() == null + ? "." + : (ensemblAnno.getTranscript().isCoding() ? "c." : "n.") + + ensemblAnno.getCDSNTChange().toHGVSString(AminoAcidCode.ONE_LETTER), + (ensemblAnno == null || ensemblAnno.getTranscript() == null) + ? "." + : ensemblAnno.getProteinChange() == null + ? "." + : "p." + + ensemblAnno + .getProteinChange() + .withOnlyPredicted(false) + .toHGVSString(AminoAcidCode.ONE_LETTER), + (ensemblAnno == null || ensemblAnno.getTranscript() == null) + ? "{}" + : buildEffectsValue(ensemblAnno.getEffects()), + (ensemblExonDist >= 0) ? Integer.toString(ensemblExonDist) : "."); + } + + private List constructEmptyOutputRecord( + ReferenceDictionary refDict, + VariantContext ctx, + int i, + VariantDescription normalizedVar, + String varType, + DbInfo exacInfo, + DbInfo gnomadExomesInfo, + DbInfo gnomadGenomesInfo, + DbInfo thousandGenomesInfo, + boolean inClinvar, + String infoStr, + GenotypeCounts gtCounts) { + return Lists.newArrayList( + args.getRelease(), + normalizedVar.getChrom(), + refDict.getContigNameToID().get(normalizedVar.getChrom()), + String.valueOf(normalizedVar.getPos() + 1), + String.valueOf(normalizedVar.getPos() + normalizedVar.getRef().length()), + UcscBinning.getContainingBin( + normalizedVar.getPos(), normalizedVar.getPos() + normalizedVar.getRef().length()), + normalizedVar.getRef(), + normalizedVar.getAlt(), + varType, + args.getCaseId(), + args.getSetId(), + infoStr, + buildGenotypeValue(ctx, i), + String.valueOf(gtCounts.numHomAlt), + String.valueOf(gtCounts.numHomRef), + String.valueOf(gtCounts.numHet), + String.valueOf(gtCounts.numHemiAlt), + String.valueOf(gtCounts.numHemiRef), + // ClinVar + inClinvar ? "TRUE" : "FALSE", + // EXAC + exacInfo.getAfPopmaxStr(), + exacInfo.getHomTotalStr(), + exacInfo.getHetTotalStr(), + exacInfo.getHemiTotalStr(), + // Thousand Genomes + thousandGenomesInfo.getAfPopmaxStr(), + thousandGenomesInfo.getHomTotalStr(), + thousandGenomesInfo.getHetTotalStr(), + thousandGenomesInfo.getHemiTotalStr(), + // gnomAD exomes + gnomadExomesInfo.getAfPopmaxStr(), + gnomadExomesInfo.getHomTotalStr(), + gnomadExomesInfo.getHetTotalStr(), + gnomadExomesInfo.getHemiTotalStr(), + // gnomAD genomes + gnomadGenomesInfo.getAfPopmaxStr(), + gnomadGenomesInfo.getHomTotalStr(), + gnomadGenomesInfo.getHetTotalStr(), + gnomadGenomesInfo.getHemiTotalStr(), + // RefSeq + ".", + ".", + ".", + ".", + ".", + ".", + ".", + // ENSEMBL + ".", + ".", + ".", + ".", + ".", + ".", + "."); + } + + /** + * Collect RefSeq or ENSEMBL annotations per gene. + * + *

See comment in {@link #annotateVariantContext} on the reasoning behind "__integergenic__". + * + * @param sortedAnnos Sorted annotations to process. + * @param annoByRefSeqGene Annotations by RefSeq gene are written here. + * @param annoByEnsemblGene Annotations by ENSEMBL gene ID. + * @param isRefSeq Whether to process as refseq or + */ + private static void extractAnnotations( + List sortedAnnos, + HashMap annoByRefSeqGene, + HashMap annoByEnsemblGene, + boolean isRefSeq) { + for (Annotation annotation : sortedAnnos) { + if (annotation.getEffects().isEmpty() + || annotation.getEffects().equals(ImmutableSet.of(VariantEffect.INTERGENIC_VARIANT))) { + // Put into map under pseudo-identifier "__intergenic__". This ways, intergenic variants + // are written out only once for RefSeq/ENSEMBL and not twice if different genes are + // closest. + annoByRefSeqGene.put("__intergenic__", annotation); + annoByEnsemblGene.put("__intergenic__", annotation); + } else { + final String refseqGeneId; + final String ensemblGeneId; + if (isRefSeq) { + refseqGeneId = annotation.getTranscript().getGeneID(); + ensemblGeneId = annotation.getTranscript().getAltGeneIDs().get("ENSEMBL_GENE_ID"); + } else { + refseqGeneId = annotation.getTranscript().getAltGeneIDs().get("ENTREZ_ID"); + ensemblGeneId = annotation.getTranscript().getGeneID(); + } + if (refseqGeneId != null && !annoByRefSeqGene.containsKey(refseqGeneId)) { + annoByRefSeqGene.put(refseqGeneId, annotation); + } + if (ensemblGeneId != null && !annoByEnsemblGene.containsKey(ensemblGeneId)) { + annoByEnsemblGene.put(ensemblGeneId, annotation); } } } @@ -755,33 +786,6 @@ private ImmutableList silentBuildAnnotations( } } - private List sortAnnos( - ImmutableList refseqAnnotationsList, int i) { - if (refseqAnnotationsList == null) { - return new ArrayList<>(); - } else { - return refseqAnnotationsList - .get(i - 1) - .getAnnotations() - .stream() - .sorted( - Comparator.comparing( - Annotation::getMostPathogenicVarType, - (t1, t2) -> { - if (t1 == null && t2 == null) { - return 0; - } else if (t2 == null) { - return -1; - } else if (t1 == null) { - return 1; - } else { - return t1.compareTo(t2); - } - })) - .collect(Collectors.toList()); - } - } - /** * (Overly) simple helper for escaping {@code s}. * @@ -885,7 +889,8 @@ private DbInfo getDbInfo( // Return "not found" if looking for ExAC or Thousand Genomes but not using GRCh37. // These are only available for GRCh37. if (!"GRCh37".equals(release) - && ImmutableList.of(EXAC_PREFIX, THOUSAND_GENOMES_PREFIX).contains(prefix)) { + && ImmutableList.of(VcfConstants.EXAC_PREFIX, VcfConstants.THOUSAND_GENOMES_PREFIX) + .contains(prefix)) { return DbInfo.nullValue(); } diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsArgs.java b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsArgs.java index f00da25..5901eb1 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsArgs.java +++ b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsArgs.java @@ -104,9 +104,11 @@ public final class AnnotateSvsArgs { @Parameter(names = "--write-bnd-mate", description = "Write out BND mates (true, false, or auto)") private String writeBndMates = "auto"; - public boolean isHelp() { - return help; - } + @Parameter( + names = "skip-filters", + description = + "Skip records with this filter value (comma-separated list), default is 'LowQual'") + private String skipFilters = "LowQual"; public String getRefseqSerPath() { return refseqSerPath; @@ -180,6 +182,10 @@ public String getWriteBndMates() { return writeBndMates; } + public String getSkipFilters() { + return skipFilters; + } + @Override public String toString() { return "AnnotateSvsArgs{" @@ -237,6 +243,9 @@ public String toString() { + ", writeBndMates=\'" + writeBndMates + "\'" + + ", skipFilters=\'" + + skipFilters + + "\'" + '}'; } } diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsVcf.java b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsVcf.java index c79bdb5..2684f5d 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsVcf.java +++ b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/AnnotateSvsVcf.java @@ -1,20 +1,18 @@ package com.github.bihealth.varfish_annotator.annotate_svs; import com.github.bihealth.varfish_annotator.VarfishAnnotatorException; -import com.github.bihealth.varfish_annotator.annotate.GenomeVersion; -import com.github.bihealth.varfish_annotator.annotate.IncompatibleVcfException; -import com.github.bihealth.varfish_annotator.annotate.VcfCompatibilityChecker; -import com.github.bihealth.varfish_annotator.init_db.DbReleaseUpdater; +import com.github.bihealth.varfish_annotator.annotate.AnnotateVcf; +import com.github.bihealth.varfish_annotator.checks.IncompatibleVcfException; +import com.github.bihealth.varfish_annotator.checks.VcfCompatibilityChecker; +import com.github.bihealth.varfish_annotator.data.GenomeVersion; +import com.github.bihealth.varfish_annotator.db.DbInfoWriterHelper; import com.github.bihealth.varfish_annotator.utils.*; import com.google.common.base.Joiner; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Lists; -import com.google.common.primitives.Ints; import de.charite.compbio.jannovar.annotation.SVAnnotation; import de.charite.compbio.jannovar.annotation.SVAnnotations; -import de.charite.compbio.jannovar.annotation.VariantEffect; import de.charite.compbio.jannovar.data.JannovarData; import de.charite.compbio.jannovar.data.JannovarDataSerializer; import de.charite.compbio.jannovar.data.SerializationException; @@ -30,10 +28,7 @@ import de.charite.compbio.jannovar.pedigree.PedFileReader; import de.charite.compbio.jannovar.pedigree.PedParseException; import de.charite.compbio.jannovar.pedigree.Pedigree; -import de.charite.compbio.jannovar.reference.SVDescription.Type; import de.charite.compbio.jannovar.reference.SVGenomeVariant; -import htsjdk.variant.variantcontext.Allele; -import htsjdk.variant.variantcontext.Genotype; import htsjdk.variant.variantcontext.VariantContext; import htsjdk.variant.variantcontext.VariantContextBuilder; import htsjdk.variant.vcf.VCFFileReader; @@ -41,56 +36,16 @@ import java.nio.file.Files; import java.nio.file.Paths; import java.sql.*; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; import java.util.UUID; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; /** Implementation of the annotate-svs command. */ public final class AnnotateSvsVcf { - private static final String FEATURE_CHROM2_COLUMNS = "chrom2-columns"; - private static final String FEATURE_DBCOUNTS_COLUMNS = "dbcounts-columns"; - private static final String FEATURE_SUPPRESS_CARRIER_COUNTS = "suppress-carriers-in-info"; - - /** Header fields for the SV and genotype file (part 1). */ - private static final ImmutableList HEADERS_GT_PART_1 = - ImmutableList.of("release", "chromosome", "chromosome_no", "bin"); - /** Header fields for the SV and genotype file (part 2, optional). */ - private static final ImmutableList HEADERS_GT_PART_2 = - ImmutableList.of("chromosome2", "chromosome_no2", "bin2", "pe_orientation"); - /** Header fields for the SV and genotype file (part 3). */ - private static final ImmutableList HEADERS_GT_PART_3 = - ImmutableList.of( - "start", - "end", - "start_ci_left", - "start_ci_right", - "end_ci_left", - "end_ci_right", - "case_id", - "set_id", - "sv_uuid", - "caller", - "sv_type", - "sv_sub_type", - "info"); - /** Header fields for the SV and genotype file (part 4, optional). */ - private static final ImmutableList HEADERS_GT_PART_4 = - ImmutableList.of("num_hom_alt", "num_hom_ref", "num_het", "num_hemi_alt", "num_hemi_ref"); - /** Header fields for the SV and genotype file (part 5). */ - private static final ImmutableList HEADERS_GT_PART_5 = ImmutableList.of("genotype"); - /** Header fields for the gene-wise feature effects file. */ public static final ImmutableList HEADERS_FEATURE_EFFECTS = ImmutableList.of( @@ -192,8 +147,9 @@ public void run() { JannovarData refseqJvData = new JannovarDataSerializer(args.getRefseqSerPath()).load(); JannovarData ensemblJvData = new JannovarDataSerializer(args.getEnsemblSerPath()).load(); annotateSvVcf( - conn, genomeVersion, reader, refseqJvData, ensemblJvData, gtWriter, featureEffectsWriter); - writeDbInfos(conn, dbInfoBufWriter); + genomeVersion, reader, refseqJvData, ensemblJvData, gtWriter, featureEffectsWriter); + new DbInfoWriterHelper() + .writeDbInfos(conn, dbInfoBufWriter, args.getRelease(), AnnotateVcf.class); } catch (SQLException e) { System.err.println("Problem with database connection"); e.printStackTrace(); @@ -235,7 +191,9 @@ private void checkOptOutFeatures() { final ImmutableList supportedFeatures = ImmutableList.of( - FEATURE_CHROM2_COLUMNS, FEATURE_DBCOUNTS_COLUMNS, FEATURE_SUPPRESS_CARRIER_COUNTS); + GtRecordBuilder.FEATURE_CHROM2_COLUMNS, + GtRecordBuilder.FEATURE_DBCOUNTS_COLUMNS, + GtRecordBuilder.FEATURE_SUPPRESS_CARRIER_COUNTS); final String features[] = args.getOptOutFeatures().split(","); boolean allGood = true; for (String feature : features) { @@ -249,52 +207,9 @@ private void checkOptOutFeatures() { } } - /** - * Write information about used databases to TSV file. - * - * @param conn Database connection to get the information from. - * @param dbInfoWriter Writer for database information. - * @throws VarfishAnnotatorException in case of problems - */ - private void writeDbInfos(Connection conn, BufferedWriter dbInfoWriter) - throws VarfishAnnotatorException { - try { - dbInfoWriter.write("genomebuild\tdb_name\trelease\n"); - } catch (IOException e) { - throw new VarfishAnnotatorException("Could not write out headers", e); - } - - final String query = - "SELECT db_name, release FROM " + DbReleaseUpdater.TABLE_NAME + " ORDER BY db_name"; - try { - final PreparedStatement stmt = conn.prepareStatement(query); - - try (ResultSet rs = stmt.executeQuery()) { - while (true) { - if (!rs.next()) { - return; - } - final String versionString; - if (rs.getString(1).equals("varfish-annotator")) { - versionString = AnnotateSvsVcf.class.getPackage().getSpecificationVersion(); - } else { - versionString = rs.getString(2); - } - dbInfoWriter.write( - args.getRelease() + "\t" + rs.getString(1) + "\t" + versionString + "\n"); - } - } - } catch (SQLException e) { - throw new VarfishAnnotatorException("Problem with querying database", e); - } catch (IOException e) { - throw new VarfishAnnotatorException("Could not write TSV info", e); - } - } - /** * Perform the variant annotation. * - * @param conn Database connection for getting ExAC/ClinVar information from. * @param reader Reader for the input VCF file. * @param refseqJv Deserialized RefSeq transcript database for Jannovar. * @param ensemblJv Deserialized ENSEMBL transcript database for Jannovar. @@ -303,7 +218,6 @@ private void writeDbInfos(Connection conn, BufferedWriter dbInfoWriter) * @throws VarfishAnnotatorException in case of problems */ private void annotateSvVcf( - Connection conn, GenomeVersion genomeVersion, VCFFileReader reader, JannovarData refseqJv, @@ -311,21 +225,24 @@ private void annotateSvVcf( Writer gtWriter, Writer featureEffectsWriter) throws VarfishAnnotatorException { + // Get list of filter values to skip. + final ImmutableSet skipFilters = ImmutableSet.copyOf(args.getSkipFilters().split(",")); + + // Helpers for building record for `.gts.tsv` and `.feature-effects.tsv` records. + final GtRecordBuilder gtRecordBuilder = + new GtRecordBuilder( + args.getRelease(), + args.getDefaultSvMethod(), + args.getOptOutFeatures(), + args.getCaseId(), + args.getSetId(), + pedigree); + final FeatureEffectsRecordBuilder feRecordBuilder = + new FeatureEffectsRecordBuilder(args.getCaseId(), args.getSetId()); // Write out header. try { - // (Conditionally) write genotype header. - final List headers = new ArrayList<>(); - headers.addAll(HEADERS_GT_PART_1); - if (!args.getOptOutFeatures().contains(FEATURE_CHROM2_COLUMNS)) { - headers.addAll(HEADERS_GT_PART_2); - } - headers.addAll(HEADERS_GT_PART_3); - if (!args.getOptOutFeatures().contains(FEATURE_DBCOUNTS_COLUMNS)) { - headers.addAll(HEADERS_GT_PART_4); - } - headers.addAll(HEADERS_GT_PART_5); - gtWriter.append(Joiner.on("\t").join(headers) + "\n"); + gtWriter.append(Joiner.on("\t").join(gtRecordBuilder.getHeaders()) + "\n"); // Write feature-effects header. featureEffectsWriter.append(Joiner.on("\t").join(HEADERS_FEATURE_EFFECTS) + "\n"); } catch (IOException e) { @@ -343,57 +260,103 @@ private void annotateSvVcf( ensemblJv.getChromosomes(), new Options(false, AminoAcidCode.ONE_LETTER, false, false, false, false, false)); - // Collect names of skipped contigs. + // Collect names of skipped contigs so we only create one log skipping them once. Set skippedContigs = new HashSet<>(); String prevChr = null; for (VariantContext ctx : reader) { - if (ctx.getFilters().contains("LowQual")) { - // TODO: make this configurable - continue; // skip low-quality SVs - } - // Skip DRAGEN "reference" lines; they have no alternate allele. - if (ctx.getNAlleles() == 1) { + if (skipRecord(ctx, skipFilters, skippedContigs)) { continue; - } - - // Check whether contigs should be skipped. - if (skippedContigs.contains(ctx.getContig())) { - continue; // skip silently - } else if (!ctx.getContig().matches(args.getContigRegex())) { - System.err.println("Skipping contig " + ctx.getContig()); - skippedContigs.add(ctx.getContig()); - continue; - } - - if (!ctx.getContig().equals(prevChr)) { + } else if (!ctx.getContig().equals(prevChr)) { System.err.println("Now on contig " + ctx.getContig()); } + annotateVariantContext( - refseqAnnotator, ensemblAnnotator, ctx, genomeVersion, gtWriter, featureEffectsWriter); + refseqAnnotator, + ensemblAnnotator, + ctx, + genomeVersion, + gtRecordBuilder, + feRecordBuilder, + gtWriter, + featureEffectsWriter); // Maybe generate mate's record and write it out. - final boolean isBnd = "BND".equals(ctx.getAttributeAsString("SVTYPE", "")); - final boolean isDelly = ctx.getAttributeAsString("SVMETHOD", "").startsWith("EMBL.DELLYv"); - if (isBnd - && (args.getWriteBndMates().equals("true") - || args.getWriteBndMates().equals("auto") && isDelly)) { - final VariantContext mateCtx = buildMateCtx(ctx); - if (mateCtx != null) { - annotateVariantContext( - refseqAnnotator, - ensemblAnnotator, - mateCtx, - genomeVersion, - gtWriter, - featureEffectsWriter); - } - } + maybeWriteOutBndMate( + genomeVersion, + gtRecordBuilder, + feRecordBuilder, + gtWriter, + featureEffectsWriter, + refseqAnnotator, + ensemblAnnotator, + ctx); prevChr = ctx.getContig(); } } + /** + * Write out BND mate record if the SV caller does not write it out. + * + *

Currently, we have explicit support for Delly2, Manta, and DRAGEN (which creates output + * equivalent to Manta). In the case of "auto" mode for BND mate generation, we write out if the + * method is Delly2 only. + */ + private void maybeWriteOutBndMate( + GenomeVersion genomeVersion, + GtRecordBuilder gtRecordBuilder, + FeatureEffectsRecordBuilder feRecordBuilder, + Writer gtWriter, + Writer featureEffectsWriter, + VariantContextAnnotator refseqAnnotator, + VariantContextAnnotator ensemblAnnotator, + VariantContext ctx) + throws VarfishAnnotatorException { + final boolean isBnd = "BND".equals(ctx.getAttributeAsString("SVTYPE", "")); + final boolean isDelly = ctx.getAttributeAsString("SVMETHOD", "").startsWith("EMBL.DELLYv"); + if (isBnd + && (args.getWriteBndMates().equals("true") + || args.getWriteBndMates().equals("auto") && isDelly)) { + final VariantContext mateCtx = buildMateCtx(ctx); + if (mateCtx != null) { + annotateVariantContext( + refseqAnnotator, + ensemblAnnotator, + mateCtx, + genomeVersion, + gtRecordBuilder, + feRecordBuilder, + gtWriter, + featureEffectsWriter); + } + } + } + + /** Handle logic for skipping a record, based on contig regular expressions, filters, etc. */ + private boolean skipRecord( + VariantContext ctx, ImmutableSet skipFilters, Set skippedContigs) { + // Skip filtered SVs, e.g., LowQual + if (ctx.getFilters().stream().anyMatch((String ft) -> skipFilters.contains(ft))) { + return true; + } + // Skip DRAGEN "reference" lines; they have no alternate allele. + if (ctx.getNAlleles() == 1) { + return true; + } + + // Check whether contigs should be skipped. + if (skippedContigs.contains(ctx.getContig())) { + return true; // skip silently + } else if (!ctx.getContig().matches(args.getContigRegex())) { + System.err.println("Skipping contig " + ctx.getContig()); + skippedContigs.add(ctx.getContig()); + return true; + } + + return false; + } + /** * Build the {@link VariantContext} for the mate of ctx. * @@ -437,8 +400,6 @@ private VariantContext buildMateCtx(VariantContext ctx) { final String leftBracket = m.group(2); final String chrom = m.group(3); final int pos = Integer.valueOf(m.group(4)); - // final String rightBracket = m.group(5); - // final String trailing = m.group(6); boolean hasLeading = leading != null; boolean leftOpen = "]".equals(leftBracket); @@ -482,6 +443,8 @@ private void annotateVariantContext( VariantContextAnnotator ensemblAnnotator, VariantContext ctx, GenomeVersion genomeVersion, + GtRecordBuilder gtRecordBuilder, + FeatureEffectsRecordBuilder feRecordBuilder, Writer gtWriter, Writer featureEffectsWriter) throws VarfishAnnotatorException { @@ -496,8 +459,10 @@ private void annotateVariantContext( final UUID variantId = nextUuid(); // Get annotations sorted descendingly by variant effect. - final List sortedRefseqAnnos = sortAnnos(refseqAnnotationsList, i); - final List sortedEnsemblAnnos = sortAnnos(ensemblAnnotationsList, i); + final List sortedRefseqAnnos = + AnnoSorting.sortSvAnnos(refseqAnnotationsList, i); + final List sortedEnsemblAnnos = + AnnoSorting.sortSvAnnos(ensemblAnnotationsList, i); // Build `SVGenomeVariant` from `ctx` regardless of any annotation. final SVGenomeVariant svGenomeVar; @@ -511,7 +476,8 @@ private void annotateVariantContext( } // Write out record with the genotype. - final List gtOutRec = buildGtRecord(variantId, svGenomeVar, ctx, genomeVersion, i); + final List gtOutRec = + gtRecordBuilder.buildRecord(variantId, svGenomeVar, ctx, genomeVersion, i); try { gtWriter.append(Joiner.on("\t").useForNull(".").join(gtOutRec) + "\n"); } catch (IOException e) { @@ -524,67 +490,15 @@ private void annotateVariantContext( continue; // short-circuit } - // Collecting RefSeq and ENSEMBL annotations per gene. We collect the variants by gene for - // RefSeq in the simplest way possible (mapping to ENSEMBL gene by using HGNC annotation from - // Jannovar). Further, if either only yields one gene we force it to be the same as the - // (lexicographically first) gene of the other. - // - // Intergenic variants are skipped. - - // Start out with the RefSeq annotations - final HashMap refseqAnnoByGene = new HashMap<>(); - for (SVAnnotation annotation : sortedRefseqAnnos) { - if (annotation.getTranscript() == null) { - continue; // skip, no transcript - } - String geneId = annotation.getTranscript().getAltGeneIDs().get("ENSEMBL_GENE_ID"); - if (geneId == null) { - geneId = annotation.getTranscript().getGeneID(); - } - if (!annotation.getEffects().contains(VariantEffect.INTERGENIC_VARIANT) - && !refseqAnnoByGene.containsKey(geneId)) { - refseqAnnoByGene.put(geneId, annotation); - } - } - - // Now, for the ENSEMBL annotations - final HashMap ensemblAnnoByGene = new HashMap<>(); - for (SVAnnotation annotation : sortedEnsemblAnnos) { - if (annotation.getTranscript() == null) { - continue; // skip, no transcript - } - final String geneId = annotation.getTranscript().getGeneID(); - if (!annotation.getEffects().contains(VariantEffect.INTERGENIC_VARIANT) - && !ensemblAnnoByGene.containsKey(geneId)) { - ensemblAnnoByGene.put(geneId, annotation); - } - } - - // Match RefSeq and ENSEMBL annotations - final TreeSet geneIds = new TreeSet<>(); - if (refseqAnnoByGene.size() == 1 && ensemblAnnoByGene.size() > 0) { - geneIds.addAll(ensemblAnnoByGene.keySet()); - final String keyEnsembl = ensemblAnnoByGene.keySet().iterator().next(); - final String keyRefseq = refseqAnnoByGene.keySet().iterator().next(); - refseqAnnoByGene.put(keyEnsembl, refseqAnnoByGene.get(keyRefseq)); - } else if (refseqAnnoByGene.size() > 0 && ensemblAnnoByGene.size() == 1) { - geneIds.addAll(refseqAnnoByGene.keySet()); - final String keyEnsembl = ensemblAnnoByGene.keySet().iterator().next(); - final String keyRefseq = refseqAnnoByGene.keySet().iterator().next(); - ensemblAnnoByGene.put(keyRefseq, ensemblAnnoByGene.get(keyEnsembl)); - } else { - geneIds.addAll(ensemblAnnoByGene.keySet()); - geneIds.addAll(refseqAnnoByGene.keySet()); - } - // Write one entry for each gene into the feature effect call file. - for (String geneId : geneIds) { + final FeatureEffectsRecordBuilder.Result feResult = + feRecordBuilder.buildAnnosByDb(sortedEnsemblAnnos, sortedRefseqAnnos); + for (String geneId : feResult.getGeneIds()) { List featureEffectOutRec = - buildFeatureEffectRecord( - svGenomeVar, + feRecordBuilder.buildRecord( variantId, - refseqAnnoByGene.get(geneId), - ensemblAnnoByGene.get(geneId)); + feResult.getRefseqAnnoByGene().get(geneId), + feResult.getEnsemblAnnoByGene().get(geneId)); try { featureEffectsWriter.append( Joiner.on("\t").useForNull(".").join(featureEffectOutRec) + "\n"); @@ -595,300 +509,6 @@ private void annotateVariantContext( } } - /** Buidl string list with the information for the genotype call record. */ - private List buildGtRecord( - UUID variantId, - SVGenomeVariant svGenomeVar, - VariantContext ctx, - GenomeVersion genomeVersion, - int alleleNo) { - String svMethod = ctx.getCommonInfo().getAttributeAsString("SVMETHOD", null); - if (svMethod == null) { - svMethod = args.getDefaultSvMethod() == null ? "." : args.getDefaultSvMethod(); - } - - final int bin; - final int bin2; - final int pos2; - if (svGenomeVar.getType() == Type.BND) { - pos2 = svGenomeVar.getPos() + 1; - bin = UcscBinning.getContainingBin(svGenomeVar.getPos(), svGenomeVar.getPos() + 1); - bin2 = UcscBinning.getContainingBin(pos2, pos2 + 1); - } else { - pos2 = svGenomeVar.getPos2(); - bin = UcscBinning.getContainingBin(svGenomeVar.getPos(), pos2); - bin2 = bin; - } - - final String contigName = - (genomeVersion == GenomeVersion.HG19) - ? svGenomeVar.getChrName().replaceFirst("chr", "") - : svGenomeVar.getChrName(); - final String contigName2 = - (genomeVersion == GenomeVersion.HG19) - ? svGenomeVar.getChr2Name().replaceFirst("chr", "") - : svGenomeVar.getChr2Name(); - - final ImmutableList.Builder result = ImmutableList.builder(); - // part 1 - result.add( - args.getRelease(), - contigName, - svGenomeVar.getChr(), - UcscBinning.getContainingBin(svGenomeVar.getPos(), pos2)); - // optional part 2 - if (!args.getOptOutFeatures().contains(FEATURE_CHROM2_COLUMNS)) { - final String peOrientation; - if (svGenomeVar.getType() == Type.INV) { - peOrientation = "3to3"; - } else if (svGenomeVar.getType() == Type.DUP) { - peOrientation = "5to3"; - } else if (svGenomeVar.getType() == Type.DEL) { - peOrientation = "3to5"; - } else if (svGenomeVar.getType() == Type.BND) { - final String altBases = ctx.getAlternateAllele(alleleNo - 1).getDisplayString(); - if (altBases.startsWith("[")) { - peOrientation = "3to5"; - } else if (altBases.startsWith("]")) { - peOrientation = "5to3"; - } else if (altBases.endsWith("[")) { - peOrientation = "3to5"; - } else if (altBases.endsWith("]")) { - peOrientation = "3to3"; - } else { - peOrientation = "."; - } - } else { - peOrientation = "."; - } - result.add(contigName2, svGenomeVar.getChr2(), bin2, peOrientation); - } - // part 3 - result.add( - svGenomeVar.getPos() + 1, - pos2, - svGenomeVar.getPosCILowerBound(), - svGenomeVar.getPosCIUpperBound(), - svGenomeVar.getPos2CILowerBound(), - svGenomeVar.getPos2CIUpperBound(), - args.getCaseId(), - args.getSetId(), - variantId.toString(), - svMethod, - // TODO: improve type and sub type annotation! - svGenomeVar.getType(), - svGenomeVar.getType(), - buildInfoValue(ctx, genomeVersion, svGenomeVar)); - // optional part 4 - if (!args.getOptOutFeatures().contains(FEATURE_DBCOUNTS_COLUMNS)) { - final GenotypeCounts gtCounts = - GenotypeCounts.buildGenotypeCounts(ctx, alleleNo, pedigree, args.getRelease()); - result.add( - String.valueOf(gtCounts.numHomAlt), - String.valueOf(gtCounts.numHomRef), - String.valueOf(gtCounts.numHet), - String.valueOf(gtCounts.numHemiAlt), - String.valueOf(gtCounts.numHemiRef)); - } - // part 5 - result.add(buildGenotypeValue(ctx, alleleNo)); - - return result.build(); - } - - private String buildInfoValue( - VariantContext ctx, GenomeVersion genomeVersion, SVGenomeVariant svGenomeVar) { - final List mappings = new ArrayList<>(); - - if (!args.getOptOutFeatures().contains(FEATURE_SUPPRESS_CARRIER_COUNTS)) { - if (svGenomeVar.getType() == Type.BND) { - final String contigName2 = - (genomeVersion == GenomeVersion.HG19) - ? svGenomeVar.getChr2Name().replaceFirst("chr", "") - : svGenomeVar.getChr2Name(); - - mappings.add(tripleQuote("chr2") + ":" + tripleQuote(contigName2)); - mappings.add(tripleQuote("pos2") + ":" + svGenomeVar.getPos2()); - } - - mappings.add( - tripleQuote("backgroundCarriers") - + ":" - + ctx.getCommonInfo().getAttributeAsInt("BACKGROUND_CARRIERS", 0)); - mappings.add( - tripleQuote("affectedCarriers") - + ":" - + ctx.getCommonInfo().getAttributeAsInt("AFFECTED_CARRIERS", 0)); - mappings.add( - tripleQuote("unaffectedCarriers") - + ":" - + ctx.getCommonInfo().getAttributeAsInt("UNAFFECTED_CARRIERS", 0)); - } - - return "{" + Joiner.on(",").join(mappings) + "}"; - } - - private String buildGenotypeValue(VariantContext ctx, int alleleNo) { - final ArrayList attrs = new ArrayList<>(); - - // Add "GT" field. - final List mappings = new ArrayList<>(); - final Comparator c = Comparator.comparing(String::toString); - final List sortedSampleNames = Lists.newArrayList(ctx.getSampleNames()); - sortedSampleNames.sort(c); - for (String sample : sortedSampleNames) { - final Genotype genotype = ctx.getGenotype(sample); - final Map gts = new TreeMap<>(); - final List gtList = new ArrayList<>(); - for (Allele allele : genotype.getAlleles()) { - if (allele.isNoCall()) { - gtList.add("."); - } else if (ctx.getAlleleIndex(allele) == alleleNo) { - gtList.add("1"); - } else { - gtList.add("0"); - } - } - if (genotype.isPhased()) { - gts.put(sample, Joiner.on("|").join(gtList)); - } else { - gtList.sort(Comparator.naturalOrder()); - gts.put(sample, Joiner.on("/").join(gtList)); - } - attrs.add(Joiner.on("").join(tripleQuote("gt"), ":", tripleQuote(gts.get(sample)))); - - // FT -- genotype filters - if (genotype.hasExtendedAttribute("FT") - && genotype.getFilters() != null - && !genotype.getFilters().equals("")) { - final List fts = - Arrays.stream(genotype.getFilters().split(",")) - .map(s -> tripleQuote(s)) - .collect(Collectors.toList()); - attrs.add(Joiner.on("").join(tripleQuote("ft"), ":{", Joiner.on(",").join(fts), "}")); - } - - // GQ -- genotype quality - if (genotype.hasGQ()) { - attrs.add(Joiner.on("").join(tripleQuote("gq"), ":", genotype.getGQ())); - } - - // Additional integer attributes, currently Delly only. - boolean looksLikeDelly = ctx.getAttributeAsString("SVMETHOD", "").contains("DELLY"); - boolean looksLikeXHMM = ctx.getAttributeAsString("SVMETHOD", "").contains("XHMM"); - boolean looksLikeGcnv = ctx.getAttributeAsString("SVMETHOD", "").contains("gcnvkernel"); - if (looksLikeDelly) { - // * DR -- reference pairs - // * DV -- variant pairs - // * RR -- reference junction - // * RV -- variant junction - final int dr = Integer.parseInt(genotype.getExtendedAttribute("DR", "0").toString()); - final int dv = Integer.parseInt(genotype.getExtendedAttribute("DV", "0").toString()); - final int rr = Integer.parseInt(genotype.getExtendedAttribute("RR", "0").toString()); - final int rv = Integer.parseInt(genotype.getExtendedAttribute("RV", "0").toString()); - - // Attributes to write out. - // - // * pec - paired end coverage - // * pev - paired end variant support - // * src - split read coverage - // * srv - split read end variant support - attrs.add(Joiner.on("").join(tripleQuote("pec"), ":", String.valueOf(dr + dv))); - attrs.add(Joiner.on("").join(tripleQuote("pev"), ":", String.valueOf(dv))); - attrs.add(Joiner.on("").join(tripleQuote("src"), ":", String.valueOf(rr + rv))); - attrs.add(Joiner.on("").join(tripleQuote("srv"), ":", String.valueOf(rv))); - } else if (looksLikeXHMM) { - // * DQ -- diploid quality - // * NDQ -- non-diploid quality - // * RD -- mean normalized read depth over region - // * PL -- genotype likelihoods, for [diploid, deletion, duplication] - final float dq = Float.parseFloat(genotype.getExtendedAttribute("DQ", "0.0").toString()); - final float ndq = Float.parseFloat(genotype.getExtendedAttribute("NDQ", "0.0").toString()); - final float rd = Float.parseFloat(genotype.getExtendedAttribute("RD", "0.0").toString()); - final int pl[] = genotype.getPL(); - - // Attributes to write out. - // - // * dq -- diploid quality - // * ndq -- non-diploid quality - // * rd -- mean normalized read depth over region - // * pl -- genotype likelihoods, for [diploid, deletion, duplication] - attrs.add(Joiner.on("").join(tripleQuote("dq"), ":", String.valueOf(dq))); - attrs.add(Joiner.on("").join(tripleQuote("ndq"), ":", String.valueOf(ndq))); - attrs.add(Joiner.on("").join(tripleQuote("rd"), ":", String.valueOf(rd))); - attrs.add( - Joiner.on("").join(tripleQuote("pl"), ":[", Joiner.on(',').join(Ints.asList(pl)), "]")); - } else if (looksLikeGcnv) { - // * CN -- copy number - // * NP -- number of points in segment - // * QA -- phred-scaled quality of all points agreeing - // * QS -- phred-scaled quality of at least one point agreeing - // * QSS -- phred-scaled quality of start breakpoint - // * QSE -- phred-scaled quality of end breakpoint - final int cn = Integer.parseInt(genotype.getExtendedAttribute("CN", "0").toString()); - final int np = Integer.parseInt(genotype.getExtendedAttribute("NP", "0").toString()); - final int qa = Integer.parseInt(genotype.getExtendedAttribute("QA", "0").toString()); - final int qs = Integer.parseInt(genotype.getExtendedAttribute("QS", "0").toString()); - final int qss = Integer.parseInt(genotype.getExtendedAttribute("QSS", "0").toString()); - final int qse = Integer.parseInt(genotype.getExtendedAttribute("QSE", "0").toString()); - - // Attributes to write out. - // - // * cn -- copy number - // * np -- number of points in segment - // * qa -- phred-scaled quality of all points agreeing - // * qs -- phred-scaled quality of at least one point agreeing - // * qss -- phred-scaled quality of start breakpoint - // * qse -- phred-scaled quality of end breakpoint - attrs.add(Joiner.on("").join(tripleQuote("cn"), ":", String.valueOf(cn))); - attrs.add(Joiner.on("").join(tripleQuote("np"), ":", String.valueOf(np))); - attrs.add(Joiner.on("").join(tripleQuote("qa"), ":", String.valueOf(qa))); - attrs.add(Joiner.on("").join(tripleQuote("qs"), ":", String.valueOf(qs))); - attrs.add(Joiner.on("").join(tripleQuote("qss"), ":", String.valueOf(qss))); - attrs.add(Joiner.on("").join(tripleQuote("qse"), ":", String.valueOf(qse))); - } - - mappings.add(Joiner.on("").join(tripleQuote(sample), ":{", Joiner.on(",").join(attrs), "}")); - } - - return "{" + Joiner.on(",").join(mappings) + "}"; - } - - /** Build string list with the information for one feature effect record. */ - private List buildFeatureEffectRecord( - SVGenomeVariant svGenomeVar, - UUID variantId, - SVAnnotation refSeqAnno, - SVAnnotation ensemblAnno) { - List result = - Lists.newArrayList(args.getCaseId(), args.getSetId(), variantId.toString()); - if (refSeqAnno == null) { - result.addAll(Arrays.asList(null, null, "FALSE", "{}")); - } else { - result.addAll( - Arrays.asList( - refSeqAnno.getTranscript().getGeneID(), - refSeqAnno.getTranscript().getAccession(), - refSeqAnno.getTranscript().isCoding() ? "TRUE" : "FALSE", - (refSeqAnno.getEffects() == null) - ? "{}" - : buildEffectsValue(refSeqAnno.getEffects()))); - } - if (ensemblAnno == null) { - result.addAll(Arrays.asList(null, null, "FALSE", "{}")); - } else { - result.addAll( - Arrays.asList( - ensemblAnno.getTranscript().getGeneID(), - ensemblAnno.getTranscript().getAccession(), - ensemblAnno.getTranscript().isCoding() ? "TRUE" : "FALSE", - (ensemblAnno.getEffects() == null) - ? "{}" - : buildEffectsValue(ensemblAnno.getEffects()))); - } - return result; - } - private ImmutableList silentBuildAnnotations( VariantContext ctx, VariantContextAnnotator annotator) { try { @@ -901,55 +521,4 @@ private ImmutableList silentBuildAnnotations( return null; } } - - private List sortAnnos(ImmutableList refseqAnnotationsList, int i) { - if (refseqAnnotationsList == null) { - return new ArrayList<>(); - } else { - return refseqAnnotationsList - .get(i - 1) - .getAnnotations() - .stream() - .sorted( - Comparator.comparing( - SVAnnotation::getMostPathogenicVariantEffect, - (t1, t2) -> { - if (t1 == null && t2 == null) { - return 0; - } else if (t2 == null) { - return -1; - } else if (t1 == null) { - return 1; - } else { - return t1.compareTo(t2); - } - })) - .collect(Collectors.toList()); - } - } - - /** - * (Overly) simple helper for escaping {@code s}. - * - * @param s String to escape. - * @return Escaped version of {@code s}. - */ - private static String tripleQuote(String s) { - return "\"\"\"" + s.replaceAll("\"\"\"", "") + "\"\"\""; - } - - /** - * Build Postgres array expression for TSV file with variant effects. - * - * @param effects The effects to create expression for. - * @return String with the variant effects. - */ - private String buildEffectsValue(ImmutableSet effects) { - final List effectStrings = - effects - .stream() - .map(e -> "\"" + e.getSequenceOntologyTerm() + "\"") - .collect(Collectors.toList()); - return Joiner.on("").join("{", Joiner.on(',').join(effectStrings), "}"); - } } diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/init_db/DbReleaseUpdater.java b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/init_db/DbReleaseUpdater.java index 69e8210..4f5311f 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/init_db/DbReleaseUpdater.java +++ b/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/init_db/DbReleaseUpdater.java @@ -1,6 +1,7 @@ package com.github.bihealth.varfish_annotator.init_db; import com.github.bihealth.varfish_annotator.VarfishAnnotatorException; +import com.github.bihealth.varfish_annotator.db.DbConstants; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; @@ -9,9 +10,6 @@ /** Store database release infos. */ public class DbReleaseUpdater { - /** The name of the table in the database. */ - public static final String TABLE_NAME = "db_release_info"; - /** The JDBC connection. */ private final Connection conn; @@ -45,7 +43,7 @@ public void run() throws VarfishAnnotatorException { private void recreateTable() throws VarfishAnnotatorException { final String createQuery = "CREATE TABLE IF NOT EXISTS " - + TABLE_NAME + + DbConstants.TABLE_NAME + "(" + "db_name VARCHAR(100) NOT NULL PRIMARY KEY, " + "release VARCHAR(100) NOT NULL, " @@ -60,7 +58,7 @@ private void recreateTable() throws VarfishAnnotatorException { /** Update the release information */ private void updateReleaseInfos() throws VarfishAnnotatorException { final String insertQuery = - "MERGE INTO " + TABLE_NAME + " (db_name, release)" + " VALUES (?, ?)"; + "MERGE INTO " + DbConstants.TABLE_NAME + " (db_name, release)" + " VALUES (?, ?)"; for (String line : dbReleaseInfos) { String[] arr = line.split(":"); diff --git a/varfish-annotator-cli/src/test/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfExceptionTest.java b/varfish-annotator-cli/src/test/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfExceptionTest.java index 40a1c7c..ef27edf 100644 --- a/varfish-annotator-cli/src/test/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfExceptionTest.java +++ b/varfish-annotator-cli/src/test/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfExceptionTest.java @@ -1,5 +1,6 @@ package com.github.bihealth.varfish_annotator.annotate; +import com.github.bihealth.varfish_annotator.checks.IncompatibleVcfException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; diff --git a/varfish-annotator-core/pom.xml b/varfish-annotator-core/pom.xml index 54047d8..4631ca9 100644 --- a/varfish-annotator-core/pom.xml +++ b/varfish-annotator-core/pom.xml @@ -18,6 +18,26 @@ UTF-8 + + + com.github.samtools + htsjdk + ${htsjdk.version} + + + + de.charite.compbio + jannovar-core + ${jannovar.version} + + + + de.charite.compbio + jannovar-htsjdk + ${jannovar.version} + + + diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/VarfishAnnotatorException.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/VarfishAnnotatorException.java similarity index 100% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/VarfishAnnotatorException.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/VarfishAnnotatorException.java diff --git a/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/FeatureEffectsRecordBuilder.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/FeatureEffectsRecordBuilder.java new file mode 100644 index 0000000..1a3f529 --- /dev/null +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/FeatureEffectsRecordBuilder.java @@ -0,0 +1,151 @@ +package com.github.bihealth.varfish_annotator.annotate_svs; + +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Lists; +import de.charite.compbio.jannovar.annotation.SVAnnotation; +import de.charite.compbio.jannovar.annotation.VariantEffect; +import java.util.*; +import java.util.stream.Collectors; + +/** Helper class for building records for the {@code .feature-effects.tsv} file. */ +public class FeatureEffectsRecordBuilder { + + public class Result { + private TreeSet geneIds; + private HashMap ensemblAnnoByGene; + private HashMap refseqAnnoByGene; + + public Result( + TreeSet geneIds, + HashMap ensemblAnnoByGene, + HashMap refseqAnnoByGene) { + this.geneIds = geneIds; + this.ensemblAnnoByGene = ensemblAnnoByGene; + this.refseqAnnoByGene = refseqAnnoByGene; + } + + public TreeSet getGeneIds() { + return geneIds; + } + + public HashMap getEnsemblAnnoByGene() { + return ensemblAnnoByGene; + } + + public HashMap getRefseqAnnoByGene() { + return refseqAnnoByGene; + } + } + + String caseId; + String setId; + + public FeatureEffectsRecordBuilder(String caseId, String setId) { + this.caseId = caseId; + this.setId = setId; + } + + public Result buildAnnosByDb( + List sortedEnsemblAnnos, List sortedRefseqAnnos) { + // Collecting RefSeq and ENSEMBL annotations per gene. We collect the variants by gene for + // RefSeq in the simplest way possible (mapping to ENSEMBL gene by using HGNC annotation from + // Jannovar). Further, if either only yields one gene we force it to be the same as the + // (lexicographically first) gene of the other. + // + // Intergenic variants are skipped. + + // Start out with the RefSeq annotations + final HashMap refseqAnnoByGene = new HashMap<>(); + for (SVAnnotation annotation : sortedRefseqAnnos) { + if (annotation.getTranscript() == null) { + continue; // skip, no transcript + } + String geneId = annotation.getTranscript().getAltGeneIDs().get("ENSEMBL_GENE_ID"); + if (geneId == null) { + geneId = annotation.getTranscript().getGeneID(); + } + if (!annotation.getEffects().contains(VariantEffect.INTERGENIC_VARIANT) + && !refseqAnnoByGene.containsKey(geneId)) { + refseqAnnoByGene.put(geneId, annotation); + } + } + + // Now, for the ENSEMBL annotations + final HashMap ensemblAnnoByGene = new HashMap<>(); + for (SVAnnotation annotation : sortedEnsemblAnnos) { + if (annotation.getTranscript() == null) { + continue; // skip, no transcript + } + final String geneId = annotation.getTranscript().getGeneID(); + if (!annotation.getEffects().contains(VariantEffect.INTERGENIC_VARIANT) + && !ensemblAnnoByGene.containsKey(geneId)) { + ensemblAnnoByGene.put(geneId, annotation); + } + } + + // Match RefSeq and ENSEMBL annotations + final TreeSet geneIds = new TreeSet<>(); + if (refseqAnnoByGene.size() == 1 && ensemblAnnoByGene.size() > 0) { + geneIds.addAll(ensemblAnnoByGene.keySet()); + final String keyEnsembl = ensemblAnnoByGene.keySet().iterator().next(); + final String keyRefseq = refseqAnnoByGene.keySet().iterator().next(); + refseqAnnoByGene.put(keyEnsembl, refseqAnnoByGene.get(keyRefseq)); + } else if (refseqAnnoByGene.size() > 0 && ensemblAnnoByGene.size() == 1) { + geneIds.addAll(refseqAnnoByGene.keySet()); + final String keyEnsembl = ensemblAnnoByGene.keySet().iterator().next(); + final String keyRefseq = refseqAnnoByGene.keySet().iterator().next(); + ensemblAnnoByGene.put(keyRefseq, ensemblAnnoByGene.get(keyEnsembl)); + } else { + geneIds.addAll(ensemblAnnoByGene.keySet()); + geneIds.addAll(refseqAnnoByGene.keySet()); + } + + return new Result(geneIds, ensemblAnnoByGene, refseqAnnoByGene); + } + + public List buildRecord( + UUID variantId, SVAnnotation refSeqAnno, SVAnnotation ensemblAnno) { + List result = Lists.newArrayList(caseId, setId, variantId.toString()); + if (refSeqAnno == null) { + result.addAll(Arrays.asList(null, null, "FALSE", "{}")); + } else { + result.addAll( + Arrays.asList( + refSeqAnno.getTranscript().getGeneID(), + refSeqAnno.getTranscript().getAccession(), + refSeqAnno.getTranscript().isCoding() ? "TRUE" : "FALSE", + (refSeqAnno.getEffects() == null) + ? "{}" + : buildEffectsValue(refSeqAnno.getEffects()))); + } + if (ensemblAnno == null) { + result.addAll(Arrays.asList(null, null, "FALSE", "{}")); + } else { + result.addAll( + Arrays.asList( + ensemblAnno.getTranscript().getGeneID(), + ensemblAnno.getTranscript().getAccession(), + ensemblAnno.getTranscript().isCoding() ? "TRUE" : "FALSE", + (ensemblAnno.getEffects() == null) + ? "{}" + : buildEffectsValue(ensemblAnno.getEffects()))); + } + return result; + } + + /** + * Build Postgres array expression for TSV file with variant effects. + * + * @param effects The effects to create expression for. + * @return String with the variant effects. + */ + private String buildEffectsValue(ImmutableSet effects) { + final List effectStrings = + effects + .stream() + .map(e -> "\"" + e.getSequenceOntologyTerm() + "\"") + .collect(Collectors.toList()); + return Joiner.on("").join("{", Joiner.on(',').join(effectStrings), "}"); + } +} diff --git a/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/GtRecordBuilder.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/GtRecordBuilder.java new file mode 100644 index 0000000..513fcda --- /dev/null +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/annotate_svs/GtRecordBuilder.java @@ -0,0 +1,349 @@ +package com.github.bihealth.varfish_annotator.annotate_svs; + +import static com.github.bihealth.varfish_annotator.utils.StringUtils.tripleQuote; + +import com.github.bihealth.varfish_annotator.data.GenomeVersion; +import com.github.bihealth.varfish_annotator.utils.GenotypeCounts; +import com.github.bihealth.varfish_annotator.utils.UcscBinning; +import com.google.common.base.Joiner; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Lists; +import com.google.common.primitives.Ints; +import de.charite.compbio.jannovar.pedigree.Pedigree; +import de.charite.compbio.jannovar.reference.SVDescription; +import de.charite.compbio.jannovar.reference.SVGenomeVariant; +import htsjdk.variant.variantcontext.Allele; +import htsjdk.variant.variantcontext.Genotype; +import htsjdk.variant.variantcontext.VariantContext; +import java.util.*; +import java.util.stream.Collectors; + +/** Helper class for building record for {@code .gts.tsv} file. */ +public class GtRecordBuilder { + + public static final String FEATURE_CHROM2_COLUMNS = "chrom2-columns"; + public static final String FEATURE_DBCOUNTS_COLUMNS = "dbcounts-columns"; + public static final String FEATURE_SUPPRESS_CARRIER_COUNTS = "suppress-carriers-in-info"; + + /** Header fields for the SV and genotype file (part 1). */ + private static final ImmutableList HEADERS_GT_PART_1 = + ImmutableList.of("release", "chromosome", "chromosome_no", "bin"); + /** Header fields for the SV and genotype file (part 2, optional). */ + private static final ImmutableList HEADERS_GT_PART_2 = + ImmutableList.of("chromosome2", "chromosome_no2", "bin2", "pe_orientation"); + /** Header fields for the SV and genotype file (part 3). */ + private static final ImmutableList HEADERS_GT_PART_3 = + ImmutableList.of( + "start", + "end", + "start_ci_left", + "start_ci_right", + "end_ci_left", + "end_ci_right", + "case_id", + "set_id", + "sv_uuid", + "caller", + "sv_type", + "sv_sub_type", + "info"); + /** Header fields for the SV and genotype file (part 4, optional). */ + private static final ImmutableList HEADERS_GT_PART_4 = + ImmutableList.of("num_hom_alt", "num_hom_ref", "num_het", "num_hemi_alt", "num_hemi_ref"); + /** Header fields for the SV and genotype file (part 5). */ + private static final ImmutableList HEADERS_GT_PART_5 = ImmutableList.of("genotype"); + + String release; + String defaultSvMethod; + String optOutFeatures; + String caseId; + String setId; + Pedigree pedigree; + + public GtRecordBuilder( + String release, + String defaultSvMethod, + String optOutFeatures, + String caseId, + String setId, + Pedigree pedigree) { + this.release = release; + this.defaultSvMethod = defaultSvMethod; + this.optOutFeatures = optOutFeatures; + this.caseId = caseId; + this.setId = setId; + this.pedigree = pedigree; + } + + public List getHeaders() { + // (Conditionally) write genotype header. + final List headers = new ArrayList<>(); + headers.addAll(HEADERS_GT_PART_1); + if (!optOutFeatures.contains(GtRecordBuilder.FEATURE_CHROM2_COLUMNS)) { + headers.addAll(HEADERS_GT_PART_2); + } + headers.addAll(HEADERS_GT_PART_3); + if (!optOutFeatures.contains(GtRecordBuilder.FEATURE_DBCOUNTS_COLUMNS)) { + headers.addAll(HEADERS_GT_PART_4); + } + headers.addAll(HEADERS_GT_PART_5); + return headers; + } + + public List buildRecord( + UUID variantId, + SVGenomeVariant svGenomeVar, + VariantContext ctx, + GenomeVersion genomeVersion, + int alleleNo) { + String svMethod = ctx.getCommonInfo().getAttributeAsString("SVMETHOD", null); + if (svMethod == null) { + svMethod = defaultSvMethod == null ? "." : defaultSvMethod; + } + + final int bin; + final int bin2; + final int pos2; + if (svGenomeVar.getType() == SVDescription.Type.BND) { + pos2 = svGenomeVar.getPos() + 1; + bin = UcscBinning.getContainingBin(svGenomeVar.getPos(), svGenomeVar.getPos() + 1); + bin2 = UcscBinning.getContainingBin(pos2, pos2 + 1); + } else { + pos2 = svGenomeVar.getPos2(); + bin = UcscBinning.getContainingBin(svGenomeVar.getPos(), pos2); + bin2 = bin; + } + + final String contigName = + (genomeVersion == GenomeVersion.HG19) + ? svGenomeVar.getChrName().replaceFirst("chr", "") + : svGenomeVar.getChrName(); + final String contigName2 = + (genomeVersion == GenomeVersion.HG19) + ? svGenomeVar.getChr2Name().replaceFirst("chr", "") + : svGenomeVar.getChr2Name(); + + final ImmutableList.Builder result = ImmutableList.builder(); + // part 1 + result.add(release, contigName, svGenomeVar.getChr(), bin); + // optional part 2 + if (!optOutFeatures.contains(FEATURE_CHROM2_COLUMNS)) { + final String peOrientation; + if (svGenomeVar.getType() == SVDescription.Type.INV) { + peOrientation = "3to3"; + } else if (svGenomeVar.getType() == SVDescription.Type.DUP) { + peOrientation = "5to3"; + } else if (svGenomeVar.getType() == SVDescription.Type.DEL) { + peOrientation = "3to5"; + } else if (svGenomeVar.getType() == SVDescription.Type.BND) { + final String altBases = ctx.getAlternateAllele(alleleNo - 1).getDisplayString(); + if (altBases.startsWith("[")) { + peOrientation = "3to5"; + } else if (altBases.startsWith("]")) { + peOrientation = "5to3"; + } else if (altBases.endsWith("[")) { + peOrientation = "3to5"; + } else if (altBases.endsWith("]")) { + peOrientation = "3to3"; + } else { + peOrientation = "."; + } + } else { + peOrientation = "."; + } + result.add(contigName2, svGenomeVar.getChr2(), bin2, peOrientation); + } + // part 3 + result.add( + svGenomeVar.getPos() + 1, + pos2, + svGenomeVar.getPosCILowerBound(), + svGenomeVar.getPosCIUpperBound(), + svGenomeVar.getPos2CILowerBound(), + svGenomeVar.getPos2CIUpperBound(), + caseId, + setId, + variantId.toString(), + svMethod, + // TODO: improve type and sub type annotation! + svGenomeVar.getType(), + svGenomeVar.getType(), + buildInfoValue(ctx, genomeVersion, svGenomeVar, optOutFeatures)); + // optional part 4 + if (!optOutFeatures.contains(FEATURE_DBCOUNTS_COLUMNS)) { + final GenotypeCounts gtCounts = + GenotypeCounts.buildGenotypeCounts(ctx, alleleNo, pedigree, release); + result.add( + String.valueOf(gtCounts.numHomAlt), + String.valueOf(gtCounts.numHomRef), + String.valueOf(gtCounts.numHet), + String.valueOf(gtCounts.numHemiAlt), + String.valueOf(gtCounts.numHemiRef)); + } + // part 5 + result.add(buildGenotypeValue(ctx, alleleNo)); + + return result.build(); + } + + private static String buildInfoValue( + VariantContext ctx, + GenomeVersion genomeVersion, + SVGenomeVariant svGenomeVar, + String optOutFeatures) { + final List mappings = new ArrayList<>(); + + if (!optOutFeatures.contains(FEATURE_SUPPRESS_CARRIER_COUNTS)) { + if (svGenomeVar.getType() == SVDescription.Type.BND) { + final String contigName2 = + (genomeVersion == GenomeVersion.HG19) + ? svGenomeVar.getChr2Name().replaceFirst("chr", "") + : svGenomeVar.getChr2Name(); + + mappings.add(tripleQuote("chr2") + ":" + tripleQuote(contigName2)); + mappings.add(tripleQuote("pos2") + ":" + svGenomeVar.getPos2()); + } + + mappings.add( + tripleQuote("backgroundCarriers") + + ":" + + ctx.getCommonInfo().getAttributeAsInt("BACKGROUND_CARRIERS", 0)); + mappings.add( + tripleQuote("affectedCarriers") + + ":" + + ctx.getCommonInfo().getAttributeAsInt("AFFECTED_CARRIERS", 0)); + mappings.add( + tripleQuote("unaffectedCarriers") + + ":" + + ctx.getCommonInfo().getAttributeAsInt("UNAFFECTED_CARRIERS", 0)); + } + + return "{" + Joiner.on(",").join(mappings) + "}"; + } + + private static String buildGenotypeValue(VariantContext ctx, int alleleNo) { + final ArrayList attrs = new ArrayList<>(); + + // Add "GT" field. + final List mappings = new ArrayList<>(); + final Comparator c = Comparator.comparing(String::toString); + final List sortedSampleNames = Lists.newArrayList(ctx.getSampleNames()); + sortedSampleNames.sort(c); + for (String sample : sortedSampleNames) { + final Genotype genotype = ctx.getGenotype(sample); + final Map gts = new TreeMap<>(); + final List gtList = new ArrayList<>(); + for (Allele allele : genotype.getAlleles()) { + if (allele.isNoCall()) { + gtList.add("."); + } else if (ctx.getAlleleIndex(allele) == alleleNo) { + gtList.add("1"); + } else { + gtList.add("0"); + } + } + if (genotype.isPhased()) { + gts.put(sample, Joiner.on("|").join(gtList)); + } else { + gtList.sort(Comparator.naturalOrder()); + gts.put(sample, Joiner.on("/").join(gtList)); + } + attrs.add(Joiner.on("").join(tripleQuote("gt"), ":", tripleQuote(gts.get(sample)))); + + // FT -- genotype filters + if (genotype.hasExtendedAttribute("FT") + && genotype.getFilters() != null + && !genotype.getFilters().equals("")) { + final List fts = + Arrays.stream(genotype.getFilters().split(",")) + .map(s -> tripleQuote(s)) + .collect(Collectors.toList()); + attrs.add(Joiner.on("").join(tripleQuote("ft"), ":{", Joiner.on(",").join(fts), "}")); + } + + // GQ -- genotype quality + if (genotype.hasGQ()) { + attrs.add(Joiner.on("").join(tripleQuote("gq"), ":", genotype.getGQ())); + } + + // Additional integer attributes, currently Delly only. + boolean looksLikeDelly = ctx.getAttributeAsString("SVMETHOD", "").contains("DELLY"); + boolean looksLikeXHMM = ctx.getAttributeAsString("SVMETHOD", "").contains("XHMM"); + boolean looksLikeGcnv = ctx.getAttributeAsString("SVMETHOD", "").contains("gcnvkernel"); + if (looksLikeDelly) { + // * DR -- reference pairs + // * DV -- variant pairs + // * RR -- reference junction + // * RV -- variant junction + final int dr = Integer.parseInt(genotype.getExtendedAttribute("DR", "0").toString()); + final int dv = Integer.parseInt(genotype.getExtendedAttribute("DV", "0").toString()); + final int rr = Integer.parseInt(genotype.getExtendedAttribute("RR", "0").toString()); + final int rv = Integer.parseInt(genotype.getExtendedAttribute("RV", "0").toString()); + + // Attributes to write out. + // + // * pec - paired end coverage + // * pev - paired end variant support + // * src - split read coverage + // * srv - split read end variant support + attrs.add(Joiner.on("").join(tripleQuote("pec"), ":", String.valueOf(dr + dv))); + attrs.add(Joiner.on("").join(tripleQuote("pev"), ":", String.valueOf(dv))); + attrs.add(Joiner.on("").join(tripleQuote("src"), ":", String.valueOf(rr + rv))); + attrs.add(Joiner.on("").join(tripleQuote("srv"), ":", String.valueOf(rv))); + } else if (looksLikeXHMM) { + // * DQ -- diploid quality + // * NDQ -- non-diploid quality + // * RD -- mean normalized read depth over region + // * PL -- genotype likelihoods, for [diploid, deletion, duplication] + final float dq = Float.parseFloat(genotype.getExtendedAttribute("DQ", "0.0").toString()); + final float ndq = Float.parseFloat(genotype.getExtendedAttribute("NDQ", "0.0").toString()); + final float rd = Float.parseFloat(genotype.getExtendedAttribute("RD", "0.0").toString()); + final int pl[] = genotype.getPL(); + + // Attributes to write out. + // + // * dq -- diploid quality + // * ndq -- non-diploid quality + // * rd -- mean normalized read depth over region + // * pl -- genotype likelihoods, for [diploid, deletion, duplication] + attrs.add(Joiner.on("").join(tripleQuote("dq"), ":", String.valueOf(dq))); + attrs.add(Joiner.on("").join(tripleQuote("ndq"), ":", String.valueOf(ndq))); + attrs.add(Joiner.on("").join(tripleQuote("rd"), ":", String.valueOf(rd))); + attrs.add( + Joiner.on("").join(tripleQuote("pl"), ":[", Joiner.on(',').join(Ints.asList(pl)), "]")); + } else if (looksLikeGcnv) { + // * CN -- copy number + // * NP -- number of points in segment + // * QA -- phred-scaled quality of all points agreeing + // * QS -- phred-scaled quality of at least one point agreeing + // * QSS -- phred-scaled quality of start breakpoint + // * QSE -- phred-scaled quality of end breakpoint + final int cn = Integer.parseInt(genotype.getExtendedAttribute("CN", "0").toString()); + final int np = Integer.parseInt(genotype.getExtendedAttribute("NP", "0").toString()); + final int qa = Integer.parseInt(genotype.getExtendedAttribute("QA", "0").toString()); + final int qs = Integer.parseInt(genotype.getExtendedAttribute("QS", "0").toString()); + final int qss = Integer.parseInt(genotype.getExtendedAttribute("QSS", "0").toString()); + final int qse = Integer.parseInt(genotype.getExtendedAttribute("QSE", "0").toString()); + + // Attributes to write out. + // + // * cn -- copy number + // * np -- number of points in segment + // * qa -- phred-scaled quality of all points agreeing + // * qs -- phred-scaled quality of at least one point agreeing + // * qss -- phred-scaled quality of start breakpoint + // * qse -- phred-scaled quality of end breakpoint + attrs.add(Joiner.on("").join(tripleQuote("cn"), ":", String.valueOf(cn))); + attrs.add(Joiner.on("").join(tripleQuote("np"), ":", String.valueOf(np))); + attrs.add(Joiner.on("").join(tripleQuote("qa"), ":", String.valueOf(qa))); + attrs.add(Joiner.on("").join(tripleQuote("qs"), ":", String.valueOf(qs))); + attrs.add(Joiner.on("").join(tripleQuote("qss"), ":", String.valueOf(qss))); + attrs.add(Joiner.on("").join(tripleQuote("qse"), ":", String.valueOf(qse))); + } + + mappings.add(Joiner.on("").join(tripleQuote(sample), ":{", Joiner.on(",").join(attrs), "}")); + } + + return "{" + Joiner.on(",").join(mappings) + "}"; + } +} diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfException.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/checks/IncompatibleVcfException.java similarity index 63% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfException.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/checks/IncompatibleVcfException.java index 580b453..f462da6 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/IncompatibleVcfException.java +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/checks/IncompatibleVcfException.java @@ -1,4 +1,4 @@ -package com.github.bihealth.varfish_annotator.annotate; +package com.github.bihealth.varfish_annotator.checks; /** Raised on incompatible VCF files. */ public class IncompatibleVcfException extends Exception { @@ -6,4 +6,6 @@ public class IncompatibleVcfException extends Exception { public IncompatibleVcfException(String message) { super(message); } + + private static final long serialVersionUID = 0; } diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/VcfCompatibilityChecker.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/checks/VcfCompatibilityChecker.java similarity index 95% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/VcfCompatibilityChecker.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/checks/VcfCompatibilityChecker.java index ed74520..daf7ecd 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/VcfCompatibilityChecker.java +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/checks/VcfCompatibilityChecker.java @@ -1,5 +1,6 @@ -package com.github.bihealth.varfish_annotator.annotate; +package com.github.bihealth.varfish_annotator.checks; +import com.github.bihealth.varfish_annotator.data.GenomeVersion; import htsjdk.samtools.SAMSequenceRecord; import htsjdk.variant.vcf.VCFContigHeaderLine; import htsjdk.variant.vcf.VCFFileReader; @@ -36,7 +37,7 @@ public VcfCompatibilityChecker(VCFFileReader reader) { *

Throws an exception in case of problems and otherwise just returns. Will print a warning to * stderr if no reliable decision could be made. * - * @param requiredRElease The required release. + * @param requiredRelease The required release. * @throws IncompatibleVcfException If the VCF file looks to be incompatible. */ public void check(String requiredRelease) throws IncompatibleVcfException { diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/GenomeVersion.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/data/GenomeVersion.java similarity index 65% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/GenomeVersion.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/data/GenomeVersion.java index 854192a..4b139fc 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/annotate/GenomeVersion.java +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/data/GenomeVersion.java @@ -1,4 +1,4 @@ -package com.github.bihealth.varfish_annotator.annotate; +package com.github.bihealth.varfish_annotator.data; /** Enumerate known genome versions. */ public enum GenomeVersion { diff --git a/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/data/VcfConstants.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/data/VcfConstants.java new file mode 100644 index 0000000..cc671a8 --- /dev/null +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/data/VcfConstants.java @@ -0,0 +1,72 @@ +package com.github.bihealth.varfish_annotator.data; + +import com.google.common.collect.ImmutableList; + +/** Constants to use for creating/filling VCF. */ +public final class VcfConstants { + + /** Name of table with ExAC variants. */ + public static final String EXAC_PREFIX = "exac"; + + /** Name of table with gnomAD exomes variants. */ + public static final String GNOMAD_EXOMES_PREFIX = "gnomad_exome"; + + /** Name of table with gnomAD genomes variants. */ + public static final String GNOMAD_GENOMES_PREFIX = "gnomad_genome"; + + /** Name of table with Thousand Genomes variants. */ + public static final String THOUSAND_GENOMES_PREFIX = "thousand_genomes"; + + /** Header fields for the genotypes file. */ + public static final ImmutableList HEADERS_GT = + ImmutableList.of( + "release", + "chromosome", + "chromosome_no", + "start", + "end", + "bin", + "reference", + "alternative", + "var_type", + "case_id", + "set_id", + "info", + "genotype", + "num_hom_alt", + "num_hom_ref", + "num_het", + "num_hemi_alt", + "num_hemi_ref", + "in_clinvar", + "exac_frequency", + "exac_homozygous", + "exac_heterozygous", + "exac_hemizygous", + "thousand_genomes_frequency", + "thousand_genomes_homozygous", + "thousand_genomes_heterozygous", + "thousand_genomes_hemizygous", + "gnomad_exomes_frequency", + "gnomad_exomes_homozygous", + "gnomad_exomes_heterozygous", + "gnomad_exomes_hemizygous", + "gnomad_genomes_frequency", + "gnomad_genomes_homozygous", + "gnomad_genomes_heterozygous", + "gnomad_genomes_hemizygous", + "refseq_gene_id", + "refseq_transcript_id", + "refseq_transcript_coding", + "refseq_hgvs_c", + "refseq_hgvs_p", + "refseq_effect", + "refseq_exon_dist", + "ensembl_gene_id", + "ensembl_transcript_id", + "ensembl_transcript_coding", + "ensembl_hgvs_c", + "ensembl_hgvs_p", + "ensembl_effect", + "ensembl_exon_dist"); +} diff --git a/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/db/DbConstants.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/db/DbConstants.java new file mode 100644 index 0000000..717fefc --- /dev/null +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/db/DbConstants.java @@ -0,0 +1,8 @@ +package com.github.bihealth.varfish_annotator.db; + +/** Constants for database. */ +public class DbConstants { + + /** The name of the table in the database. */ + public static final String TABLE_NAME = "db_release_info"; +} diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/DbInfo.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/db/DbInfo.java similarity index 96% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/DbInfo.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/db/DbInfo.java index 6ea59f6..abca8eb 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/DbInfo.java +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/db/DbInfo.java @@ -1,4 +1,4 @@ -package com.github.bihealth.varfish_annotator; +package com.github.bihealth.varfish_annotator.db; /** Information from variant from DB. */ public class DbInfo { diff --git a/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/db/DbInfoWriterHelper.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/db/DbInfoWriterHelper.java new file mode 100644 index 0000000..eef195b --- /dev/null +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/db/DbInfoWriterHelper.java @@ -0,0 +1,57 @@ +package com.github.bihealth.varfish_annotator.db; + +import com.github.bihealth.varfish_annotator.VarfishAnnotatorException; +import java.io.BufferedWriter; +import java.io.IOException; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; + +/** Helper class for writing out information about database to output TSV file. */ +public class DbInfoWriterHelper { + + /** + * Write information about used databases to TSV file. + * + * @param conn Database connection to get the information from. + * @param dbInfoWriter Writer for database information. + * @param genomeRelease The genome release to write out. + * @param classForVersion The Java class to use for extracting the version. + * @throws VarfishAnnotatorException in case of problems + */ + public void writeDbInfos( + Connection conn, BufferedWriter dbInfoWriter, String genomeRelease, Class classForVersion) + throws VarfishAnnotatorException { + try { + dbInfoWriter.write("genomebuild\tdb_name\trelease\n"); + } catch (IOException e) { + throw new VarfishAnnotatorException("Could not write out headers", e); + } + + final String query = + "SELECT db_name, release FROM " + DbConstants.TABLE_NAME + " ORDER BY db_name"; + try { + final PreparedStatement stmt = conn.prepareStatement(query); + + try (ResultSet rs = stmt.executeQuery()) { + while (true) { + if (!rs.next()) { + return; + } + final String versionString; + if (rs.getString(1).equals("varfish-annotator")) { + versionString = classForVersion.getPackage().getSpecificationVersion(); + } else { + versionString = rs.getString(2); + } + dbInfoWriter.write(genomeRelease + "\t" + rs.getString(1) + "\t" + versionString + "\n"); + } + } + } catch (SQLException e) { + throw new VarfishAnnotatorException("Problem with querying database", e); + } catch (IOException e) { + throw new VarfishAnnotatorException("Could not write TSV info", e); + } + } +} diff --git a/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/AnnoSorting.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/AnnoSorting.java new file mode 100644 index 0000000..efefc34 --- /dev/null +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/AnnoSorting.java @@ -0,0 +1,65 @@ +package com.github.bihealth.varfish_annotator.utils; + +import com.google.common.collect.ImmutableList; +import de.charite.compbio.jannovar.annotation.*; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Collectors; + +public class AnnoSorting { + + public static List sortAnnotations( + ImmutableList refseqAnnotationsList, int i) { + if (refseqAnnotationsList == null) { + return new ArrayList<>(); + } else { + return refseqAnnotationsList + .get(i - 1) + .getAnnotations() + .stream() + .sorted( + Comparator.comparing( + Annotation::getMostPathogenicVarType, + (t1, t2) -> { + if (t1 == null && t2 == null) { + return 0; + } else if (t2 == null) { + return -1; + } else if (t1 == null) { + return 1; + } else { + return t1.compareTo(t2); + } + })) + .collect(Collectors.toList()); + } + } + + public static List sortSvAnnos( + ImmutableList refseqAnnotationsList, int i) { + if (refseqAnnotationsList == null) { + return new ArrayList<>(); + } else { + return refseqAnnotationsList + .get(i - 1) + .getAnnotations() + .stream() + .sorted( + Comparator.comparing( + SVAnnotation::getMostPathogenicVariantEffect, + (t1, t2) -> { + if (t1 == null && t2 == null) { + return 0; + } else if (t2 == null) { + return -1; + } else if (t1 == null) { + return 1; + } else { + return t1.compareTo(t2); + } + })) + .collect(Collectors.toList()); + } + } +} diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/DatabaseSelfTest.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/DatabaseSelfTest.java similarity index 81% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/DatabaseSelfTest.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/DatabaseSelfTest.java index 799d650..934b9c2 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/DatabaseSelfTest.java +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/DatabaseSelfTest.java @@ -1,6 +1,6 @@ package com.github.bihealth.varfish_annotator.utils; -import com.github.bihealth.varfish_annotator.annotate.AnnotateVcf; +import com.github.bihealth.varfish_annotator.data.VcfConstants; import com.google.common.collect.ImmutableList; import java.sql.Connection; import java.sql.PreparedStatement; @@ -34,13 +34,13 @@ public void selfTest(String release, boolean checkChr1Only, boolean checkChr22On chromNames = CHROMS; } if ("grch37".equals(release.toLowerCase())) { - selfTestDb(AnnotateVcf.EXAC_PREFIX, "GRCh37", "", chromNames); - selfTestDb(AnnotateVcf.GNOMAD_EXOMES_PREFIX, "GRCh37", "", chromNames); - selfTestDb(AnnotateVcf.GNOMAD_GENOMES_PREFIX, "GRCh37", "", chromNames); - selfTestDb(AnnotateVcf.THOUSAND_GENOMES_PREFIX, "GRCh37", "", chromNames); + selfTestDb(VcfConstants.EXAC_PREFIX, "GRCh37", "", chromNames); + selfTestDb(VcfConstants.GNOMAD_EXOMES_PREFIX, "GRCh37", "", chromNames); + selfTestDb(VcfConstants.GNOMAD_GENOMES_PREFIX, "GRCh37", "", chromNames); + selfTestDb(VcfConstants.THOUSAND_GENOMES_PREFIX, "GRCh37", "", chromNames); } else if ("grch38".equals(release.toLowerCase())) { - selfTestDb(AnnotateVcf.GNOMAD_EXOMES_PREFIX, "GRCh38", "chr", chromNames); - selfTestDb(AnnotateVcf.GNOMAD_GENOMES_PREFIX, "GRCh38", "chr", chromNames); + selfTestDb(VcfConstants.GNOMAD_EXOMES_PREFIX, "GRCh38", "chr", chromNames); + selfTestDb(VcfConstants.GNOMAD_GENOMES_PREFIX, "GRCh38", "chr", chromNames); } else { throw new RuntimeException("Invalid release: " + release); } @@ -58,7 +58,7 @@ private void selfTestDb( for (String chrom : chromNames) { final String chromName = chrPrefix + chrom; if ("grch37".equalsIgnoreCase(release) - && tablePrefix.equals(AnnotateVcf.GNOMAD_GENOMES_PREFIX) + && tablePrefix.equals(VcfConstants.GNOMAD_GENOMES_PREFIX) && chromName.equals("Y")) { continue; // skip chrY for gnomAD genomes on GRCh37 } diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/GenotypeCounts.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/GenotypeCounts.java similarity index 100% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/GenotypeCounts.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/GenotypeCounts.java diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/GzipUtil.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/GzipUtil.java similarity index 100% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/GzipUtil.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/GzipUtil.java diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/PseudoAutosomalRegionHelper.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/PseudoAutosomalRegionHelper.java similarity index 100% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/PseudoAutosomalRegionHelper.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/PseudoAutosomalRegionHelper.java diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/SelfTestFailedException.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/SelfTestFailedException.java similarity index 86% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/SelfTestFailedException.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/SelfTestFailedException.java index 50989c1..cddbb52 100644 --- a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/SelfTestFailedException.java +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/SelfTestFailedException.java @@ -10,4 +10,6 @@ public SelfTestFailedException(String message, Throwable cause) { public SelfTestFailedException(String message) { super(message); } + + private static final long serialVersionUID = 0; } diff --git a/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/StringUtils.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/StringUtils.java new file mode 100644 index 0000000..c4fd2d0 --- /dev/null +++ b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/StringUtils.java @@ -0,0 +1,14 @@ +package com.github.bihealth.varfish_annotator.utils; + +public class StringUtils { + + /** + * (Overly) simple helper for escaping {@code s}. + * + * @param s String to escape. + * @return Escaped version of {@code s}. + */ + public static String tripleQuote(String s) { + return "\"\"\"" + s.replaceAll("\"\"\"", "") + "\"\"\""; + } +} diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/UcscBinning.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/UcscBinning.java similarity index 100% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/UcscBinning.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/UcscBinning.java diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/VariantDescription.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/VariantDescription.java similarity index 100% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/VariantDescription.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/VariantDescription.java diff --git a/varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/VariantNormalizer.java b/varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/VariantNormalizer.java similarity index 100% rename from varfish-annotator-cli/src/main/java/com/github/bihealth/varfish_annotator/utils/VariantNormalizer.java rename to varfish-annotator-core/src/main/java/com/github/bihealth/varfish_annotator/utils/VariantNormalizer.java diff --git a/varfish-annotator-cli/src/test/java/com/github/bihealth/varfish_annotator/VarFishAnnotatorExceptionTest.java b/varfish-annotator-core/src/test/java/com/github/bihealth/varfish_annotator/VarFishAnnotatorExceptionTest.java similarity index 100% rename from varfish-annotator-cli/src/test/java/com/github/bihealth/varfish_annotator/VarFishAnnotatorExceptionTest.java rename to varfish-annotator-core/src/test/java/com/github/bihealth/varfish_annotator/VarFishAnnotatorExceptionTest.java diff --git a/varfish-annotator-cli/src/test/java/com/github/bihealth/varfish_annotator/utils/VariantDescriptionTest.java b/varfish-annotator-core/src/test/java/com/github/bihealth/varfish_annotator/utils/VariantDescriptionTest.java similarity index 100% rename from varfish-annotator-cli/src/test/java/com/github/bihealth/varfish_annotator/utils/VariantDescriptionTest.java rename to varfish-annotator-core/src/test/java/com/github/bihealth/varfish_annotator/utils/VariantDescriptionTest.java