diff --git a/src/hgvs/extras/babelfish.py b/src/hgvs/extras/babelfish.py index 8435dd18..a5ef0b8d 100644 --- a/src/hgvs/extras/babelfish.py +++ b/src/hgvs/extras/babelfish.py @@ -1,8 +1,15 @@ """translate between HGVS and other formats""" +import os -import bioutils.assemblies - +import hgvs import hgvs.normalizer +from bioutils.assemblies import make_ac_name_map, make_name_ac_map +from bioutils.sequences import reverse_complement +from hgvs.edit import NARefAlt +from hgvs.location import Interval, SimplePosition +from hgvs.normalizer import Normalizer +from hgvs.posedit import PosEdit +from hgvs.sequencevariant import SequenceVariant def _as_interbase(posedit): @@ -18,11 +25,11 @@ def _as_interbase(posedit): class Babelfish: def __init__(self, hdp, assembly_name): + self.assembly_name = assembly_name self.hdp = hdp self.hn = hgvs.normalizer.Normalizer(hdp, cross_boundaries=False, shuffle_direction=5, validate=False) - self.ac_to_chr_name_map = { - sr["refseq_ac"]: sr["name"] for sr in bioutils.assemblies.get_assembly("GRCh38")["sequences"] - } + self.ac_to_name_map = make_ac_name_map(assembly_name) + self.name_to_ac_map = make_name_ac_map(assembly_name) def hgvs_to_vcf(self, var_g): """**EXPERIMENTAL** @@ -30,9 +37,6 @@ def hgvs_to_vcf(self, var_g): converts a single hgvs allele to (chr, pos, ref, alt) using the given assembly_name. The chr name uses official chromosome name (i.e., without a "chr" prefix). - - Returns None for non-variation (e.g., NC_000006.12:g.49949407=) - """ if var_g.type != "g": @@ -42,7 +46,7 @@ def hgvs_to_vcf(self, var_g): (start_i, end_i) = _as_interbase(vleft.posedit) - chr = self.ac_to_chr_name_map[vleft.ac] + chrom = self.ac_to_name_map[vleft.ac] typ = vleft.posedit.edit.type @@ -50,25 +54,55 @@ def hgvs_to_vcf(self, var_g): start_i -= 1 alt = self.hdp.seqfetcher.fetch_seq(vleft.ac, start_i, end_i) ref = alt[0] - end_i = start_i - return (chr, start_i + 1, ref, alt, typ) - - if vleft.posedit.edit.ref == vleft.posedit.edit.alt: - return None - - alt = vleft.posedit.edit.alt or "" - - if end_i - start_i == 1 and vleft.posedit.length_change == 0: - # SNVs aren't left anchored + elif typ == 'inv': ref = vleft.posedit.edit.ref - + alt = reverse_complement(ref) else: - # everything else is left-anchored - start_i -= 1 - ref = self.hdp.seqfetcher.fetch_seq(vleft.ac, start_i, end_i) - alt = ref[0] + alt - - return (chr, start_i + 1, ref, alt, typ) + alt = vleft.posedit.edit.alt or "" + + if typ in ('del', 'ins'): # Left anchored + start_i -= 1 + ref = self.hdp.seqfetcher.fetch_seq(vleft.ac, start_i, end_i) + alt = ref[0] + alt + else: + ref = vleft.posedit.edit.ref + if ref == alt: + alt = '.' + return chrom, start_i + 1, ref, alt, typ + + def vcf_to_g_hgvs(self, chrom, position, ref, alt): + ac = self.name_to_ac_map[chrom] + + # Strip common prefix + if len(alt) > 1 and len(ref) > 1: + pfx = os.path.commonprefix([ref, alt]) + lp = len(pfx) + if lp > 0: + ref = ref[lp:] + alt = alt[lp:] + position += lp + + if ref == '': # Insert + # Insert uses coordinates around the insert point. + start = position - 1 + end = position + else: + start = position + end = position + len(ref) - 1 + + if alt == '.': + alt = ref + + var_g = SequenceVariant(ac=ac, + type='g', + posedit=PosEdit(Interval(start=SimplePosition(start), + end=SimplePosition(end), + uncertain=False), + NARefAlt(ref=ref or None, + alt=alt or None, + uncertain=False))) + n = Normalizer(self.hdp) + return n.normalize(var_g) if __name__ == "__main__": diff --git a/tests/data/cache-py3.hdp b/tests/data/cache-py3.hdp index 2a9a9cd6..e7ed304c 100644 Binary files a/tests/data/cache-py3.hdp and b/tests/data/cache-py3.hdp differ diff --git a/tests/test_hgvs_extras_babelfish.py b/tests/test_hgvs_extras_babelfish.py index 5743dfbe..538b249b 100644 --- a/tests/test_hgvs_extras_babelfish.py +++ b/tests/test_hgvs_extras_babelfish.py @@ -1,3 +1,43 @@ +NORM_HGVS_VCF = [ + # Columns are: (normed-HGVS, non-normalized HGVS, VCF coordinates, non-norm VCF) + + # no-op + ("NC_000006.12:g.49949407=", [], + ("6", 49949407, "A", ".", 'identity'), [("6", 49949407, "A", "A", 'identity')]), + + # snv + ("NC_000006.12:g.49949407A>T", + [], + # was ("6", 49949406, "AA", "AT", "sub") however VT parsimony rules say it should be those below + ("6", 49949407, "A", "T", "sub"), []), + # delins + ("NC_000006.12:g.49949413_49949414delinsCC", + [], + # This was ("6", 49949412, "AAA", "ACC", "delins") - however VT parsimony rules say it should be those below + ("6", 49949413, "AA", "CC", "delins"), []), + + # del, no shift + ("NC_000006.12:g.49949415del", [], ("6", 49949414, "AT", "A", "del"), []), + + # del, w/ shift + ("NC_000006.12:g.49949414del", ["NC_000006.12:g.49949413del"], + ("6", 49949409, "GA", "G", "del"), []), + ("NC_000006.12:g.49949413_49949414del", [], ("6", 49949409, "GAA", "G", "del"), []), + + # ins, no shift + ("NC_000006.12:g.49949413_49949414insC", [], ("6", 49949413, "A", "AC", "ins"), []), + ("NC_000006.12:g.49949414_49949415insCC", [], ("6", 49949414, "A", "ACC", "ins"), []), + + # ins/dup, w/shift + ("NC_000006.12:g.49949414dup", + ["NC_000006.12:g.49949413_49949414insA", "NC_000006.12:g.49949414_49949415insA"], + ("6", 49949409, "G", "GA", "dup"), []), + ("NC_000006.12:g.49949413_49949414dup", + ["NC_000006.12:g.49949414_49949415insAA"], + ("6", 49949409, "G", "GAA", "dup"), []), +] + + def test_hgvs_to_vcf(parser, babelfish38): """ 49949___ 400 410 420 @@ -9,28 +49,19 @@ def test_hgvs_to_vcf(parser, babelfish38): def _h2v(h): return babelfish38.hgvs_to_vcf(parser.parse(h)) - # no-op - assert _h2v("NC_000006.12:g.49949407=") == None - - # snv - assert _h2v("NC_000006.12:g.49949407A>T") == ("6", 49949406, "AA", "AT", "sub") - - # delins - assert _h2v("NC_000006.12:g.49949413_49949414delinsCC") == ("6", 49949412, "AAA", "ACC", "delins") - - # del, no shift - assert _h2v("NC_000006.12:g.49949415del") == ("6", 49949414, "AT", "A", "del") + for norm_hgvs_string, alt_hgvs, expected_variant_coordinate, _ in NORM_HGVS_VCF: + for hgvs_string in [norm_hgvs_string] + alt_hgvs: + variant_coordinates = _h2v(hgvs_string) + assert variant_coordinates == expected_variant_coordinate - # del, w/ shift - assert _h2v("NC_000006.12:g.49949413del") == ("6", 49949409, "GA", "G", "del") - assert _h2v("NC_000006.12:g.49949414del") == ("6", 49949409, "GA", "G", "del") - assert _h2v("NC_000006.12:g.49949413_49949414del") == ("6", 49949409, "GAA", "G", "del") - # ins, no shift - assert _h2v("NC_000006.12:g.49949413_49949414insC") == ("6", 49949413, "A", "AC", "ins") - assert _h2v("NC_000006.12:g.49949414_49949415insCC") == ("6", 49949414, "A", "ACC", "ins") +def test_vcf_to_hgvs(parser, babelfish38): + def _v2h(*v): + return babelfish38.vcf_to_g_hgvs(*v) - # ins/dup, w/shift - assert _h2v("NC_000006.12:g.49949413_49949414insA") == ("6", 49949409, "G", "GA", "dup") - assert _h2v("NC_000006.12:g.49949414_49949415insA") == ("6", 49949409, "G", "GA", "dup") - assert _h2v("NC_000006.12:g.49949414_49949415insAA") == ("6", 49949409, "G", "GAA", "dup") + for expected_hgvs_string, _, norm_variant_coordinate, alt_variant_coordinate in NORM_HGVS_VCF: + for variant_coordinate in [norm_variant_coordinate] + alt_variant_coordinate: + *v, typ = variant_coordinate # last column is type ie "dup" + hgvs_g = _v2h(*v) + hgvs_string = hgvs_g.format() + assert hgvs_string == expected_hgvs_string