From 7cbf7564869b029ef956d538f23d11edd0a03b71 Mon Sep 17 00:00:00 2001
From: Everaldo <everaldorodrigo@gmail.com>
Date: Wed, 16 Oct 2024 11:05:50 -0700
Subject: [PATCH] Create _id.

---
 .../dataload/sources/civic/civic_parser.py    | 130 ++++++++++--------
 1 file changed, 73 insertions(+), 57 deletions(-)

diff --git a/src/hub/dataload/sources/civic/civic_parser.py b/src/hub/dataload/sources/civic/civic_parser.py
index a25e9970..855205b3 100644
--- a/src/hub/dataload/sources/civic/civic_parser.py
+++ b/src/hub/dataload/sources/civic/civic_parser.py
@@ -44,64 +44,80 @@ def load_data(data_folder):
         doc = merge_dicts(doc, variant_data["VariantDetail"]["data"]["variant"])
         doc = merge_dicts(doc, variant_data["VariantSummary"]["data"]["variant"])
 
-        if set(['error', 'status']) != set(doc.keys()):
-            print("### doc")
-            print(doc)
-            [chrom, pos, ref, alt] = [doc['coordinates'][x] for x in ['chromosome', 'start', 'referenceBases', 'variantBases']]
-            variant_id = doc.pop("id")
-            new_doc = {}
-            doc['variant_id'] = variant_id
-            if chrom and ref and alt:
-                no_case1 += 1
-                try:
-                    new_doc['_id'] = get_hgvs_from_vcf(chrom, pos, ref, alt)
-                except ValueError:
-                    logging.warning("id has ref,alt, but coudn't be converted to hgvs id: {}".format(variant_id))
-                    continue
-            # handle cases of deletions where only ref info is provided
-            elif chrom and ref and not alt:
-                no_case2 += 1
-                start = int(pos)
-                end = int(pos) + len(ref) - 1
-                if start == end:
-                    new_doc['_id'] = 'chr{0}:g.{1}del'.format(chrom, start)
-                else:
-                    new_doc['_id'] = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
-            # handle cases of insertions where only alt info is provided
-            elif chrom and alt and not ref:
-                no_case3 += 1
-                new_doc['_id'] = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, alt)
-            # handle cases where no ref or alt info provided,
-            # in this case, use CIVIC internal ID as the primary id for MyVariant.info, e.g. CIVIC_VARIANT:1
-            else:
-                no_case4 += 1
-                new_doc['_id'] = 'CIVIC_VARIANT:' + str(variant_id)
-            # for _evidence in doc['evidence_items']:
-            # print(doc)
-            for _molecularProfiles in doc['molecularProfiles']['nodes']:
-                # print(_molecularProfiles)
-                for _evidence in _molecularProfiles['evidenceItems']['edges']:
-                    # print(_evidence['node'])
-                    if 'disease' in _evidence['node'] and 'doid' in (_evidence['node']['disease'] or {}) and _evidence['node']['disease']['doid']:
-                        _evidence['node']['disease']['doid'] = 'DOID:' + _evidence['node']['disease']['doid']
-                    if 'source' in _evidence['node'] and 'citationId' in _evidence['node']['source']:
-                        if _evidence['node']['source']['sourceType'].lower() == "pubmed":
-                            _evidence['node']['source']['pubmed'] = to_int(_evidence['node']['source']['citationId'])
-                            _evidence['node']['source'].pop('sourceType')
-                            _evidence['node']['source'].pop('citationId')
-                        elif _evidence['node']['source']['sourceType'].lower() == "asco":
-                            _evidence['node']['source']['asco'] = to_int(_evidence['node']['source']['citationId'])
-                            _evidence['node']['source'].pop('sourceType')
-                            _evidence['node']['source'].pop('citationId')
-                        else:
-                            raise ValueError("The value of source_type is not one of PubMed or ASCO, it's {}, need to restructure parser".format(_evidence['node']['source']['sourceType']))
-            new_doc['civic'] = doc
-            print("### new_doc")
-            print(new_doc)
-            yield dict_sweep(unlist(new_doc), ['', 'null', 'N/A', None, [], {}])
-            # change doid into its formal representation, which should be sth like DOID:1
+        if "myVariantInfo" in doc and "myVariantInfoId" in doc["myVariantInfo"]:
+            _id = doc["myVariantInfo"]["myVariantInfoId"]
+        elif "hgvsDescriptions" in doc:
+            hgvs_description = doc["hgvsDescriptions"]
+            hgvs_nc_item = hgvs_description[0].split(":")
+            nc_id = hgvs_nc_item[0].replace("NC_", "").split(".")[0].lstrip('0')
+            _id = nc_id + hgvs_nc_item[1]
         else:
-            continue
+            _id = 'CIVIC_VARIANT:' + str(doc["variant_id"])
+
+        new_doc = {}
+        new_doc["_id"] = _id
+
+        # if set(['error', 'status']) != set(doc.keys()):
+        #     print("### doc")
+        #     print(doc)
+        #     [chrom, pos, ref, alt] = [doc['coordinates'][x] for x in ['chromosome', 'start', 'referenceBases', 'variantBases']]
+        #     variant_id = doc.pop("id")
+        #     new_doc = {}
+        #     doc['variant_id'] = variant_id
+        #     if chrom and ref and alt:
+        #         no_case1 += 1
+        #         try:
+        #             new_doc['_id'] = get_hgvs_from_vcf(chrom, pos, ref, alt)
+        #         except ValueError:
+        #             logging.warning("id has ref,alt, but coudn't be converted to hgvs id: {}".format(variant_id))
+        #             continue
+        #     # handle cases of deletions where only ref info is provided
+        #     elif chrom and ref and not alt:
+        #         no_case2 += 1
+        #         start = int(pos)
+        #         end = int(pos) + len(ref) - 1
+        #         if start == end:
+        #             new_doc['_id'] = 'chr{0}:g.{1}del'.format(chrom, start)
+        #         else:
+        #             new_doc['_id'] = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end)
+        #     # handle cases of insertions where only alt info is provided
+        #     elif chrom and alt and not ref:
+        #         no_case3 += 1
+        #         new_doc['_id'] = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, alt)
+        #     # handle cases where no ref or alt info provided,
+        #     # in this case, use CIVIC internal ID as the primary id for MyVariant.info, e.g. CIVIC_VARIANT:1
+        #     else:
+        #         no_case4 += 1
+        #         new_doc['_id'] = 'CIVIC_VARIANT:' + str(variant_id)
+        #     # for _evidence in doc['evidence_items']:
+        #     # print(doc)
+
+        for _molecularProfiles in doc['molecularProfiles']['nodes']:
+            # print(_molecularProfiles)
+            for _evidence in _molecularProfiles['evidenceItems']['edges']:
+                # print(_evidence['node'])
+                if 'disease' in _evidence['node'] and 'doid' in (_evidence['node']['disease'] or {}) and _evidence['node']['disease']['doid']:
+                    _evidence['node']['disease']['doid'] = 'DOID:' + _evidence['node']['disease']['doid']
+                if 'source' in _evidence['node'] and 'citationId' in _evidence['node']['source']:
+                    if _evidence['node']['source']['sourceType'].lower() == "pubmed":
+                        _evidence['node']['source']['pubmed'] = to_int(_evidence['node']['source']['citationId'])
+                        _evidence['node']['source'].pop('sourceType')
+                        _evidence['node']['source'].pop('citationId')
+                    elif _evidence['node']['source']['sourceType'].lower() == "asco":
+                        _evidence['node']['source']['asco'] = to_int(_evidence['node']['source']['citationId'])
+                        _evidence['node']['source'].pop('sourceType')
+                        _evidence['node']['source'].pop('citationId')
+                    else:
+                        raise ValueError("The value of source_type is not one of PubMed or ASCO, it's {}, need to restructure parser".format(_evidence['node']['source']['sourceType']))
+        new_doc["civic"] = doc
+        new_doc["civic"].pop("myVariantInfo")
+        print("### new_doc")
+        print(new_doc)
+        yield dict_sweep(unlist(new_doc), ['', 'null', 'N/A', None, [], {}])
+
+        # change doid into its formal representation, which should be sth like DOID:1
+        # else:
+        #     continue
     logging.info("number of ids with ref, alt, chrom: {}".format(no_case1))
     logging.info("number of ids with chrom, ref but no alt: {}".format(no_case2))
     logging.info("number of ids with chrom, alt but no ref: {}".format(no_case3))