From 7cbf7564869b029ef956d538f23d11edd0a03b71 Mon Sep 17 00:00:00 2001 From: Everaldo Date: Wed, 16 Oct 2024 11:05:50 -0700 Subject: [PATCH] Create _id. --- .../dataload/sources/civic/civic_parser.py | 130 ++++++++++-------- 1 file changed, 73 insertions(+), 57 deletions(-) diff --git a/src/hub/dataload/sources/civic/civic_parser.py b/src/hub/dataload/sources/civic/civic_parser.py index a25e9970..855205b3 100644 --- a/src/hub/dataload/sources/civic/civic_parser.py +++ b/src/hub/dataload/sources/civic/civic_parser.py @@ -44,64 +44,80 @@ def load_data(data_folder): doc = merge_dicts(doc, variant_data["VariantDetail"]["data"]["variant"]) doc = merge_dicts(doc, variant_data["VariantSummary"]["data"]["variant"]) - if set(['error', 'status']) != set(doc.keys()): - print("### doc") - print(doc) - [chrom, pos, ref, alt] = [doc['coordinates'][x] for x in ['chromosome', 'start', 'referenceBases', 'variantBases']] - variant_id = doc.pop("id") - new_doc = {} - doc['variant_id'] = variant_id - if chrom and ref and alt: - no_case1 += 1 - try: - new_doc['_id'] = get_hgvs_from_vcf(chrom, pos, ref, alt) - except ValueError: - logging.warning("id has ref,alt, but coudn't be converted to hgvs id: {}".format(variant_id)) - continue - # handle cases of deletions where only ref info is provided - elif chrom and ref and not alt: - no_case2 += 1 - start = int(pos) - end = int(pos) + len(ref) - 1 - if start == end: - new_doc['_id'] = 'chr{0}:g.{1}del'.format(chrom, start) - else: - new_doc['_id'] = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end) - # handle cases of insertions where only alt info is provided - elif chrom and alt and not ref: - no_case3 += 1 - new_doc['_id'] = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, alt) - # handle cases where no ref or alt info provided, - # in this case, use CIVIC internal ID as the primary id for MyVariant.info, e.g. CIVIC_VARIANT:1 - else: - no_case4 += 1 - new_doc['_id'] = 'CIVIC_VARIANT:' + str(variant_id) - # for _evidence in doc['evidence_items']: - # print(doc) - for _molecularProfiles in doc['molecularProfiles']['nodes']: - # print(_molecularProfiles) - for _evidence in _molecularProfiles['evidenceItems']['edges']: - # print(_evidence['node']) - if 'disease' in _evidence['node'] and 'doid' in (_evidence['node']['disease'] or {}) and _evidence['node']['disease']['doid']: - _evidence['node']['disease']['doid'] = 'DOID:' + _evidence['node']['disease']['doid'] - if 'source' in _evidence['node'] and 'citationId' in _evidence['node']['source']: - if _evidence['node']['source']['sourceType'].lower() == "pubmed": - _evidence['node']['source']['pubmed'] = to_int(_evidence['node']['source']['citationId']) - _evidence['node']['source'].pop('sourceType') - _evidence['node']['source'].pop('citationId') - elif _evidence['node']['source']['sourceType'].lower() == "asco": - _evidence['node']['source']['asco'] = to_int(_evidence['node']['source']['citationId']) - _evidence['node']['source'].pop('sourceType') - _evidence['node']['source'].pop('citationId') - else: - raise ValueError("The value of source_type is not one of PubMed or ASCO, it's {}, need to restructure parser".format(_evidence['node']['source']['sourceType'])) - new_doc['civic'] = doc - print("### new_doc") - print(new_doc) - yield dict_sweep(unlist(new_doc), ['', 'null', 'N/A', None, [], {}]) - # change doid into its formal representation, which should be sth like DOID:1 + if "myVariantInfo" in doc and "myVariantInfoId" in doc["myVariantInfo"]: + _id = doc["myVariantInfo"]["myVariantInfoId"] + elif "hgvsDescriptions" in doc: + hgvs_description = doc["hgvsDescriptions"] + hgvs_nc_item = hgvs_description[0].split(":") + nc_id = hgvs_nc_item[0].replace("NC_", "").split(".")[0].lstrip('0') + _id = nc_id + hgvs_nc_item[1] else: - continue + _id = 'CIVIC_VARIANT:' + str(doc["variant_id"]) + + new_doc = {} + new_doc["_id"] = _id + + # if set(['error', 'status']) != set(doc.keys()): + # print("### doc") + # print(doc) + # [chrom, pos, ref, alt] = [doc['coordinates'][x] for x in ['chromosome', 'start', 'referenceBases', 'variantBases']] + # variant_id = doc.pop("id") + # new_doc = {} + # doc['variant_id'] = variant_id + # if chrom and ref and alt: + # no_case1 += 1 + # try: + # new_doc['_id'] = get_hgvs_from_vcf(chrom, pos, ref, alt) + # except ValueError: + # logging.warning("id has ref,alt, but coudn't be converted to hgvs id: {}".format(variant_id)) + # continue + # # handle cases of deletions where only ref info is provided + # elif chrom and ref and not alt: + # no_case2 += 1 + # start = int(pos) + # end = int(pos) + len(ref) - 1 + # if start == end: + # new_doc['_id'] = 'chr{0}:g.{1}del'.format(chrom, start) + # else: + # new_doc['_id'] = 'chr{0}:g.{1}_{2}del'.format(chrom, start, end) + # # handle cases of insertions where only alt info is provided + # elif chrom and alt and not ref: + # no_case3 += 1 + # new_doc['_id'] = 'chr{0}:g.{1}_{2}ins{3}'.format(chrom, start, end, alt) + # # handle cases where no ref or alt info provided, + # # in this case, use CIVIC internal ID as the primary id for MyVariant.info, e.g. CIVIC_VARIANT:1 + # else: + # no_case4 += 1 + # new_doc['_id'] = 'CIVIC_VARIANT:' + str(variant_id) + # # for _evidence in doc['evidence_items']: + # # print(doc) + + for _molecularProfiles in doc['molecularProfiles']['nodes']: + # print(_molecularProfiles) + for _evidence in _molecularProfiles['evidenceItems']['edges']: + # print(_evidence['node']) + if 'disease' in _evidence['node'] and 'doid' in (_evidence['node']['disease'] or {}) and _evidence['node']['disease']['doid']: + _evidence['node']['disease']['doid'] = 'DOID:' + _evidence['node']['disease']['doid'] + if 'source' in _evidence['node'] and 'citationId' in _evidence['node']['source']: + if _evidence['node']['source']['sourceType'].lower() == "pubmed": + _evidence['node']['source']['pubmed'] = to_int(_evidence['node']['source']['citationId']) + _evidence['node']['source'].pop('sourceType') + _evidence['node']['source'].pop('citationId') + elif _evidence['node']['source']['sourceType'].lower() == "asco": + _evidence['node']['source']['asco'] = to_int(_evidence['node']['source']['citationId']) + _evidence['node']['source'].pop('sourceType') + _evidence['node']['source'].pop('citationId') + else: + raise ValueError("The value of source_type is not one of PubMed or ASCO, it's {}, need to restructure parser".format(_evidence['node']['source']['sourceType'])) + new_doc["civic"] = doc + new_doc["civic"].pop("myVariantInfo") + print("### new_doc") + print(new_doc) + yield dict_sweep(unlist(new_doc), ['', 'null', 'N/A', None, [], {}]) + + # change doid into its formal representation, which should be sth like DOID:1 + # else: + # continue logging.info("number of ids with ref, alt, chrom: {}".format(no_case1)) logging.info("number of ids with chrom, ref but no alt: {}".format(no_case2)) logging.info("number of ids with chrom, alt but no ref: {}".format(no_case3))