From 95b9423bb4b1864088f9a757d646005c3cd07a23 Mon Sep 17 00:00:00 2001 From: aclum Date: Tue, 3 Oct 2023 09:02:15 -0700 Subject: [PATCH 01/44] adding code to mint ids via prod runtime APi, connect to napa compliance mongo db instance and update study and biosample records for gold:Gs0114663 --- nmdc_schema/connect_napa_mongo.py | 114 +++++++++++++++++++++++ nmdc_schema/napa_complaince.README | 6 ++ nmdc_schema/runtime_api_operations.py | 127 ++++++++++++++++++++++++++ 3 files changed, 247 insertions(+) create mode 100644 nmdc_schema/connect_napa_mongo.py create mode 100644 nmdc_schema/napa_complaince.README create mode 100644 nmdc_schema/runtime_api_operations.py diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py new file mode 100644 index 0000000000..103941d784 --- /dev/null +++ b/nmdc_schema/connect_napa_mongo.py @@ -0,0 +1,114 @@ +from datetime import datetime, timezone +import json +import os +from pprint import pprint +import secrets +import time + +from dotenv import load_dotenv +import requests + +import pymongo +from pymongo import MongoClient +from pymongo.errors import ConnectionFailure + +envfile_path = "../../.env.client" + +load_dotenv(envfile_path) + + +#nersc ssh tunnel required to connect to mongo +#ssh -L 37020:mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017 -o ServerAliveInterval=60 {YOUR_NERSC_USERNAME}@dtn01.nersc.gov + +napa_mongo_pw = os.environ['MONGO_NAPA_PW'] +#print("napa_mongo_pw:", os.environ['MONGO_NAPA_PW']) + +napa_mongo='mongodb://root:'+napa_mongo_pw+'@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin' +#connection = MongoClient() +#db = connection.napa_mongo +print(napa_mongo) + +#connect to mongo +client = MongoClient(napa_mongo) + +#set mongo database name to nmdc' +mydb =client['nmdc'] + +#list database names +#for db in client.list_database_names(): +# print(db) + +#list collections +#for coll in mydb.list_collection_names(): +# print(coll) + +study_coll=mydb["study_set"] + +select_legacy_study = {"id":"gold:Gs0114663"} +napa_study_update = { "$set": { "id": "nmdc:sty-12-85j6kq06" } } + + +study_coll.update_one(select_legacy_study, napa_study_update) + +check_study_update = { "id":"nmdc:sty-12-85j6kq06"} +mydoc=study_coll.find(check_study_update) + +#for x in mydoc: +# print(x) + +biosample_coll=mydb["biosample_set"] +select_biosample_part_of = {"part_of": {"$regex" :"Gs0114663$"}} +napa_biosample_part_of = { "$set": { "part_of": "nmdc:sty-12-85j6kq06"}} +part_of_biosample_update=biosample_coll.update_many(select_biosample_part_of,napa_biosample_part_of) + +print(part_of_biosample_update.modified_count, "documents_updated.") + +#mint 85 biosample identifiers +#manually created this file when testing for Gs0114663 + +#update alt biosample ids +biosample_alt_emsl= {"part_of": "nmdc:sty-12-85j6kq06","emsl_biosample_identifiers":{"$exists":False},"id":{"$regex":"^emsl*"}} + + + +for doc in biosample_coll.find(biosample_alt_emsl): +# print(doc["id"]) + target_biosample={"id": doc["id"]} + target_update = { "$set": { "emsl_biosample_identifiers": doc["id"] } } + biosample_coll.update_one(target_biosample,target_update) + + +with open("napa_biosample_test.json", 'r') as j: + biosample_napa_ids = json.loads(j.read()) + + + +Gs0114663_legacy_biosamples={"part_of": "nmdc:sty-12-85j6kq06"} +#f = open("Gs0114663_reid.txt", "a") + +biosample_counter=0 +for doc in biosample_coll.find(Gs0114663_legacy_biosamples): + target_biosample={"id": doc["id"]} + target_update = { "$set": { "id": biosample_napa_ids[biosample_counter]}} + print("Biosample ",target_biosample,target_update) + biosample_coll.update_one(target_biosample,target_update) + biosample_counter=biosample_counter+1 +# f.write("Biosample "+ doc["id"]+ " "+ biosample_napa_ids[biosample_counter]) + +omics_coll=mydb["omics_processing_set"] + +Gs0114663_legacy_omics={"part_of":"gold:Gs0114663"} + +omics_counter=0 +for doc in omics_coll.find(Gs0114663_legacy_omics): + + +#f.close() +#example regex +#myquery = { "id": {"$regex" :"^gold*"}} + +#mydatabase = client.nmdc +#print(mydatabase) +#collection =nmdc["study_set"] +#study_count=nmdc.study_set.count() +#print("The study count is:", study_count) diff --git a/nmdc_schema/napa_complaince.README b/nmdc_schema/napa_complaince.README new file mode 100644 index 0000000000..bd8e83193b --- /dev/null +++ b/nmdc_schema/napa_complaince.README @@ -0,0 +1,6 @@ +#install notes + +#for generating tokens needed to run +pip install python-dotenv +#use pip or conda to install pymong +pip install pymongo diff --git a/nmdc_schema/runtime_api_operations.py b/nmdc_schema/runtime_api_operations.py new file mode 100644 index 0000000000..be0964274d --- /dev/null +++ b/nmdc_schema/runtime_api_operations.py @@ -0,0 +1,127 @@ +from datetime import datetime, timezone +import json +import os +from pprint import pprint +import secrets +import time + +from dotenv import load_dotenv +import requests + +#modified from nmdc-runtime how-to guide https://microbiomedata.github.io/nmdc-runtime/nb/queue_and_trigger_data_jobs/ + +# relative path to file with format +# ``` +# NMDC_RUNTIME_HOST=fixme +# NMDC_RUNTIME_USER=fixme +# NMDC_RUNTIME_PASS=fixme +# NMDC_RUNTIME_SITE_ID=fixme # Okay if you don't have yet +# NMDC_RUNTIME_SITE_CLIENT_ID=fixme # Okay if you don't have yet +# NMDC_RUNTIME_SITE_CLIENT_SECRET=fixme # Okay if you don't have yet +# ``` +envfile_path = "../../.env.client" + +load_dotenv(envfile_path) + +ENV = { + k: v for k, v in os.environ.items() + if k.startswith("NMDC_RUNTIME_") +} + +assert ( + ENV["NMDC_RUNTIME_HOST"] == + "https://api.microbiomedata.org" +) + +HOST = ENV["NMDC_RUNTIME_HOST"] + +def request_and_return_json(method, path, host=HOST, **kwargs): + r = requests.request(method, host + path, **kwargs) + r.raise_for_status() + return r.json() + +def get_json(path, host=HOST, **kwargs): + return request_and_return_json("GET", path, host=host, **kwargs) + +def post_and_return_json(path, host=HOST, **kwargs): + return request_and_return_json("POST", path, host=host, **kwargs) + +def patch_and_return_json(path, host=HOST, **kwargs): + return request_and_return_json("PATCH", path, host=host, **kwargs) + +def put_and_return_json(path, host=HOST, **kwargs): + return request_and_return_json("PUT", path, host=host, **kwargs) + +def auth_header(bearer_token): + return {"Authorization": f"Bearer {bearer_token}"} + +def get_token_for_user(): + response = post_and_return_json( + "/token", + data={ + "grant_type": "password", + "username": ENV["NMDC_RUNTIME_USER"], + "password": ENV["NMDC_RUNTIME_PASS"] + } + ) + expires_minutes = response['expires']['minutes'] + print(f"Bearer token expires in {expires_minutes} minutes") + return response["access_token"] + +def get_token_for_site_client(): + response = post_and_return_json( + "/token", + data={ + "grant_type": "client_credentials", + "client_id": ENV["NMDC_RUNTIME_SITE_CLIENT_ID"], + "client_secret": ENV["NMDC_RUNTIME_SITE_CLIENT_SECRET"] + } + ) + expires_minutes = response['expires']['minutes'] + print(f"Bearer token expires in {expires_minutes} minutes") + return response["access_token"] + +def mint_ids(schema_class,how_many,formatted_token): + + url=HOST + "/pids/mint" + data={ + "schema_class": {"id": schema_class}, + "how_many": how_many + } + headers = formatted_token +# print(headers) + response = requests.post(url,headers=headers,json=data) + print("JSON Response ", response.json()) + + minted_ids=response.json() + return minted_ids + #print(minted_ids) + +#def mint_ids(schema_class,how_many,TOKEN_C): +# response = post_and_return_json( +# "/pids/mint", +# data={ +# "schema_class": {"id": schema_class}, +# "how_many": how_many +# } +# headers = TOKEN_C +# return response +# ) + + + + +def now(as_str=False): + dt = datetime.now(timezone.utc) + return dt.isoformat() if as_str else dt + +TOKEN_C = get_token_for_site_client() + + +print(TOKEN_C) +formatted_token=auth_header(TOKEN_C) +napa_ids=mint_ids("nmdc:Study",2,formatted_token) +print(napa_ids) + + + From ae912aa2c7784522bd3526f15bde29b318a31538 Mon Sep 17 00:00:00 2001 From: aclum Date: Wed, 4 Oct 2023 10:08:59 -0700 Subject: [PATCH 02/44] Fixes for schema slots treated as strings that should have been lists --- nmdc_schema/connect_napa_mongo.py | 128 +++++++++++++++++++++++++++++- 1 file changed, 126 insertions(+), 2 deletions(-) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index 103941d784..2b983b119d 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -53,21 +53,85 @@ check_study_update = { "id":"nmdc:sty-12-85j6kq06"} mydoc=study_coll.find(check_study_update) +biosample_coll=mydb["biosample_set"] + +##needed only in testing, update values in gold alt id slots to lowercase. This has been updated in the production version of the schema already +biosample_fix_gold_case={"gold_biosample_identifiers":{"$regex":"^GOLD*"}} +for doc in biosample_coll.find(biosample_fix_gold_case): + if (len(doc["gold_biosample_identifiers"]) == 1): + gold_id_list=doc["gold_biosample_identifiers"] + case_fixed=gold_id_list[0].replace('GOLD','gold') + gold_fix_target_biosample={"id": doc["id"]} + gold_case_update={ "$set": { "gold_biosample_identifiers": case_fixed}} + #print("Set operation on ",gold_fix_target_biosample,gold_case_update) + biosample_coll.update_one(gold_fix_target_biosample,gold_case_update) + else: + print("There is more than one gold biosample for",doc["id"]) + +#fix the fact you did updated the gold biosample ids as a string and not a list +biosample_fix_gold_type={"gold_biosample_identifiers":{"$regex":"^gold*"}} +for doc in biosample_coll.find(biosample_fix_gold_type): + if(isinstance(doc["gold_biosample_identifiers"],str)): +# print("need to fix", doc["gold_biosample_identifiers"]) + update_gold_biosample=[] + update_gold_biosample.append(doc["gold_biosample_identifiers"]) + fix_gold_biosample_type_target={"id": doc["id"]} + gold_biosample_type_update={"$set": { "gold_biosample_identifiers": update_gold_biosample}} + biosample_coll.update_one(fix_gold_biosample_type_target,gold_biosample_type_update) + elif(isinstance(doc["gold_biosample_identifiers"],list)): + print("already the correct type ",doc["gold_biosample_identifiers"]) + else: + print("this record is type ", doc["gold_biosample_identifiers"],type(doc["gold_biosample_identifiers"])) + +study_fix_gold_case={"gold_study_identifiers":{"$regex":"^GOLD*"}} + +for doc in study_coll.find(study_fix_gold_case): + update_gold_study=[] + for gold_study in doc["gold_study_identifiers"]: + gold_study=gold_study.replace('GOLD','gold') + gold_study_target={"id": doc["id"]} + update_gold_study.append(gold_study) + #print(update_gold_study) + gold_fix_target_study={ "$set": {"gold_study_identifiers": update_gold_study}} + #print(gold_fix_target_study) + study_coll.update_one(gold_study_target,gold_fix_target_study) + +omics_coll=mydb["omics_processing_set"] +omics_processing_fix_gold_case={"gold_sequencing_project_identifiers":{"$regex":"^GOLD*"}} +for doc in omics_coll.find(omics_processing_fix_gold_case): + update_gold_project=[] + for gold_project in doc["gold_sequencing_project_identifiers"]: + gold_project=gold_project.replace('GOLD','gold') + update_gold_project.append(gold_project) + gold_project_target={"id": doc["id"]} + gold_fix_target_omics={ "$set": {"gold_sequencing_project_identifiers":update_gold_project}} + #print(gold_project_target,gold_fix_target_omics) + omics_coll.update_one(gold_project_target,gold_fix_target_omics) + +#end section only needed for testing +################################################################################## #for x in mydoc: # print(x) -biosample_coll=mydb["biosample_set"] select_biosample_part_of = {"part_of": {"$regex" :"Gs0114663$"}} napa_biosample_part_of = { "$set": { "part_of": "nmdc:sty-12-85j6kq06"}} part_of_biosample_update=biosample_coll.update_many(select_biosample_part_of,napa_biosample_part_of) print(part_of_biosample_update.modified_count, "documents_updated.") +#fix part_of for study Gs0114663, this needs to be an array +fix_select_biosample_part_of ={"part_of":"nmdc:sty-12-85j6kq06"} +fix_napa_biosample_part_of= { "$set": { "part_of": ["nmdc:sty-12-85j6kq06"]}} +fix_part_of_biosample_update=biosample_coll.update_many(fix_select_biosample_part_of,fix_napa_biosample_part_of) +print(fix_part_of_biosample_update.modified_count, "documents_updated.") + #mint 85 biosample identifiers #manually created this file when testing for Gs0114663 #update alt biosample ids biosample_alt_emsl= {"part_of": "nmdc:sty-12-85j6kq06","emsl_biosample_identifiers":{"$exists":False},"id":{"$regex":"^emsl*"}} + + @@ -77,6 +141,21 @@ target_update = { "$set": { "emsl_biosample_identifiers": doc["id"] } } biosample_coll.update_one(target_biosample,target_update) +#fix updates to setting emsl biosample id slot as string instead of list +fix_type_emsl_alt_biosample={"part_of": "nmdc:sty-12-85j6kq06","emsl_biosample_identifiers":{"$exists":True}} + +for doc in biosample_coll.find(fix_type_emsl_alt_biosample): + if(isinstance(doc["emsl_biosample_identifiers"],str)): +# print("need to fix", doc["emsl_biosample_identifiers"]) + update_emsl_biosample=[] + update_emsl_biosample.append(doc["emsl_biosample_identifiers"]) + fix_emsl_biosample_type_target={"id": doc["id"]} + emsl_biosample_type_update={"$set": { "emsl_biosample_identifiers": update_emsl_biosample}} + biosample_coll.update_one(fix_emsl_biosample_type_target,emsl_biosample_type_update) + #print(fix_emsl_biosample_type_target,emsl_biosample_type_update) + else: + print("this record is type ", doc["emsl_biosample_identifiers"],type(doc["emsl_biosample_identifiers"])) + with open("napa_biosample_test.json", 'r') as j: biosample_napa_ids = json.loads(j.read()) @@ -95,13 +174,56 @@ biosample_counter=biosample_counter+1 # f.write("Biosample "+ doc["id"]+ " "+ biosample_napa_ids[biosample_counter]) + +#update omics_processing_set records for gold:Gs0114663 omics_coll=mydb["omics_processing_set"] Gs0114663_legacy_omics={"part_of":"gold:Gs0114663"} +#mint 479 napa omics ids manually + +#read omics_ids into a python list +with open("napa_omics_test.json", 'r') as j: + omics_napa_ids = json.loads(j.read()) + omics_counter=0 -for doc in omics_coll.find(Gs0114663_legacy_omics): +f_omics_id_mapping = open("Gs0114663_omics_reid.txt", "w") +f_omics_set_operation =open("Gs0114663_omics_set","w") +#TODO update the omics collection fixes to use arrays for part_of, has_input, alt identifiers +for doc in omics_coll.find(Gs0114663_legacy_omics): + #determine what has_input should be + if (len(doc["has_input"]) > 1): + print("Too many inputs for ",doc["id"]) + elif(len(doc["has_input"]) == 1): + for biosample in doc["has_input"]: + if (biosample.startswith('GOLD')): + b_alt_id=biosample.replace('GOLD','gold') + else: + b_alt_id=biosample + else: + print("has_input not specified for ", doc["id"]) + target_has_input={"$or":[ {"emsl_biosample_identifiers":b_alt_id}, {"gold_biosample_identifiers":b_alt_id},{"insdc_biosample_identifiers":b_alt_id}]} + get_biosample=biosample_coll.find_one(target_has_input) + replace_has_input=get_biosample["id"] + #set id + target_omics={"id": doc["id"]} + if (doc["id"].startswith('gold')): + alt_id_slot="gold_sequencing_project_identifiers" + elif (doc["id"].startswith('GOLD')): + alt_id_slot="gold_sequencing_project_identifiers" + doc["id"].replace('gold','GOLD') + elif (doc["id"].startswith(('emsl', 'EMSL'))): + alt_id_slot="alternative_identifiers" + else: + print("Not sure how to re-id omics_processing_set id ",doc["id"]) + target_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":"nmdc:sty-12-85j6kq06", alt_id_slot:doc["id"],"has_input": replace_has_input }} + class_legacy_napa="OmicsProcessing " + doc["id"] + " "+ omics_napa_ids[omics_counter] + print(class_legacy_napa) + print(target_update) + f_omics_id_mapping.write(class_legacy_napa) + #f_omics_set_operation.write(target_update) + omics_counter=omics_counter+1 #f.close() #example regex @@ -112,3 +234,5 @@ #collection =nmdc["study_set"] #study_count=nmdc.study_set.count() #print("The study count is:", study_count) + + From 95a208205088a5f4b19feb1536906e37dd9676d8 Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 5 Oct 2023 08:57:50 -0700 Subject: [PATCH 03/44] adding functions for study,biosample,and omics re-iding --- nmdc_schema/connect_napa_mongo.py | 105 ++++++++++++++---- .../napa_study_biosample_omics_migration.py | 100 +++++++++++++++++ 2 files changed, 183 insertions(+), 22 deletions(-) create mode 100644 nmdc_schema/napa_study_biosample_omics_migration.py diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index 2b983b119d..b151ba0630 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -53,6 +53,40 @@ check_study_update = { "id":"nmdc:sty-12-85j6kq06"} mydoc=study_coll.find(check_study_update) + +######################### +#generalized function to update study identifiers to napa format +#alt slots are already populated in all cases so logic for that is not needed + +#mint Class Study IDs using runtime API or manually using the minter endpoint +#if reading minted IDs from a json file +study_napa_json="XXXXXXXXXX" +with open(study_napa_json, 'r') as j: + study_napa_ids = json.loads(j.read()) + +#update_studies_to_napa_standards + +def update_studies_to_napa_standards(): + study_reid_log=open("napa_study_update".txt,"w") + napa_study_counter=0 + get_legacy_studies={ "id" : {"$regex":"^gold" } } + for doc in study_coll.find(get_legacy_studies): + select_legacy_study = {"id": doc["id"]} + study_target_update = {"$set": { "id": napa_study_ids[napa_study_count] } } + if (napa_study_ids[napa_study_count].startswith('nmdc:sty')): + #study_coll.update_one(select_legacy_study,study_target_update) + study_class_legacy_napa="Study "+ doc["id"] + " " + napa_study_ids[napa_study_count] + print(study_class_legacy_napa) + study_reid_log.write(napa_study_update.txt) + napa_study_counter=napa_study_counter+1 + else: + print("Did not update issue updating ",doc["id"]) + +######################### + + + + biosample_coll=mydb["biosample_set"] ##needed only in testing, update values in gold alt id slots to lowercase. This has been updated in the production version of the schema already @@ -190,34 +224,61 @@ f_omics_id_mapping = open("Gs0114663_omics_reid.txt", "w") f_omics_set_operation =open("Gs0114663_omics_set","w") -#TODO update the omics collection fixes to use arrays for part_of, has_input, alt identifiers +napa_study="nmdc:sty-12-85j6kq06" for doc in omics_coll.find(Gs0114663_legacy_omics): + #set list with value of napa study for part_of + study_napa_list=[] + study_napa_list.append(napa_study) #determine what has_input should be - if (len(doc["has_input"]) > 1): - print("Too many inputs for ",doc["id"]) - elif(len(doc["has_input"]) == 1): - for biosample in doc["has_input"]: + if(isinstance(doc["has_input"],list)): + napa_biosample_inputs=[] + for biosample in doc["has_input"]: if (biosample.startswith('GOLD')): - b_alt_id=biosample.replace('GOLD','gold') - else: - b_alt_id=biosample - else: - print("has_input not specified for ", doc["id"]) - target_has_input={"$or":[ {"emsl_biosample_identifiers":b_alt_id}, {"gold_biosample_identifiers":b_alt_id},{"insdc_biosample_identifiers":b_alt_id}]} - get_biosample=biosample_coll.find_one(target_has_input) - replace_has_input=get_biosample["id"] - #set id + biosample=biosample.replace('GOLD','gold') + target_has_input={"$or":[ {"emsl_biosample_identifiers":biosample}, {"gold_biosample_identifiers":biosample},{"insdc_biosample_identifiers":biosample}]} + get_biosample=biosample_coll.find_one(target_has_input) + napa_biosample_inputs.append.get_biosample["id"] + #set id and alternative ids target_omics={"id": doc["id"]} - if (doc["id"].startswith('gold')): - alt_id_slot="gold_sequencing_project_identifiers" - elif (doc["id"].startswith('GOLD')): - alt_id_slot="gold_sequencing_project_identifiers" - doc["id"].replace('gold','GOLD') - elif (doc["id"].startswith(('emsl', 'EMSL'))): + #deal with gold omics identifiers + if (doc["id"].startswith('gold')): + alt_id_slot="gold_sequencing_project_identifiers" + alt_id=[] + alt_id.append.doc["id"] + if(doc["gold_sequencing_project_identifiers"]): + sorted_existing_gold_alt=doc["gold_sequencing_project_identifiers"].sort() + alt_id.sort() + if sorted_existing_gold_alt == alt_id: + update_alt= False + else: + print("gold alt ids exist but are not equal", doc["id"],doc["gold_sequencing_project_identifiers"]) + else: + alt_id=[] + alt_id.append.doc["id"] + update_alt=True + #deal with emsl omics identifiers + elif (doc["id"].startswith(('emsl'))): alt_id_slot="alternative_identifiers" + alt_id=[] + alt_id.append.doc["id"] + if(doc["alternative_identifiers"]): + sorted_existing_alt=doc["alternative_identifiers"].sort + alt_id.sort() + if sorted_existing_alt == alt_id: + update_alt=False + else: + print("emsl alt ids exist but are not equal", doc["id"],doc["alternative_identifiers"]) + else: + alt_id=[] + alt_id.append.doc["id"] + update_alt=True else: - print("Not sure how to re-id omics_processing_set id ",doc["id"]) - target_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":"nmdc:sty-12-85j6kq06", alt_id_slot:doc["id"],"has_input": replace_has_input }} + print("Not sure how to re-id omics_processing_set id ",doc["id"]) + #set target update depending on if alt slot exists already or not + if update_alt is True: + target_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":[study_napa_list], "has_input": [napa_biosample_inputs], alt_id_slot: [alt_id] }} + if update_alt is False: + target_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":[study_napa_list], "has_input": [napa_biosample_inputs]}} class_legacy_napa="OmicsProcessing " + doc["id"] + " "+ omics_napa_ids[omics_counter] print(class_legacy_napa) print(target_update) diff --git a/nmdc_schema/napa_study_biosample_omics_migration.py b/nmdc_schema/napa_study_biosample_omics_migration.py new file mode 100644 index 0000000000..9ff32701a2 --- /dev/null +++ b/nmdc_schema/napa_study_biosample_omics_migration.py @@ -0,0 +1,100 @@ +import json +import os +from pprint import pprint +import secrets +import time + +import requests + +import pymongo +from pymongo import MongoClient + +#define variables for tables to update, assumes a mongo connection variable 'client' +#set database name +mydb =client['nmdc'] +sty_coll=mydb["study_set"] +bsm_coll=mydb["biosample_set"] + +######################### +#generalized function to update study identifiers to napa format +#alt slots are already populated in all cases so logic for that is not needed + +#mint Class Study IDs using runtime API or manually using the minter endpoint +#if reading minted IDs from a json file +sty_napa_json="XXXXXXXXXX" +with open(sty_napa_json, 'r') as j: + sty_napa_ids = json.loads(j.read()) + +#update_studies_to_napa_standards + +def update_studies_to_napa_standards(): + study_reid_log=open("napa_sty_update.txt","w") + napa_sty_counter=0 + get_legacy_sty={ "id" : {"$regex":"^gold" } } + for sty_doc in sty_coll.find(get_legacy_studies): + select_legacy_sty = {"id": sty_doc["id"]} + sty_target_update = {"$set": { "id": napa_sty_ids[napa_sty_count] } } + if (napa_sty_ids[napa_sty_count].startswith('nmdc:sty')): + #sty_coll.update_one(select_legacy_sty,sty_target_update) + sty_class_legacy_napa="Study "+ sty_doc["id"] + " " + napa_sty_ids[napa_study_count] + print(sty_class_legacy_napa) + sty_reid_log.write(napa_sty_update.txt) + napa_sty_counter=napa_sty_counter+1 + else: + print("Did not update issue updating ",sty_doc["id"]) + +######################### +# +#function to update biosamples + +#mint Class Study IDs using runtime API or manually using the minter endpoint +#if reading minted IDs from a json file +sty_bsm_napa_json="XXXXXXXXXX" +with open(study_bsm_napa_json, 'r') as j: + bsm_napa_ids = json.loads(j.read()) + +def update_bsm_by_study(napa_sty_id): + bsm_reid_log=open(napa_sty_id"_bsm_update.txt","w") + bsm_counter=0 + legacy_sty=napa_sty_to_legacy(napa_sty_id) + + legacy_bsm={"part_of": legacy_sty, "id", {"$ne":"^nmdc:bsm"}} + for bsm_doc in bsm_coll.find(legacy_bsm): + #set value for part_of + sty_napa_list=[] + sty_napa_list.append(napa_sty_id) + target_bsm={"id": bsm_doc["id"]} + #alt id check function + #TODO + if update_alt is True: + bsm_target_update = { "$set": { "id": bsm_napa_ids[bsm_counter], "part_of":[sty_napa_list], alt_id_slot: [alt_id] }} + if update_alt is False: + bsm_target_update = { "$set": { "id": bsm_napa_ids[bsm_counter], "part_of":[sty_napa_list]}} + bsm_class_legacy_napa="Biosample " + bsm_doc["id"] + " "+ bsm_napa_ids[bsm_counter] + print(bsm_class_legacy_napa) + #perform biosample update + #bsm_coll.update_one(target_bsm,bsm_target_update) + bsm_reid_log.write(class_legacy_napa) + bsm_counter=bsm_counter+1 + else: + print("study id query returned something other than 1 document",napa_sty_id) + +################ + +#function to get legacy study id from alt id slot +def napa_sty_to_legacy(napa_sty_id): + legacy_sty="" + get_sty_record={"id":napa_sty_id} + target_sty=sty_coll.find(get_sty_record) + if target_sty == 1: + if (len(target_sty["gold_study_identifiers"]) ==1: + for alt_study in target_sty["gold_study_identifiers"]: + legacy_sty=alt_sty + return legacy_sty + else: + print("More than one GOLD study as alt id", target_sty["gold_study_identifiers"]) + +########################## +#function to update omics records +def update_omics_by_study(napa_sty_id): + From ceb5676277ad3d88a1c65a47c97a44af709e8ac6 Mon Sep 17 00:00:00 2001 From: "Yuri E. Corilo" Date: Sun, 15 Oct 2023 12:30:11 -0700 Subject: [PATCH 04/44] start metab and nom ide refactoring --- nmdc_schema/metab_id_refactor.py | 192 +++++++++++++++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 nmdc_schema/metab_id_refactor.py diff --git a/nmdc_schema/metab_id_refactor.py b/nmdc_schema/metab_id_refactor.py new file mode 100644 index 0000000000..d58836cd35 --- /dev/null +++ b/nmdc_schema/metab_id_refactor.py @@ -0,0 +1,192 @@ +from dataclasses import dataclass, field, asdict +import hashlib +from pathlib import Path +import os +from pprint import pprint +from typing import List +from json import dumps + +from dotenv import load_dotenv +import pymongo +from pymongo import MongoClient +from pymongo.errors import ConnectionFailure +import oauthlib +import requests_oauthlib + +from linkml_runtime.dumpers import json_dumper +import yaml +import nmdc_schema.nmdc as nmdc + + +envfile_path = "../../.env.client" + +load_dotenv(envfile_path) +#nersc ssh tunnel required to connect to mongo +#ssh -L 37020:mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017 -o ServerAliveInterval=60 {YOUR_NERSC_USERNAME}@dtn01.nersc.gov + +napa_mongo_pw = os.environ.get('MONGO_NAPA_PW') or "safeguard-wipe-scanner-78" +#print("napa_mongo_pw:", os.environ['MONGO_NAPA_PW']) +print(napa_mongo_pw) +napa_mongo='mongodb://root:'+napa_mongo_pw+'@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin' +#connection = MongoClient() +#db = connection.napa_mongo +print(napa_mongo) + +#connect to mongo +client = MongoClient(napa_mongo) + +#set mongo database name to nmdc' +mydb =client['nmdc'] + +#list database names +#for db in client.list_database_names(): +# print(db) + +#list collections +#for coll in mydb.list_collection_names(): +# print(coll) + +# omicsProcessing update, has_output --> raw data +# omicsProcessing update, alternative_identifier --> nom_analysis_activity.was_informed_by + +# nom_analysis_activity --> has_input (new raw file or update ID) +# nom_analysis_activity --> has_output (data product file, update ID) +# nom_analysis_activity --> replace ids +# nom_analysis_activity --> was_informed_by -- id from alternative indetifier omics Processing +# dataObject --> replace id, and add alternative identifier, emsl:60592345 +@dataclass +class NMDC_Mint: + + schema_class: dict = field(default_factory= lambda: { + 'schema': None, + }) + how_many:int = 1 + + @property + def __dict__(self): + return asdict(self) + + @property + def json(self): + return dumps(self.__dict__) + +@dataclass +class DataObject: + nom_raw_data_object_type:str = "Direct Infusion FT ICR-MS Raw Data" + nom_raw_data_object_description:str = "Raw 21T Direct Infusion Data" + nom_dp_data_object_type:str = "FT ICR-MS Analysis Results" + nom_dp_data_object_description:str = "EnviroMS FT ICR-MS natural organic matter workflow molecular formula assignment output details" + +@dataclass +class NMDC_Types: + + BioSample:str = "nmdc:Biosample" + OmicsProcessing:str = "nmdc:OmicsProcessing" + NomAnalysisActivity:str = "nmdc:NomAnalysisActivity" + DataObject:str = "nmdc:DataObject" + +def update_data_products(nom_activities_doc, new_raw_file_id:str, + new_data_product_id:str, omics_prcessing_id:str, raw_file_path:Path=None): + + raw_file_id = nom_activities_doc.has_input[0] + + dataproduct_id = nom_activities_doc.has_input[0] + + data_object_set = mydb['data_object_set'] + + get_raw_file_data_object = { "id" : raw_file_id } + get_data_product_data_object = { "id" : dataproduct_id } + + raw_object_docs = [raw_objectdata_doc for raw_objectdata_doc in data_object_set.find(get_raw_file_data_object)] + + if raw_object_docs: + + raw_object_update = { "$set": { "id": new_raw_file_id, 'alternative_identifier': [omics_prcessing_id]} } + + data_object_set.update_one(raw_object_docs[0], raw_object_update ) + + else: + + new_raw_data_object = get_raw_data_object(raw_file_path, + was_generated_by=omics_prcessing_id, + data_object_type =DataObject.nom_raw_data_object_type, + description =DataObject.nom_raw_data_object_description) + + data_object_set.insert_one(new_raw_data_object) + + for data_product_objectdata_doc in data_object_set.find(get_data_product_data_object): + + data_product_object_update = { "$set": { "id": new_data_product_id}} + + data_object_set.update_one(data_product_objectdata_doc, data_product_object_update ) + +def update_omics_processing(nom_new_id, new_data_product_id, new_raw_file_id, raw_file_path=None): + + omics_processing_set = mydb['omics_processing_set'] + + nom_activities_set = mydb['nom_analysis_activity_set'] + + get_old_activities={ "id" : {"$regex":"^emsl" } } + + for nom_activities_doc in nom_activities_set.find(get_old_activities): + + get_parent_omics_processing ={ "has_output" : nom_activities_doc["has_input"] } + + '''always going to be one omics processing''' + for omics_processing_doc in omics_processing_set.find(get_parent_omics_processing): + + omics_processing_update = { "$set": { "has_output": [new_raw_file_id]} } + + omics_processing_set.update_one(omics_processing_doc, omics_processing_update) + + new_omics_processing_id = omics_processing_doc['id'] + + update_data_products( nom_activities_doc, new_data_product_id, new_data_product_id, + new_omics_processing_id, raw_file_path) + + nom_activity_update = { "$set": { "id": nom_new_id , "has_output":[new_data_product_id], + "has_input":[new_raw_file_id], "was_informed_by": [new_omics_processing_id]} } + + nom_activities_set.update_one(nom_activities_doc, nom_activity_update) + +def mint_nmdc_id(type:NMDC_Types, how_many:int = 1) -> List[str]: + + config = yaml.safe_load(open('./config.yaml','r')) + client = oauthlib.oauth2.BackendApplicationClient(client_id=config['client_id']) + oauth = requests_oauthlib.OAuth2Session(client=client) + + token = oauth.fetch_token(token_url='https://api.microbiomedata.org/token', + client_id=config['client_id'], + client_secret=config['client_secret']) + + nmdc_mint_url = "https://api.microbiomedata.org/pids/mint" + + payload = NMDC_Mint(type, how_many) + + #response = s.post(nmdc_mint_url, data=payload.json, ) + #list_ids = response.json() + print(payload.json) + response = oauth.post(nmdc_mint_url, data=payload.json) + list_ids = response.json() + print(list_ids) + return list_ids + +def get_raw_data_object(file_path:Path, was_generated_by:str, + data_object_type:str, description:str) -> nmdc.DataObject: + + nmdc_id = mint_nmdc_id({'id': NMDC_Types.DataObject})[0] + + data_dict = { + 'id': nmdc_id, + "name": file_path.name, + "file_size_bytes": file_path.stat().st_size, + "md5_checksum": hashlib.md5(file_path.open('rb').read()).hexdigest(), + "was_generated_by": was_generated_by, #omics processing id + "data_object_type": data_object_type, + "description": description, + "type": "nmdc:DataObject" + } + + data_object = nmdc.DataObject(**data_dict) + + return data_object \ No newline at end of file From 4f627fd099e0059adf22292f24867af05b7a54d6 Mon Sep 17 00:00:00 2001 From: aclum Date: Mon, 23 Oct 2023 17:30:16 -0700 Subject: [PATCH 05/44] updating to prod napa ids, testing updates to omics_processing_records for gold:Gs0114663 --- nmdc_schema/connect_napa_mongo.py | 117 ++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 37 deletions(-) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index b151ba0630..d59bfaa293 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -35,8 +35,8 @@ mydb =client['nmdc'] #list database names -#for db in client.list_database_names(): -# print(db) +for db in client.list_database_names(): + print(db) #list collections #for coll in mydb.list_collection_names(): @@ -224,7 +224,7 @@ def update_studies_to_napa_standards(): f_omics_id_mapping = open("Gs0114663_omics_reid.txt", "w") f_omics_set_operation =open("Gs0114663_omics_set","w") -napa_study="nmdc:sty-12-85j6kq06" +napa_study='nmdc:sty-11-aygzgv51' for doc in omics_coll.find(Gs0114663_legacy_omics): #set list with value of napa study for part_of study_napa_list=[] @@ -237,55 +237,36 @@ def update_studies_to_napa_standards(): biosample=biosample.replace('GOLD','gold') target_has_input={"$or":[ {"emsl_biosample_identifiers":biosample}, {"gold_biosample_identifiers":biosample},{"insdc_biosample_identifiers":biosample}]} get_biosample=biosample_coll.find_one(target_has_input) - napa_biosample_inputs.append.get_biosample["id"] + napa_biosample_inputs.append(get_biosample["id"]) #set id and alternative ids target_omics={"id": doc["id"]} - #deal with gold omics identifiers + #deal with gold omics identifiers, for all 485 legacy records all already list gold projects in the gold_sequencing_project_identifiers slot if (doc["id"].startswith('gold')): - alt_id_slot="gold_sequencing_project_identifiers" - alt_id=[] - alt_id.append.doc["id"] - if(doc["gold_sequencing_project_identifiers"]): - sorted_existing_gold_alt=doc["gold_sequencing_project_identifiers"].sort() - alt_id.sort() - if sorted_existing_gold_alt == alt_id: - update_alt= False - else: - print("gold alt ids exist but are not equal", doc["id"],doc["gold_sequencing_project_identifiers"]) - else: - alt_id=[] - alt_id.append.doc["id"] - update_alt=True + update_alt= False #deal with emsl omics identifiers elif (doc["id"].startswith(('emsl'))): alt_id_slot="alternative_identifiers" alt_id=[] - alt_id.append.doc["id"] - if(doc["alternative_identifiers"]): - sorted_existing_alt=doc["alternative_identifiers"].sort - alt_id.sort() - if sorted_existing_alt == alt_id: - update_alt=False - else: - print("emsl alt ids exist but are not equal", doc["id"],doc["alternative_identifiers"]) - else: - alt_id=[] - alt_id.append.doc["id"] - update_alt=True + alt_id.append(doc["id"]) + update_alt=True else: print("Not sure how to re-id omics_processing_set id ",doc["id"]) #set target update depending on if alt slot exists already or not if update_alt is True: - target_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":[study_napa_list], "has_input": [napa_biosample_inputs], alt_id_slot: [alt_id] }} + target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs, alt_id_slot: alt_id }} if update_alt is False: - target_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":[study_napa_list], "has_input": [napa_biosample_inputs]}} + target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs}} + omics_coll.update_one(target_omics,target_omics_update) class_legacy_napa="OmicsProcessing " + doc["id"] + " "+ omics_napa_ids[omics_counter] - print(class_legacy_napa) - print(target_update) - f_omics_id_mapping.write(class_legacy_napa) - #f_omics_set_operation.write(target_update) + #print(class_legacy_napa) + #print(target_update) + f_omics_id_mapping.write(class_legacy_napa + '\n') + # f_omics_set_operation.write(target_update + '\n') omics_counter=omics_counter+1 +f_omics_id_mapping.close() +f_omics_set_operation.close() + #f.close() #example regex #myquery = { "id": {"$regex" :"^gold*"}} @@ -295,5 +276,67 @@ def update_studies_to_napa_standards(): #collection =nmdc["study_set"] #study_count=nmdc.study_set.count() #print("The study count is:", study_count) +output_file=open("new_studies_brynn.txt","w") +for sty in studies: + get_sty=study_coll.find_one({"id":sty}) + print(get_sty) + +##get details on metap records with invalid urls +missing_urls=open('metap_missing_data_object_records.txt', 'r') +Lines = missing_urls.readlines() +missing_url_list=[] +for line in Lines: +# print(line.strip()) + select_dobj_target={"url":line.strip()} + print(select_dobj_target) + dobj_doc=dobj_coll.find_one(select_dobj_target) + print(dobj_doc) + missing_url_list.append(dobj_doc) +json_data = dumps(missing_url_list, indent = 2) + +with open('missing_data_objects.json', 'w') as file: + file.write(json_data) + +#update test biosample records to (Gs0114663) use prod minted IDs, not dev +with open("biosample_prod_Gs0114663.json", 'r') as j: + biosample_prod_napa_ids = json.loads(j.read()) +napa_study='nmdc:sty-12-85j6kq06' +f_biosample_prod_id_mapping = open("Gs0114663_biosample_reid.txt", "w") +Gs0114663_legacy_emsl_biosample={"part_of":"nmdc:sty-12-85j6kq06", "emsl_biosample_identifiers": { "$exists": True }} +Gs0114663_legacy_gold_biosample={"part_of":"nmdc:sty-12-85j6kq06", "emsl_biosample_identifiers": { "$exists": False }} +biosample_prod_counter=0 +for doc in biosample_coll.find(Gs0114663_legacy_emsl_biosample): + target_prod_update = { "$set": { "id": biosample_prod_napa_ids[biosample_prod_counter]}} + target_prod_biosample={"id": doc["id"]} + biosample_coll.update_one(target_prod_biosample,target_prod_update) + class_legacy_napa_Gs0114663="Biosample " + biosample_prod_napa_ids[biosample_prod_counter] + " "+ doc["emsl_biosample_identifiers"][0] + f_biosample_prod_id_mapping.write(class_legacy_napa_Gs0114663+ '\n') + biosample_prod_counter=biosample_prod_counter+1 + +for doc in biosample_coll.find(Gs0114663_legacy_gold_biosample): + target_prod_update = { "$set": { "id": biosample_prod_napa_ids[biosample_prod_counter]}} + target_prod_biosample={"id": doc["id"]} + biosample_coll.update_one(target_prod_biosample,target_prod_update) + class_legacy_napa="Biosample " + biosample_prod_napa_ids[biosample_prod_counter] + " "+ doc["gold_biosample_identifiers"][0] + f_biosample_prod_id_mapping.write(class_legacy_napa + '\n') + print(class_legacy_napa) + biosample_prod_counter=biosample_prod_counter+1 + +f_biosample_prod_id_mapping.close() +#end dev to prod ids for nmdc:sty-12-85j6kq06/gold:Gs0114663 + +#update nmdc:sty-12-85j6kq06 to a prod id +study_coll=mydb["study_set"] +select_dev_napa_study = {"id":"nmdc:sty-12-85j6kq06"} +napa_prod_study_update = { "$set": { "id": "nmdc:sty-11-aygzgv51" } } + + +study_coll.update_one(select_dev_napa_study, napa_prod_study_update) + +Gs0114663_dev_biosample={"part_of":"nmdc:sty-12-85j6kq06"} +for doc in biosample_coll.find(Gs0114663_dev_biosample): + target_prod_biosample={"id": doc["id"]} + fix_Gs0114663_biosample_part_of= { "$set": { "part_of": ["nmdc:sty-11-aygzgv51"]}} + biosample_coll.update_one(target_prod_biosample,fix_Gs0114663_biosample_part_of) From 8a118503619fdd692cde5fb132c28cd56aae76dc Mon Sep 17 00:00:00 2001 From: aclum Date: Tue, 24 Oct 2023 15:48:04 -0700 Subject: [PATCH 06/44] fix gold projects for Gs0114663 --- nmdc_schema/connect_napa_mongo.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index d59bfaa293..f0eb8a703b 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -267,6 +267,15 @@ def update_studies_to_napa_standards(): f_omics_id_mapping.close() f_omics_set_operation.close() +#update gold project ids for omics_records +Gs0114663_mgs=open("Gs0114663_mg_omics.txt", 'r') +for gold_sp_updates in Gs0114663_mgs: + gold_sp_info=gold_sp_updates.split() + gold_sp_list=[] + gold_sp_list.append(gold_sp_info[1]) + gold_sp_target_id={"id":gold_sp_info[2]} + gold_sp_target_update={ "$set": { "gold_sequencing_project_identifiers":gold_sp_list}} + omics_coll.update_one(gold_sp_target_id,gold_sp_target_update) #f.close() #example regex #myquery = { "id": {"$regex" :"^gold*"}} From afe423ef2e9f4e99ba1d8e30d004413e11e01fd3 Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 26 Oct 2023 17:32:33 -0700 Subject: [PATCH 07/44] Adding code to remove WEA and data object records from projects that do not have upstream omics records --- nmdc_schema/connect_napa_mongo.py | 48 +++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index f0eb8a703b..c381f8d1c7 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -349,3 +349,51 @@ def update_studies_to_napa_standards(): fix_Gs0114663_biosample_part_of= { "$set": { "part_of": ["nmdc:sty-11-aygzgv51"]}} biosample_coll.update_one(target_prod_biosample,fix_Gs0114663_biosample_part_of) +#check lenght of gold project arrays +gold_project_array_lengths=[] +omics_with_gold_projects={'id':{'$regex':'^gold'},'gold_sequencing_project_identifiers':{'$exists':True}} +for doc in omics_coll.find(omics_with_gold_projects): + if (len(doc["gold_sequencing_project_identifiers"])) <1: + print(doc["id"] +" has an empty array: part of "+ doc["part_of"][0]) + elif (len(doc["gold_sequencing_project_identifiers"])) == 1: + len_array=1 + else: + print("length unclear") + +############### +#track down records WorkflowExecutionActivity (WEA) records that need to be deleted and their associated data objects + +seq_based_collection_list=['read_qc_analysis_activity_set','read_based_taxonomy_analysis_activity_set','metagenome_assembly_set','metagenome_annotation_activity_set','mags_activity_set','metatranscriptome_activity_set'] + +#open file with list of omics records to delete, this list is derived from a rdf query to check for data refs +#that list was manually reviewed to determine which WEA to delete vs repair upstream records + +data_object_coll=mydb['data_object_set'] + +target_gp_for_del=open("omics_records_to_delete.txt", 'r') +for gp in target_gp_for_del: + gold_project_id=gp.strip() + gold_proj_curie='gold:'+gold_project_id + #check to make sure omics_processing_set record doesn't exist + if (omics_coll.find_one({'id':gold_proj_curie})): + print("omics processing set record exists for "+gold_proj_curie) + else: + for collection in seq_based_collection_list: + wea_coll=mydb[collection] + doc=wea_coll.find_one({'was_informed_by': gold_proj_curie}) + if (doc): + print("found "+doc["id"]+" in collection "+ collection) + #wea_to_delete.append(doc) + wea_coll.delete_one({'was_informed_by': gold_proj_curie}) +#this method should not be used as there are data objects that need to be removed that are not listed in has_output for the WEA records + #if "has_input" in doc.keys(): + # for input in doc["has_input"]: + # dobj_to_delete.append(input) + #if "has_output" in doc.keys(): + # for output in doc["has_output"]: + # dobj_to_delete.append(output) + else: + print("Could not find WEA records informed by "+gold_project_id+" in collection "+ collection) + data_object_coll.delete_many({'description':{'$regex':gold_project_id}}) +### +#end cleanup of omics records that don't exist From 316aca07fb44a894dd6b66a510dcc9f0bfbc55c8 Mon Sep 17 00:00:00 2001 From: aclum Date: Mon, 30 Oct 2023 13:02:43 -0700 Subject: [PATCH 08/44] refined updates to napa compliance biosample updates --- .../napa_study_biosample_omics_migration.py | 147 +++++++++++++----- 1 file changed, 108 insertions(+), 39 deletions(-) diff --git a/nmdc_schema/napa_study_biosample_omics_migration.py b/nmdc_schema/napa_study_biosample_omics_migration.py index 9ff32701a2..3f5c5ee703 100644 --- a/nmdc_schema/napa_study_biosample_omics_migration.py +++ b/nmdc_schema/napa_study_biosample_omics_migration.py @@ -9,6 +9,11 @@ import pymongo from pymongo import MongoClient +#connect to napa mongo +napa_mongo_pw = os.environ['MONGO_NAPA_PW'] +napa_mongo='mongodb://root:'+napa_mongo_pw+'@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin' +client = MongoClient(napa_mongo) + #define variables for tables to update, assumes a mongo connection variable 'client' #set database name mydb =client['nmdc'] @@ -44,57 +49,121 @@ def update_studies_to_napa_standards(): print("Did not update issue updating ",sty_doc["id"]) ######################### -# -#function to update biosamples +##function to update biosamples #mint Class Study IDs using runtime API or manually using the minter endpoint #if reading minted IDs from a json file -sty_bsm_napa_json="XXXXXXXXXX" -with open(study_bsm_napa_json, 'r') as j: - bsm_napa_ids = json.loads(j.read()) def update_bsm_by_study(napa_sty_id): - bsm_reid_log=open(napa_sty_id"_bsm_update.txt","w") + bsm_reid_log=open(napa_sty_id + "_bsm_update.txt","w") bsm_counter=0 - legacy_sty=napa_sty_to_legacy(napa_sty_id) - - legacy_bsm={"part_of": legacy_sty, "id", {"$ne":"^nmdc:bsm"}} - for bsm_doc in bsm_coll.find(legacy_bsm): - #set value for part_of - sty_napa_list=[] - sty_napa_list.append(napa_sty_id) - target_bsm={"id": bsm_doc["id"]} - #alt id check function - #TODO - if update_alt is True: - bsm_target_update = { "$set": { "id": bsm_napa_ids[bsm_counter], "part_of":[sty_napa_list], alt_id_slot: [alt_id] }} - if update_alt is False: - bsm_target_update = { "$set": { "id": bsm_napa_ids[bsm_counter], "part_of":[sty_napa_list]}} - bsm_class_legacy_napa="Biosample " + bsm_doc["id"] + " "+ bsm_napa_ids[bsm_counter] - print(bsm_class_legacy_napa) - #perform biosample update - #bsm_coll.update_one(target_bsm,bsm_target_update) - bsm_reid_log.write(class_legacy_napa) - bsm_counter=bsm_counter+1 - else: - print("study id query returned something other than 1 document",napa_sty_id) - + bsm_alt_id_dict={'gold_biosample_identifiers':'gold:','igsn_biosample_identifiers':'igsn:','emsl_biosample_identifiers':'emsl:'} + legacy_sty=napa_sty_to_legacy(napa_sty_id) + with open(legacy_sty + "_bsm_napa.json", 'r') as j: + bsm_napa_ids = json.loads(j.read()) + legacy_bsm={"part_of": legacy_sty, "id": {"$ne":"^nmdc:bsm"}} + for bsm_doc in bsm_coll.find(legacy_bsm): + #set value for part_of + sty_napa_list=[] + sty_napa_list.append(napa_sty_id) + target_bsm={"id": bsm_doc["id"]} + #alt id check function + alt_id=[] + alt_id_slot_name='' + for alt_id_slot in bsm_alt_id_dict: + if bsm_doc["id"].startswith(bsm_alt_id_dict[alt_id_slot]): + alt_id_slot_name=alt_id_slot + if alt_id_slot_name in bsm_doc.keys(): + if len(bsm_doc[alt_id_slot_name]) == 0: + update_alt=True + alt_id.append(bsm_doc["id"]) + print ("will update alt id slot is empty"+alt_id_slot_name) + elif (len(bsm_doc[alt_id_slot_name]) == 1 and bsm_doc[alt_id_slot_name][0] == bsm_doc["id"]): + print(alt_id_slot+" already set for "+bsm_doc["id"]) + update_alt=False + else: + print("length of array for "+ alt_id_slot +"exists and is greater than 1") + update_alt=False + else: + update_alt=True + alt_id.append(bsm_doc["id"]) + print ("will update alt id b/c could not fine alt id") + break; + if update_alt: + bsm_target_update = { "$set": { "id": bsm_napa_ids[bsm_counter], "part_of":sty_napa_list, alt_id_slot_name: alt_id }} + elif not update_alt: + bsm_target_update = { "$set": { "id": bsm_napa_ids[bsm_counter], "part_of":sty_napa_list}} + else: + print("not sure how to make the biosample update for" + bsm_doc["id"]) + bsm_class_legacy_napa="Biosample " + bsm_doc["id"] + " "+ bsm_napa_ids[bsm_counter] + print(bsm_class_legacy_napa) + print(target_bsm) + print(bsm_target_update) + #perform biosample update + bsm_coll.update_one(target_bsm,bsm_target_update) + bsm_reid_log.write(bsm_class_legacy_napa) + bsm_counter=bsm_counter+1 + bsm_reid_log.close() ################ #function to get legacy study id from alt id slot def napa_sty_to_legacy(napa_sty_id): legacy_sty="" get_sty_record={"id":napa_sty_id} - target_sty=sty_coll.find(get_sty_record) - if target_sty == 1: - if (len(target_sty["gold_study_identifiers"]) ==1: - for alt_study in target_sty["gold_study_identifiers"]: - legacy_sty=alt_sty - return legacy_sty - else: - print("More than one GOLD study as alt id", target_sty["gold_study_identifiers"]) - + target_sty=sty_coll.find_one(get_sty_record) + if len(target_sty["gold_study_identifiers"]) ==1: + legacy_sty=target_sty["gold_study_identifiers"][0] + else: + print("More than one GOLD study as alt id", target_sty["gold_study_identifiers"]) + return legacy_sty ########################## #function to update omics records def update_omics_by_study(napa_sty_id): - + omics_coll=mydb["omics_processing_set"] + omics_counter=0 + legacy_sty=napa_sty_to_legacy(napa_sty_id) + legacy_omics={"part_of": legacy_sty, "id", {"$ne":"^nmdc:omprc"}} + with open(legacy_study"_omics_test.json", 'r') as j: + omics_napa_ids = json.loads(j.read()) + for doc in omics_coll.find(legacy_omics): + #set list with value of napa study for part_of + study_napa_list=[] + study_napa_list.append(napa_study) + #determine what has_input should be + if(isinstance(doc["has_input"],list)): + napa_biosample_inputs=[] + for biosample in doc["has_input"]: + if (biosample.startswith('GOLD')): + biosample=biosample.replace('GOLD','gold') + target_has_input={"$or":[ {"emsl_biosample_identifiers":biosample}, {"gold_biosample_identifiers":biosample},{"insdc_biosample_identifiers":biosample}]} + get_biosample=biosample_coll.find_one(target_has_input) + napa_biosample_inputs.append(get_biosample["id"]) + #set id and alternative ids + target_omics={"id": doc["id"]} + #deal with gold omics identifiers, for all 485 legacy records all already list gold projects in the gold_sequencing_project_identifiers slot + if (doc["id"].startswith('gold')): + update_alt= False + #deal with emsl omics identifiers + elif (doc["id"].startswith(('emsl'))): + alt_id_slot="alternative_identifiers" + alt_id=[] + alt_id.append(doc["id"]) + update_alt=True + else: + print("Not sure how to re-id omics_processing_set id ",doc["id"]) + #set target update depending on if alt slot exists already or not + if update_alt is True: + target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs, alt_id_slot: alt_id }} + if update_alt is False: + target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs}} + omics_coll.update_one(target_omics,target_omics_update) + class_legacy_napa="OmicsProcessing " + doc["id"] + " "+ omics_napa_ids[omics_counter] + #print(class_legacy_napa) + #print(target_update) + f_omics_id_mapping.write(class_legacy_napa + '\n') + # f_omics_set_operation.write(target_update + '\n') + omics_counter=omics_counter+1 + f_omics_id_mapping.close() + f_omics_set_operation.close() + + From 8c937a446585be81548cdf404eac043afda28f79 Mon Sep 17 00:00:00 2001 From: aclum Date: Mon, 30 Oct 2023 17:20:11 -0700 Subject: [PATCH 09/44] working functions to reid biosample_set and omics_processing_set records from legacy to NMDC napa style identifiers --- .../napa_study_biosample_omics_migration.py | 70 +++++++++++-------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/nmdc_schema/napa_study_biosample_omics_migration.py b/nmdc_schema/napa_study_biosample_omics_migration.py index 3f5c5ee703..c578ad9834 100644 --- a/nmdc_schema/napa_study_biosample_omics_migration.py +++ b/nmdc_schema/napa_study_biosample_omics_migration.py @@ -55,10 +55,10 @@ def update_studies_to_napa_standards(): #if reading minted IDs from a json file def update_bsm_by_study(napa_sty_id): - bsm_reid_log=open(napa_sty_id + "_bsm_update.txt","w") bsm_counter=0 bsm_alt_id_dict={'gold_biosample_identifiers':'gold:','igsn_biosample_identifiers':'igsn:','emsl_biosample_identifiers':'emsl:'} legacy_sty=napa_sty_to_legacy(napa_sty_id) + bsm_reid_log=open(legacy_sty + "_bsm_reid.txt","w") with open(legacy_sty + "_bsm_napa.json", 'r') as j: bsm_napa_ids = json.loads(j.read()) legacy_bsm={"part_of": legacy_sty, "id": {"$ne":"^nmdc:bsm"}} @@ -101,7 +101,7 @@ def update_bsm_by_study(napa_sty_id): print(bsm_target_update) #perform biosample update bsm_coll.update_one(target_bsm,bsm_target_update) - bsm_reid_log.write(bsm_class_legacy_napa) + bsm_reid_log.write(bsm_class_legacy_napa + '\n') bsm_counter=bsm_counter+1 bsm_reid_log.close() ################ @@ -121,49 +121,59 @@ def napa_sty_to_legacy(napa_sty_id): def update_omics_by_study(napa_sty_id): omics_coll=mydb["omics_processing_set"] omics_counter=0 + omics_alt_id_dict={'gold_sequencing_project_identifiers':'gold:','alternative_identifiers':'emsl:'} legacy_sty=napa_sty_to_legacy(napa_sty_id) - legacy_omics={"part_of": legacy_sty, "id", {"$ne":"^nmdc:omprc"}} - with open(legacy_study"_omics_test.json", 'r') as j: + legacy_omics={"part_of": legacy_sty, "id": {"$ne":"^nmdc:omprc"}} + f_omics_id_mapping = open(legacy_sty+"_omics_reid.txt", "w") + with open(legacy_sty+"_omics_napa.json", 'r') as j: omics_napa_ids = json.loads(j.read()) - for doc in omics_coll.find(legacy_omics): + for omics_doc in omics_coll.find(legacy_omics): #set list with value of napa study for part_of study_napa_list=[] - study_napa_list.append(napa_study) + study_napa_list.append(napa_sty_id) #determine what has_input should be - if(isinstance(doc["has_input"],list)): + if(isinstance(omics_doc["has_input"],list)): napa_biosample_inputs=[] - for biosample in doc["has_input"]: - if (biosample.startswith('GOLD')): - biosample=biosample.replace('GOLD','gold') - target_has_input={"$or":[ {"emsl_biosample_identifiers":biosample}, {"gold_biosample_identifiers":biosample},{"insdc_biosample_identifiers":biosample}]} - get_biosample=biosample_coll.find_one(target_has_input) - napa_biosample_inputs.append(get_biosample["id"]) + for biosample in omics_doc["has_input"]: + biosample=biosample.replace('GOLD','gold') + target_has_input={"$or":[ {"emsl_biosample_identifiers":biosample}, {"gold_biosample_identifiers":biosample},{"insdc_biosample_identifiers":biosample}]} + get_biosample=bsm_coll.find_one(target_has_input) + napa_biosample_inputs.append(get_biosample["id"]) #set id and alternative ids - target_omics={"id": doc["id"]} + target_omics={"id": omics_doc["id"]} #deal with gold omics identifiers, for all 485 legacy records all already list gold projects in the gold_sequencing_project_identifiers slot - if (doc["id"].startswith('gold')): - update_alt= False - #deal with emsl omics identifiers - elif (doc["id"].startswith(('emsl'))): - alt_id_slot="alternative_identifiers" - alt_id=[] - alt_id.append(doc["id"]) - update_alt=True - else: - print("Not sure how to re-id omics_processing_set id ",doc["id"]) + alt_omics_id=[] + for alt_omics_id_slot in omics_alt_id_dict: + if omics_doc["id"].startswith(omics_alt_id_dict[alt_omics_id_slot]): + if alt_omics_id_slot in omics_doc.keys(): + if len(omics_doc[alt_omics_id_slot]) == 0: + update_alt_omics=True + alt_omics_id.append(omics_doc["id"]) + target_alt_omics_slot=alt_omics_id_slot + print ("will update alt id slot is empty"+alt_id_slot_name) + elif (len(omics_doc[alt_omics_id_slot]) == 1 and omics_doc[alt_omics_id_slot][0] == omics_doc["id"]): + print(alt_omcs_id_slot+" already set for "+omics_doc["id"]) + update_alt_omics=False + else: + print("length of array for "+ alt_omics_id_slot +"exists and is greater than 1") + update_alt_omics=False + else: + update_alt_omics=True + alt_omics_id.append(omics_doc["id"]) + target_alt_omics_slot=alt_omics_id_slot + print ("will update alt id b/c could not find alt id") #set target update depending on if alt slot exists already or not - if update_alt is True: - target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs, alt_id_slot: alt_id }} - if update_alt is False: + if update_alt_omics is True: + target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs, target_alt_omics_slot: alt_omics_id }} + if update_alt_omics is False: target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs}} + print(target_omics_update) omics_coll.update_one(target_omics,target_omics_update) - class_legacy_napa="OmicsProcessing " + doc["id"] + " "+ omics_napa_ids[omics_counter] + class_legacy_napa="OmicsProcessing " + omics_doc["id"] + " "+ omics_napa_ids[omics_counter] #print(class_legacy_napa) #print(target_update) f_omics_id_mapping.write(class_legacy_napa + '\n') - # f_omics_set_operation.write(target_update + '\n') omics_counter=omics_counter+1 f_omics_id_mapping.close() - f_omics_set_operation.close() From 37e84d39997bb8ddbc0362efe508e15963ddc1d8 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 17 Nov 2023 16:12:18 -0800 Subject: [PATCH 10/44] Fix typo in filename and format as Markdown --- nmdc_schema/napa_complaince.README | 6 ------ nmdc_schema/napa_compliance.README.md | 11 +++++++++++ 2 files changed, 11 insertions(+), 6 deletions(-) delete mode 100644 nmdc_schema/napa_complaince.README create mode 100644 nmdc_schema/napa_compliance.README.md diff --git a/nmdc_schema/napa_complaince.README b/nmdc_schema/napa_complaince.README deleted file mode 100644 index bd8e83193b..0000000000 --- a/nmdc_schema/napa_complaince.README +++ /dev/null @@ -1,6 +0,0 @@ -#install notes - -#for generating tokens needed to run -pip install python-dotenv -#use pip or conda to install pymong -pip install pymongo diff --git a/nmdc_schema/napa_compliance.README.md b/nmdc_schema/napa_compliance.README.md new file mode 100644 index 0000000000..be92b3af13 --- /dev/null +++ b/nmdc_schema/napa_compliance.README.md @@ -0,0 +1,11 @@ +# Napa Compliance + +## Installing Python packages + +```shell +# So Python scripts can read `.env` files. +pip install python-dotenv + +# So Python scripts can access Mongo databases. +pip install pymongo +``` From 045b2b0f931881435acb4eea6b2fd7e1cf2d587b Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 17 Nov 2023 16:15:06 -0800 Subject: [PATCH 11/44] Use `black` to format existing Python source code --- nmdc_schema/connect_napa_mongo.py | 663 ++++++++++-------- nmdc_schema/metab_id_refactor.py | 291 ++++---- .../napa_study_biosample_omics_migration.py | 362 ++++++---- nmdc_schema/runtime_api_operations.py | 76 +- 4 files changed, 793 insertions(+), 599 deletions(-) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index c381f8d1c7..77759fefbb 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -17,383 +17,474 @@ load_dotenv(envfile_path) -#nersc ssh tunnel required to connect to mongo -#ssh -L 37020:mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017 -o ServerAliveInterval=60 {YOUR_NERSC_USERNAME}@dtn01.nersc.gov - -napa_mongo_pw = os.environ['MONGO_NAPA_PW'] -#print("napa_mongo_pw:", os.environ['MONGO_NAPA_PW']) - -napa_mongo='mongodb://root:'+napa_mongo_pw+'@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin' -#connection = MongoClient() -#db = connection.napa_mongo +# nersc ssh tunnel required to connect to mongo +# ssh -L 37020:mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017 -o ServerAliveInterval=60 {YOUR_NERSC_USERNAME}@dtn01.nersc.gov + +napa_mongo_pw = os.environ["MONGO_NAPA_PW"] +# print("napa_mongo_pw:", os.environ['MONGO_NAPA_PW']) + +napa_mongo = ( + "mongodb://root:" + + napa_mongo_pw + + "@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin" +) +# connection = MongoClient() +# db = connection.napa_mongo print(napa_mongo) -#connect to mongo +# connect to mongo client = MongoClient(napa_mongo) -#set mongo database name to nmdc' -mydb =client['nmdc'] +# set mongo database name to nmdc' +mydb = client["nmdc"] -#list database names +# list database names for db in client.list_database_names(): - print(db) + print(db) -#list collections -#for coll in mydb.list_collection_names(): +# list collections +# for coll in mydb.list_collection_names(): # print(coll) -study_coll=mydb["study_set"] +study_coll = mydb["study_set"] -select_legacy_study = {"id":"gold:Gs0114663"} -napa_study_update = { "$set": { "id": "nmdc:sty-12-85j6kq06" } } +select_legacy_study = {"id": "gold:Gs0114663"} +napa_study_update = {"$set": {"id": "nmdc:sty-12-85j6kq06"}} study_coll.update_one(select_legacy_study, napa_study_update) -check_study_update = { "id":"nmdc:sty-12-85j6kq06"} -mydoc=study_coll.find(check_study_update) +check_study_update = {"id": "nmdc:sty-12-85j6kq06"} +mydoc = study_coll.find(check_study_update) ######################### -#generalized function to update study identifiers to napa format -#alt slots are already populated in all cases so logic for that is not needed +# generalized function to update study identifiers to napa format +# alt slots are already populated in all cases so logic for that is not needed + +# mint Class Study IDs using runtime API or manually using the minter endpoint +# if reading minted IDs from a json file +study_napa_json = "XXXXXXXXXX" +with open(study_napa_json, "r") as j: + study_napa_ids = json.loads(j.read()) -#mint Class Study IDs using runtime API or manually using the minter endpoint -#if reading minted IDs from a json file -study_napa_json="XXXXXXXXXX" -with open(study_napa_json, 'r') as j: - study_napa_ids = json.loads(j.read()) +# update_studies_to_napa_standards -#update_studies_to_napa_standards def update_studies_to_napa_standards(): - study_reid_log=open("napa_study_update".txt,"w") - napa_study_counter=0 - get_legacy_studies={ "id" : {"$regex":"^gold" } } - for doc in study_coll.find(get_legacy_studies): - select_legacy_study = {"id": doc["id"]} - study_target_update = {"$set": { "id": napa_study_ids[napa_study_count] } } - if (napa_study_ids[napa_study_count].startswith('nmdc:sty')): - #study_coll.update_one(select_legacy_study,study_target_update) - study_class_legacy_napa="Study "+ doc["id"] + " " + napa_study_ids[napa_study_count] - print(study_class_legacy_napa) - study_reid_log.write(napa_study_update.txt) - napa_study_counter=napa_study_counter+1 - else: - print("Did not update issue updating ",doc["id"]) + study_reid_log = open("napa_study_update".txt, "w") + napa_study_counter = 0 + get_legacy_studies = {"id": {"$regex": "^gold"}} + for doc in study_coll.find(get_legacy_studies): + select_legacy_study = {"id": doc["id"]} + study_target_update = {"$set": {"id": napa_study_ids[napa_study_count]}} + if napa_study_ids[napa_study_count].startswith("nmdc:sty"): + # study_coll.update_one(select_legacy_study,study_target_update) + study_class_legacy_napa = ( + "Study " + doc["id"] + " " + napa_study_ids[napa_study_count] + ) + print(study_class_legacy_napa) + study_reid_log.write(napa_study_update.txt) + napa_study_counter = napa_study_counter + 1 + else: + print("Did not update issue updating ", doc["id"]) -######################### - +######################### -biosample_coll=mydb["biosample_set"] +biosample_coll = mydb["biosample_set"] ##needed only in testing, update values in gold alt id slots to lowercase. This has been updated in the production version of the schema already -biosample_fix_gold_case={"gold_biosample_identifiers":{"$regex":"^GOLD*"}} +biosample_fix_gold_case = {"gold_biosample_identifiers": {"$regex": "^GOLD*"}} for doc in biosample_coll.find(biosample_fix_gold_case): - if (len(doc["gold_biosample_identifiers"]) == 1): - gold_id_list=doc["gold_biosample_identifiers"] - case_fixed=gold_id_list[0].replace('GOLD','gold') - gold_fix_target_biosample={"id": doc["id"]} - gold_case_update={ "$set": { "gold_biosample_identifiers": case_fixed}} - #print("Set operation on ",gold_fix_target_biosample,gold_case_update) - biosample_coll.update_one(gold_fix_target_biosample,gold_case_update) - else: - print("There is more than one gold biosample for",doc["id"]) - -#fix the fact you did updated the gold biosample ids as a string and not a list -biosample_fix_gold_type={"gold_biosample_identifiers":{"$regex":"^gold*"}} + if len(doc["gold_biosample_identifiers"]) == 1: + gold_id_list = doc["gold_biosample_identifiers"] + case_fixed = gold_id_list[0].replace("GOLD", "gold") + gold_fix_target_biosample = {"id": doc["id"]} + gold_case_update = {"$set": {"gold_biosample_identifiers": case_fixed}} + # print("Set operation on ",gold_fix_target_biosample,gold_case_update) + biosample_coll.update_one(gold_fix_target_biosample, gold_case_update) + else: + print("There is more than one gold biosample for", doc["id"]) + +# fix the fact you did updated the gold biosample ids as a string and not a list +biosample_fix_gold_type = {"gold_biosample_identifiers": {"$regex": "^gold*"}} for doc in biosample_coll.find(biosample_fix_gold_type): - if(isinstance(doc["gold_biosample_identifiers"],str)): -# print("need to fix", doc["gold_biosample_identifiers"]) - update_gold_biosample=[] - update_gold_biosample.append(doc["gold_biosample_identifiers"]) - fix_gold_biosample_type_target={"id": doc["id"]} - gold_biosample_type_update={"$set": { "gold_biosample_identifiers": update_gold_biosample}} - biosample_coll.update_one(fix_gold_biosample_type_target,gold_biosample_type_update) - elif(isinstance(doc["gold_biosample_identifiers"],list)): - print("already the correct type ",doc["gold_biosample_identifiers"]) - else: - print("this record is type ", doc["gold_biosample_identifiers"],type(doc["gold_biosample_identifiers"])) - -study_fix_gold_case={"gold_study_identifiers":{"$regex":"^GOLD*"}} + if isinstance(doc["gold_biosample_identifiers"], str): + # print("need to fix", doc["gold_biosample_identifiers"]) + update_gold_biosample = [] + update_gold_biosample.append(doc["gold_biosample_identifiers"]) + fix_gold_biosample_type_target = {"id": doc["id"]} + gold_biosample_type_update = { + "$set": {"gold_biosample_identifiers": update_gold_biosample} + } + biosample_coll.update_one( + fix_gold_biosample_type_target, gold_biosample_type_update + ) + elif isinstance(doc["gold_biosample_identifiers"], list): + print("already the correct type ", doc["gold_biosample_identifiers"]) + else: + print( + "this record is type ", + doc["gold_biosample_identifiers"], + type(doc["gold_biosample_identifiers"]), + ) + +study_fix_gold_case = {"gold_study_identifiers": {"$regex": "^GOLD*"}} for doc in study_coll.find(study_fix_gold_case): - update_gold_study=[] - for gold_study in doc["gold_study_identifiers"]: - gold_study=gold_study.replace('GOLD','gold') - gold_study_target={"id": doc["id"]} - update_gold_study.append(gold_study) - #print(update_gold_study) - gold_fix_target_study={ "$set": {"gold_study_identifiers": update_gold_study}} - #print(gold_fix_target_study) - study_coll.update_one(gold_study_target,gold_fix_target_study) - -omics_coll=mydb["omics_processing_set"] -omics_processing_fix_gold_case={"gold_sequencing_project_identifiers":{"$regex":"^GOLD*"}} + update_gold_study = [] + for gold_study in doc["gold_study_identifiers"]: + gold_study = gold_study.replace("GOLD", "gold") + gold_study_target = {"id": doc["id"]} + update_gold_study.append(gold_study) + # print(update_gold_study) + gold_fix_target_study = {"$set": {"gold_study_identifiers": update_gold_study}} + # print(gold_fix_target_study) + study_coll.update_one(gold_study_target, gold_fix_target_study) + +omics_coll = mydb["omics_processing_set"] +omics_processing_fix_gold_case = { + "gold_sequencing_project_identifiers": {"$regex": "^GOLD*"} +} for doc in omics_coll.find(omics_processing_fix_gold_case): - update_gold_project=[] + update_gold_project = [] for gold_project in doc["gold_sequencing_project_identifiers"]: - gold_project=gold_project.replace('GOLD','gold') + gold_project = gold_project.replace("GOLD", "gold") update_gold_project.append(gold_project) - gold_project_target={"id": doc["id"]} - gold_fix_target_omics={ "$set": {"gold_sequencing_project_identifiers":update_gold_project}} - #print(gold_project_target,gold_fix_target_omics) - omics_coll.update_one(gold_project_target,gold_fix_target_omics) - -#end section only needed for testing + gold_project_target = {"id": doc["id"]} + gold_fix_target_omics = { + "$set": {"gold_sequencing_project_identifiers": update_gold_project} + } + # print(gold_project_target,gold_fix_target_omics) + omics_coll.update_one(gold_project_target, gold_fix_target_omics) + +# end section only needed for testing ################################################################################## -#for x in mydoc: +# for x in mydoc: # print(x) -select_biosample_part_of = {"part_of": {"$regex" :"Gs0114663$"}} -napa_biosample_part_of = { "$set": { "part_of": "nmdc:sty-12-85j6kq06"}} -part_of_biosample_update=biosample_coll.update_many(select_biosample_part_of,napa_biosample_part_of) +select_biosample_part_of = {"part_of": {"$regex": "Gs0114663$"}} +napa_biosample_part_of = {"$set": {"part_of": "nmdc:sty-12-85j6kq06"}} +part_of_biosample_update = biosample_coll.update_many( + select_biosample_part_of, napa_biosample_part_of +) print(part_of_biosample_update.modified_count, "documents_updated.") -#fix part_of for study Gs0114663, this needs to be an array -fix_select_biosample_part_of ={"part_of":"nmdc:sty-12-85j6kq06"} -fix_napa_biosample_part_of= { "$set": { "part_of": ["nmdc:sty-12-85j6kq06"]}} -fix_part_of_biosample_update=biosample_coll.update_many(fix_select_biosample_part_of,fix_napa_biosample_part_of) +# fix part_of for study Gs0114663, this needs to be an array +fix_select_biosample_part_of = {"part_of": "nmdc:sty-12-85j6kq06"} +fix_napa_biosample_part_of = {"$set": {"part_of": ["nmdc:sty-12-85j6kq06"]}} +fix_part_of_biosample_update = biosample_coll.update_many( + fix_select_biosample_part_of, fix_napa_biosample_part_of +) print(fix_part_of_biosample_update.modified_count, "documents_updated.") -#mint 85 biosample identifiers -#manually created this file when testing for Gs0114663 - -#update alt biosample ids -biosample_alt_emsl= {"part_of": "nmdc:sty-12-85j6kq06","emsl_biosample_identifiers":{"$exists":False},"id":{"$regex":"^emsl*"}} - +# mint 85 biosample identifiers +# manually created this file when testing for Gs0114663 - +# update alt biosample ids +biosample_alt_emsl = { + "part_of": "nmdc:sty-12-85j6kq06", + "emsl_biosample_identifiers": {"$exists": False}, + "id": {"$regex": "^emsl*"}, +} for doc in biosample_coll.find(biosample_alt_emsl): -# print(doc["id"]) - target_biosample={"id": doc["id"]} - target_update = { "$set": { "emsl_biosample_identifiers": doc["id"] } } - biosample_coll.update_one(target_biosample,target_update) + # print(doc["id"]) + target_biosample = {"id": doc["id"]} + target_update = {"$set": {"emsl_biosample_identifiers": doc["id"]}} + biosample_coll.update_one(target_biosample, target_update) -#fix updates to setting emsl biosample id slot as string instead of list -fix_type_emsl_alt_biosample={"part_of": "nmdc:sty-12-85j6kq06","emsl_biosample_identifiers":{"$exists":True}} +# fix updates to setting emsl biosample id slot as string instead of list +fix_type_emsl_alt_biosample = { + "part_of": "nmdc:sty-12-85j6kq06", + "emsl_biosample_identifiers": {"$exists": True}, +} for doc in biosample_coll.find(fix_type_emsl_alt_biosample): - if(isinstance(doc["emsl_biosample_identifiers"],str)): -# print("need to fix", doc["emsl_biosample_identifiers"]) - update_emsl_biosample=[] - update_emsl_biosample.append(doc["emsl_biosample_identifiers"]) - fix_emsl_biosample_type_target={"id": doc["id"]} - emsl_biosample_type_update={"$set": { "emsl_biosample_identifiers": update_emsl_biosample}} - biosample_coll.update_one(fix_emsl_biosample_type_target,emsl_biosample_type_update) - #print(fix_emsl_biosample_type_target,emsl_biosample_type_update) + if isinstance(doc["emsl_biosample_identifiers"], str): + # print("need to fix", doc["emsl_biosample_identifiers"]) + update_emsl_biosample = [] + update_emsl_biosample.append(doc["emsl_biosample_identifiers"]) + fix_emsl_biosample_type_target = {"id": doc["id"]} + emsl_biosample_type_update = { + "$set": {"emsl_biosample_identifiers": update_emsl_biosample} + } + biosample_coll.update_one( + fix_emsl_biosample_type_target, emsl_biosample_type_update + ) + # print(fix_emsl_biosample_type_target,emsl_biosample_type_update) else: - print("this record is type ", doc["emsl_biosample_identifiers"],type(doc["emsl_biosample_identifiers"])) + print( + "this record is type ", + doc["emsl_biosample_identifiers"], + type(doc["emsl_biosample_identifiers"]), + ) -with open("napa_biosample_test.json", 'r') as j: - biosample_napa_ids = json.loads(j.read()) +with open("napa_biosample_test.json", "r") as j: + biosample_napa_ids = json.loads(j.read()) +Gs0114663_legacy_biosamples = {"part_of": "nmdc:sty-12-85j6kq06"} +# f = open("Gs0114663_reid.txt", "a") -Gs0114663_legacy_biosamples={"part_of": "nmdc:sty-12-85j6kq06"} -#f = open("Gs0114663_reid.txt", "a") - -biosample_counter=0 +biosample_counter = 0 for doc in biosample_coll.find(Gs0114663_legacy_biosamples): - target_biosample={"id": doc["id"]} - target_update = { "$set": { "id": biosample_napa_ids[biosample_counter]}} - print("Biosample ",target_biosample,target_update) - biosample_coll.update_one(target_biosample,target_update) - biosample_counter=biosample_counter+1 -# f.write("Biosample "+ doc["id"]+ " "+ biosample_napa_ids[biosample_counter]) + target_biosample = {"id": doc["id"]} + target_update = {"$set": {"id": biosample_napa_ids[biosample_counter]}} + print("Biosample ", target_biosample, target_update) + biosample_coll.update_one(target_biosample, target_update) + biosample_counter = biosample_counter + 1 +# f.write("Biosample "+ doc["id"]+ " "+ biosample_napa_ids[biosample_counter]) -#update omics_processing_set records for gold:Gs0114663 -omics_coll=mydb["omics_processing_set"] +# update omics_processing_set records for gold:Gs0114663 +omics_coll = mydb["omics_processing_set"] -Gs0114663_legacy_omics={"part_of":"gold:Gs0114663"} +Gs0114663_legacy_omics = {"part_of": "gold:Gs0114663"} -#mint 479 napa omics ids manually +# mint 479 napa omics ids manually -#read omics_ids into a python list -with open("napa_omics_test.json", 'r') as j: - omics_napa_ids = json.loads(j.read()) +# read omics_ids into a python list +with open("napa_omics_test.json", "r") as j: + omics_napa_ids = json.loads(j.read()) -omics_counter=0 +omics_counter = 0 f_omics_id_mapping = open("Gs0114663_omics_reid.txt", "w") -f_omics_set_operation =open("Gs0114663_omics_set","w") +f_omics_set_operation = open("Gs0114663_omics_set", "w") -napa_study='nmdc:sty-11-aygzgv51' +napa_study = "nmdc:sty-11-aygzgv51" for doc in omics_coll.find(Gs0114663_legacy_omics): - #set list with value of napa study for part_of - study_napa_list=[] - study_napa_list.append(napa_study) - #determine what has_input should be - if(isinstance(doc["has_input"],list)): - napa_biosample_inputs=[] - for biosample in doc["has_input"]: - if (biosample.startswith('GOLD')): - biosample=biosample.replace('GOLD','gold') - target_has_input={"$or":[ {"emsl_biosample_identifiers":biosample}, {"gold_biosample_identifiers":biosample},{"insdc_biosample_identifiers":biosample}]} - get_biosample=biosample_coll.find_one(target_has_input) - napa_biosample_inputs.append(get_biosample["id"]) - #set id and alternative ids - target_omics={"id": doc["id"]} - #deal with gold omics identifiers, for all 485 legacy records all already list gold projects in the gold_sequencing_project_identifiers slot - if (doc["id"].startswith('gold')): - update_alt= False - #deal with emsl omics identifiers - elif (doc["id"].startswith(('emsl'))): - alt_id_slot="alternative_identifiers" - alt_id=[] - alt_id.append(doc["id"]) - update_alt=True - else: - print("Not sure how to re-id omics_processing_set id ",doc["id"]) - #set target update depending on if alt slot exists already or not - if update_alt is True: - target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs, alt_id_slot: alt_id }} - if update_alt is False: - target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs}} - omics_coll.update_one(target_omics,target_omics_update) - class_legacy_napa="OmicsProcessing " + doc["id"] + " "+ omics_napa_ids[omics_counter] - #print(class_legacy_napa) - #print(target_update) - f_omics_id_mapping.write(class_legacy_napa + '\n') - # f_omics_set_operation.write(target_update + '\n') - omics_counter=omics_counter+1 + # set list with value of napa study for part_of + study_napa_list = [] + study_napa_list.append(napa_study) + # determine what has_input should be + if isinstance(doc["has_input"], list): + napa_biosample_inputs = [] + for biosample in doc["has_input"]: + if biosample.startswith("GOLD"): + biosample = biosample.replace("GOLD", "gold") + target_has_input = { + "$or": [ + {"emsl_biosample_identifiers": biosample}, + {"gold_biosample_identifiers": biosample}, + {"insdc_biosample_identifiers": biosample}, + ] + } + get_biosample = biosample_coll.find_one(target_has_input) + napa_biosample_inputs.append(get_biosample["id"]) + # set id and alternative ids + target_omics = {"id": doc["id"]} + # deal with gold omics identifiers, for all 485 legacy records all already list gold projects in the gold_sequencing_project_identifiers slot + if doc["id"].startswith("gold"): + update_alt = False + # deal with emsl omics identifiers + elif doc["id"].startswith(("emsl")): + alt_id_slot = "alternative_identifiers" + alt_id = [] + alt_id.append(doc["id"]) + update_alt = True + else: + print("Not sure how to re-id omics_processing_set id ", doc["id"]) + # set target update depending on if alt slot exists already or not + if update_alt is True: + target_omics_update = { + "$set": { + "id": omics_napa_ids[omics_counter], + "part_of": study_napa_list, + "has_input": napa_biosample_inputs, + alt_id_slot: alt_id, + } + } + if update_alt is False: + target_omics_update = { + "$set": { + "id": omics_napa_ids[omics_counter], + "part_of": study_napa_list, + "has_input": napa_biosample_inputs, + } + } + omics_coll.update_one(target_omics, target_omics_update) + class_legacy_napa = ( + "OmicsProcessing " + doc["id"] + " " + omics_napa_ids[omics_counter] + ) + # print(class_legacy_napa) + # print(target_update) + f_omics_id_mapping.write(class_legacy_napa + "\n") + # f_omics_set_operation.write(target_update + '\n') + omics_counter = omics_counter + 1 f_omics_id_mapping.close() f_omics_set_operation.close() -#update gold project ids for omics_records -Gs0114663_mgs=open("Gs0114663_mg_omics.txt", 'r') +# update gold project ids for omics_records +Gs0114663_mgs = open("Gs0114663_mg_omics.txt", "r") for gold_sp_updates in Gs0114663_mgs: - gold_sp_info=gold_sp_updates.split() - gold_sp_list=[] - gold_sp_list.append(gold_sp_info[1]) - gold_sp_target_id={"id":gold_sp_info[2]} - gold_sp_target_update={ "$set": { "gold_sequencing_project_identifiers":gold_sp_list}} - omics_coll.update_one(gold_sp_target_id,gold_sp_target_update) -#f.close() -#example regex -#myquery = { "id": {"$regex" :"^gold*"}} - -#mydatabase = client.nmdc -#print(mydatabase) -#collection =nmdc["study_set"] -#study_count=nmdc.study_set.count() -#print("The study count is:", study_count) -output_file=open("new_studies_brynn.txt","w") + gold_sp_info = gold_sp_updates.split() + gold_sp_list = [] + gold_sp_list.append(gold_sp_info[1]) + gold_sp_target_id = {"id": gold_sp_info[2]} + gold_sp_target_update = { + "$set": {"gold_sequencing_project_identifiers": gold_sp_list} + } + omics_coll.update_one(gold_sp_target_id, gold_sp_target_update) +# f.close() +# example regex +# myquery = { "id": {"$regex" :"^gold*"}} + +# mydatabase = client.nmdc +# print(mydatabase) +# collection =nmdc["study_set"] +# study_count=nmdc.study_set.count() +# print("The study count is:", study_count) +output_file = open("new_studies_brynn.txt", "w") for sty in studies: - get_sty=study_coll.find_one({"id":sty}) - print(get_sty) + get_sty = study_coll.find_one({"id": sty}) + print(get_sty) ##get details on metap records with invalid urls -missing_urls=open('metap_missing_data_object_records.txt', 'r') +missing_urls = open("metap_missing_data_object_records.txt", "r") Lines = missing_urls.readlines() -missing_url_list=[] +missing_url_list = [] for line in Lines: -# print(line.strip()) - select_dobj_target={"url":line.strip()} - print(select_dobj_target) - dobj_doc=dobj_coll.find_one(select_dobj_target) - print(dobj_doc) - missing_url_list.append(dobj_doc) -json_data = dumps(missing_url_list, indent = 2) - -with open('missing_data_objects.json', 'w') as file: - file.write(json_data) - -#update test biosample records to (Gs0114663) use prod minted IDs, not dev -with open("biosample_prod_Gs0114663.json", 'r') as j: - biosample_prod_napa_ids = json.loads(j.read()) -napa_study='nmdc:sty-12-85j6kq06' + # print(line.strip()) + select_dobj_target = {"url": line.strip()} + print(select_dobj_target) + dobj_doc = dobj_coll.find_one(select_dobj_target) + print(dobj_doc) + missing_url_list.append(dobj_doc) +json_data = dumps(missing_url_list, indent=2) + +with open("missing_data_objects.json", "w") as file: + file.write(json_data) + +# update test biosample records to (Gs0114663) use prod minted IDs, not dev +with open("biosample_prod_Gs0114663.json", "r") as j: + biosample_prod_napa_ids = json.loads(j.read()) +napa_study = "nmdc:sty-12-85j6kq06" f_biosample_prod_id_mapping = open("Gs0114663_biosample_reid.txt", "w") -Gs0114663_legacy_emsl_biosample={"part_of":"nmdc:sty-12-85j6kq06", "emsl_biosample_identifiers": { "$exists": True }} -Gs0114663_legacy_gold_biosample={"part_of":"nmdc:sty-12-85j6kq06", "emsl_biosample_identifiers": { "$exists": False }} -biosample_prod_counter=0 +Gs0114663_legacy_emsl_biosample = { + "part_of": "nmdc:sty-12-85j6kq06", + "emsl_biosample_identifiers": {"$exists": True}, +} +Gs0114663_legacy_gold_biosample = { + "part_of": "nmdc:sty-12-85j6kq06", + "emsl_biosample_identifiers": {"$exists": False}, +} +biosample_prod_counter = 0 for doc in biosample_coll.find(Gs0114663_legacy_emsl_biosample): - target_prod_update = { "$set": { "id": biosample_prod_napa_ids[biosample_prod_counter]}} - target_prod_biosample={"id": doc["id"]} - biosample_coll.update_one(target_prod_biosample,target_prod_update) - class_legacy_napa_Gs0114663="Biosample " + biosample_prod_napa_ids[biosample_prod_counter] + " "+ doc["emsl_biosample_identifiers"][0] - f_biosample_prod_id_mapping.write(class_legacy_napa_Gs0114663+ '\n') - biosample_prod_counter=biosample_prod_counter+1 + target_prod_update = { + "$set": {"id": biosample_prod_napa_ids[biosample_prod_counter]} + } + target_prod_biosample = {"id": doc["id"]} + biosample_coll.update_one(target_prod_biosample, target_prod_update) + class_legacy_napa_Gs0114663 = ( + "Biosample " + + biosample_prod_napa_ids[biosample_prod_counter] + + " " + + doc["emsl_biosample_identifiers"][0] + ) + f_biosample_prod_id_mapping.write(class_legacy_napa_Gs0114663 + "\n") + biosample_prod_counter = biosample_prod_counter + 1 for doc in biosample_coll.find(Gs0114663_legacy_gold_biosample): - target_prod_update = { "$set": { "id": biosample_prod_napa_ids[biosample_prod_counter]}} - target_prod_biosample={"id": doc["id"]} - biosample_coll.update_one(target_prod_biosample,target_prod_update) - class_legacy_napa="Biosample " + biosample_prod_napa_ids[biosample_prod_counter] + " "+ doc["gold_biosample_identifiers"][0] - f_biosample_prod_id_mapping.write(class_legacy_napa + '\n') - print(class_legacy_napa) - biosample_prod_counter=biosample_prod_counter+1 - -f_biosample_prod_id_mapping.close() -#end dev to prod ids for nmdc:sty-12-85j6kq06/gold:Gs0114663 - -#update nmdc:sty-12-85j6kq06 to a prod id -study_coll=mydb["study_set"] -select_dev_napa_study = {"id":"nmdc:sty-12-85j6kq06"} -napa_prod_study_update = { "$set": { "id": "nmdc:sty-11-aygzgv51" } } + target_prod_update = { + "$set": {"id": biosample_prod_napa_ids[biosample_prod_counter]} + } + target_prod_biosample = {"id": doc["id"]} + biosample_coll.update_one(target_prod_biosample, target_prod_update) + class_legacy_napa = ( + "Biosample " + + biosample_prod_napa_ids[biosample_prod_counter] + + " " + + doc["gold_biosample_identifiers"][0] + ) + f_biosample_prod_id_mapping.write(class_legacy_napa + "\n") + print(class_legacy_napa) + biosample_prod_counter = biosample_prod_counter + 1 + +f_biosample_prod_id_mapping.close() +# end dev to prod ids for nmdc:sty-12-85j6kq06/gold:Gs0114663 + +# update nmdc:sty-12-85j6kq06 to a prod id +study_coll = mydb["study_set"] +select_dev_napa_study = {"id": "nmdc:sty-12-85j6kq06"} +napa_prod_study_update = {"$set": {"id": "nmdc:sty-11-aygzgv51"}} study_coll.update_one(select_dev_napa_study, napa_prod_study_update) -Gs0114663_dev_biosample={"part_of":"nmdc:sty-12-85j6kq06"} +Gs0114663_dev_biosample = {"part_of": "nmdc:sty-12-85j6kq06"} for doc in biosample_coll.find(Gs0114663_dev_biosample): - target_prod_biosample={"id": doc["id"]} - fix_Gs0114663_biosample_part_of= { "$set": { "part_of": ["nmdc:sty-11-aygzgv51"]}} - biosample_coll.update_one(target_prod_biosample,fix_Gs0114663_biosample_part_of) - -#check lenght of gold project arrays -gold_project_array_lengths=[] -omics_with_gold_projects={'id':{'$regex':'^gold'},'gold_sequencing_project_identifiers':{'$exists':True}} + target_prod_biosample = {"id": doc["id"]} + fix_Gs0114663_biosample_part_of = {"$set": {"part_of": ["nmdc:sty-11-aygzgv51"]}} + biosample_coll.update_one(target_prod_biosample, fix_Gs0114663_biosample_part_of) + +# check lenght of gold project arrays +gold_project_array_lengths = [] +omics_with_gold_projects = { + "id": {"$regex": "^gold"}, + "gold_sequencing_project_identifiers": {"$exists": True}, +} for doc in omics_coll.find(omics_with_gold_projects): - if (len(doc["gold_sequencing_project_identifiers"])) <1: - print(doc["id"] +" has an empty array: part of "+ doc["part_of"][0]) - elif (len(doc["gold_sequencing_project_identifiers"])) == 1: - len_array=1 - else: - print("length unclear") + if (len(doc["gold_sequencing_project_identifiers"])) < 1: + print(doc["id"] + " has an empty array: part of " + doc["part_of"][0]) + elif (len(doc["gold_sequencing_project_identifiers"])) == 1: + len_array = 1 + else: + print("length unclear") ############### -#track down records WorkflowExecutionActivity (WEA) records that need to be deleted and their associated data objects +# track down records WorkflowExecutionActivity (WEA) records that need to be deleted and their associated data objects -seq_based_collection_list=['read_qc_analysis_activity_set','read_based_taxonomy_analysis_activity_set','metagenome_assembly_set','metagenome_annotation_activity_set','mags_activity_set','metatranscriptome_activity_set'] +seq_based_collection_list = [ + "read_qc_analysis_activity_set", + "read_based_taxonomy_analysis_activity_set", + "metagenome_assembly_set", + "metagenome_annotation_activity_set", + "mags_activity_set", + "metatranscriptome_activity_set", +] -#open file with list of omics records to delete, this list is derived from a rdf query to check for data refs -#that list was manually reviewed to determine which WEA to delete vs repair upstream records +# open file with list of omics records to delete, this list is derived from a rdf query to check for data refs +# that list was manually reviewed to determine which WEA to delete vs repair upstream records -data_object_coll=mydb['data_object_set'] +data_object_coll = mydb["data_object_set"] -target_gp_for_del=open("omics_records_to_delete.txt", 'r') +target_gp_for_del = open("omics_records_to_delete.txt", "r") for gp in target_gp_for_del: - gold_project_id=gp.strip() - gold_proj_curie='gold:'+gold_project_id - #check to make sure omics_processing_set record doesn't exist - if (omics_coll.find_one({'id':gold_proj_curie})): - print("omics processing set record exists for "+gold_proj_curie) - else: - for collection in seq_based_collection_list: - wea_coll=mydb[collection] - doc=wea_coll.find_one({'was_informed_by': gold_proj_curie}) - if (doc): - print("found "+doc["id"]+" in collection "+ collection) - #wea_to_delete.append(doc) - wea_coll.delete_one({'was_informed_by': gold_proj_curie}) -#this method should not be used as there are data objects that need to be removed that are not listed in has_output for the WEA records - #if "has_input" in doc.keys(): - # for input in doc["has_input"]: - # dobj_to_delete.append(input) - #if "has_output" in doc.keys(): - # for output in doc["has_output"]: - # dobj_to_delete.append(output) - else: - print("Could not find WEA records informed by "+gold_project_id+" in collection "+ collection) - data_object_coll.delete_many({'description':{'$regex':gold_project_id}}) + gold_project_id = gp.strip() + gold_proj_curie = "gold:" + gold_project_id + # check to make sure omics_processing_set record doesn't exist + if omics_coll.find_one({"id": gold_proj_curie}): + print("omics processing set record exists for " + gold_proj_curie) + else: + for collection in seq_based_collection_list: + wea_coll = mydb[collection] + doc = wea_coll.find_one({"was_informed_by": gold_proj_curie}) + if doc: + print("found " + doc["id"] + " in collection " + collection) + # wea_to_delete.append(doc) + wea_coll.delete_one({"was_informed_by": gold_proj_curie}) + # this method should not be used as there are data objects that need to be removed that are not listed in has_output for the WEA records + # if "has_input" in doc.keys(): + # for input in doc["has_input"]: + # dobj_to_delete.append(input) + # if "has_output" in doc.keys(): + # for output in doc["has_output"]: + # dobj_to_delete.append(output) + else: + print( + "Could not find WEA records informed by " + + gold_project_id + + " in collection " + + collection + ) + data_object_coll.delete_many({"description": {"$regex": gold_project_id}}) ### -#end cleanup of omics records that don't exist +# end cleanup of omics records that don't exist diff --git a/nmdc_schema/metab_id_refactor.py b/nmdc_schema/metab_id_refactor.py index d58836cd35..0cc8c2ff6c 100644 --- a/nmdc_schema/metab_id_refactor.py +++ b/nmdc_schema/metab_id_refactor.py @@ -21,46 +21,52 @@ envfile_path = "../../.env.client" load_dotenv(envfile_path) -#nersc ssh tunnel required to connect to mongo -#ssh -L 37020:mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017 -o ServerAliveInterval=60 {YOUR_NERSC_USERNAME}@dtn01.nersc.gov +# nersc ssh tunnel required to connect to mongo +# ssh -L 37020:mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017 -o ServerAliveInterval=60 {YOUR_NERSC_USERNAME}@dtn01.nersc.gov -napa_mongo_pw = os.environ.get('MONGO_NAPA_PW') or "safeguard-wipe-scanner-78" -#print("napa_mongo_pw:", os.environ['MONGO_NAPA_PW']) +napa_mongo_pw = os.environ.get("MONGO_NAPA_PW") or "safeguard-wipe-scanner-78" +# print("napa_mongo_pw:", os.environ['MONGO_NAPA_PW']) print(napa_mongo_pw) -napa_mongo='mongodb://root:'+napa_mongo_pw+'@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin' -#connection = MongoClient() -#db = connection.napa_mongo +napa_mongo = ( + "mongodb://root:" + + napa_mongo_pw + + "@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin" +) +# connection = MongoClient() +# db = connection.napa_mongo print(napa_mongo) -#connect to mongo +# connect to mongo client = MongoClient(napa_mongo) -#set mongo database name to nmdc' -mydb =client['nmdc'] +# set mongo database name to nmdc' +mydb = client["nmdc"] -#list database names -#for db in client.list_database_names(): +# list database names +# for db in client.list_database_names(): # print(db) -#list collections -#for coll in mydb.list_collection_names(): +# list collections +# for coll in mydb.list_collection_names(): # print(coll) -# omicsProcessing update, has_output --> raw data +# omicsProcessing update, has_output --> raw data # omicsProcessing update, alternative_identifier --> nom_analysis_activity.was_informed_by -# nom_analysis_activity --> has_input (new raw file or update ID) + +# nom_analysis_activity --> has_input (new raw file or update ID) # nom_analysis_activity --> has_output (data product file, update ID) # nom_analysis_activity --> replace ids # nom_analysis_activity --> was_informed_by -- id from alternative indetifier omics Processing # dataObject --> replace id, and add alternative identifier, emsl:60592345 @dataclass class NMDC_Mint: - - schema_class: dict = field(default_factory= lambda: { - 'schema': None, - }) - how_many:int = 1 + schema_class: dict = field( + default_factory=lambda: { + "schema": None, + } + ) + how_many: int = 1 @property def __dict__(self): @@ -69,124 +75,159 @@ def __dict__(self): @property def json(self): return dumps(self.__dict__) - -@dataclass -class DataObject: - nom_raw_data_object_type:str = "Direct Infusion FT ICR-MS Raw Data" - nom_raw_data_object_description:str = "Raw 21T Direct Infusion Data" - nom_dp_data_object_type:str = "FT ICR-MS Analysis Results" - nom_dp_data_object_description:str = "EnviroMS FT ICR-MS natural organic matter workflow molecular formula assignment output details" - -@dataclass -class NMDC_Types: - - BioSample:str = "nmdc:Biosample" - OmicsProcessing:str = "nmdc:OmicsProcessing" - NomAnalysisActivity:str = "nmdc:NomAnalysisActivity" - DataObject:str = "nmdc:DataObject" - -def update_data_products(nom_activities_doc, new_raw_file_id:str, - new_data_product_id:str, omics_prcessing_id:str, raw_file_path:Path=None): - - raw_file_id = nom_activities_doc.has_input[0] - - dataproduct_id = nom_activities_doc.has_input[0] - - data_object_set = mydb['data_object_set'] - - get_raw_file_data_object = { "id" : raw_file_id } - get_data_product_data_object = { "id" : dataproduct_id } - - raw_object_docs = [raw_objectdata_doc for raw_objectdata_doc in data_object_set.find(get_raw_file_data_object)] - - if raw_object_docs: - - raw_object_update = { "$set": { "id": new_raw_file_id, 'alternative_identifier': [omics_prcessing_id]} } - - data_object_set.update_one(raw_object_docs[0], raw_object_update ) - - else: - - new_raw_data_object = get_raw_data_object(raw_file_path, - was_generated_by=omics_prcessing_id, - data_object_type =DataObject.nom_raw_data_object_type, - description =DataObject.nom_raw_data_object_description) - - data_object_set.insert_one(new_raw_data_object) - - for data_product_objectdata_doc in data_object_set.find(get_data_product_data_object): - - data_product_object_update = { "$set": { "id": new_data_product_id}} - - data_object_set.update_one(data_product_objectdata_doc, data_product_object_update ) -def update_omics_processing(nom_new_id, new_data_product_id, new_raw_file_id, raw_file_path=None): - omics_processing_set = mydb['omics_processing_set'] - - nom_activities_set = mydb['nom_analysis_activity_set'] - - get_old_activities={ "id" : {"$regex":"^emsl" } } - - for nom_activities_doc in nom_activities_set.find(get_old_activities): - - get_parent_omics_processing ={ "has_output" : nom_activities_doc["has_input"] } - - '''always going to be one omics processing''' - for omics_processing_doc in omics_processing_set.find(get_parent_omics_processing): - - omics_processing_update = { "$set": { "has_output": [new_raw_file_id]} } - - omics_processing_set.update_one(omics_processing_doc, omics_processing_update) - - new_omics_processing_id = omics_processing_doc['id'] +@dataclass +class DataObject: + nom_raw_data_object_type: str = "Direct Infusion FT ICR-MS Raw Data" + nom_raw_data_object_description: str = "Raw 21T Direct Infusion Data" + nom_dp_data_object_type: str = "FT ICR-MS Analysis Results" + nom_dp_data_object_description: str = "EnviroMS FT ICR-MS natural organic matter workflow molecular formula assignment output details" - update_data_products( nom_activities_doc, new_data_product_id, new_data_product_id, - new_omics_processing_id, raw_file_path) - - nom_activity_update = { "$set": { "id": nom_new_id , "has_output":[new_data_product_id], - "has_input":[new_raw_file_id], "was_informed_by": [new_omics_processing_id]} } - - nom_activities_set.update_one(nom_activities_doc, nom_activity_update) -def mint_nmdc_id(type:NMDC_Types, how_many:int = 1) -> List[str]: - - config = yaml.safe_load(open('./config.yaml','r')) - client = oauthlib.oauth2.BackendApplicationClient(client_id=config['client_id']) +@dataclass +class NMDC_Types: + BioSample: str = "nmdc:Biosample" + OmicsProcessing: str = "nmdc:OmicsProcessing" + NomAnalysisActivity: str = "nmdc:NomAnalysisActivity" + DataObject: str = "nmdc:DataObject" + + +def update_data_products( + nom_activities_doc, + new_raw_file_id: str, + new_data_product_id: str, + omics_prcessing_id: str, + raw_file_path: Path = None, +): + raw_file_id = nom_activities_doc.has_input[0] + + dataproduct_id = nom_activities_doc.has_input[0] + + data_object_set = mydb["data_object_set"] + + get_raw_file_data_object = {"id": raw_file_id} + get_data_product_data_object = {"id": dataproduct_id} + + raw_object_docs = [ + raw_objectdata_doc + for raw_objectdata_doc in data_object_set.find(get_raw_file_data_object) + ] + + if raw_object_docs: + raw_object_update = { + "$set": { + "id": new_raw_file_id, + "alternative_identifier": [omics_prcessing_id], + } + } + + data_object_set.update_one(raw_object_docs[0], raw_object_update) + + else: + new_raw_data_object = get_raw_data_object( + raw_file_path, + was_generated_by=omics_prcessing_id, + data_object_type=DataObject.nom_raw_data_object_type, + description=DataObject.nom_raw_data_object_description, + ) + + data_object_set.insert_one(new_raw_data_object) + + for data_product_objectdata_doc in data_object_set.find( + get_data_product_data_object + ): + data_product_object_update = {"$set": {"id": new_data_product_id}} + + data_object_set.update_one( + data_product_objectdata_doc, data_product_object_update + ) + + +def update_omics_processing( + nom_new_id, new_data_product_id, new_raw_file_id, raw_file_path=None +): + omics_processing_set = mydb["omics_processing_set"] + + nom_activities_set = mydb["nom_analysis_activity_set"] + + get_old_activities = {"id": {"$regex": "^emsl"}} + + for nom_activities_doc in nom_activities_set.find(get_old_activities): + get_parent_omics_processing = {"has_output": nom_activities_doc["has_input"]} + + """always going to be one omics processing""" + for omics_processing_doc in omics_processing_set.find( + get_parent_omics_processing + ): + omics_processing_update = {"$set": {"has_output": [new_raw_file_id]}} + + omics_processing_set.update_one( + omics_processing_doc, omics_processing_update + ) + + new_omics_processing_id = omics_processing_doc["id"] + + update_data_products( + nom_activities_doc, + new_data_product_id, + new_data_product_id, + new_omics_processing_id, + raw_file_path, + ) + + nom_activity_update = { + "$set": { + "id": nom_new_id, + "has_output": [new_data_product_id], + "has_input": [new_raw_file_id], + "was_informed_by": [new_omics_processing_id], + } + } + + nom_activities_set.update_one(nom_activities_doc, nom_activity_update) + + +def mint_nmdc_id(type: NMDC_Types, how_many: int = 1) -> List[str]: + config = yaml.safe_load(open("./config.yaml", "r")) + client = oauthlib.oauth2.BackendApplicationClient(client_id=config["client_id"]) oauth = requests_oauthlib.OAuth2Session(client=client) - - token = oauth.fetch_token(token_url='https://api.microbiomedata.org/token', - client_id=config['client_id'], - client_secret=config['client_secret']) + + token = oauth.fetch_token( + token_url="https://api.microbiomedata.org/token", + client_id=config["client_id"], + client_secret=config["client_secret"], + ) nmdc_mint_url = "https://api.microbiomedata.org/pids/mint" - + payload = NMDC_Mint(type, how_many) - - #response = s.post(nmdc_mint_url, data=payload.json, ) - #list_ids = response.json() + + # response = s.post(nmdc_mint_url, data=payload.json, ) + # list_ids = response.json() print(payload.json) response = oauth.post(nmdc_mint_url, data=payload.json) list_ids = response.json() print(list_ids) return list_ids - -def get_raw_data_object(file_path:Path, was_generated_by:str, - data_object_type:str, description:str) -> nmdc.DataObject: - - nmdc_id = mint_nmdc_id({'id': NMDC_Types.DataObject})[0] + + +def get_raw_data_object( + file_path: Path, was_generated_by: str, data_object_type: str, description: str +) -> nmdc.DataObject: + nmdc_id = mint_nmdc_id({"id": NMDC_Types.DataObject})[0] data_dict = { - 'id': nmdc_id, - "name": file_path.name, - "file_size_bytes": file_path.stat().st_size, - "md5_checksum": hashlib.md5(file_path.open('rb').read()).hexdigest(), - "was_generated_by": was_generated_by, #omics processing id - "data_object_type": data_object_type, - "description": description, - "type": "nmdc:DataObject" - } - + "id": nmdc_id, + "name": file_path.name, + "file_size_bytes": file_path.stat().st_size, + "md5_checksum": hashlib.md5(file_path.open("rb").read()).hexdigest(), + "was_generated_by": was_generated_by, # omics processing id + "data_object_type": data_object_type, + "description": description, + "type": "nmdc:DataObject", + } + data_object = nmdc.DataObject(**data_dict) - return data_object \ No newline at end of file + return data_object diff --git a/nmdc_schema/napa_study_biosample_omics_migration.py b/nmdc_schema/napa_study_biosample_omics_migration.py index c578ad9834..db80f5c8af 100644 --- a/nmdc_schema/napa_study_biosample_omics_migration.py +++ b/nmdc_schema/napa_study_biosample_omics_migration.py @@ -9,171 +9,237 @@ import pymongo from pymongo import MongoClient -#connect to napa mongo -napa_mongo_pw = os.environ['MONGO_NAPA_PW'] -napa_mongo='mongodb://root:'+napa_mongo_pw+'@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin' +# connect to napa mongo +napa_mongo_pw = os.environ["MONGO_NAPA_PW"] +napa_mongo = ( + "mongodb://root:" + + napa_mongo_pw + + "@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin" +) client = MongoClient(napa_mongo) -#define variables for tables to update, assumes a mongo connection variable 'client' -#set database name -mydb =client['nmdc'] -sty_coll=mydb["study_set"] -bsm_coll=mydb["biosample_set"] +# define variables for tables to update, assumes a mongo connection variable 'client' +# set database name +mydb = client["nmdc"] +sty_coll = mydb["study_set"] +bsm_coll = mydb["biosample_set"] ######################### -#generalized function to update study identifiers to napa format -#alt slots are already populated in all cases so logic for that is not needed +# generalized function to update study identifiers to napa format +# alt slots are already populated in all cases so logic for that is not needed -#mint Class Study IDs using runtime API or manually using the minter endpoint -#if reading minted IDs from a json file -sty_napa_json="XXXXXXXXXX" -with open(sty_napa_json, 'r') as j: - sty_napa_ids = json.loads(j.read()) +# mint Class Study IDs using runtime API or manually using the minter endpoint +# if reading minted IDs from a json file +sty_napa_json = "XXXXXXXXXX" +with open(sty_napa_json, "r") as j: + sty_napa_ids = json.loads(j.read()) + +# update_studies_to_napa_standards -#update_studies_to_napa_standards def update_studies_to_napa_standards(): - study_reid_log=open("napa_sty_update.txt","w") - napa_sty_counter=0 - get_legacy_sty={ "id" : {"$regex":"^gold" } } - for sty_doc in sty_coll.find(get_legacy_studies): - select_legacy_sty = {"id": sty_doc["id"]} - sty_target_update = {"$set": { "id": napa_sty_ids[napa_sty_count] } } - if (napa_sty_ids[napa_sty_count].startswith('nmdc:sty')): - #sty_coll.update_one(select_legacy_sty,sty_target_update) - sty_class_legacy_napa="Study "+ sty_doc["id"] + " " + napa_sty_ids[napa_study_count] - print(sty_class_legacy_napa) - sty_reid_log.write(napa_sty_update.txt) - napa_sty_counter=napa_sty_counter+1 - else: - print("Did not update issue updating ",sty_doc["id"]) + study_reid_log = open("napa_sty_update.txt", "w") + napa_sty_counter = 0 + get_legacy_sty = {"id": {"$regex": "^gold"}} + for sty_doc in sty_coll.find(get_legacy_studies): + select_legacy_sty = {"id": sty_doc["id"]} + sty_target_update = {"$set": {"id": napa_sty_ids[napa_sty_count]}} + if napa_sty_ids[napa_sty_count].startswith("nmdc:sty"): + # sty_coll.update_one(select_legacy_sty,sty_target_update) + sty_class_legacy_napa = ( + "Study " + sty_doc["id"] + " " + napa_sty_ids[napa_study_count] + ) + print(sty_class_legacy_napa) + sty_reid_log.write(napa_sty_update.txt) + napa_sty_counter = napa_sty_counter + 1 + else: + print("Did not update issue updating ", sty_doc["id"]) + ######################### ##function to update biosamples -#mint Class Study IDs using runtime API or manually using the minter endpoint -#if reading minted IDs from a json file +# mint Class Study IDs using runtime API or manually using the minter endpoint +# if reading minted IDs from a json file + def update_bsm_by_study(napa_sty_id): - bsm_counter=0 - bsm_alt_id_dict={'gold_biosample_identifiers':'gold:','igsn_biosample_identifiers':'igsn:','emsl_biosample_identifiers':'emsl:'} - legacy_sty=napa_sty_to_legacy(napa_sty_id) - bsm_reid_log=open(legacy_sty + "_bsm_reid.txt","w") - with open(legacy_sty + "_bsm_napa.json", 'r') as j: - bsm_napa_ids = json.loads(j.read()) - legacy_bsm={"part_of": legacy_sty, "id": {"$ne":"^nmdc:bsm"}} - for bsm_doc in bsm_coll.find(legacy_bsm): - #set value for part_of - sty_napa_list=[] - sty_napa_list.append(napa_sty_id) - target_bsm={"id": bsm_doc["id"]} - #alt id check function - alt_id=[] - alt_id_slot_name='' - for alt_id_slot in bsm_alt_id_dict: - if bsm_doc["id"].startswith(bsm_alt_id_dict[alt_id_slot]): - alt_id_slot_name=alt_id_slot - if alt_id_slot_name in bsm_doc.keys(): - if len(bsm_doc[alt_id_slot_name]) == 0: - update_alt=True - alt_id.append(bsm_doc["id"]) - print ("will update alt id slot is empty"+alt_id_slot_name) - elif (len(bsm_doc[alt_id_slot_name]) == 1 and bsm_doc[alt_id_slot_name][0] == bsm_doc["id"]): - print(alt_id_slot+" already set for "+bsm_doc["id"]) - update_alt=False - else: - print("length of array for "+ alt_id_slot +"exists and is greater than 1") - update_alt=False + bsm_counter = 0 + bsm_alt_id_dict = { + "gold_biosample_identifiers": "gold:", + "igsn_biosample_identifiers": "igsn:", + "emsl_biosample_identifiers": "emsl:", + } + legacy_sty = napa_sty_to_legacy(napa_sty_id) + bsm_reid_log = open(legacy_sty + "_bsm_reid.txt", "w") + with open(legacy_sty + "_bsm_napa.json", "r") as j: + bsm_napa_ids = json.loads(j.read()) + legacy_bsm = {"part_of": legacy_sty, "id": {"$ne": "^nmdc:bsm"}} + for bsm_doc in bsm_coll.find(legacy_bsm): + # set value for part_of + sty_napa_list = [] + sty_napa_list.append(napa_sty_id) + target_bsm = {"id": bsm_doc["id"]} + # alt id check function + alt_id = [] + alt_id_slot_name = "" + for alt_id_slot in bsm_alt_id_dict: + if bsm_doc["id"].startswith(bsm_alt_id_dict[alt_id_slot]): + alt_id_slot_name = alt_id_slot + if alt_id_slot_name in bsm_doc.keys(): + if len(bsm_doc[alt_id_slot_name]) == 0: + update_alt = True + alt_id.append(bsm_doc["id"]) + print("will update alt id slot is empty" + alt_id_slot_name) + elif ( + len(bsm_doc[alt_id_slot_name]) == 1 + and bsm_doc[alt_id_slot_name][0] == bsm_doc["id"] + ): + print(alt_id_slot + " already set for " + bsm_doc["id"]) + update_alt = False + else: + print( + "length of array for " + + alt_id_slot + + "exists and is greater than 1" + ) + update_alt = False + else: + update_alt = True + alt_id.append(bsm_doc["id"]) + print("will update alt id b/c could not fine alt id") + break + if update_alt: + bsm_target_update = { + "$set": { + "id": bsm_napa_ids[bsm_counter], + "part_of": sty_napa_list, + alt_id_slot_name: alt_id, + } + } + elif not update_alt: + bsm_target_update = { + "$set": {"id": bsm_napa_ids[bsm_counter], "part_of": sty_napa_list} + } else: - update_alt=True - alt_id.append(bsm_doc["id"]) - print ("will update alt id b/c could not fine alt id") - break; - if update_alt: - bsm_target_update = { "$set": { "id": bsm_napa_ids[bsm_counter], "part_of":sty_napa_list, alt_id_slot_name: alt_id }} - elif not update_alt: - bsm_target_update = { "$set": { "id": bsm_napa_ids[bsm_counter], "part_of":sty_napa_list}} - else: - print("not sure how to make the biosample update for" + bsm_doc["id"]) - bsm_class_legacy_napa="Biosample " + bsm_doc["id"] + " "+ bsm_napa_ids[bsm_counter] - print(bsm_class_legacy_napa) - print(target_bsm) - print(bsm_target_update) - #perform biosample update - bsm_coll.update_one(target_bsm,bsm_target_update) - bsm_reid_log.write(bsm_class_legacy_napa + '\n') - bsm_counter=bsm_counter+1 - bsm_reid_log.close() + print("not sure how to make the biosample update for" + bsm_doc["id"]) + bsm_class_legacy_napa = ( + "Biosample " + bsm_doc["id"] + " " + bsm_napa_ids[bsm_counter] + ) + print(bsm_class_legacy_napa) + print(target_bsm) + print(bsm_target_update) + # perform biosample update + bsm_coll.update_one(target_bsm, bsm_target_update) + bsm_reid_log.write(bsm_class_legacy_napa + "\n") + bsm_counter = bsm_counter + 1 + bsm_reid_log.close() + + ################ -#function to get legacy study id from alt id slot + +# function to get legacy study id from alt id slot def napa_sty_to_legacy(napa_sty_id): - legacy_sty="" - get_sty_record={"id":napa_sty_id} - target_sty=sty_coll.find_one(get_sty_record) - if len(target_sty["gold_study_identifiers"]) ==1: - legacy_sty=target_sty["gold_study_identifiers"][0] - else: - print("More than one GOLD study as alt id", target_sty["gold_study_identifiers"]) - return legacy_sty -########################## -#function to update omics records -def update_omics_by_study(napa_sty_id): - omics_coll=mydb["omics_processing_set"] - omics_counter=0 - omics_alt_id_dict={'gold_sequencing_project_identifiers':'gold:','alternative_identifiers':'emsl:'} - legacy_sty=napa_sty_to_legacy(napa_sty_id) - legacy_omics={"part_of": legacy_sty, "id": {"$ne":"^nmdc:omprc"}} - f_omics_id_mapping = open(legacy_sty+"_omics_reid.txt", "w") - with open(legacy_sty+"_omics_napa.json", 'r') as j: - omics_napa_ids = json.loads(j.read()) - for omics_doc in omics_coll.find(legacy_omics): - #set list with value of napa study for part_of - study_napa_list=[] - study_napa_list.append(napa_sty_id) - #determine what has_input should be - if(isinstance(omics_doc["has_input"],list)): - napa_biosample_inputs=[] - for biosample in omics_doc["has_input"]: - biosample=biosample.replace('GOLD','gold') - target_has_input={"$or":[ {"emsl_biosample_identifiers":biosample}, {"gold_biosample_identifiers":biosample},{"insdc_biosample_identifiers":biosample}]} - get_biosample=bsm_coll.find_one(target_has_input) - napa_biosample_inputs.append(get_biosample["id"]) - #set id and alternative ids - target_omics={"id": omics_doc["id"]} - #deal with gold omics identifiers, for all 485 legacy records all already list gold projects in the gold_sequencing_project_identifiers slot - alt_omics_id=[] - for alt_omics_id_slot in omics_alt_id_dict: - if omics_doc["id"].startswith(omics_alt_id_dict[alt_omics_id_slot]): - if alt_omics_id_slot in omics_doc.keys(): - if len(omics_doc[alt_omics_id_slot]) == 0: - update_alt_omics=True - alt_omics_id.append(omics_doc["id"]) - target_alt_omics_slot=alt_omics_id_slot - print ("will update alt id slot is empty"+alt_id_slot_name) - elif (len(omics_doc[alt_omics_id_slot]) == 1 and omics_doc[alt_omics_id_slot][0] == omics_doc["id"]): - print(alt_omcs_id_slot+" already set for "+omics_doc["id"]) - update_alt_omics=False - else: - print("length of array for "+ alt_omics_id_slot +"exists and is greater than 1") - update_alt_omics=False - else: - update_alt_omics=True - alt_omics_id.append(omics_doc["id"]) - target_alt_omics_slot=alt_omics_id_slot - print ("will update alt id b/c could not find alt id") - #set target update depending on if alt slot exists already or not - if update_alt_omics is True: - target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs, target_alt_omics_slot: alt_omics_id }} - if update_alt_omics is False: - target_omics_update = { "$set": { "id": omics_napa_ids[omics_counter], "part_of":study_napa_list, "has_input": napa_biosample_inputs}} - print(target_omics_update) - omics_coll.update_one(target_omics,target_omics_update) - class_legacy_napa="OmicsProcessing " + omics_doc["id"] + " "+ omics_napa_ids[omics_counter] - #print(class_legacy_napa) - #print(target_update) - f_omics_id_mapping.write(class_legacy_napa + '\n') - omics_counter=omics_counter+1 - f_omics_id_mapping.close() + legacy_sty = "" + get_sty_record = {"id": napa_sty_id} + target_sty = sty_coll.find_one(get_sty_record) + if len(target_sty["gold_study_identifiers"]) == 1: + legacy_sty = target_sty["gold_study_identifiers"][0] + else: + print( + "More than one GOLD study as alt id", target_sty["gold_study_identifiers"] + ) + return legacy_sty +########################## +# function to update omics records +def update_omics_by_study(napa_sty_id): + omics_coll = mydb["omics_processing_set"] + omics_counter = 0 + omics_alt_id_dict = { + "gold_sequencing_project_identifiers": "gold:", + "alternative_identifiers": "emsl:", + } + legacy_sty = napa_sty_to_legacy(napa_sty_id) + legacy_omics = {"part_of": legacy_sty, "id": {"$ne": "^nmdc:omprc"}} + f_omics_id_mapping = open(legacy_sty + "_omics_reid.txt", "w") + with open(legacy_sty + "_omics_napa.json", "r") as j: + omics_napa_ids = json.loads(j.read()) + for omics_doc in omics_coll.find(legacy_omics): + # set list with value of napa study for part_of + study_napa_list = [] + study_napa_list.append(napa_sty_id) + # determine what has_input should be + if isinstance(omics_doc["has_input"], list): + napa_biosample_inputs = [] + for biosample in omics_doc["has_input"]: + biosample = biosample.replace("GOLD", "gold") + target_has_input = { + "$or": [ + {"emsl_biosample_identifiers": biosample}, + {"gold_biosample_identifiers": biosample}, + {"insdc_biosample_identifiers": biosample}, + ] + } + get_biosample = bsm_coll.find_one(target_has_input) + napa_biosample_inputs.append(get_biosample["id"]) + # set id and alternative ids + target_omics = {"id": omics_doc["id"]} + # deal with gold omics identifiers, for all 485 legacy records all already list gold projects in the gold_sequencing_project_identifiers slot + alt_omics_id = [] + for alt_omics_id_slot in omics_alt_id_dict: + if omics_doc["id"].startswith(omics_alt_id_dict[alt_omics_id_slot]): + if alt_omics_id_slot in omics_doc.keys(): + if len(omics_doc[alt_omics_id_slot]) == 0: + update_alt_omics = True + alt_omics_id.append(omics_doc["id"]) + target_alt_omics_slot = alt_omics_id_slot + print("will update alt id slot is empty" + alt_id_slot_name) + elif ( + len(omics_doc[alt_omics_id_slot]) == 1 + and omics_doc[alt_omics_id_slot][0] == omics_doc["id"] + ): + print(alt_omcs_id_slot + " already set for " + omics_doc["id"]) + update_alt_omics = False + else: + print( + "length of array for " + + alt_omics_id_slot + + "exists and is greater than 1" + ) + update_alt_omics = False + else: + update_alt_omics = True + alt_omics_id.append(omics_doc["id"]) + target_alt_omics_slot = alt_omics_id_slot + print("will update alt id b/c could not find alt id") + # set target update depending on if alt slot exists already or not + if update_alt_omics is True: + target_omics_update = { + "$set": { + "id": omics_napa_ids[omics_counter], + "part_of": study_napa_list, + "has_input": napa_biosample_inputs, + target_alt_omics_slot: alt_omics_id, + } + } + if update_alt_omics is False: + target_omics_update = { + "$set": { + "id": omics_napa_ids[omics_counter], + "part_of": study_napa_list, + "has_input": napa_biosample_inputs, + } + } + print(target_omics_update) + omics_coll.update_one(target_omics, target_omics_update) + class_legacy_napa = ( + "OmicsProcessing " + omics_doc["id"] + " " + omics_napa_ids[omics_counter] + ) + # print(class_legacy_napa) + # print(target_update) + f_omics_id_mapping.write(class_legacy_napa + "\n") + omics_counter = omics_counter + 1 + f_omics_id_mapping.close() diff --git a/nmdc_schema/runtime_api_operations.py b/nmdc_schema/runtime_api_operations.py index be0964274d..315b65475e 100644 --- a/nmdc_schema/runtime_api_operations.py +++ b/nmdc_schema/runtime_api_operations.py @@ -8,7 +8,7 @@ from dotenv import load_dotenv import requests -#modified from nmdc-runtime how-to guide https://microbiomedata.github.io/nmdc-runtime/nb/queue_and_trigger_data_jobs/ +# modified from nmdc-runtime how-to guide https://microbiomedata.github.io/nmdc-runtime/nb/queue_and_trigger_data_jobs/ # relative path to file with format # ``` @@ -23,105 +23,101 @@ load_dotenv(envfile_path) -ENV = { - k: v for k, v in os.environ.items() - if k.startswith("NMDC_RUNTIME_") -} +ENV = {k: v for k, v in os.environ.items() if k.startswith("NMDC_RUNTIME_")} -assert ( - ENV["NMDC_RUNTIME_HOST"] == - "https://api.microbiomedata.org" -) +assert ENV["NMDC_RUNTIME_HOST"] == "https://api.microbiomedata.org" HOST = ENV["NMDC_RUNTIME_HOST"] + def request_and_return_json(method, path, host=HOST, **kwargs): r = requests.request(method, host + path, **kwargs) r.raise_for_status() return r.json() + def get_json(path, host=HOST, **kwargs): return request_and_return_json("GET", path, host=host, **kwargs) + def post_and_return_json(path, host=HOST, **kwargs): - return request_and_return_json("POST", path, host=host, **kwargs) + return request_and_return_json("POST", path, host=host, **kwargs) + def patch_and_return_json(path, host=HOST, **kwargs): - return request_and_return_json("PATCH", path, host=host, **kwargs) + return request_and_return_json("PATCH", path, host=host, **kwargs) + def put_and_return_json(path, host=HOST, **kwargs): - return request_and_return_json("PUT", path, host=host, **kwargs) + return request_and_return_json("PUT", path, host=host, **kwargs) + def auth_header(bearer_token): return {"Authorization": f"Bearer {bearer_token}"} + def get_token_for_user(): response = post_and_return_json( "/token", data={ "grant_type": "password", "username": ENV["NMDC_RUNTIME_USER"], - "password": ENV["NMDC_RUNTIME_PASS"] - } + "password": ENV["NMDC_RUNTIME_PASS"], + }, ) - expires_minutes = response['expires']['minutes'] + expires_minutes = response["expires"]["minutes"] print(f"Bearer token expires in {expires_minutes} minutes") return response["access_token"] + def get_token_for_site_client(): response = post_and_return_json( "/token", data={ "grant_type": "client_credentials", "client_id": ENV["NMDC_RUNTIME_SITE_CLIENT_ID"], - "client_secret": ENV["NMDC_RUNTIME_SITE_CLIENT_SECRET"] - } + "client_secret": ENV["NMDC_RUNTIME_SITE_CLIENT_SECRET"], + }, ) - expires_minutes = response['expires']['minutes'] + expires_minutes = response["expires"]["minutes"] print(f"Bearer token expires in {expires_minutes} minutes") return response["access_token"] -def mint_ids(schema_class,how_many,formatted_token): - url=HOST + "/pids/mint" - data={ - "schema_class": {"id": schema_class}, - "how_many": how_many - } - headers = formatted_token -# print(headers) - response = requests.post(url,headers=headers,json=data) - print("JSON Response ", response.json()) +def mint_ids(schema_class, how_many, formatted_token): + url = HOST + "/pids/mint" + data = {"schema_class": {"id": schema_class}, "how_many": how_many} + headers = formatted_token + # print(headers) + response = requests.post(url, headers=headers, json=data) + print("JSON Response ", response.json()) - minted_ids=response.json() - return minted_ids - #print(minted_ids) + minted_ids = response.json() + return minted_ids + # print(minted_ids) -#def mint_ids(schema_class,how_many,TOKEN_C): + +# def mint_ids(schema_class,how_many,TOKEN_C): # response = post_and_return_json( # "/pids/mint", # data={ # "schema_class": {"id": schema_class}, # "how_many": how_many -# } +# } # headers = TOKEN_C # return response # ) - - def now(as_str=False): dt = datetime.now(timezone.utc) return dt.isoformat() if as_str else dt + TOKEN_C = get_token_for_site_client() print(TOKEN_C) -formatted_token=auth_header(TOKEN_C) -napa_ids=mint_ids("nmdc:Study",2,formatted_token) +formatted_token = auth_header(TOKEN_C) +napa_ids = mint_ids("nmdc:Study", 2, formatted_token) print(napa_ids) - - - From 01d973e43bf76f9b1405aead6a287154383409f3 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Fri, 17 Nov 2023 16:19:07 -0800 Subject: [PATCH 12/44] Document usage of `black` code formatter --- nmdc_schema/napa_compliance.README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/nmdc_schema/napa_compliance.README.md b/nmdc_schema/napa_compliance.README.md index be92b3af13..f4e9ec7480 100644 --- a/nmdc_schema/napa_compliance.README.md +++ b/nmdc_schema/napa_compliance.README.md @@ -8,4 +8,24 @@ pip install python-dotenv # So Python scripts can access Mongo databases. pip install pymongo + +# So Python code is formatted in a standard way. +pip install black +``` + +## Formatting source code + +You can use [`black`](https://black.readthedocs.io/en/stable/) to format the Python code you write, by running: + +```shell +python -m black /path/to/python/code.py +``` + +For example: + +```shell +python -m black nmdc_schema/connect_napa_mongo.py +python -m black nmdc_schema/metab_id_refactor.py +python -m black nmdc_schema/napa_study_biosample_omics_migration.py +python -m black nmdc_schema/runtime_api_operations.py ``` From a2da3682620e231b55e486662e6843b0f9c0b302 Mon Sep 17 00:00:00 2001 From: aclum Date: Fri, 19 Jan 2024 08:52:46 -0800 Subject: [PATCH 13/44] fixed issues with biosample function and target biosample slot name in omics function for re-iding --- .../napa_study_biosample_omics_migration.py | 63 ++++++++++++------- 1 file changed, 39 insertions(+), 24 deletions(-) diff --git a/nmdc_schema/napa_study_biosample_omics_migration.py b/nmdc_schema/napa_study_biosample_omics_migration.py index db80f5c8af..8bbe829564 100644 --- a/nmdc_schema/napa_study_biosample_omics_migration.py +++ b/nmdc_schema/napa_study_biosample_omics_migration.py @@ -1,4 +1,5 @@ import json +m import os from pprint import pprint import secrets @@ -51,7 +52,7 @@ def update_studies_to_napa_standards(): ) print(sty_class_legacy_napa) sty_reid_log.write(napa_sty_update.txt) - napa_sty_counter = napa_sty_counter + 1 + napa_sty_counte)r = napa_sty_counter + 1 else: print("Did not update issue updating ", sty_doc["id"]) @@ -68,7 +69,7 @@ def update_bsm_by_study(napa_sty_id): bsm_alt_id_dict = { "gold_biosample_identifiers": "gold:", "igsn_biosample_identifiers": "igsn:", - "emsl_biosample_identifiers": "emsl:", + "emsl_biosample_identifiers": "emsl:" } legacy_sty = napa_sty_to_legacy(napa_sty_id) bsm_reid_log = open(legacy_sty + "_bsm_reid.txt", "w") @@ -76,6 +77,8 @@ def update_bsm_by_study(napa_sty_id): bsm_napa_ids = json.loads(j.read()) legacy_bsm = {"part_of": legacy_sty, "id": {"$ne": "^nmdc:bsm"}} for bsm_doc in bsm_coll.find(legacy_bsm): + bsm_target_update = "" + #print(bsm_doc["id"]) # set value for part_of sty_napa_list = [] sty_napa_list.append(napa_sty_id) @@ -84,45 +87,53 @@ def update_bsm_by_study(napa_sty_id): alt_id = [] alt_id_slot_name = "" for alt_id_slot in bsm_alt_id_dict: + #print(bsm_alt_id_dict[alt_id_slot]) if bsm_doc["id"].startswith(bsm_alt_id_dict[alt_id_slot]): + print(bsm_doc["id"] + "starts with"+ bsm_alt_id_dict[alt_id_slot]) alt_id_slot_name = alt_id_slot if alt_id_slot_name in bsm_doc.keys(): if len(bsm_doc[alt_id_slot_name]) == 0: update_alt = True alt_id.append(bsm_doc["id"]) print("will update alt id slot is empty" + alt_id_slot_name) + bsm_target_update = { + "$set": { + "id": bsm_napa_ids[bsm_counter], + "part_of": sty_napa_list, + alt_id_slot_name: alt_id, + } + } elif ( len(bsm_doc[alt_id_slot_name]) == 1 and bsm_doc[alt_id_slot_name][0] == bsm_doc["id"] ): print(alt_id_slot + " already set for " + bsm_doc["id"]) - update_alt = False + bsm_target_update = { + "$set": {"id": bsm_napa_ids[bsm_counter], "part_of": sty_napa_list} + } else: print( "length of array for " + alt_id_slot + "exists and is greater than 1" ) - update_alt = False + bsm_target_update = { + "$set": {"id": bsm_napa_ids[bsm_counter], "part_of": sty_napa_list} + } else: - update_alt = True - alt_id.append(bsm_doc["id"]) - print("will update alt id b/c could not fine alt id") - break - if update_alt: - bsm_target_update = { - "$set": { + alt_id.append(bsm_doc["id"]) + print("will update alt id b/c could not find alt id") + bsm_target_update = { + "$set": { "id": bsm_napa_ids[bsm_counter], - "part_of": sty_napa_list, - alt_id_slot_name: alt_id, - } - } - elif not update_alt: - bsm_target_update = { - "$set": {"id": bsm_napa_ids[bsm_counter], "part_of": sty_napa_list} - } - else: - print("not sure how to make the biosample update for" + bsm_doc["id"]) + "part_of": sty_napa_list, + alt_id_slot_name: alt_id, + } + } + #else: + # print(bsm_doc["id"] + "does not start with prefix"+ bsm_alt_id_dict[alt_id_slot]) + #else: + # print("not sure how to make the biosample update for" + bsm_doc["id"]) bsm_class_legacy_napa = ( "Biosample " + bsm_doc["id"] + " " + bsm_napa_ids[bsm_counter] ) @@ -139,8 +150,7 @@ def update_bsm_by_study(napa_sty_id): ################ -# function to get legacy study id from alt id slot -def napa_sty_to_legacy(napa_sty_id): + legacy_sty = "" get_sty_record = {"id": napa_sty_id} target_sty = sty_coll.find_one(get_sty_record) @@ -163,7 +173,10 @@ def update_omics_by_study(napa_sty_id): "alternative_identifiers": "emsl:", } legacy_sty = napa_sty_to_legacy(napa_sty_id) + #commented out only until we get SPRUCE fixed legacy_omics = {"part_of": legacy_sty, "id": {"$ne": "^nmdc:omprc"}} + # test only serach for NOM data so Yuri can test + #legacy_omics = {"part_of": legacy_sty, "id": {"$ne": "^nmdc:omprc"}, "omics_type.has_raw_value":"Organic Matter Characterization"} f_omics_id_mapping = open(legacy_sty + "_omics_reid.txt", "w") with open(legacy_sty + "_omics_napa.json", "r") as j: omics_napa_ids = json.loads(j.read()) @@ -180,10 +193,12 @@ def update_omics_by_study(napa_sty_id): "$or": [ {"emsl_biosample_identifiers": biosample}, {"gold_biosample_identifiers": biosample}, - {"insdc_biosample_identifiers": biosample}, + {"igsn_biosample_identifiers": biosample}, ] } get_biosample = bsm_coll.find_one(target_has_input) + print(omics_doc) + print(get_biosample["id"]) napa_biosample_inputs.append(get_biosample["id"]) # set id and alternative ids target_omics = {"id": omics_doc["id"]} From de6086986a016716203c287a64fa76a0f96a1fed Mon Sep 17 00:00:00 2001 From: aclum Date: Tue, 23 Jan 2024 16:22:23 -0800 Subject: [PATCH 14/44] add insert_many_pymongo.py as a workaround to json:submit endpoint --- nmdc_schema/insert_many_pymongo.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 nmdc_schema/insert_many_pymongo.py diff --git a/nmdc_schema/insert_many_pymongo.py b/nmdc_schema/insert_many_pymongo.py new file mode 100644 index 0000000000..cb4c267709 --- /dev/null +++ b/nmdc_schema/insert_many_pymongo.py @@ -0,0 +1,29 @@ +import os +from pprint import pprint +import secrets +import time +import pymongo +from pymongo import MongoClient + +#connect to napa mongo +#set value of napa_mongo_pw manually interactively +napa_mongo = ( + "mongodb://root:" + + napa_mongo_pw + + "@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin") +client = MongoClient(napa_mongo) +mydb = client["nmdc"] + + +#workaround example for json:submit endpoint +#modified from https://stackoverflow.com/questions/49510049/how-to-import-json-file-to-mongodb-using-python +#read in json file which has target collections and documents to insert +with open('20240118.stegen.metap.SOP01.json') as f: + file_data = json.load(f) + +#loop through target collections and insert documents for each +for collection,documents in file_data.items(): + target_coll=mydb[collection] + target_coll.insert_many(documents) + + From f85d6c6fffd4944d521b0fe3011419ba44220356 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 24 Jan 2024 16:18:58 -0800 Subject: [PATCH 15/44] Use `click` framework and specify options via CLI --- nmdc_schema/insert_many_pymongo.py | 146 +++++++++++++++++++++++------ 1 file changed, 119 insertions(+), 27 deletions(-) diff --git a/nmdc_schema/insert_many_pymongo.py b/nmdc_schema/insert_many_pymongo.py index cb4c267709..17259378ef 100644 --- a/nmdc_schema/insert_many_pymongo.py +++ b/nmdc_schema/insert_many_pymongo.py @@ -1,29 +1,121 @@ -import os -from pprint import pprint -import secrets -import time -import pymongo -from pymongo import MongoClient - -#connect to napa mongo -#set value of napa_mongo_pw manually interactively -napa_mongo = ( - "mongodb://root:" - + napa_mongo_pw - + "@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin") -client = MongoClient(napa_mongo) -mydb = client["nmdc"] - - -#workaround example for json:submit endpoint -#modified from https://stackoverflow.com/questions/49510049/how-to-import-json-file-to-mongodb-using-python -#read in json file which has target collections and documents to insert -with open('20240118.stegen.metap.SOP01.json') as f: - file_data = json.load(f) - -#loop through target collections and insert documents for each -for collection,documents in file_data.items(): - target_coll=mydb[collection] - target_coll.insert_many(documents) +import json +import logging +import click_log +import click # not currently a top-level dependency of `nmdc-schema` +import pymongo # not currently a top-level dependency of `nmdc-schema` +import requests # not currently a top-level dependency of `nmdc-schema` + +logger = logging.getLogger(__name__) +click_log.basic_config(logger) + + + +@click.command() +@click_log.simple_verbosity_option(logger) +@click.option( + "--input-file", + "--in", + type=click.File(), + required=True, + help=r"Path to JSON file containing input data", + prompt=r"Path to JSON file", +) +@click.option( + "--mongo-uri", + type=str, + required=True, + envvar="TEMP_MONGO_URI", + help=r"MongoDB connection string (can be specified via an environment variable named `TEMP_MONGO_URI`)", + prompt=r"MongoDB connection string", +) +@click.option( + "--is-direct-connection", + type=bool, + required=False, + default=True, + help=f"Whether to use the `directConnection` parameter for the MongoDB connection", + prompt=f"Whether to use the `directConnection` parameter for the MongoDB connection", +) +@click.option( + "--database-name", + type=str, + required=False, + default="nmdc", + help=f"MongoDB database name", + prompt=f"MongoDB database name", +) +@click.option( + "--validator-uri", + type=str, + required=False, + default="https://api.microbiomedata.org/metadata/json:validate", + help=f"URI of NMDC Schema-based validator", + prompt=f"URI of NMDC Schema-based validator", +) +def insert_many_pymongo( + input_file, + mongo_uri: str, + is_direct_connection: bool, + database_name: str, + validator_uri: str, +) -> None: + r""" + Reads data from a JSON file and inserts that data into a MongoDB database. + + The JSON file consists of a top-level object whose property names corresponding to MongoDB collection names. + The value of each of those properties is an array of objects. Each object corresponding to a document you want + to insert into that collection. + """ + r""" + References: + - Topic: Specifying a file path via a CLI option (and `click` providing a file handle to the function). + https://click.palletsprojects.com/en/8.1.x/api/#click.File + - Topic: `click` populating function parameters from environment variables. + https://click.palletsprojects.com/en/8.1.x/options/#values-from-environment-variables + - Topic: `click_log`. + https://click-log.readthedocs.io/en/stable/ + """ + + # Validate the JSON data with respect to the NMDC schema. + # + # Note: The validation endpoint currently returns `{"result": "All Okay!"}` + # when data is valid. + # + logger.debug(f"Validating the JSON data.") + json_data = json.load(input_file) + response = requests.post(validator_uri, json=json_data) + assert response.status_code == 200, f"Failed to access validator at {validator_uri}" + validation_result = response.json() + if validation_result.get("result") == "All Okay!": + logger.debug(f"The JSON data is valid.") + else: + logger.error(f"Validation result: {validation_result}") + raise ValueError(f"The JSON data is not valid.") + + # Validate the MongoDB connection string and database name. + mongo_client = pymongo.MongoClient(host=mongo_uri, directConnection=is_direct_connection) + with pymongo.timeout(5): # stop trying after 5 seconds + assert (database_name in mongo_client.list_database_names()), f'The database named "{database_name}" does not exist.' + + # Insert the JSON data into the MongoDB database. + db = mongo_client[database_name] + logger.info(f'Processing the {len(json_data.keys())} collection(s) provided.') + for collection_name, documents in json_data.items(): + if len(documents) > 0: + logger.debug(f'Inserting {len(documents)} documents into the "{collection_name}" collection.') + result = db[collection_name].insert_many(documents) + num_documents_inserted = len(result.inserted_ids) + num_documents_provided = len(documents) + logger.info(f"Inserted {num_documents_inserted} of {num_documents_provided} documents.") + if num_documents_inserted < num_documents_provided: + logger.warning(f"Not all of the provided documents were inserted.") + else: + logger.warning(f'Skipping collection "{collection_name}" because no documents were provided for it.') + + return None + + +if __name__ == "__main__": + insert_many_pymongo() # `click` will prompt the user for options From 6d2c920d4871e0dafc2532a0937c456cfd068052 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 24 Jan 2024 16:25:02 -0800 Subject: [PATCH 16/44] Refine CLI command description --- nmdc_schema/insert_many_pymongo.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nmdc_schema/insert_many_pymongo.py b/nmdc_schema/insert_many_pymongo.py index 17259378ef..a5903ad505 100644 --- a/nmdc_schema/insert_many_pymongo.py +++ b/nmdc_schema/insert_many_pymongo.py @@ -64,9 +64,7 @@ def insert_many_pymongo( r""" Reads data from a JSON file and inserts that data into a MongoDB database. - The JSON file consists of a top-level object whose property names corresponding to MongoDB collection names. - The value of each of those properties is an array of objects. Each object corresponding to a document you want - to insert into that collection. + The contents of the JSON file conform to the NMDC Schema. """ r""" References: From 6b37ac7c0c6c204ecf663cba7701d008c3541458 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 24 Jan 2024 17:19:57 -0800 Subject: [PATCH 17/44] Display collection name during insertion --- nmdc_schema/insert_many_pymongo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nmdc_schema/insert_many_pymongo.py b/nmdc_schema/insert_many_pymongo.py index a5903ad505..74579349aa 100644 --- a/nmdc_schema/insert_many_pymongo.py +++ b/nmdc_schema/insert_many_pymongo.py @@ -102,7 +102,7 @@ def insert_many_pymongo( logger.info(f'Processing the {len(json_data.keys())} collection(s) provided.') for collection_name, documents in json_data.items(): if len(documents) > 0: - logger.debug(f'Inserting {len(documents)} documents into the "{collection_name}" collection.') + logger.info(f'Inserting {len(documents)} documents into the "{collection_name}" collection.') result = db[collection_name].insert_many(documents) num_documents_inserted = len(result.inserted_ids) num_documents_provided = len(documents) From a815cc19bbfdeb237e65c6e32cd02b330fb603a4 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 24 Jan 2024 17:24:58 -0800 Subject: [PATCH 18/44] Clarify command description --- nmdc_schema/insert_many_pymongo.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nmdc_schema/insert_many_pymongo.py b/nmdc_schema/insert_many_pymongo.py index 74579349aa..e462ad64bf 100644 --- a/nmdc_schema/insert_many_pymongo.py +++ b/nmdc_schema/insert_many_pymongo.py @@ -62,9 +62,7 @@ def insert_many_pymongo( validator_uri: str, ) -> None: r""" - Reads data from a JSON file and inserts that data into a MongoDB database. - - The contents of the JSON file conform to the NMDC Schema. + Reads data from an NMDC Schema-conformant JSON file and inserts that data into a MongoDB database. """ r""" References: From 6e82a0c357016c5643d9a4f32737330917f5b353 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 24 Jan 2024 17:38:15 -0800 Subject: [PATCH 19/44] Refine descriptions of CLI options --- nmdc_schema/insert_many_pymongo.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/nmdc_schema/insert_many_pymongo.py b/nmdc_schema/insert_many_pymongo.py index e462ad64bf..bc146c8c30 100644 --- a/nmdc_schema/insert_many_pymongo.py +++ b/nmdc_schema/insert_many_pymongo.py @@ -16,10 +16,10 @@ @click_log.simple_verbosity_option(logger) @click.option( "--input-file", - "--in", type=click.File(), required=True, - help=r"Path to JSON file containing input data", + help=r"Path to a JSON file containing the data you want to insert. " + r"The JSON file must conform to the NMDC Schema.", prompt=r"Path to JSON file", ) @click.option( @@ -27,7 +27,9 @@ type=str, required=True, envvar="TEMP_MONGO_URI", - help=r"MongoDB connection string (can be specified via an environment variable named `TEMP_MONGO_URI`)", + help=r"MongoDB connection string. Note: Some connection strings include a password. " + r"To avoid putting your password on the command line, you can specify the connection string " + r"via an environment variable named `TEMP_MONGO_URI`.", prompt=r"MongoDB connection string", ) @click.option( @@ -35,24 +37,25 @@ type=bool, required=False, default=True, - help=f"Whether to use the `directConnection` parameter for the MongoDB connection", - prompt=f"Whether to use the `directConnection` parameter for the MongoDB connection", + show_default=True, + help=f"Whether you want the script to set the `directConnection` flag when connecting to the MongoDB server. " + f"That is required by some MongoDB servers that belong to a replica set. ", ) @click.option( "--database-name", type=str, required=False, default="nmdc", + show_default=True, help=f"MongoDB database name", - prompt=f"MongoDB database name", ) @click.option( "--validator-uri", type=str, required=False, default="https://api.microbiomedata.org/metadata/json:validate", + show_default=True, help=f"URI of NMDC Schema-based validator", - prompt=f"URI of NMDC Schema-based validator", ) def insert_many_pymongo( input_file, From 03c2125cc133258cb9e38e9a4c24ac3da7596500 Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 25 Jan 2024 16:16:38 -0800 Subject: [PATCH 20/44] fix deletion of start of study lookup function, fix typo in omics re-id function --- nmdc_schema/napa_study_biosample_omics_migration.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/nmdc_schema/napa_study_biosample_omics_migration.py b/nmdc_schema/napa_study_biosample_omics_migration.py index 8bbe829564..20a8da7c42 100644 --- a/nmdc_schema/napa_study_biosample_omics_migration.py +++ b/nmdc_schema/napa_study_biosample_omics_migration.py @@ -1,5 +1,4 @@ import json -m import os from pprint import pprint import secrets @@ -149,8 +148,8 @@ def update_bsm_by_study(napa_sty_id): ################ - - +# function to get legacy study id from alt id slot +def napa_sty_to_legacy(napa_sty_id): legacy_sty = "" get_sty_record = {"id": napa_sty_id} target_sty = sty_coll.find_one(get_sty_record) @@ -216,7 +215,7 @@ def update_omics_by_study(napa_sty_id): len(omics_doc[alt_omics_id_slot]) == 1 and omics_doc[alt_omics_id_slot][0] == omics_doc["id"] ): - print(alt_omcs_id_slot + " already set for " + omics_doc["id"]) + print(alt_omics_id_slot + " already set for " + omics_doc["id"]) update_alt_omics = False else: print( From c2ac49ef1f086ccad7ee99595dd11082208ae9a7 Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Thu, 4 Apr 2024 13:27:39 -0700 Subject: [PATCH 21/44] Update `connect_napa_mongo.py` to output tsv with deleted record identifiers --- nmdc_schema/connect_napa_mongo.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index 77759fefbb..30a09f0c76 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -1,4 +1,5 @@ from datetime import datetime, timezone +import csv import json import os from pprint import pprint @@ -457,6 +458,8 @@ def update_studies_to_napa_standards(): data_object_coll = mydb["data_object_set"] target_gp_for_del = open("omics_records_to_delete.txt", "r") +# Track deleted record identifiers as (collection, id) tuples +deleted_record_identifiers = [] for gp in target_gp_for_del: gold_project_id = gp.strip() gold_proj_curie = "gold:" + gold_project_id @@ -469,6 +472,7 @@ def update_studies_to_napa_standards(): doc = wea_coll.find_one({"was_informed_by": gold_proj_curie}) if doc: print("found " + doc["id"] + " in collection " + collection) + deleted_record_identifiers.append((collection, doc["id"])) # wea_to_delete.append(doc) wea_coll.delete_one({"was_informed_by": gold_proj_curie}) # this method should not be used as there are data objects that need to be removed that are not listed in has_output for the WEA records @@ -485,6 +489,19 @@ def update_studies_to_napa_standards(): + " in collection " + collection ) - data_object_coll.delete_many({"description": {"$regex": gold_project_id}}) + # Fetch documents matching the regex pattern + matching_docs = data_object_coll.find({"description": {"$regex": gold_project_id}}) + # Delete each matching document + for doc in matching_docs: + deleted_record_identifiers.append(("data_object_set", doc["id"])) + data_object_coll.delete_one({"_id": doc["_id"]}) + +# Print the list of deleted record identifiers to a tsv file +with open("deleted_record_identifiers.tsv", "w") as f: + writer = csv.writer(f, delimiter="\t") + writer.writerow(["collection", "id"]) + for record in deleted_record_identifiers: + writer.writerow(record) + ### # end cleanup of omics records that don't exist From 2145167d565932da3e82b2b297f5cf38935e131a Mon Sep 17 00:00:00 2001 From: "Giberson, Cameron" Date: Thu, 4 Apr 2024 15:20:51 -0700 Subject: [PATCH 22/44] add script for deleting metaPro records --- nmdc_schema/metap_records_delete.py | 266 ++++++++++++++++++++++++++++ 1 file changed, 266 insertions(+) create mode 100644 nmdc_schema/metap_records_delete.py diff --git a/nmdc_schema/metap_records_delete.py b/nmdc_schema/metap_records_delete.py new file mode 100644 index 0000000000..613f35e3b6 --- /dev/null +++ b/nmdc_schema/metap_records_delete.py @@ -0,0 +1,266 @@ +import json +import pymongo +import argparse +from pathlib import Path +from typing import List, Any, Dict, Tuple +from collections import defaultdict, Counter +from itertools import chain + + +def args() -> Tuple[str]: + parser = argparse.ArgumentParser() + + parser.add_argument( + "--output_dir", + type=str, + help="The output directory to save results", + required=False, + ) + parser.add_argument( + "-d", help="Delete all MetaP proteomics records", action="store_true" + ) + parser.add_argument("--username", type=str, help="MongoDB username", required=True) + parser.add_argument("--password", type=str, help="MongoDB password", required=True) + return parser.parse_args() + + +class NMDCAccessor: + def __init__(self, db): + self.db = db + + def get_metaproteomics_analysis_activity_set_documents(self) -> Any: + collection = self.db["metaproteomics_analysis_activity_set"] + documents = collection.find({}) + + return list(documents) + + def get_has_outputs_map(self) -> Dict[str, List[str]]: + documents = self.get_metaproteomics_analysis_activity_set_documents() + has_outputs_map = defaultdict(list) + + for document in documents: + has_outputs_map[document["id"]].extend(document["has_output"]) + + return dict(has_outputs_map) + + def get_data_object_set_documents(self, ids: List[str]) -> Any: + collection = self.db["data_object_set"] + query = {"id": {"$in": ids}} + documents = collection.find(query) + + return list(documents) + + def get_data_objects_from_activity_set(self) -> Any: + ids = self.get_has_outputs_map() + flattened_ids = list(chain(*ids.values())) + + return self.get_data_object_set_documents(flattened_ids) + + def get_matching_msgf_data_objects_records(self) -> Any: + collection = self.db["data_object_set"] + query = {"description": {"$regex": "MSGF"}} + documents = collection.find(query) + + return list(documents) + + def get_matching_msgf_data_object_ids(self) -> List[str]: + records = self.get_matching_msgf_data_objects_records() + + return [record["id"] for record in records] + + def get_metap_gene_function_aggregation_documents(self) -> Any: + collection = self.db["metap_gene_function_aggregation"] + documents = collection.find({}) + + return list(documents) + + def get_metaproteomics_collection_ids_to_delete_map(self) -> Dict[str, List[str]]: + metap_analy_documents = ( + self.get_metaproteomics_analysis_activity_set_documents() + ) + data_objects_documents = self.get_matching_msgf_data_objects_records() + + metap_ids = [ + metap_analy_document["id"] for metap_analy_document in metap_analy_documents + ] + data_objects_ids = [ + data_object_document["id"] + for data_object_document in data_objects_documents + ] + + return { + "metaproteomics_analysis_activity_set": metap_ids, + "data_object_set": data_objects_ids, + } + + def delete_matching_records_from_ids( + self, collection_name: str, ids: List[str] + ) -> None: + collection = self.db[collection_name] + filter = {"id": {"$in": ids}} + + result = collection.delete_many(filter) + print(f"Deleted {result.deleted_count} documents") + + def delete_matching_record_from_id(self, collection_name: str, id: str) -> None: + collection = self.db[collection_name] + filter = {"id": id} + + result = collection.delete_one(filter) + print(f"Deleted {result.deleted_count} documents") + + def delete_all_records_from_collection(self, collection_name: str) -> Any: + """ + A terrifying function for deleting ALL documents in a collection + """ + + collection = self.db[collection_name] + + result = collection.delete_many({}) + + print(f"Deleted {result.deleted_count} documents") + + def delete_all_metaproteomics_records(self) -> None: + metap_collection_name = "metap_gene_function_aggregation" + metaproteomics_analy_collection_name = "metaproteomics_analysis_activity_set" + data_objects_set_name = "data_object_set" + + # Drop all all from metap gene function collection. + self.delete_all_records_from_collection(metap_collection_name) + + # Drop all from metaproteomics analysis activity set collection. + self.delete_all_records_from_collection(metaproteomics_analy_collection_name) + + # Get all IDs associated with protemics job outputs. + # This search is broader than tracing down the outputs of the metaproteomics analysis activity set records' data objects + # since there appear to be dangling data objects that are not associated with any metaproteomics analysis activity records, + # but "MSGF" is in their description and absolutely associated with the proteomics pipeline + ids = self.get_matching_msgf_data_object_ids() + + for id in ids: + self.delete_matching_record_from_id(data_objects_set_name, id) + + def save_metaproteomics_analysis_activity_set(self, output_dir: Path) -> None: + output_file = output_dir.joinpath( + Path("metaproteomics_analysis_activity_set.json") + ) + documents = self.get_metaproteomics_analysis_activity_set_documents() + + with open(str(output_file), "w+") as fp: + json.dump(documents, fp, default=str, indent=2) + + def save_has_outputs_map(self, output_dir: Path) -> None: + output_file = output_dir.joinpath(Path("has_outputs_map.json")) + has_outputs_map = self.get_has_outputs_map() + + with open(str(output_file), "w+") as fp: + json.dump(has_outputs_map, fp, default=str, indent=2) + + def save_data_object_set(self, output_dir: Path) -> None: + output_file = output_dir.joinpath(Path("matching_data_objects.json")) + data_objects = self.get_data_objects_from_activity_set() + + with open(str(output_file), "w+") as fp: + json.dump(data_objects, fp, default=str, indent=2) + + def save_all_metaproteomics_ids(self, output_dir: Path) -> None: + output_file = output_dir.joinpath(Path("all_ids.json")) + id_map = self.get_has_outputs_map() + flattened_ids = list(chain(*id_map.values())) + flattened_ids.extend(id_map.keys()) + + with open(str(output_file), "w+") as fp: + json.dump(flattened_ids, fp, default=str, indent=2) + + def save_matching_msgf_data_objects(self, output_dir: Path) -> None: + output_file = output_dir.joinpath(Path("all_proteomics_data_objects.json")) + data_objects = self.get_matching_msgf_data_objects_records() + + with open(str(output_file), "w+") as fp: + json.dump(data_objects, fp, default=str, indent=2) + + def save_metap_gene_function_aggregation(self, output_dir: Path) -> None: + output_file = output_dir.joinpath(Path("metap_gene_function_aggregation.json")) + documents = self.get_metap_gene_function_aggregation_documents() + + with open(str(output_file), "w+") as fp: + json.dump(documents, fp, default=str, indent=2) + + def save_all_to_delete_by_ids_map(self, output_dir: Path) -> None: + output_file = output_dir.joinpath(Path("ids_to_delete.json")) + to_delete_ids_map = self.get_metaproteomics_collection_ids_to_delete_map() + + with open(str(output_file), "w+") as fp: + json.dump(to_delete_ids_map, fp, default=str, indent=2) + + def save_metap_gene_function_aggregation_stats(self, output_dir: Path) -> None: + output_file = output_dir.joinpath( + Path("metap_gene_function_aggregation_stats.json") + ) + documents_function_agg = self.get_metap_gene_function_aggregation_documents() + documents_metaproteomics_analy = ( + self.get_metaproteomics_analysis_activity_set_documents() + ) + + ids_function_agg: List[str] = [ + document["metaproteomic_analysis_id"] for document in documents_function_agg + ] + id_function_agg_counter: Counter = Counter(ids_function_agg) + ids_metaproteomics_analy: set[str] = { + document["id"] for document in documents_metaproteomics_analy + } + ids_in_metaproteomics_anly_set_map: Dict[str, int] = { + id: id in ids_metaproteomics_analy for id in id_function_agg_counter.keys() + } + + stats_json = { + "id_count": len(ids_function_agg), + "unique_id_count": len(id_function_agg_counter.keys()), + "id_frequency_map": id_function_agg_counter, + "id_found_in_metaproteomics_analysis_activity_set_map": ids_in_metaproteomics_anly_set_map, + } + + with open(str(output_file), "w+") as fp: + json.dump(stats_json, fp, default=str, indent=2) + + @staticmethod + def get_nmdc_db(username: str, password: str) -> "NMDCAccessor": + db = "nmdc" + + client = pymongo.MongoClient( + "localhost", + port=37020, + username=username, + password=password, + authSource="admin", + directConnection=True, + authMechanism="DEFAULT", + ) + + return NMDCAccessor(client[db]) + + +def main(): + args_map = args() + + accessor = NMDCAccessor.get_nmdc_db(args_map.username, args_map.password) + + if args_map.output_dir: + output = Path(args_map.output_dir) + print(f"Saving to {output}") + accessor.save_data_object_set(output) + accessor.save_metaproteomics_analysis_activity_set(output) + accessor.save_has_outputs_map(output) + accessor.save_all_metaproteomics_ids(output) + accessor.save_matching_msgf_data_objects(output) + accessor.save_metap_gene_function_aggregation_stats(output) + accessor.save_all_to_delete_by_ids_map(output) + if ( + args_map.d + ): + print("Deleting all records") + accessor.delete_all_metaproteomics_records() + + +if __name__ == "__main__": + main() From 55f087ad90982d647b82f42efd2662f8368740ba Mon Sep 17 00:00:00 2001 From: "Giberson, Cameron" Date: Thu, 11 Apr 2024 17:43:21 -0700 Subject: [PATCH 23/44] add logging to write ID of deleted record if one is available. Small refactor to clean up some messy functions. --- nmdc_schema/metap_records_delete.py | 120 +++++++++++++++++++--------- 1 file changed, 83 insertions(+), 37 deletions(-) diff --git a/nmdc_schema/metap_records_delete.py b/nmdc_schema/metap_records_delete.py index 613f35e3b6..9d91da9c30 100644 --- a/nmdc_schema/metap_records_delete.py +++ b/nmdc_schema/metap_records_delete.py @@ -1,23 +1,34 @@ import json import pymongo import argparse +import logging +import time from pathlib import Path from typing import List, Any, Dict, Tuple from collections import defaultdict, Counter from itertools import chain +logger = logging.getLogger(Path(__file__).name) +logging.basicConfig( + level=logging.INFO, + handlers=[ + logging.FileHandler(f'{Path(__file__).stem}_{time.strftime("%Y%m%d-%H%M%S")}.log'), + logging.StreamHandler() + ] +) + def args() -> Tuple[str]: parser = argparse.ArgumentParser() parser.add_argument( - "--output_dir", + "--output-dir", type=str, - help="The output directory to save results", + help="Optionally write relevant records and stats in the specifiec directory. Occurs before deletion.", required=False, ) parser.add_argument( - "-d", help="Delete all MetaP proteomics records", action="store_true" + "--dry-run", help="Print, but not delete, all metaP proteomics records deletion records", action="store_true" ) parser.add_argument("--username", type=str, help="MongoDB username", required=True) parser.add_argument("--password", type=str, help="MongoDB password", required=True) @@ -28,14 +39,15 @@ class NMDCAccessor: def __init__(self, db): self.db = db - def get_metaproteomics_analysis_activity_set_documents(self) -> Any: - collection = self.db["metaproteomics_analysis_activity_set"] - documents = collection.find({}) + def get_documents_from_collection(self, collection_name: str) -> Any: + collection = self.db[collection_name] + documents = list(collection.find({})) - return list(documents) + logger.info(f"Found {len(documents)} documents in {collection_name}") + return documents def get_has_outputs_map(self) -> Dict[str, List[str]]: - documents = self.get_metaproteomics_analysis_activity_set_documents() + documents = self.get_records_from_collection("metaproteomics_analysis_activity_set") has_outputs_map = defaultdict(list) for document in documents: @@ -68,12 +80,6 @@ def get_matching_msgf_data_object_ids(self) -> List[str]: return [record["id"] for record in records] - def get_metap_gene_function_aggregation_documents(self) -> Any: - collection = self.db["metap_gene_function_aggregation"] - documents = collection.find({}) - - return list(documents) - def get_metaproteomics_collection_ids_to_delete_map(self) -> Dict[str, List[str]]: metap_analy_documents = ( self.get_metaproteomics_analysis_activity_set_documents() @@ -100,45 +106,72 @@ def delete_matching_records_from_ids( filter = {"id": {"$in": ids}} result = collection.delete_many(filter) - print(f"Deleted {result.deleted_count} documents") + logger.info(f"Deleted {result.deleted_count} documents") + + def delete_matching_record_from_id(self, collection_name: str, id: str, delete=False, should_log_id=True) -> None: + """ + Delete a record from a collection by ID. - def delete_matching_record_from_id(self, collection_name: str, id: str) -> None: + :param collection_name: The name of the collection to delete the record from. + :param id: The ID of the record to delete. + :param delete: If True, delete the record. If False, just log the record that would be deleted. + :param should_log_id: If True, log the ID of the record that would be deleted. If False, do not log the ID since not all records have IDs. + """ collection = self.db[collection_name] filter = {"id": id} - result = collection.delete_one(filter) - print(f"Deleted {result.deleted_count} documents") + if should_log_id: + self.__log_record_deletion_information(collection_name, id) + + if delete: + result = collection.delete_one(filter) + logger.info(f"Deleted {result.deleted_count} record(s) from {collection_name}") - def delete_all_records_from_collection(self, collection_name: str) -> Any: + def delete_all_records_from_collection(self, collection_name: str, delete=False, should_log_id=True) -> Any: """ - A terrifying function for deleting ALL documents in a collection + A terrifying function for deleting ALL records in a collection. + + :param collection_name: The name of the collection to delete all records from. + :param delete: If True, delete the records. If False, just log the records that would be deleted. + :param ided_records: If True, log the IDs of the records that would be deleted. If False, do not log the IDs since not all records have IDs. """ + logger.info(f"Deleting all records from {collection_name}") + to_delete = self.get_documents_from_collection(collection_name) collection = self.db[collection_name] - result = collection.delete_many({}) + if should_log_id: + ids = [doc["id"] for doc in to_delete] + self.__log_record_deletion_information_many(collection_name, ids) - print(f"Deleted {result.deleted_count} documents") + if delete: + result = collection.delete_many({}) + logger.info(f"Deleted {result.deleted_count} record(s) from {collection_name}") - def delete_all_metaproteomics_records(self) -> None: + def delete_all_metaproteomics_records(self, delete = False) -> None: + """ + Delete all metaproteomics records. + + :param delete: If True, delete the records. If False, just log the records that would be deleted. + """ metap_collection_name = "metap_gene_function_aggregation" metaproteomics_analy_collection_name = "metaproteomics_analysis_activity_set" data_objects_set_name = "data_object_set" - # Drop all all from metap gene function collection. - self.delete_all_records_from_collection(metap_collection_name) + # Drop all from metap gene function collection. + self.delete_all_records_from_collection(metap_collection_name, delete=delete, should_log_id=False) # Drop all from metaproteomics analysis activity set collection. - self.delete_all_records_from_collection(metaproteomics_analy_collection_name) + self.delete_all_records_from_collection(metaproteomics_analy_collection_name, delete=delete, should_log_id=True) # Get all IDs associated with protemics job outputs. # This search is broader than tracing down the outputs of the metaproteomics analysis activity set records' data objects # since there appear to be dangling data objects that are not associated with any metaproteomics analysis activity records, # but "MSGF" is in their description and absolutely associated with the proteomics pipeline ids = self.get_matching_msgf_data_object_ids() - + logger.info(f'Found {len(ids)} matching records in {data_objects_set_name}') for id in ids: - self.delete_matching_record_from_id(data_objects_set_name, id) + self.delete_matching_record_from_id(data_objects_set_name, id, delete=delete, should_log_id=True) def save_metaproteomics_analysis_activity_set(self, output_dir: Path) -> None: output_file = output_dir.joinpath( @@ -181,7 +214,7 @@ def save_matching_msgf_data_objects(self, output_dir: Path) -> None: def save_metap_gene_function_aggregation(self, output_dir: Path) -> None: output_file = output_dir.joinpath(Path("metap_gene_function_aggregation.json")) - documents = self.get_metap_gene_function_aggregation_documents() + documents = self.get_documents_from_collection("metap_gene_function_aggregation") with open(str(output_file), "w+") as fp: json.dump(documents, fp, default=str, indent=2) @@ -197,9 +230,9 @@ def save_metap_gene_function_aggregation_stats(self, output_dir: Path) -> None: output_file = output_dir.joinpath( Path("metap_gene_function_aggregation_stats.json") ) - documents_function_agg = self.get_metap_gene_function_aggregation_documents() + documents_function_agg = self.get_documents_from_collection("metap_gene_function_aggregation") documents_metaproteomics_analy = ( - self.get_metaproteomics_analysis_activity_set_documents() + self.get_documents_from_collection("metaproteomics_analysis_activity_set") ) ids_function_agg: List[str] = [ @@ -223,6 +256,14 @@ def save_metap_gene_function_aggregation_stats(self, output_dir: Path) -> None: with open(str(output_file), "w+") as fp: json.dump(stats_json, fp, default=str, indent=2) + def __log_record_deletion_information_many(self, collection_name: str, ids: List[str]) -> None: + for id in ids: + self.__log_record_deletion_information(collection_name, id) + + def __log_record_deletion_information(self, collection_name: str, id: str) -> None: + logger.info(f"Deleting record with ID: {id} from {collection_name}") + + @staticmethod def get_nmdc_db(username: str, password: str) -> "NMDCAccessor": db = "nmdc" @@ -247,7 +288,7 @@ def main(): if args_map.output_dir: output = Path(args_map.output_dir) - print(f"Saving to {output}") + logger.info(f"Saving to {output}") accessor.save_data_object_set(output) accessor.save_metaproteomics_analysis_activity_set(output) accessor.save_has_outputs_map(output) @@ -255,11 +296,16 @@ def main(): accessor.save_matching_msgf_data_objects(output) accessor.save_metap_gene_function_aggregation_stats(output) accessor.save_all_to_delete_by_ids_map(output) - if ( - args_map.d - ): - print("Deleting all records") - accessor.delete_all_metaproteomics_records() + + if args_map.dry_run: + logger.info("Dry run: no records will be deleted") + else: + logger.info("Deleting all records") + + # Being very explicit about the deletion of records + delete = not args_map.dry_run + + accessor.delete_all_metaproteomics_records(delete=delete) if __name__ == "__main__": From 4823fcbb61ceadcc58e62763310701e241a9e0e1 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 23 Apr 2024 18:09:23 -0700 Subject: [PATCH 24/44] Generate `/queries:run` request body for each collection --- nmdc_schema/connect_napa_mongo.py | 51 +++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index 30a09f0c76..3c9f65e1ae 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -503,5 +503,56 @@ def update_studies_to_napa_standards(): for record in deleted_record_identifiers: writer.writerow(record) + +def make_deletion_descriptors(collection_names_and_document_ids: list) -> dict: + r""" + Creates a deletion descriptor for each collection name-document ID pair in + the specified list. + + A deletion descriptor is a dictionary that, when converted into JSON, can + be used within the body of a request to the `/queries:run` endpoint of the + Runtime API. The deletion descriptors are grouped by collection, since the + `/queries:run` endpoint only processes documents in a single collection + per each HTTP request. + """ + + deletion_descriptors = dict() + for collection_name_and_document_id in collection_names_and_document_ids: + + # Extract the elements of the tuple. + (collection_name, document_id) = collection_name_and_document_id + + # Initialize this collection's list of deletion descriptors. + if collection_name not in deletion_descriptors: + deletion_descriptors[collection_name] = [] + + # Create and append a deletion descriptor for this item. + deletion_descriptor = dict(q=dict(id=document_id), limit=1) + deletion_descriptors[collection_name].append(deletion_descriptor) + + return deletion_descriptors + + +def dump_request_body(collection_name: str, its_deletion_descriptors: list) -> str: + r""" + Creates a request body into which the specified deletion descriptors are + incorporated, and writes them to a JSON file. That request body can be + submitted to the `/queries:run` endpoint of the Runtime API. + """ + + file_path = f"./{collection_name}.deletion_api_request_body.json" + with open(file_path, "w") as json_file: + api_request_body = dict(delete=collection_name, deletes=its_deletion_descriptors) + json.dump(api_request_body, json_file) + + return file_path + + +# Create JSON files, each of which contains a request body for the `/queries:json` endpoint. +deletion_descriptors = make_deletion_descriptors(deleted_record_identifiers) +for collection_name in deletion_descriptors.keys(): + dump_request_body(collection_name, deletion_descriptors[collection_name]) + + ### # end cleanup of omics records that don't exist From 33c1d8d1c0ccdb660f36a62db74d0b3e58077fa5 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 23 Apr 2024 18:12:15 -0700 Subject: [PATCH 25/44] Clarify comments --- nmdc_schema/connect_napa_mongo.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index 3c9f65e1ae..387b731d3b 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -513,7 +513,7 @@ def make_deletion_descriptors(collection_names_and_document_ids: list) -> dict: be used within the body of a request to the `/queries:run` endpoint of the Runtime API. The deletion descriptors are grouped by collection, since the `/queries:run` endpoint only processes documents in a single collection - per each HTTP request. + per HTTP request. """ deletion_descriptors = dict() @@ -526,7 +526,7 @@ def make_deletion_descriptors(collection_names_and_document_ids: list) -> dict: if collection_name not in deletion_descriptors: deletion_descriptors[collection_name] = [] - # Create and append a deletion descriptor for this item. + # Create and append a deletion descriptor for this document. deletion_descriptor = dict(q=dict(id=document_id), limit=1) deletion_descriptors[collection_name].append(deletion_descriptor) @@ -536,7 +536,7 @@ def make_deletion_descriptors(collection_names_and_document_ids: list) -> dict: def dump_request_body(collection_name: str, its_deletion_descriptors: list) -> str: r""" Creates a request body into which the specified deletion descriptors are - incorporated, and writes them to a JSON file. That request body can be + incorporated, and writes them to a JSON file. That request body can then be submitted to the `/queries:run` endpoint of the Runtime API. """ From 1f115d7e15ee052158f82acebc7b3fa10a7184e5 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 23 Apr 2024 18:21:06 -0700 Subject: [PATCH 26/44] Add doctests --- nmdc_schema/connect_napa_mongo.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index 387b731d3b..ba2c24c660 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -514,6 +514,18 @@ def make_deletion_descriptors(collection_names_and_document_ids: list) -> dict: Runtime API. The deletion descriptors are grouped by collection, since the `/queries:run` endpoint only processes documents in a single collection per HTTP request. + + Note: The remainder of this docstring consists of doctests. + Reference: https://docs.python.org/3/library/doctest.html + + >>> make_deletion_descriptors([]) + {} + >>> make_deletion_descriptors([("my_collection", "my_id")]) + {'my_collection': [{'q': {'id': 'my_id'}, 'limit': 1}]} + >>> make_deletion_descriptors([("my_collection", "my_id"), ("my_collection", "other_id")]) + {'my_collection': [{'q': {'id': 'my_id'}, 'limit': 1}, {'q': {'id': 'other_id'}, 'limit': 1}]} + >>> make_deletion_descriptors([("my_collection", "my_id"), ("other_collection", "other_id")]) + {'my_collection': [{'q': {'id': 'my_id'}, 'limit': 1}], 'other_collection': [{'q': {'id': 'other_id'}, 'limit': 1}]} """ deletion_descriptors = dict() From 4f2ce9a6fcd2bf5c98d4bf75323f487978b997d9 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 23 Apr 2024 19:34:37 -0700 Subject: [PATCH 27/44] Refactor code to facilitate testing --- nmdc_schema/connect_napa_mongo.py | 35 +++++++++++++++++++------------ 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index ba2c24c660..e2e9743ec9 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -539,32 +539,41 @@ def make_deletion_descriptors(collection_names_and_document_ids: list) -> dict: deletion_descriptors[collection_name] = [] # Create and append a deletion descriptor for this document. - deletion_descriptor = dict(q=dict(id=document_id), limit=1) + deletion_descriptor = {"q": {"id": document_id}, "limit": 1} deletion_descriptors[collection_name].append(deletion_descriptor) return deletion_descriptors -def dump_request_body(collection_name: str, its_deletion_descriptors: list) -> str: +def make_request_body(collection_name: str, its_deletion_descriptors: list) -> dict: r""" Creates a request body into which the specified deletion descriptors are - incorporated, and writes them to a JSON file. That request body can then be - submitted to the `/queries:run` endpoint of the Runtime API. - """ + incorporated. That request body can then be submitted to the `/queries:run` + endpoint of the Runtime API. - file_path = f"./{collection_name}.deletion_api_request_body.json" - with open(file_path, "w") as json_file: - api_request_body = dict(delete=collection_name, deletes=its_deletion_descriptors) - json.dump(api_request_body, json_file) + Note: The remainder of this docstring consists of doctests. - return file_path + >>> make_request_body("my_collection", []) + {'delete': 'my_collection', 'deletes': []} + >>> make_request_body("my_collection", [{'q': {'id': 'my_id'}, 'limit': 1}]) + {'delete': 'my_collection', 'deletes': [{'q': {'id': 'my_id'}, 'limit': 1}]} + >>> make_request_body("my_collection", [{'q': {'id': 'my_id'}, 'limit': 1}, {'q': {'id': 'other_id'}, 'limit': 1}]) + {'delete': 'my_collection', 'deletes': [{'q': {'id': 'my_id'}, 'limit': 1}, {'q': {'id': 'other_id'}, 'limit': 1}]} + """ + return {"delete": collection_name, "deletes": its_deletion_descriptors} -# Create JSON files, each of which contains a request body for the `/queries:json` endpoint. + +# For each collection that has any deletion descriptors, create a JSON file +# containing an HTTP request body compatible with the `/queries:json` endpoint. deletion_descriptors = make_deletion_descriptors(deleted_record_identifiers) for collection_name in deletion_descriptors.keys(): - dump_request_body(collection_name, deletion_descriptors[collection_name]) - + its_deletion_descriptors = deletion_descriptors[collection_name] + file_path = f"./{collection_name}.deletion_api_request_body.json" + with open(file_path, "w") as json_file: + request_body = make_request_body(collection_name, its_deletion_descriptors) + json.dump(request_body, json_file) + print(f"Created file: {file_path}") ### # end cleanup of omics records that don't exist From a116268e014f40bf1b1165e5c58bc21bbfc4481c Mon Sep 17 00:00:00 2001 From: eecavanna Date: Tue, 23 Apr 2024 21:10:23 -0700 Subject: [PATCH 28/44] Move statement closer to related statement --- nmdc_schema/connect_napa_mongo.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index e2e9743ec9..ca4b1f5fb2 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -530,7 +530,6 @@ def make_deletion_descriptors(collection_names_and_document_ids: list) -> dict: deletion_descriptors = dict() for collection_name_and_document_id in collection_names_and_document_ids: - # Extract the elements of the tuple. (collection_name, document_id) = collection_name_and_document_id @@ -569,9 +568,9 @@ def make_request_body(collection_name: str, its_deletion_descriptors: list) -> d deletion_descriptors = make_deletion_descriptors(deleted_record_identifiers) for collection_name in deletion_descriptors.keys(): its_deletion_descriptors = deletion_descriptors[collection_name] + request_body = make_request_body(collection_name, its_deletion_descriptors) file_path = f"./{collection_name}.deletion_api_request_body.json" with open(file_path, "w") as json_file: - request_body = make_request_body(collection_name, its_deletion_descriptors) json.dump(request_body, json_file) print(f"Created file: {file_path}") From 0694dfc24df6b83242a989f59779db7bb8b356f9 Mon Sep 17 00:00:00 2001 From: eecavanna Date: Wed, 24 Apr 2024 11:59:21 -0700 Subject: [PATCH 29/44] Add boolean flag preventing delete statements from being executed --- nmdc_schema/connect_napa_mongo.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index ca4b1f5fb2..4b617948d7 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -443,6 +443,10 @@ def update_studies_to_napa_standards(): ############### # track down records WorkflowExecutionActivity (WEA) records that need to be deleted and their associated data objects +# Flag you can use to control whether you want this script to also delete the documents it finds, or to only +# generate the `/queries:run` HTTP request bodies that can be used to delete them later via the Runtime API. +DELETE_DOCUMENTS_NOW: bool = False + seq_based_collection_list = [ "read_qc_analysis_activity_set", "read_based_taxonomy_analysis_activity_set", @@ -474,7 +478,8 @@ def update_studies_to_napa_standards(): print("found " + doc["id"] + " in collection " + collection) deleted_record_identifiers.append((collection, doc["id"])) # wea_to_delete.append(doc) - wea_coll.delete_one({"was_informed_by": gold_proj_curie}) + if DELETE_DOCUMENTS_NOW: + wea_coll.delete_one({"was_informed_by": gold_proj_curie}) # this method should not be used as there are data objects that need to be removed that are not listed in has_output for the WEA records # if "has_input" in doc.keys(): # for input in doc["has_input"]: @@ -494,7 +499,8 @@ def update_studies_to_napa_standards(): # Delete each matching document for doc in matching_docs: deleted_record_identifiers.append(("data_object_set", doc["id"])) - data_object_coll.delete_one({"_id": doc["_id"]}) + if DELETE_DOCUMENTS_NOW: + data_object_coll.delete_one({"_id": doc["_id"]}) # Print the list of deleted record identifiers to a tsv file with open("deleted_record_identifiers.tsv", "w") as f: From d1a79d1bbed264ae208574957ba954bd1c994cbf Mon Sep 17 00:00:00 2001 From: Michael Thornton Date: Mon, 29 Apr 2024 16:58:27 -0700 Subject: [PATCH 30/44] updates to command-line args --- nmdc_schema/metap_records_delete.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/nmdc_schema/metap_records_delete.py b/nmdc_schema/metap_records_delete.py index 9d91da9c30..236a570e23 100644 --- a/nmdc_schema/metap_records_delete.py +++ b/nmdc_schema/metap_records_delete.py @@ -30,8 +30,9 @@ def args() -> Tuple[str]: parser.add_argument( "--dry-run", help="Print, but not delete, all metaP proteomics records deletion records", action="store_true" ) - parser.add_argument("--username", type=str, help="MongoDB username", required=True) - parser.add_argument("--password", type=str, help="MongoDB password", required=True) + # parser.add_argument("--username", type=str, help="MongoDB username", required=True) + # parser.add_argument("--password", type=str, help="MongoDB password", required=True) + parser.add_argument("--mongo-uri", type=str, help="MongoDB URI", required=False, default="mongodb://localhost:27017",) return parser.parse_args() @@ -47,7 +48,7 @@ def get_documents_from_collection(self, collection_name: str) -> Any: return documents def get_has_outputs_map(self) -> Dict[str, List[str]]: - documents = self.get_records_from_collection("metaproteomics_analysis_activity_set") + documents = self.get_documents_from_collection("metaproteomics_analysis_activity_set") has_outputs_map = defaultdict(list) for document in documents: @@ -265,17 +266,12 @@ def __log_record_deletion_information(self, collection_name: str, id: str) -> No @staticmethod - def get_nmdc_db(username: str, password: str) -> "NMDCAccessor": + def get_nmdc_db(mongo_uri: str) -> "NMDCAccessor": db = "nmdc" client = pymongo.MongoClient( - "localhost", - port=37020, - username=username, - password=password, - authSource="admin", + mongo_uri, directConnection=True, - authMechanism="DEFAULT", ) return NMDCAccessor(client[db]) @@ -284,7 +280,7 @@ def get_nmdc_db(username: str, password: str) -> "NMDCAccessor": def main(): args_map = args() - accessor = NMDCAccessor.get_nmdc_db(args_map.username, args_map.password) + accessor = NMDCAccessor.get_nmdc_db(mongo_uri=args_map.mongo_uri) if args_map.output_dir: output = Path(args_map.output_dir) From b425f26fb9afd7dc21895223f1bde111b3158a03 Mon Sep 17 00:00:00 2001 From: "Giberson, Cameron" Date: Mon, 29 Apr 2024 17:36:11 -0700 Subject: [PATCH 31/44] delete unused functions and remove associated command-line args --- nmdc_schema/metap_records_delete.py | 156 +--------------------------- 1 file changed, 5 insertions(+), 151 deletions(-) diff --git a/nmdc_schema/metap_records_delete.py b/nmdc_schema/metap_records_delete.py index 236a570e23..38ab3f6a33 100644 --- a/nmdc_schema/metap_records_delete.py +++ b/nmdc_schema/metap_records_delete.py @@ -1,12 +1,10 @@ -import json -import pymongo import argparse import logging import time from pathlib import Path -from typing import List, Any, Dict, Tuple -from collections import defaultdict, Counter -from itertools import chain +from typing import List, Any, Tuple + +import pymongo logger = logging.getLogger(Path(__file__).name) @@ -22,16 +20,8 @@ def args() -> Tuple[str]: parser = argparse.ArgumentParser() parser.add_argument( - "--output-dir", - type=str, - help="Optionally write relevant records and stats in the specifiec directory. Occurs before deletion.", - required=False, - ) - parser.add_argument( - "--dry-run", help="Print, but not delete, all metaP proteomics records deletion records", action="store_true" + "--dry-run", help="Print, but not delete, all metaP proteomics records", action="store_true" ) - # parser.add_argument("--username", type=str, help="MongoDB username", required=True) - # parser.add_argument("--password", type=str, help="MongoDB password", required=True) parser.add_argument("--mongo-uri", type=str, help="MongoDB URI", required=False, default="mongodb://localhost:27017",) return parser.parse_args() @@ -47,28 +37,6 @@ def get_documents_from_collection(self, collection_name: str) -> Any: logger.info(f"Found {len(documents)} documents in {collection_name}") return documents - def get_has_outputs_map(self) -> Dict[str, List[str]]: - documents = self.get_documents_from_collection("metaproteomics_analysis_activity_set") - has_outputs_map = defaultdict(list) - - for document in documents: - has_outputs_map[document["id"]].extend(document["has_output"]) - - return dict(has_outputs_map) - - def get_data_object_set_documents(self, ids: List[str]) -> Any: - collection = self.db["data_object_set"] - query = {"id": {"$in": ids}} - documents = collection.find(query) - - return list(documents) - - def get_data_objects_from_activity_set(self) -> Any: - ids = self.get_has_outputs_map() - flattened_ids = list(chain(*ids.values())) - - return self.get_data_object_set_documents(flattened_ids) - def get_matching_msgf_data_objects_records(self) -> Any: collection = self.db["data_object_set"] query = {"description": {"$regex": "MSGF"}} @@ -81,25 +49,6 @@ def get_matching_msgf_data_object_ids(self) -> List[str]: return [record["id"] for record in records] - def get_metaproteomics_collection_ids_to_delete_map(self) -> Dict[str, List[str]]: - metap_analy_documents = ( - self.get_metaproteomics_analysis_activity_set_documents() - ) - data_objects_documents = self.get_matching_msgf_data_objects_records() - - metap_ids = [ - metap_analy_document["id"] for metap_analy_document in metap_analy_documents - ] - data_objects_ids = [ - data_object_document["id"] - for data_object_document in data_objects_documents - ] - - return { - "metaproteomics_analysis_activity_set": metap_ids, - "data_object_set": data_objects_ids, - } - def delete_matching_records_from_ids( self, collection_name: str, ids: List[str] ) -> None: @@ -134,7 +83,7 @@ def delete_all_records_from_collection(self, collection_name: str, delete=False, :param collection_name: The name of the collection to delete all records from. :param delete: If True, delete the records. If False, just log the records that would be deleted. - :param ided_records: If True, log the IDs of the records that would be deleted. If False, do not log the IDs since not all records have IDs. + :param should_log_id: If True, log the IDs of the records that would be deleted. If False, do not log the IDs since not all records have IDs. """ logger.info(f"Deleting all records from {collection_name}") @@ -174,89 +123,6 @@ def delete_all_metaproteomics_records(self, delete = False) -> None: for id in ids: self.delete_matching_record_from_id(data_objects_set_name, id, delete=delete, should_log_id=True) - def save_metaproteomics_analysis_activity_set(self, output_dir: Path) -> None: - output_file = output_dir.joinpath( - Path("metaproteomics_analysis_activity_set.json") - ) - documents = self.get_metaproteomics_analysis_activity_set_documents() - - with open(str(output_file), "w+") as fp: - json.dump(documents, fp, default=str, indent=2) - - def save_has_outputs_map(self, output_dir: Path) -> None: - output_file = output_dir.joinpath(Path("has_outputs_map.json")) - has_outputs_map = self.get_has_outputs_map() - - with open(str(output_file), "w+") as fp: - json.dump(has_outputs_map, fp, default=str, indent=2) - - def save_data_object_set(self, output_dir: Path) -> None: - output_file = output_dir.joinpath(Path("matching_data_objects.json")) - data_objects = self.get_data_objects_from_activity_set() - - with open(str(output_file), "w+") as fp: - json.dump(data_objects, fp, default=str, indent=2) - - def save_all_metaproteomics_ids(self, output_dir: Path) -> None: - output_file = output_dir.joinpath(Path("all_ids.json")) - id_map = self.get_has_outputs_map() - flattened_ids = list(chain(*id_map.values())) - flattened_ids.extend(id_map.keys()) - - with open(str(output_file), "w+") as fp: - json.dump(flattened_ids, fp, default=str, indent=2) - - def save_matching_msgf_data_objects(self, output_dir: Path) -> None: - output_file = output_dir.joinpath(Path("all_proteomics_data_objects.json")) - data_objects = self.get_matching_msgf_data_objects_records() - - with open(str(output_file), "w+") as fp: - json.dump(data_objects, fp, default=str, indent=2) - - def save_metap_gene_function_aggregation(self, output_dir: Path) -> None: - output_file = output_dir.joinpath(Path("metap_gene_function_aggregation.json")) - documents = self.get_documents_from_collection("metap_gene_function_aggregation") - - with open(str(output_file), "w+") as fp: - json.dump(documents, fp, default=str, indent=2) - - def save_all_to_delete_by_ids_map(self, output_dir: Path) -> None: - output_file = output_dir.joinpath(Path("ids_to_delete.json")) - to_delete_ids_map = self.get_metaproteomics_collection_ids_to_delete_map() - - with open(str(output_file), "w+") as fp: - json.dump(to_delete_ids_map, fp, default=str, indent=2) - - def save_metap_gene_function_aggregation_stats(self, output_dir: Path) -> None: - output_file = output_dir.joinpath( - Path("metap_gene_function_aggregation_stats.json") - ) - documents_function_agg = self.get_documents_from_collection("metap_gene_function_aggregation") - documents_metaproteomics_analy = ( - self.get_documents_from_collection("metaproteomics_analysis_activity_set") - ) - - ids_function_agg: List[str] = [ - document["metaproteomic_analysis_id"] for document in documents_function_agg - ] - id_function_agg_counter: Counter = Counter(ids_function_agg) - ids_metaproteomics_analy: set[str] = { - document["id"] for document in documents_metaproteomics_analy - } - ids_in_metaproteomics_anly_set_map: Dict[str, int] = { - id: id in ids_metaproteomics_analy for id in id_function_agg_counter.keys() - } - - stats_json = { - "id_count": len(ids_function_agg), - "unique_id_count": len(id_function_agg_counter.keys()), - "id_frequency_map": id_function_agg_counter, - "id_found_in_metaproteomics_analysis_activity_set_map": ids_in_metaproteomics_anly_set_map, - } - - with open(str(output_file), "w+") as fp: - json.dump(stats_json, fp, default=str, indent=2) - def __log_record_deletion_information_many(self, collection_name: str, ids: List[str]) -> None: for id in ids: self.__log_record_deletion_information(collection_name, id) @@ -264,7 +130,6 @@ def __log_record_deletion_information_many(self, collection_name: str, ids: List def __log_record_deletion_information(self, collection_name: str, id: str) -> None: logger.info(f"Deleting record with ID: {id} from {collection_name}") - @staticmethod def get_nmdc_db(mongo_uri: str) -> "NMDCAccessor": db = "nmdc" @@ -282,17 +147,6 @@ def main(): accessor = NMDCAccessor.get_nmdc_db(mongo_uri=args_map.mongo_uri) - if args_map.output_dir: - output = Path(args_map.output_dir) - logger.info(f"Saving to {output}") - accessor.save_data_object_set(output) - accessor.save_metaproteomics_analysis_activity_set(output) - accessor.save_has_outputs_map(output) - accessor.save_all_metaproteomics_ids(output) - accessor.save_matching_msgf_data_objects(output) - accessor.save_metap_gene_function_aggregation_stats(output) - accessor.save_all_to_delete_by_ids_map(output) - if args_map.dry_run: logger.info("Dry run: no records will be deleted") else: From 752ce8490bf72659c10c5761d1d02ba4d8afc2bb Mon Sep 17 00:00:00 2001 From: aclum Date: Fri, 17 May 2024 17:32:19 -0700 Subject: [PATCH 32/44] start at emp500 removing duplicate workflow runs --- nmdc_schema/fix_emp500.py | 58 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 nmdc_schema/fix_emp500.py diff --git a/nmdc_schema/fix_emp500.py b/nmdc_schema/fix_emp500.py new file mode 100644 index 0000000000..17d37de199 --- /dev/null +++ b/nmdc_schema/fix_emp500.py @@ -0,0 +1,58 @@ +import pymongo + +db="nmdc" + +study_id="" +def get_omics_sharing_was_informed_by(study_id): + pipeline =[ + { "$group": { "_id": "$was_informed_by", "count": { "$sum": 1 } } }, + { "$match": { "count": { "$gt": 1 } } }, + { "$lookup": { "from": "omics_processing_set", "localField": "_id", "foreignField": "id", "as": "omics_processing_set" } }, + { "$match": { "omics_processing_set.part_of": study_id } +] + + coll="metagenome_assembly_set" + multiple_workflow_results=db.coll.aggregate(pipeline) + return(multiple_workflow_results) + +def get_records_to_keep(multiple_workflow_results): + + + asm_coll="metagenome_assembly_set"" + qc_coll="read_qc_analysis_activity_set" + rbt_coll="read_based_taxonomy_analysis_activity_set" + omics_project="" + #look for the newest assembly record, look up upstream filtering record and reads based taxonomy analysis that uses the same record + #to do: add logic to make sure qc and reads based taxonomy exist + pick_records_pipeline = [ + { "$match": { + "was_informed_by": omics_project} + }, + { "$sort": { "_id": -1 } }, + { "$limit": 1 }, + { "$lookup": { + "from": "read_qc_analysis_activity_set", + "localField": "has_input", + "foreignField": "has_output", + "as": "upstream_filtering" + } + }, + { "$lookup": { + "from": "read_based_taxonomy_analysis_activity_set", + "localField": "has_input", + "foreignField": "has_input", + "as": "rbt_record_to_keep" } + } + ] + asm_keep=[] + qc_keep=[] + rbt_keep=[] + for doc in multiple_workflow_results: + omics_record=doc["_id"] + records=db.asm_coll.aggregate(pick_records_pipeline) + if len(records) == 1: + asm_keep.append(records["id"]) + qc_keep.append(records["upstream_filtering.id"]) + rbt_keep.append(records["rbt_record_to_keep.id"]) + + From 7555e70b1a7eba903e85ed660586a2e727968ff1 Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 11 Jul 2024 16:02:42 -0700 Subject: [PATCH 33/44] making sure local and remote are the same --- nmdc_schema/fix_emp500.py | 58 -------- nmdc_schema/fix_emp_500.py | 3 + nmdc_schema/neon_fail_records.py | 133 ++++++++++++++++++ .../repair_metagenome_sequencing_set.py | 18 +++ 4 files changed, 154 insertions(+), 58 deletions(-) delete mode 100644 nmdc_schema/fix_emp500.py create mode 100644 nmdc_schema/fix_emp_500.py create mode 100644 nmdc_schema/neon_fail_records.py create mode 100644 nmdc_schema/repair_metagenome_sequencing_set.py diff --git a/nmdc_schema/fix_emp500.py b/nmdc_schema/fix_emp500.py deleted file mode 100644 index 17d37de199..0000000000 --- a/nmdc_schema/fix_emp500.py +++ /dev/null @@ -1,58 +0,0 @@ -import pymongo - -db="nmdc" - -study_id="" -def get_omics_sharing_was_informed_by(study_id): - pipeline =[ - { "$group": { "_id": "$was_informed_by", "count": { "$sum": 1 } } }, - { "$match": { "count": { "$gt": 1 } } }, - { "$lookup": { "from": "omics_processing_set", "localField": "_id", "foreignField": "id", "as": "omics_processing_set" } }, - { "$match": { "omics_processing_set.part_of": study_id } -] - - coll="metagenome_assembly_set" - multiple_workflow_results=db.coll.aggregate(pipeline) - return(multiple_workflow_results) - -def get_records_to_keep(multiple_workflow_results): - - - asm_coll="metagenome_assembly_set"" - qc_coll="read_qc_analysis_activity_set" - rbt_coll="read_based_taxonomy_analysis_activity_set" - omics_project="" - #look for the newest assembly record, look up upstream filtering record and reads based taxonomy analysis that uses the same record - #to do: add logic to make sure qc and reads based taxonomy exist - pick_records_pipeline = [ - { "$match": { - "was_informed_by": omics_project} - }, - { "$sort": { "_id": -1 } }, - { "$limit": 1 }, - { "$lookup": { - "from": "read_qc_analysis_activity_set", - "localField": "has_input", - "foreignField": "has_output", - "as": "upstream_filtering" - } - }, - { "$lookup": { - "from": "read_based_taxonomy_analysis_activity_set", - "localField": "has_input", - "foreignField": "has_input", - "as": "rbt_record_to_keep" } - } - ] - asm_keep=[] - qc_keep=[] - rbt_keep=[] - for doc in multiple_workflow_results: - omics_record=doc["_id"] - records=db.asm_coll.aggregate(pick_records_pipeline) - if len(records) == 1: - asm_keep.append(records["id"]) - qc_keep.append(records["upstream_filtering.id"]) - rbt_keep.append(records["rbt_record_to_keep.id"]) - - diff --git a/nmdc_schema/fix_emp_500.py b/nmdc_schema/fix_emp_500.py new file mode 100644 index 0000000000..70fe6181cb --- /dev/null +++ b/nmdc_schema/fix_emp_500.py @@ -0,0 +1,3 @@ + + +def diff --git a/nmdc_schema/neon_fail_records.py b/nmdc_schema/neon_fail_records.py new file mode 100644 index 0000000000..2ebd2beba3 --- /dev/null +++ b/nmdc_schema/neon_fail_records.py @@ -0,0 +1,133 @@ +import json +#from bson import json_util, ObjectId +import pymongo +from pymongo import MongoClient +from pymongo.errors import ConnectionFailure + +prod_mongo = "mongodb://aclum:wqNj7hWW*oWmzQ2FsL%40f@localhost:37019/?authMechanism=SCRAM-SHA-256&authSource=admin&directConnection=true" +client = MongoClient(prod_mongo) +mydb = client["nmdc"] + +#for db in client.list_database_names(): +# print(db) +#collection names +asm_coll= mydb["metagenome_assembly_set"] +qc_coll=mydb["read_qc_analysis_activity_set"] +rbt_coll=mydb["read_based_taxonomy_analysis_activity_set"] +ann_coll=mydb["metagenome_annotation_activity_set"] +mags_coll=mydb["mags_activity_set"] +omics_coll=mydb["omics_processing_set"] +ann_agg_coll=mydb["functional_annotation_agg"] +data_object_coll = mydb["data_object_set"] + +#asm_cursor=asm_coll.find({"was_informed_by":"nmdc:omprc-11-c82tqn53"}) +#print(len(list(asm_cursor))) + +del_asm_coll=[] +del_qc_coll=[] +del_rbt_coll=[] +del_ann_coll=[] +del_mags_coll=[] +del_do=[] +del_omics=[] +collections_dict={asm_coll:del_asm_coll, + qc_coll:del_qc_coll, + rbt_coll:del_rbt_coll, + ann_coll:del_ann_coll, + mags_coll:del_mags_coll +} + +del_dict={"metagenome_assembly_set":del_asm_coll, + "read_qc_analysis_activity_set":del_qc_coll, + "read_based_taxonomy_analysis_activity_set":del_rbt_coll, + "metagenome_annotation_activity_set": del_ann_coll, + "mags_activity_set":del_mags_coll, + "data_object_set":del_do, + "omics_processing_set":del_omics +} + +neon_omics = open("/Users/aclum/Downloads/neon_omics_to_fail.txt", "r") + + +img_list=[] +omics_list=[] +#get workflow activity ID and data object IDs to delete +for omics in neon_omics: + omics_list.append(omics.strip()) + for coll in collections_dict: + query=({"was_informed_by":omics.strip()}) + wxa_record=coll.find(query) + for doc in wxa_record: + collections_dict[coll].append(doc["id"]) + img_info=[doc["id"],doc["was_informed_by"],doc["type"]] + list = ",".join(img_info) + img_list.append(list) + for output in doc["has_output"]: + del_do.append(output) + +with open ("neon_delete_img.csv", 'w') as f: + for line in img_list: + f.write(f"{line}\n") + +#for coll in collections_dict: +# print(coll) +# print(collections_dict[coll]) +# + + +for omics in omics_list: + query=({"id":omics}) + omics_doc=omics_coll.find_one(query) + del_omics.append(omics_doc["id"]) + for output in omics_doc["has_output"]: + del_do.append(output) + print(data_object_coll.find_one({"id":output})) + +pipeline= [ + { "$match": { "id": omics } }, + { + "$lookup": { + "from": "data_object_set", + "localField": "has_output", + "foreignField": "id", + "as": "seq_files" + } + }, + { + "$project": { + "name": 1, + "id": 1, + "seq_files.id": 1, + "seq_files.url": 1 + } + } + ] + +omics_full_record=[] +for record in omics: + omics_agg_doc=omics_coll.aggregate(pipeline) + for doc in omics_agg_doc: + print(doc) + omics_full_record.append(doc) + +#json_object=json_util.dumps(omics_full_record) +#with open ("20240523_neon_soil_bad_pairs.json", 'w') as f: +# f.write(json_object) + +for del_coll,del_values in del_dict.items(): + request_body_file=del_coll + "request_body.json" + request_deletes=[] + for value in del_values: + request_deletes.append({"q": {"id": value}, "limit":1}) + request_body_json=({"delete": del_coll,"deletes": request_deletes}) + with open (request_body_file, 'w') as f: + json.dump(request_body_json,f) + +request_agg_del=[] +for ann in del_ann_coll: + request_agg_del.append({"q": {"metagenome_annotation_id": ann}, "limit":0}) +request_body_json=({"delete": "functional_annotation_agg","deletes": request_agg_del}) +with open ("agg_delete_request.json",'w') as f: + json.dump(request_body_json,f) + +neon_omics.close() diff --git a/nmdc_schema/repair_metagenome_sequencing_set.py b/nmdc_schema/repair_metagenome_sequencing_set.py new file mode 100644 index 0000000000..236c8a007a --- /dev/null +++ b/nmdc_schema/repair_metagenome_sequencing_set.py @@ -0,0 +1,18 @@ +import json +update_list=[] + +file= open ('/Users/aclum/Downloads/nmdc.metagenome_sequencing_activity_set.csv','r') + +for i in file.readlines(): + i=i.strip() + value_array=i.split(',') + update_list.append({"q": {"id": value_array[1] }, "u": {"$set": {"has_input": [value_array[2]]}}}) + +#print(update_list) +json_body=({ + "update": "metagenome_sequencing_activity_set", + "updates": update_list}) + +with open ("mg_seq_repair_inputs.json",'w') as f: + json.dump(json_body,f) + From db50a9252ac22c60da971588f7e2780b0c0e9a1c Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 11 Jul 2024 16:03:59 -0700 Subject: [PATCH 34/44] running black --- nmdc_schema/fix_emp_500.py | 3 - nmdc_schema/neon_fail_records.py | 192 +++++++++--------- .../repair_metagenome_sequencing_set.py | 24 +-- 3 files changed, 106 insertions(+), 113 deletions(-) delete mode 100644 nmdc_schema/fix_emp_500.py diff --git a/nmdc_schema/fix_emp_500.py b/nmdc_schema/fix_emp_500.py deleted file mode 100644 index 70fe6181cb..0000000000 --- a/nmdc_schema/fix_emp_500.py +++ /dev/null @@ -1,3 +0,0 @@ - - -def diff --git a/nmdc_schema/neon_fail_records.py b/nmdc_schema/neon_fail_records.py index 2ebd2beba3..81fe2eb20f 100644 --- a/nmdc_schema/neon_fail_records.py +++ b/nmdc_schema/neon_fail_records.py @@ -1,5 +1,6 @@ import json -#from bson import json_util, ObjectId + +# from bson import json_util, ObjectId import pymongo from pymongo import MongoClient from pymongo.errors import ConnectionFailure @@ -8,126 +9,121 @@ client = MongoClient(prod_mongo) mydb = client["nmdc"] -#for db in client.list_database_names(): +# for db in client.list_database_names(): # print(db) -#collection names -asm_coll= mydb["metagenome_assembly_set"] -qc_coll=mydb["read_qc_analysis_activity_set"] -rbt_coll=mydb["read_based_taxonomy_analysis_activity_set"] -ann_coll=mydb["metagenome_annotation_activity_set"] -mags_coll=mydb["mags_activity_set"] -omics_coll=mydb["omics_processing_set"] -ann_agg_coll=mydb["functional_annotation_agg"] +# collection names +asm_coll = mydb["metagenome_assembly_set"] +qc_coll = mydb["read_qc_analysis_activity_set"] +rbt_coll = mydb["read_based_taxonomy_analysis_activity_set"] +ann_coll = mydb["metagenome_annotation_activity_set"] +mags_coll = mydb["mags_activity_set"] +omics_coll = mydb["omics_processing_set"] +ann_agg_coll = mydb["functional_annotation_agg"] data_object_coll = mydb["data_object_set"] -#asm_cursor=asm_coll.find({"was_informed_by":"nmdc:omprc-11-c82tqn53"}) -#print(len(list(asm_cursor))) - -del_asm_coll=[] -del_qc_coll=[] -del_rbt_coll=[] -del_ann_coll=[] -del_mags_coll=[] -del_do=[] -del_omics=[] -collections_dict={asm_coll:del_asm_coll, - qc_coll:del_qc_coll, - rbt_coll:del_rbt_coll, - ann_coll:del_ann_coll, - mags_coll:del_mags_coll +# asm_cursor=asm_coll.find({"was_informed_by":"nmdc:omprc-11-c82tqn53"}) +# print(len(list(asm_cursor))) + +del_asm_coll = [] +del_qc_coll = [] +del_rbt_coll = [] +del_ann_coll = [] +del_mags_coll = [] +del_do = [] +del_omics = [] +collections_dict = { + asm_coll: del_asm_coll, + qc_coll: del_qc_coll, + rbt_coll: del_rbt_coll, + ann_coll: del_ann_coll, + mags_coll: del_mags_coll, } -del_dict={"metagenome_assembly_set":del_asm_coll, - "read_qc_analysis_activity_set":del_qc_coll, - "read_based_taxonomy_analysis_activity_set":del_rbt_coll, - "metagenome_annotation_activity_set": del_ann_coll, - "mags_activity_set":del_mags_coll, - "data_object_set":del_do, - "omics_processing_set":del_omics +del_dict = { + "metagenome_assembly_set": del_asm_coll, + "read_qc_analysis_activity_set": del_qc_coll, + "read_based_taxonomy_analysis_activity_set": del_rbt_coll, + "metagenome_annotation_activity_set": del_ann_coll, + "mags_activity_set": del_mags_coll, + "data_object_set": del_do, + "omics_processing_set": del_omics, } neon_omics = open("/Users/aclum/Downloads/neon_omics_to_fail.txt", "r") -img_list=[] -omics_list=[] -#get workflow activity ID and data object IDs to delete +img_list = [] +omics_list = [] +# get workflow activity ID and data object IDs to delete for omics in neon_omics: - omics_list.append(omics.strip()) - for coll in collections_dict: - query=({"was_informed_by":omics.strip()}) - wxa_record=coll.find(query) - for doc in wxa_record: - collections_dict[coll].append(doc["id"]) - img_info=[doc["id"],doc["was_informed_by"],doc["type"]] - list = ",".join(img_info) - img_list.append(list) - for output in doc["has_output"]: - del_do.append(output) - -with open ("neon_delete_img.csv", 'w') as f: - for line in img_list: - f.write(f"{line}\n") - -#for coll in collections_dict: + omics_list.append(omics.strip()) + for coll in collections_dict: + query = {"was_informed_by": omics.strip()} + wxa_record = coll.find(query) + for doc in wxa_record: + collections_dict[coll].append(doc["id"]) + img_info = [doc["id"], doc["was_informed_by"], doc["type"]] + list = ",".join(img_info) + img_list.append(list) + for output in doc["has_output"]: + del_do.append(output) + +with open("neon_delete_img.csv", "w") as f: + for line in img_list: + f.write(f"{line}\n") + +# for coll in collections_dict: # print(coll) # print(collections_dict[coll]) # for omics in omics_list: - query=({"id":omics}) - omics_doc=omics_coll.find_one(query) - del_omics.append(omics_doc["id"]) - for output in omics_doc["has_output"]: - del_do.append(output) - print(data_object_coll.find_one({"id":output})) - -pipeline= [ - { "$match": { "id": omics } }, + query = {"id": omics} + omics_doc = omics_coll.find_one(query) + del_omics.append(omics_doc["id"]) + for output in omics_doc["has_output"]: + del_do.append(output) + print(data_object_coll.find_one({"id": output})) + +pipeline = [ + {"$match": {"id": omics}}, { - "$lookup": { - "from": "data_object_set", - "localField": "has_output", - "foreignField": "id", - "as": "seq_files" - } + "$lookup": { + "from": "data_object_set", + "localField": "has_output", + "foreignField": "id", + "as": "seq_files", + } }, - { - "$project": { - "name": 1, - "id": 1, - "seq_files.id": 1, - "seq_files.url": 1 - } - } - ] - -omics_full_record=[] + {"$project": {"name": 1, "id": 1, "seq_files.id": 1, "seq_files.url": 1}}, +] + +omics_full_record = [] for record in omics: - omics_agg_doc=omics_coll.aggregate(pipeline) - for doc in omics_agg_doc: - print(doc) - omics_full_record.append(doc) + omics_agg_doc = omics_coll.aggregate(pipeline) + for doc in omics_agg_doc: + print(doc) + omics_full_record.append(doc) -#json_object=json_util.dumps(omics_full_record) -#with open ("20240523_neon_soil_bad_pairs.json", 'w') as f: +# json_object=json_util.dumps(omics_full_record) +# with open ("20240523_neon_soil_bad_pairs.json", 'w') as f: # f.write(json_object) -for del_coll,del_values in del_dict.items(): - request_body_file=del_coll + "request_body.json" - request_deletes=[] - for value in del_values: - request_deletes.append({"q": {"id": value}, "limit":1}) - request_body_json=({"delete": del_coll,"deletes": request_deletes}) - with open (request_body_file, 'w') as f: - json.dump(request_body_json,f) +for del_coll, del_values in del_dict.items(): + request_body_file = del_coll + "request_body.json" + request_deletes = [] + for value in del_values: + request_deletes.append({"q": {"id": value}, "limit": 1}) + request_body_json = {"delete": del_coll, "deletes": request_deletes} + with open(request_body_file, "w") as f: + json.dump(request_body_json, f) -request_agg_del=[] +request_agg_del = [] for ann in del_ann_coll: - request_agg_del.append({"q": {"metagenome_annotation_id": ann}, "limit":0}) -request_body_json=({"delete": "functional_annotation_agg","deletes": request_agg_del}) -with open ("agg_delete_request.json",'w') as f: - json.dump(request_body_json,f) + request_agg_del.append({"q": {"metagenome_annotation_id": ann}, "limit": 0}) +request_body_json = {"delete": "functional_annotation_agg", "deletes": request_agg_del} +with open("agg_delete_request.json", "w") as f: + json.dump(request_body_json, f) -neon_omics.close() +neon_omics.close() diff --git a/nmdc_schema/repair_metagenome_sequencing_set.py b/nmdc_schema/repair_metagenome_sequencing_set.py index 236c8a007a..0e7ce40b7a 100644 --- a/nmdc_schema/repair_metagenome_sequencing_set.py +++ b/nmdc_schema/repair_metagenome_sequencing_set.py @@ -1,18 +1,18 @@ import json -update_list=[] -file= open ('/Users/aclum/Downloads/nmdc.metagenome_sequencing_activity_set.csv','r') +update_list = [] -for i in file.readlines(): - i=i.strip() - value_array=i.split(',') - update_list.append({"q": {"id": value_array[1] }, "u": {"$set": {"has_input": [value_array[2]]}}}) +file = open("/Users/aclum/Downloads/nmdc.metagenome_sequencing_activity_set.csv", "r") -#print(update_list) -json_body=({ - "update": "metagenome_sequencing_activity_set", - "updates": update_list}) +for i in file.readlines(): + i = i.strip() + value_array = i.split(",") + update_list.append( + {"q": {"id": value_array[1]}, "u": {"$set": {"has_input": [value_array[2]]}}} + ) -with open ("mg_seq_repair_inputs.json",'w') as f: - json.dump(json_body,f) +# print(update_list) +json_body = {"update": "metagenome_sequencing_activity_set", "updates": update_list} +with open("mg_seq_repair_inputs.json", "w") as f: + json.dump(json_body, f) From 236a299f9b004d4a0c34beda8d3f17f6d4ef4938 Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 11 Jul 2024 16:05:19 -0700 Subject: [PATCH 35/44] removing some comments --- nmdc_schema/connect_napa_mongo.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/connect_napa_mongo.py index 4b617948d7..f37813cdbc 100644 --- a/nmdc_schema/connect_napa_mongo.py +++ b/nmdc_schema/connect_napa_mongo.py @@ -18,8 +18,6 @@ load_dotenv(envfile_path) -# nersc ssh tunnel required to connect to mongo -# ssh -L 37020:mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017 -o ServerAliveInterval=60 {YOUR_NERSC_USERNAME}@dtn01.nersc.gov napa_mongo_pw = os.environ["MONGO_NAPA_PW"] # print("napa_mongo_pw:", os.environ['MONGO_NAPA_PW']) From ebb58d2aab21b80ee6c40c1ccc24f59ed0797ba5 Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 11 Jul 2024 16:10:23 -0700 Subject: [PATCH 36/44] removing napa_study_biosample_omics_migration.py since nmdc_automation code was used instead of this protoype code --- .../napa_study_biosample_omics_migration.py | 259 ------------------ 1 file changed, 259 deletions(-) delete mode 100644 nmdc_schema/napa_study_biosample_omics_migration.py diff --git a/nmdc_schema/napa_study_biosample_omics_migration.py b/nmdc_schema/napa_study_biosample_omics_migration.py deleted file mode 100644 index 20a8da7c42..0000000000 --- a/nmdc_schema/napa_study_biosample_omics_migration.py +++ /dev/null @@ -1,259 +0,0 @@ -import json -import os -from pprint import pprint -import secrets -import time - -import requests - -import pymongo -from pymongo import MongoClient - -# connect to napa mongo -napa_mongo_pw = os.environ["MONGO_NAPA_PW"] -napa_mongo = ( - "mongodb://root:" - + napa_mongo_pw - + "@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin" -) -client = MongoClient(napa_mongo) - -# define variables for tables to update, assumes a mongo connection variable 'client' -# set database name -mydb = client["nmdc"] -sty_coll = mydb["study_set"] -bsm_coll = mydb["biosample_set"] - -######################### -# generalized function to update study identifiers to napa format -# alt slots are already populated in all cases so logic for that is not needed - -# mint Class Study IDs using runtime API or manually using the minter endpoint -# if reading minted IDs from a json file -sty_napa_json = "XXXXXXXXXX" -with open(sty_napa_json, "r") as j: - sty_napa_ids = json.loads(j.read()) - -# update_studies_to_napa_standards - - -def update_studies_to_napa_standards(): - study_reid_log = open("napa_sty_update.txt", "w") - napa_sty_counter = 0 - get_legacy_sty = {"id": {"$regex": "^gold"}} - for sty_doc in sty_coll.find(get_legacy_studies): - select_legacy_sty = {"id": sty_doc["id"]} - sty_target_update = {"$set": {"id": napa_sty_ids[napa_sty_count]}} - if napa_sty_ids[napa_sty_count].startswith("nmdc:sty"): - # sty_coll.update_one(select_legacy_sty,sty_target_update) - sty_class_legacy_napa = ( - "Study " + sty_doc["id"] + " " + napa_sty_ids[napa_study_count] - ) - print(sty_class_legacy_napa) - sty_reid_log.write(napa_sty_update.txt) - napa_sty_counte)r = napa_sty_counter + 1 - else: - print("Did not update issue updating ", sty_doc["id"]) - - -######################### -##function to update biosamples - -# mint Class Study IDs using runtime API or manually using the minter endpoint -# if reading minted IDs from a json file - - -def update_bsm_by_study(napa_sty_id): - bsm_counter = 0 - bsm_alt_id_dict = { - "gold_biosample_identifiers": "gold:", - "igsn_biosample_identifiers": "igsn:", - "emsl_biosample_identifiers": "emsl:" - } - legacy_sty = napa_sty_to_legacy(napa_sty_id) - bsm_reid_log = open(legacy_sty + "_bsm_reid.txt", "w") - with open(legacy_sty + "_bsm_napa.json", "r") as j: - bsm_napa_ids = json.loads(j.read()) - legacy_bsm = {"part_of": legacy_sty, "id": {"$ne": "^nmdc:bsm"}} - for bsm_doc in bsm_coll.find(legacy_bsm): - bsm_target_update = "" - #print(bsm_doc["id"]) - # set value for part_of - sty_napa_list = [] - sty_napa_list.append(napa_sty_id) - target_bsm = {"id": bsm_doc["id"]} - # alt id check function - alt_id = [] - alt_id_slot_name = "" - for alt_id_slot in bsm_alt_id_dict: - #print(bsm_alt_id_dict[alt_id_slot]) - if bsm_doc["id"].startswith(bsm_alt_id_dict[alt_id_slot]): - print(bsm_doc["id"] + "starts with"+ bsm_alt_id_dict[alt_id_slot]) - alt_id_slot_name = alt_id_slot - if alt_id_slot_name in bsm_doc.keys(): - if len(bsm_doc[alt_id_slot_name]) == 0: - update_alt = True - alt_id.append(bsm_doc["id"]) - print("will update alt id slot is empty" + alt_id_slot_name) - bsm_target_update = { - "$set": { - "id": bsm_napa_ids[bsm_counter], - "part_of": sty_napa_list, - alt_id_slot_name: alt_id, - } - } - elif ( - len(bsm_doc[alt_id_slot_name]) == 1 - and bsm_doc[alt_id_slot_name][0] == bsm_doc["id"] - ): - print(alt_id_slot + " already set for " + bsm_doc["id"]) - bsm_target_update = { - "$set": {"id": bsm_napa_ids[bsm_counter], "part_of": sty_napa_list} - } - else: - print( - "length of array for " - + alt_id_slot - + "exists and is greater than 1" - ) - bsm_target_update = { - "$set": {"id": bsm_napa_ids[bsm_counter], "part_of": sty_napa_list} - } - else: - alt_id.append(bsm_doc["id"]) - print("will update alt id b/c could not find alt id") - bsm_target_update = { - "$set": { - "id": bsm_napa_ids[bsm_counter], - "part_of": sty_napa_list, - alt_id_slot_name: alt_id, - } - } - #else: - # print(bsm_doc["id"] + "does not start with prefix"+ bsm_alt_id_dict[alt_id_slot]) - #else: - # print("not sure how to make the biosample update for" + bsm_doc["id"]) - bsm_class_legacy_napa = ( - "Biosample " + bsm_doc["id"] + " " + bsm_napa_ids[bsm_counter] - ) - print(bsm_class_legacy_napa) - print(target_bsm) - print(bsm_target_update) - # perform biosample update - bsm_coll.update_one(target_bsm, bsm_target_update) - bsm_reid_log.write(bsm_class_legacy_napa + "\n") - bsm_counter = bsm_counter + 1 - bsm_reid_log.close() - - -################ - -# function to get legacy study id from alt id slot -def napa_sty_to_legacy(napa_sty_id): - legacy_sty = "" - get_sty_record = {"id": napa_sty_id} - target_sty = sty_coll.find_one(get_sty_record) - if len(target_sty["gold_study_identifiers"]) == 1: - legacy_sty = target_sty["gold_study_identifiers"][0] - else: - print( - "More than one GOLD study as alt id", target_sty["gold_study_identifiers"] - ) - return legacy_sty - - -########################## -# function to update omics records -def update_omics_by_study(napa_sty_id): - omics_coll = mydb["omics_processing_set"] - omics_counter = 0 - omics_alt_id_dict = { - "gold_sequencing_project_identifiers": "gold:", - "alternative_identifiers": "emsl:", - } - legacy_sty = napa_sty_to_legacy(napa_sty_id) - #commented out only until we get SPRUCE fixed - legacy_omics = {"part_of": legacy_sty, "id": {"$ne": "^nmdc:omprc"}} - # test only serach for NOM data so Yuri can test - #legacy_omics = {"part_of": legacy_sty, "id": {"$ne": "^nmdc:omprc"}, "omics_type.has_raw_value":"Organic Matter Characterization"} - f_omics_id_mapping = open(legacy_sty + "_omics_reid.txt", "w") - with open(legacy_sty + "_omics_napa.json", "r") as j: - omics_napa_ids = json.loads(j.read()) - for omics_doc in omics_coll.find(legacy_omics): - # set list with value of napa study for part_of - study_napa_list = [] - study_napa_list.append(napa_sty_id) - # determine what has_input should be - if isinstance(omics_doc["has_input"], list): - napa_biosample_inputs = [] - for biosample in omics_doc["has_input"]: - biosample = biosample.replace("GOLD", "gold") - target_has_input = { - "$or": [ - {"emsl_biosample_identifiers": biosample}, - {"gold_biosample_identifiers": biosample}, - {"igsn_biosample_identifiers": biosample}, - ] - } - get_biosample = bsm_coll.find_one(target_has_input) - print(omics_doc) - print(get_biosample["id"]) - napa_biosample_inputs.append(get_biosample["id"]) - # set id and alternative ids - target_omics = {"id": omics_doc["id"]} - # deal with gold omics identifiers, for all 485 legacy records all already list gold projects in the gold_sequencing_project_identifiers slot - alt_omics_id = [] - for alt_omics_id_slot in omics_alt_id_dict: - if omics_doc["id"].startswith(omics_alt_id_dict[alt_omics_id_slot]): - if alt_omics_id_slot in omics_doc.keys(): - if len(omics_doc[alt_omics_id_slot]) == 0: - update_alt_omics = True - alt_omics_id.append(omics_doc["id"]) - target_alt_omics_slot = alt_omics_id_slot - print("will update alt id slot is empty" + alt_id_slot_name) - elif ( - len(omics_doc[alt_omics_id_slot]) == 1 - and omics_doc[alt_omics_id_slot][0] == omics_doc["id"] - ): - print(alt_omics_id_slot + " already set for " + omics_doc["id"]) - update_alt_omics = False - else: - print( - "length of array for " - + alt_omics_id_slot - + "exists and is greater than 1" - ) - update_alt_omics = False - else: - update_alt_omics = True - alt_omics_id.append(omics_doc["id"]) - target_alt_omics_slot = alt_omics_id_slot - print("will update alt id b/c could not find alt id") - # set target update depending on if alt slot exists already or not - if update_alt_omics is True: - target_omics_update = { - "$set": { - "id": omics_napa_ids[omics_counter], - "part_of": study_napa_list, - "has_input": napa_biosample_inputs, - target_alt_omics_slot: alt_omics_id, - } - } - if update_alt_omics is False: - target_omics_update = { - "$set": { - "id": omics_napa_ids[omics_counter], - "part_of": study_napa_list, - "has_input": napa_biosample_inputs, - } - } - print(target_omics_update) - omics_coll.update_one(target_omics, target_omics_update) - class_legacy_napa = ( - "OmicsProcessing " + omics_doc["id"] + " " + omics_napa_ids[omics_counter] - ) - # print(class_legacy_napa) - # print(target_update) - f_omics_id_mapping.write(class_legacy_napa + "\n") - omics_counter = omics_counter + 1 - f_omics_id_mapping.close() From 9e0b8b273f6619907058c4cdcee35d4383476d3d Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 11 Jul 2024 16:12:27 -0700 Subject: [PATCH 37/44] updating script title --- nmdc_schema/{connect_napa_mongo.py => misc_reid_code.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename nmdc_schema/{connect_napa_mongo.py => misc_reid_code.py} (100%) diff --git a/nmdc_schema/connect_napa_mongo.py b/nmdc_schema/misc_reid_code.py similarity index 100% rename from nmdc_schema/connect_napa_mongo.py rename to nmdc_schema/misc_reid_code.py From 5a87627c5ad66abac253f81d34b441042ea8f580 Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 11 Jul 2024 16:13:56 -0700 Subject: [PATCH 38/44] removing metab_id_refactor.py, code in nmdc_automation was used instead --- nmdc_schema/metab_id_refactor.py | 233 ------------------------------- 1 file changed, 233 deletions(-) delete mode 100644 nmdc_schema/metab_id_refactor.py diff --git a/nmdc_schema/metab_id_refactor.py b/nmdc_schema/metab_id_refactor.py deleted file mode 100644 index 0cc8c2ff6c..0000000000 --- a/nmdc_schema/metab_id_refactor.py +++ /dev/null @@ -1,233 +0,0 @@ -from dataclasses import dataclass, field, asdict -import hashlib -from pathlib import Path -import os -from pprint import pprint -from typing import List -from json import dumps - -from dotenv import load_dotenv -import pymongo -from pymongo import MongoClient -from pymongo.errors import ConnectionFailure -import oauthlib -import requests_oauthlib - -from linkml_runtime.dumpers import json_dumper -import yaml -import nmdc_schema.nmdc as nmdc - - -envfile_path = "../../.env.client" - -load_dotenv(envfile_path) -# nersc ssh tunnel required to connect to mongo -# ssh -L 37020:mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017 -o ServerAliveInterval=60 {YOUR_NERSC_USERNAME}@dtn01.nersc.gov - -napa_mongo_pw = os.environ.get("MONGO_NAPA_PW") or "safeguard-wipe-scanner-78" -# print("napa_mongo_pw:", os.environ['MONGO_NAPA_PW']) -print(napa_mongo_pw) -napa_mongo = ( - "mongodb://root:" - + napa_mongo_pw - + "@mongo-loadbalancer.nmdc-napa.production.svc.spin.nersc.org:27017/?authSource=admin" -) -# connection = MongoClient() -# db = connection.napa_mongo -print(napa_mongo) - -# connect to mongo -client = MongoClient(napa_mongo) - -# set mongo database name to nmdc' -mydb = client["nmdc"] - -# list database names -# for db in client.list_database_names(): -# print(db) - -# list collections -# for coll in mydb.list_collection_names(): -# print(coll) - -# omicsProcessing update, has_output --> raw data -# omicsProcessing update, alternative_identifier --> nom_analysis_activity.was_informed_by - - -# nom_analysis_activity --> has_input (new raw file or update ID) -# nom_analysis_activity --> has_output (data product file, update ID) -# nom_analysis_activity --> replace ids -# nom_analysis_activity --> was_informed_by -- id from alternative indetifier omics Processing -# dataObject --> replace id, and add alternative identifier, emsl:60592345 -@dataclass -class NMDC_Mint: - schema_class: dict = field( - default_factory=lambda: { - "schema": None, - } - ) - how_many: int = 1 - - @property - def __dict__(self): - return asdict(self) - - @property - def json(self): - return dumps(self.__dict__) - - -@dataclass -class DataObject: - nom_raw_data_object_type: str = "Direct Infusion FT ICR-MS Raw Data" - nom_raw_data_object_description: str = "Raw 21T Direct Infusion Data" - nom_dp_data_object_type: str = "FT ICR-MS Analysis Results" - nom_dp_data_object_description: str = "EnviroMS FT ICR-MS natural organic matter workflow molecular formula assignment output details" - - -@dataclass -class NMDC_Types: - BioSample: str = "nmdc:Biosample" - OmicsProcessing: str = "nmdc:OmicsProcessing" - NomAnalysisActivity: str = "nmdc:NomAnalysisActivity" - DataObject: str = "nmdc:DataObject" - - -def update_data_products( - nom_activities_doc, - new_raw_file_id: str, - new_data_product_id: str, - omics_prcessing_id: str, - raw_file_path: Path = None, -): - raw_file_id = nom_activities_doc.has_input[0] - - dataproduct_id = nom_activities_doc.has_input[0] - - data_object_set = mydb["data_object_set"] - - get_raw_file_data_object = {"id": raw_file_id} - get_data_product_data_object = {"id": dataproduct_id} - - raw_object_docs = [ - raw_objectdata_doc - for raw_objectdata_doc in data_object_set.find(get_raw_file_data_object) - ] - - if raw_object_docs: - raw_object_update = { - "$set": { - "id": new_raw_file_id, - "alternative_identifier": [omics_prcessing_id], - } - } - - data_object_set.update_one(raw_object_docs[0], raw_object_update) - - else: - new_raw_data_object = get_raw_data_object( - raw_file_path, - was_generated_by=omics_prcessing_id, - data_object_type=DataObject.nom_raw_data_object_type, - description=DataObject.nom_raw_data_object_description, - ) - - data_object_set.insert_one(new_raw_data_object) - - for data_product_objectdata_doc in data_object_set.find( - get_data_product_data_object - ): - data_product_object_update = {"$set": {"id": new_data_product_id}} - - data_object_set.update_one( - data_product_objectdata_doc, data_product_object_update - ) - - -def update_omics_processing( - nom_new_id, new_data_product_id, new_raw_file_id, raw_file_path=None -): - omics_processing_set = mydb["omics_processing_set"] - - nom_activities_set = mydb["nom_analysis_activity_set"] - - get_old_activities = {"id": {"$regex": "^emsl"}} - - for nom_activities_doc in nom_activities_set.find(get_old_activities): - get_parent_omics_processing = {"has_output": nom_activities_doc["has_input"]} - - """always going to be one omics processing""" - for omics_processing_doc in omics_processing_set.find( - get_parent_omics_processing - ): - omics_processing_update = {"$set": {"has_output": [new_raw_file_id]}} - - omics_processing_set.update_one( - omics_processing_doc, omics_processing_update - ) - - new_omics_processing_id = omics_processing_doc["id"] - - update_data_products( - nom_activities_doc, - new_data_product_id, - new_data_product_id, - new_omics_processing_id, - raw_file_path, - ) - - nom_activity_update = { - "$set": { - "id": nom_new_id, - "has_output": [new_data_product_id], - "has_input": [new_raw_file_id], - "was_informed_by": [new_omics_processing_id], - } - } - - nom_activities_set.update_one(nom_activities_doc, nom_activity_update) - - -def mint_nmdc_id(type: NMDC_Types, how_many: int = 1) -> List[str]: - config = yaml.safe_load(open("./config.yaml", "r")) - client = oauthlib.oauth2.BackendApplicationClient(client_id=config["client_id"]) - oauth = requests_oauthlib.OAuth2Session(client=client) - - token = oauth.fetch_token( - token_url="https://api.microbiomedata.org/token", - client_id=config["client_id"], - client_secret=config["client_secret"], - ) - - nmdc_mint_url = "https://api.microbiomedata.org/pids/mint" - - payload = NMDC_Mint(type, how_many) - - # response = s.post(nmdc_mint_url, data=payload.json, ) - # list_ids = response.json() - print(payload.json) - response = oauth.post(nmdc_mint_url, data=payload.json) - list_ids = response.json() - print(list_ids) - return list_ids - - -def get_raw_data_object( - file_path: Path, was_generated_by: str, data_object_type: str, description: str -) -> nmdc.DataObject: - nmdc_id = mint_nmdc_id({"id": NMDC_Types.DataObject})[0] - - data_dict = { - "id": nmdc_id, - "name": file_path.name, - "file_size_bytes": file_path.stat().st_size, - "md5_checksum": hashlib.md5(file_path.open("rb").read()).hexdigest(), - "was_generated_by": was_generated_by, # omics processing id - "data_object_type": data_object_type, - "description": description, - "type": "nmdc:DataObject", - } - - data_object = nmdc.DataObject(**data_dict) - - return data_object From ebf819c7ed70b17ee7af8e8edae66134e2c5440b Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 11 Jul 2024 16:15:50 -0700 Subject: [PATCH 39/44] removing ad hoc files --- nmdc_schema/neon_fail_records.py | 129 ------------------------------- 1 file changed, 129 deletions(-) delete mode 100644 nmdc_schema/neon_fail_records.py diff --git a/nmdc_schema/neon_fail_records.py b/nmdc_schema/neon_fail_records.py deleted file mode 100644 index 81fe2eb20f..0000000000 --- a/nmdc_schema/neon_fail_records.py +++ /dev/null @@ -1,129 +0,0 @@ -import json - -# from bson import json_util, ObjectId -import pymongo -from pymongo import MongoClient -from pymongo.errors import ConnectionFailure - -prod_mongo = "mongodb://aclum:wqNj7hWW*oWmzQ2FsL%40f@localhost:37019/?authMechanism=SCRAM-SHA-256&authSource=admin&directConnection=true" -client = MongoClient(prod_mongo) -mydb = client["nmdc"] - -# for db in client.list_database_names(): -# print(db) -# collection names -asm_coll = mydb["metagenome_assembly_set"] -qc_coll = mydb["read_qc_analysis_activity_set"] -rbt_coll = mydb["read_based_taxonomy_analysis_activity_set"] -ann_coll = mydb["metagenome_annotation_activity_set"] -mags_coll = mydb["mags_activity_set"] -omics_coll = mydb["omics_processing_set"] -ann_agg_coll = mydb["functional_annotation_agg"] -data_object_coll = mydb["data_object_set"] - -# asm_cursor=asm_coll.find({"was_informed_by":"nmdc:omprc-11-c82tqn53"}) -# print(len(list(asm_cursor))) - -del_asm_coll = [] -del_qc_coll = [] -del_rbt_coll = [] -del_ann_coll = [] -del_mags_coll = [] -del_do = [] -del_omics = [] -collections_dict = { - asm_coll: del_asm_coll, - qc_coll: del_qc_coll, - rbt_coll: del_rbt_coll, - ann_coll: del_ann_coll, - mags_coll: del_mags_coll, -} - -del_dict = { - "metagenome_assembly_set": del_asm_coll, - "read_qc_analysis_activity_set": del_qc_coll, - "read_based_taxonomy_analysis_activity_set": del_rbt_coll, - "metagenome_annotation_activity_set": del_ann_coll, - "mags_activity_set": del_mags_coll, - "data_object_set": del_do, - "omics_processing_set": del_omics, -} - -neon_omics = open("/Users/aclum/Downloads/neon_omics_to_fail.txt", "r") - - -img_list = [] -omics_list = [] -# get workflow activity ID and data object IDs to delete -for omics in neon_omics: - omics_list.append(omics.strip()) - for coll in collections_dict: - query = {"was_informed_by": omics.strip()} - wxa_record = coll.find(query) - for doc in wxa_record: - collections_dict[coll].append(doc["id"]) - img_info = [doc["id"], doc["was_informed_by"], doc["type"]] - list = ",".join(img_info) - img_list.append(list) - for output in doc["has_output"]: - del_do.append(output) - -with open("neon_delete_img.csv", "w") as f: - for line in img_list: - f.write(f"{line}\n") - -# for coll in collections_dict: -# print(coll) -# print(collections_dict[coll]) -# - - -for omics in omics_list: - query = {"id": omics} - omics_doc = omics_coll.find_one(query) - del_omics.append(omics_doc["id"]) - for output in omics_doc["has_output"]: - del_do.append(output) - print(data_object_coll.find_one({"id": output})) - -pipeline = [ - {"$match": {"id": omics}}, - { - "$lookup": { - "from": "data_object_set", - "localField": "has_output", - "foreignField": "id", - "as": "seq_files", - } - }, - {"$project": {"name": 1, "id": 1, "seq_files.id": 1, "seq_files.url": 1}}, -] - -omics_full_record = [] -for record in omics: - omics_agg_doc = omics_coll.aggregate(pipeline) - for doc in omics_agg_doc: - print(doc) - omics_full_record.append(doc) - -# json_object=json_util.dumps(omics_full_record) -# with open ("20240523_neon_soil_bad_pairs.json", 'w') as f: -# f.write(json_object) - -for del_coll, del_values in del_dict.items(): - request_body_file = del_coll + "request_body.json" - request_deletes = [] - for value in del_values: - request_deletes.append({"q": {"id": value}, "limit": 1}) - request_body_json = {"delete": del_coll, "deletes": request_deletes} - with open(request_body_file, "w") as f: - json.dump(request_body_json, f) - -request_agg_del = [] -for ann in del_ann_coll: - request_agg_del.append({"q": {"metagenome_annotation_id": ann}, "limit": 0}) -request_body_json = {"delete": "functional_annotation_agg", "deletes": request_agg_del} -with open("agg_delete_request.json", "w") as f: - json.dump(request_body_json, f) - -neon_omics.close() From 90711f197547dac768d1c89506ec26f7c8ee1937 Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 11 Jul 2024 16:18:14 -0700 Subject: [PATCH 40/44] removing unused python scripts --- nmdc_schema/runtime_api_operations.py | 123 -------------------------- 1 file changed, 123 deletions(-) delete mode 100644 nmdc_schema/runtime_api_operations.py diff --git a/nmdc_schema/runtime_api_operations.py b/nmdc_schema/runtime_api_operations.py deleted file mode 100644 index 315b65475e..0000000000 --- a/nmdc_schema/runtime_api_operations.py +++ /dev/null @@ -1,123 +0,0 @@ -from datetime import datetime, timezone -import json -import os -from pprint import pprint -import secrets -import time - -from dotenv import load_dotenv -import requests - -# modified from nmdc-runtime how-to guide https://microbiomedata.github.io/nmdc-runtime/nb/queue_and_trigger_data_jobs/ - -# relative path to file with format -# ``` -# NMDC_RUNTIME_HOST=fixme -# NMDC_RUNTIME_USER=fixme -# NMDC_RUNTIME_PASS=fixme -# NMDC_RUNTIME_SITE_ID=fixme # Okay if you don't have yet -# NMDC_RUNTIME_SITE_CLIENT_ID=fixme # Okay if you don't have yet -# NMDC_RUNTIME_SITE_CLIENT_SECRET=fixme # Okay if you don't have yet -# ``` -envfile_path = "../../.env.client" - -load_dotenv(envfile_path) - -ENV = {k: v for k, v in os.environ.items() if k.startswith("NMDC_RUNTIME_")} - -assert ENV["NMDC_RUNTIME_HOST"] == "https://api.microbiomedata.org" - -HOST = ENV["NMDC_RUNTIME_HOST"] - - -def request_and_return_json(method, path, host=HOST, **kwargs): - r = requests.request(method, host + path, **kwargs) - r.raise_for_status() - return r.json() - - -def get_json(path, host=HOST, **kwargs): - return request_and_return_json("GET", path, host=host, **kwargs) - - -def post_and_return_json(path, host=HOST, **kwargs): - return request_and_return_json("POST", path, host=host, **kwargs) - - -def patch_and_return_json(path, host=HOST, **kwargs): - return request_and_return_json("PATCH", path, host=host, **kwargs) - - -def put_and_return_json(path, host=HOST, **kwargs): - return request_and_return_json("PUT", path, host=host, **kwargs) - - -def auth_header(bearer_token): - return {"Authorization": f"Bearer {bearer_token}"} - - -def get_token_for_user(): - response = post_and_return_json( - "/token", - data={ - "grant_type": "password", - "username": ENV["NMDC_RUNTIME_USER"], - "password": ENV["NMDC_RUNTIME_PASS"], - }, - ) - expires_minutes = response["expires"]["minutes"] - print(f"Bearer token expires in {expires_minutes} minutes") - return response["access_token"] - - -def get_token_for_site_client(): - response = post_and_return_json( - "/token", - data={ - "grant_type": "client_credentials", - "client_id": ENV["NMDC_RUNTIME_SITE_CLIENT_ID"], - "client_secret": ENV["NMDC_RUNTIME_SITE_CLIENT_SECRET"], - }, - ) - expires_minutes = response["expires"]["minutes"] - print(f"Bearer token expires in {expires_minutes} minutes") - return response["access_token"] - - -def mint_ids(schema_class, how_many, formatted_token): - url = HOST + "/pids/mint" - data = {"schema_class": {"id": schema_class}, "how_many": how_many} - headers = formatted_token - # print(headers) - response = requests.post(url, headers=headers, json=data) - print("JSON Response ", response.json()) - - minted_ids = response.json() - return minted_ids - # print(minted_ids) - - -# def mint_ids(schema_class,how_many,TOKEN_C): -# response = post_and_return_json( -# "/pids/mint", -# data={ -# "schema_class": {"id": schema_class}, -# "how_many": how_many -# } -# headers = TOKEN_C -# return response -# ) - - -def now(as_str=False): - dt = datetime.now(timezone.utc) - return dt.isoformat() if as_str else dt - - -TOKEN_C = get_token_for_site_client() - - -print(TOKEN_C) -formatted_token = auth_header(TOKEN_C) -napa_ids = mint_ids("nmdc:Study", 2, formatted_token) -print(napa_ids) From 0b72e6041c3a49a47ef9b2fdaf89c7def9f9efad Mon Sep 17 00:00:00 2001 From: aclum Date: Thu, 11 Jul 2024 16:18:47 -0700 Subject: [PATCH 41/44] removing more ad hoc code --- .../repair_metagenome_sequencing_set.py | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 nmdc_schema/repair_metagenome_sequencing_set.py diff --git a/nmdc_schema/repair_metagenome_sequencing_set.py b/nmdc_schema/repair_metagenome_sequencing_set.py deleted file mode 100644 index 0e7ce40b7a..0000000000 --- a/nmdc_schema/repair_metagenome_sequencing_set.py +++ /dev/null @@ -1,18 +0,0 @@ -import json - -update_list = [] - -file = open("/Users/aclum/Downloads/nmdc.metagenome_sequencing_activity_set.csv", "r") - -for i in file.readlines(): - i = i.strip() - value_array = i.split(",") - update_list.append( - {"q": {"id": value_array[1]}, "u": {"$set": {"has_input": [value_array[2]]}}} - ) - -# print(update_list) -json_body = {"update": "metagenome_sequencing_activity_set", "updates": update_list} - -with open("mg_seq_repair_inputs.json", "w") as f: - json.dump(json_body, f) From 59e5e220a2bfe7b7c3a83aa6ac7271382d305eb3 Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Tue, 16 Jul 2024 17:14:40 -0400 Subject: [PATCH 42/44] napa compliance code isolation and documentation --- .../{ => completed_napa_compliance}/insert_many_pymongo.py | 0 .../{ => completed_napa_compliance}/metap_records_delete.py | 0 .../{ => completed_napa_compliance}/misc_reid_code.py | 0 .../napa_compliance.README.md | 6 ++++++ 4 files changed, 6 insertions(+) rename nmdc_schema/{ => completed_napa_compliance}/insert_many_pymongo.py (100%) rename nmdc_schema/{ => completed_napa_compliance}/metap_records_delete.py (100%) rename nmdc_schema/{ => completed_napa_compliance}/misc_reid_code.py (100%) rename nmdc_schema/{ => completed_napa_compliance}/napa_compliance.README.md (64%) diff --git a/nmdc_schema/insert_many_pymongo.py b/nmdc_schema/completed_napa_compliance/insert_many_pymongo.py similarity index 100% rename from nmdc_schema/insert_many_pymongo.py rename to nmdc_schema/completed_napa_compliance/insert_many_pymongo.py diff --git a/nmdc_schema/metap_records_delete.py b/nmdc_schema/completed_napa_compliance/metap_records_delete.py similarity index 100% rename from nmdc_schema/metap_records_delete.py rename to nmdc_schema/completed_napa_compliance/metap_records_delete.py diff --git a/nmdc_schema/misc_reid_code.py b/nmdc_schema/completed_napa_compliance/misc_reid_code.py similarity index 100% rename from nmdc_schema/misc_reid_code.py rename to nmdc_schema/completed_napa_compliance/misc_reid_code.py diff --git a/nmdc_schema/napa_compliance.README.md b/nmdc_schema/completed_napa_compliance/napa_compliance.README.md similarity index 64% rename from nmdc_schema/napa_compliance.README.md rename to nmdc_schema/completed_napa_compliance/napa_compliance.README.md index f4e9ec7480..25bca25b80 100644 --- a/nmdc_schema/napa_compliance.README.md +++ b/nmdc_schema/completed_napa_compliance/napa_compliance.README.md @@ -1,5 +1,11 @@ # Napa Compliance +This code has been executed, the results were accepted, and it should not need to be run again. It is being included as +documentation. The Python files have been moved from `nmdc_schema/` to `nmdc_schema/completed_napa_compliance/` and they +shouldn't be expected to work from that location. + +One shouldn't assume that the installation notes below are intended for any other nmdc-schema development. + ## Installing Python packages ```shell From f7979c271fbf3dab4484000bea0e16db975668bd Mon Sep 17 00:00:00 2001 From: "Mark A. Miller" Date: Tue, 16 Jul 2024 17:36:14 -0400 Subject: [PATCH 43/44] generalized isolation --- .gitignore | 2 ++ .../scripts/task_specific_code/README.md | 9 +++++++-- .../scripts/task_specific_code}/insert_many_pymongo.py | 0 .../scripts/task_specific_code}/metap_records_delete.py | 0 .../scripts/task_specific_code}/misc_reid_code.py | 0 5 files changed, 9 insertions(+), 2 deletions(-) rename nmdc_schema/completed_napa_compliance/napa_compliance.README.md => src/scripts/task_specific_code/README.md (77%) rename {nmdc_schema/completed_napa_compliance => src/scripts/task_specific_code}/insert_many_pymongo.py (100%) rename {nmdc_schema/completed_napa_compliance => src/scripts/task_specific_code}/metap_records_delete.py (100%) rename {nmdc_schema/completed_napa_compliance => src/scripts/task_specific_code}/misc_reid_code.py (100%) diff --git a/.gitignore b/.gitignore index df97bfb564..0c617ae3b2 100644 --- a/.gitignore +++ b/.gitignore @@ -27,6 +27,8 @@ site/ tdbcontent tdbcontent/ +src/scripts/cachedir/ + # PyCharm project config .idea/ diff --git a/nmdc_schema/completed_napa_compliance/napa_compliance.README.md b/src/scripts/task_specific_code/README.md similarity index 77% rename from nmdc_schema/completed_napa_compliance/napa_compliance.README.md rename to src/scripts/task_specific_code/README.md index 25bca25b80..08d6f496d7 100644 --- a/nmdc_schema/completed_napa_compliance/napa_compliance.README.md +++ b/src/scripts/task_specific_code/README.md @@ -1,8 +1,13 @@ # Napa Compliance This code has been executed, the results were accepted, and it should not need to be run again. It is being included as -documentation. The Python files have been moved from `nmdc_schema/` to `nmdc_schema/completed_napa_compliance/` and they -shouldn't be expected to work from that location. +documentation. The Python files have been moved from `nmdc_schema/` to `src/scripts/task_specific_code/` and they +shouldn't be expected to work from that location. These scripts demonstrate many good design principles, but may not +meet all of nmdc-schema's current code quality standards. + +* insert_many_pymongo.py +* metap_records_delete.py +* misc_reid_code.py One shouldn't assume that the installation notes below are intended for any other nmdc-schema development. diff --git a/nmdc_schema/completed_napa_compliance/insert_many_pymongo.py b/src/scripts/task_specific_code/insert_many_pymongo.py similarity index 100% rename from nmdc_schema/completed_napa_compliance/insert_many_pymongo.py rename to src/scripts/task_specific_code/insert_many_pymongo.py diff --git a/nmdc_schema/completed_napa_compliance/metap_records_delete.py b/src/scripts/task_specific_code/metap_records_delete.py similarity index 100% rename from nmdc_schema/completed_napa_compliance/metap_records_delete.py rename to src/scripts/task_specific_code/metap_records_delete.py diff --git a/nmdc_schema/completed_napa_compliance/misc_reid_code.py b/src/scripts/task_specific_code/misc_reid_code.py similarity index 100% rename from nmdc_schema/completed_napa_compliance/misc_reid_code.py rename to src/scripts/task_specific_code/misc_reid_code.py From c42a0d6ce7d88b3bdb1a76f8689d45225a33ff89 Mon Sep 17 00:00:00 2001 From: aclum Date: Tue, 16 Jul 2024 15:14:06 -0700 Subject: [PATCH 44/44] Update README.md Updates paths in the README.md file for code in /src/scripts/task_specific_code --- src/scripts/task_specific_code/README.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/scripts/task_specific_code/README.md b/src/scripts/task_specific_code/README.md index 08d6f496d7..a367472919 100644 --- a/src/scripts/task_specific_code/README.md +++ b/src/scripts/task_specific_code/README.md @@ -35,8 +35,5 @@ python -m black /path/to/python/code.py For example: ```shell -python -m black nmdc_schema/connect_napa_mongo.py -python -m black nmdc_schema/metab_id_refactor.py -python -m black nmdc_schema/napa_study_biosample_omics_migration.py -python -m black nmdc_schema/runtime_api_operations.py +python -m black src/scripts/task_specific_code/metap_records_delete.py ```