-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmap2codelists.py
304 lines (271 loc) · 14.6 KB
/
map2codelists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
import requests
import xlsxwriter as XLS
import os.path
import argparse
"""
map2codelists.py generates the codelist metadata based used to generate codelists in Define-XML v2.1. The codelists were
stripped from the SDTM mapping spreadsheet and added to this program. This program looks up the CDISC CT codelists in
the CDISC Library to generate the needed content including all the terms. There are some special cases that are also
addressed such as the domain abbreviation codelists and codelists subsets where a subset of the terms are used.
Example Cmd-line:
python map2codelists.py -a e9a7d1b9bf1a4036ae7b25533123456 -o ./path/to/codelists.xlsx
"""
# api key - put your own API key here
library_api_key = "e9a7d1b9bf1a4036ae7b25533a081565"
# identifying details of the CT package used for this study
package_date = "2021-12-17"
package_standard = "sdtmct"
# codelist OIDs created from c-codes referenced in the SDTM mapping spreadsheet
codelists = ["CL.C66731", "CL.C74457", "CL.C66790"]
# codelist subset definitions - codelist OID and c-codes and submission values for each term in the subset
codelist_subsets = [
{"oid": "CL.C66742", "terms": [{"c_code": "C49488", "sub_val": "Y"}]},
{"oid": "CL.C66728", "terms": [{"c_code": "C25629", "sub_val": "BEFORE"}, {"c_code": "C53279", "sub_val": "ONGOING"}]},
{"oid": "CL.C66781", "terms": [{"c_code": "C29848", "sub_val": "YEARS"}]},
{"oid": "CL.C71620", "terms": [{"c_code": "C44278", "sub_val": "U"}, {"c_code": "NA", "sub_val": "U/hr"},
{"c_code": "C25301", "sub_val": "DAYS"}, {"c_code": "C172604", "sub_val": "cup eq"},
{"c_code": "C161487", "sub_val": "DRINK"}, {"c_code": "C48155", "sub_val": "g"},
{"c_code": "C67194", "sub_val": "kcal"}, {"c_code": "C28253", "sub_val": "mg"},
{"c_code": "C172605", "sub_val": "oz eq"}, {"c_code": "C184720", "sub_val": "SERVING"},
{"c_code": "C25613", "sub_val": "%"}, {"c_code": "C67015", "sub_val": "mg/dL"},
{"c_code": "C170633", "sub_val": "days/wk"}, {"c_code": "C172603", "sub_val": "tsp eq"},
{"c_code": "C176381", "sub_val": "min/day"}]},
{"oid": "CL.C71113", "terms": [{"c_code": "C25473", "sub_val": "QD"}]},
{"oid": "CL.C74559", "terms": [{"c_code": "C17953", "sub_val": "EDULEVEL"}, {"c_code": "C154890", "sub_val": "INCMLVL"},
{"c_code": "NA", "sub_val": "HLTHINS"}]},
{"oid": "CL.C103330", "terms": [{"c_code": "C17953", "sub_val": "Level of Education Attained"},
{"c_code": "C154890", "sub_val": "Income Level"},
{"c_code": "NA", "sub_val": "Health Insurance"}]},
{"oid": "CL.C66741", "terms": [{"c_code": "C25347", "sub_val": "HEIGHT"}, {"c_code": "C25208", "sub_val": "WEIGHT"},
{"c_code": "C49677", "sub_val": "HR"}, {"c_code": "NA", "sub_val": "HRM"}]},
{"oid": "CL.C67153", "terms": [{"c_code": "C25347", "sub_val": "Height"}, {"c_code": "C25208", "sub_val": "Weight"},
{"c_code": "C49677", "sub_val": "Heart Rate"}, {"c_code": "NA", "sub_val": "Heart Rate, Mean"}]},
{"oid": "CL.C66770", "terms": [{"c_code": "C48500", "sub_val": "in"}, {"c_code": "C48531", "sub_val": "LB"},
{"c_code": "C49673", "sub_val": "beats/min"}]},
{"oid": "CL.C65047", "terms": [{"c_code": "C64849", "sub_val": "HBA1C"}, {"c_code": "C105585", "sub_val": "GLUC"}]},
{"oid": "CL.C67154", "terms": [{"c_code": "C64849", "sub_val": "Hemoglobin A1C"}, {"c_code": "C105585", "sub_val": "Glucose"}]},
{"oid": "CL.C67154", "terms": [{"c_code": "C64849", "sub_val": "Hemoglobin A1C"}, {"c_code": "C105585", "sub_val": "Glucose"}]}
]
# OIDs for codelist subsets for domain - each codelist includes the term for one domain
domain_codelists = ["CL.DOMAIN.VS", "CL.DOMAIN.SC", "CL.DOMAIN.QS", "CL.DOMAIN.RP", "CL.DOMAIN.PR", "CL.DOMAIN.ML",
"CL.DOMAIN.LB", "CL.DOMAIN.FA", "CL.DOMAIN.FACM", "CL.DOMAIN.FADX", "CL.DOMAIN.FAML_meal",
"CL.DOMAIN.FAML_daily", "CL.DOMAIN.FAML_item", "CL.DOMAIN.FAPR", "CL.DOMAIN.DX", "CL.DOMAIN.DM",
"CL.DOMAIN.DI", "CL.DOMAIN.NV", "CL.DOMAIN.CM", "CL.DOMAIN.RELREC", "CL.DOMAIN.SUPPDM"]
# codelist tab column headers that match the odmlib spreadsheet
header = ["OID", "Name", "NCI Codelist Code", "Data Type", "Order", "Term", "NCI Term Code", "Decoded Value",
"Comment", "IsNonStandard", "StandardOID"]
# the worksheets to skip when processing the SDTM mapping spreadsheet
worksheet_skip = ["T1Dexi Tables", "Domains", "Sheet1"]
# output spreadsheet with just the codelists tab that can be copied to the odmlib spreadsheet - assumes child data dir
excel_define_file = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data', 'codelists-test.xlsx')
def get_codelist_from_library(endpoint, api_key):
"""
retrieve a codelist from the CDISC Library using the provided endpoint
:param endpoint: endpoint string used to create the API call to the Library to retrieve a codelist
:param api_key: string CDISC Library API key
:return: json results from Library GET request for a specified codelist
"""
base_url = "https://library.cdisc.org/api"
headers = {"Accept": "application/json", "User-Agent": "crawler", "api-key": api_key}
r = requests.get(base_url + endpoint, headers=headers)
if r.status_code == 200:
return r.json()
else:
print(f"HTTPError {r.status_code} for url {base_url + endpoint}")
def process_library_codelist(cl_oid, cl):
"""
given a codelist retrieved from the Library, create the odmlib codelist worksheet rows
:param cl_oid: the OID for the codelist that is included on each row
:param cl: the codelist retrieved from the Library
:return: rows - a list of codelist term rows ready to add to the odmlib codelist worksheet
"""
rows = []
for order_num, term in enumerate(cl["terms"]):
row = {key: "" for key in header}
row["OID"] = cl_oid
row["Name"] = cl["name"]
row["NCI Codelist Code"] = cl["conceptId"]
row["Data Type"] = "text"
row["Order"] = order_num + 1
row["Term"] = term["submissionValue"]
row["NCI Term Code"] = term["conceptId"]
row["Decoded Value"] = term["preferredTerm"]
row["Comment"] = ""
row["IsNonStandard"] = ""
row["StandardOID"] = "STD.3"
rows.append(row)
return rows
def create_domain_codelist_subsets(worksheet, row_nbr, api_key):
""" create a codelist subset for each domain used in the study
:param worksheet: odmlib worksheet object to write to
:param row_nbr: row number to start appending codelists to the worksheet
:param api_key: string CDISC Library API key
:return: row_nbr: integer that indicates where to start appending new codelists
"""
cl_count = 0
endpoint = "/mdr/ct/packages/" + package_standard + "-" + package_date + "/codelists/C66734"
cl = get_codelist_from_library(endpoint, api_key)
rows = []
for order_nbr, cl_oid in enumerate(domain_codelists):
row = {key: "" for key in header}
row["OID"] = cl_oid
domain_oid = cl_oid.split(".")
row["Name"] = "Domain Abbreviation (" + domain_oid[2] + ")"
row["NCI Codelist Code"] = "C66734"
row["Data Type"] = "text"
row["Order"] = 1 # every domain codelist has 1 term
term = get_domain_term(cl, domain_oid[2])
row["Term"] = term["Term"]
row["NCI Term Code"] = term["NCI Term Code"]
row["Decoded Value"] = term["Decoded Value"]
row["Comment"] = term["Comment"]
row["IsNonStandard"] = term["IsNonStandard"]
row["StandardOID"] = term["StandardOID"]
rows.append(row)
cl_count += 1
row_nbr = write_codelist_to_xls(worksheet, rows, row_nbr)
print(f"added {cl_count} domain codelist subsets...")
return row_nbr
def get_domain_term(cl, domain):
"""
create the codelist term values for a given domain code
:param cl: the codelist for domains
:param domain: the 2 letter domain abbreviation to use to find the codelist term details to return
:return: term dictionary with the details of the domain codelist term
"""
term = {}
for domain_term in cl["terms"]:
if domain_term["submissionValue"] == domain:
term["Term"] = domain_term["submissionValue"]
term["NCI Term Code"] = domain_term["conceptId"]
term["Decoded Value"] = domain_term["preferredTerm"]
term["Comment"] = ""
term["IsNonStandard"] = ""
term["StandardOID"] = "STD.3"
return term
term["Term"] = domain
term["NCI Term Code"] = ""
term["Decoded Value"] = ""
term["Comment"] = ""
term["IsNonStandard"] = "Yes"
term["StandardOID"] = ""
return term
def create_defined_subsets(worksheet, row_nbr, api_key):
"""
generate codelist subsets based on the codelist_subset dictionary created from the mapping spreadsheet
:param worksheet: the Excel worksheet to write the codelist subset values to
:param row_nbr: the current row number in the worksheet (start writing from this row an retrun incremented value)
:param api_key: the Library API key needed to authenticate access to retrieve the codelist
:return: integer: the incremented row_nbr to indicate the current row in the worksheet
"""
cl_count = 0
rows = []
for subset in codelist_subsets:
c_code = subset["oid"].split(".")[1]
# get the complete codelist from the Library to use to populate the subset terms
endpoint = "/mdr/ct/packages/" + package_standard + "-" + package_date + "/codelists/" + c_code
cl = get_codelist_from_library(endpoint, api_key)
order_nbr = 1
cl_count += 1
# find the non-standard terms that extend a codelist and have a c-code == "NA"
submission_values = [term_dict["sub_val"] for term_dict in subset["terms"] if term_dict["c_code"] == "NA"]
term_c_codes = [term_dict["c_code"] for term_dict in subset["terms"]]
is_term_found = False
# populate fields of the subset term from the Library content
for term in cl["terms"]:
if term["conceptId"] in term_c_codes:
row = {key: "" for key in header}
row["OID"] = "CL." + c_code
row["Name"] = cl["submissionValue"]
row["NCI Codelist Code"] = c_code
row["Data Type"] = "text"
row["Order"] = order_nbr
row["Term"] = term["submissionValue"]
row["NCI Term Code"] = term["conceptId"]
row["Decoded Value"] = term["preferredTerm"]
row["Comment"] = ""
row["IsNonStandard"] = ""
row["StandardOID"] = "STD.3"
is_term_found = True
order_nbr += 1
rows.append(row)
# process non-standard terms that extend a codelist
for sv in submission_values:
row = {key: "" for key in header}
row["OID"] = "CL." + c_code
row["Name"] = cl["submissionValue"]
row["NCI Codelist Code"] = c_code
row["Data Type"] = "text"
row["Order"] = order_nbr
row["Term"] = sv
row["Comment"] = ""
row["IsNonStandard"] = "Yes"
row["StandardOID"] = ""
is_term_found = True
order_nbr += 1
rows.append(row)
if not is_term_found:
print(f"No terms found in {c_code}")
row_nbr = write_codelist_to_xls(worksheet, rows, row_nbr)
print(f"added {cl_count} defined subsets...")
return row_nbr
def write_codelist_to_xls(worksheet, rows, row_nbr=0):
"""
write codelist term rows to the odmlib worksheet
:param worksheet: odmlib worksheet object to write to
:param rows: a list of codelist row dictionaries to write to the worksheet
:param row_nbr: integer row number that indicates which worksheet row to begin appending terms
:return: integer row number that indicates the next row to begin appending terms
"""
for row in rows:
row_nbr += 1
for c, col_name in enumerate(header):
worksheet.write(row_nbr, c, row[col_name])
return row_nbr
def write_header_row(worksheet, header_format):
"""
write the worksheet header column labels to the odmlib worksheet
:param worksheet: the odmlib worksheet to write the column headers to
:param header_format: the header format to indicate the style of the header columns
"""
for c, header_label in enumerate(header):
worksheet.write(0, c, header_label, header_format)
def set_cmd_line_args():
"""
get the command-line arguments needed to generate a metadata worksheet for codelists
:return: return the argparse object with the command-line parameters
"""
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--api_key", help="CDISC Library API key",
required=False, dest="api_key", default=library_api_key)
parser.add_argument("-o", "--output_file", help="path and file name of Define-XML v2.1 metadata spreadsheet file",
required=False, dest="define_xls", default=excel_define_file)
args = parser.parse_args()
return args
def main():
"""
main driver for creating the odmlib worksheet and writing codelist and associated terms to it
"""
args = set_cmd_line_args()
workbook = XLS.Workbook(args.define_xls, {"strings_to_numbers": False})
header_format = workbook.add_format({"bold": True, "bg_color": "#CCFFFF", "border": True, "border_color": "black"})
worksheet = workbook.add_worksheet("codelists")
write_header_row(worksheet, header_format)
cl_count = 0
row_nbr = 0
# add codelists
for cl_oid in codelists:
prefix, c_code = cl_oid.split(".")
endpoint = "/mdr/ct/packages/" + package_standard + "-" + package_date + "/codelists/" + c_code
cl = get_codelist_from_library(endpoint, args.api_key)
rows = process_library_codelist(cl_oid, cl)
row_nbr = write_codelist_to_xls(worksheet, rows, row_nbr)
cl_count += 1
print(f"added {cl_count} codelists...")
# add domain codelists
row_nbr = create_domain_codelist_subsets(worksheet, row_nbr, args.api_key)
# add process codelist subsets
row_nbr = create_defined_subsets(worksheet, row_nbr, args.api_key)
workbook.close()
if __name__ == '__main__':
main()