-
Notifications
You must be signed in to change notification settings - Fork 2
/
validate_ndjson_json_schema.py
52 lines (42 loc) · 2 KB
/
validate_ndjson_json_schema.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from linkml.validator import validate
import ndjson
import json
import os
from jsonschema import validate
"""
Validates all Dataset-JSON v1.1 NDJSON examples. Because it validates 1 line at a time this runs slowly.
"""
def validate_dataset(dataset_filename, standard, json_schema) -> None:
with open(dataset_filename, mode='r', encoding='utf-8') as f:
error_count = 0
data_row = {}
reader = ndjson.reader(f)
for line_num, json_line in enumerate(reader, 1):
try:
if line_num > 1:
data_row["rows"] = json_line
validate(instance=data_row, schema=json_schema["$defs"]["RowData"])
else:
validate(instance=json_line, schema=json_schema["$defs"]["DatasetMetadata"])
except json.decoder.JSONDecodeError as ve:
error_count += 1
print(f"Invalid json on line {line_num} for dataset {dataset_filename}.\n{ve}")
if not error_count:
print(f"NDJSON dataset {dataset_filename} is valid based on the JSON schema.")
def convert_example_datasets(datasets, standard, json_schema) -> None:
for dataset in datasets:
dataset_filename = os.path.join(os.getcwd(), "data", standard + "-ndjson", dataset + ".ndjson")
validate_dataset(dataset_filename, standard, json_schema)
def get_ndjson_json_schema(schema_file):
schema_filename = os.path.join(os.getcwd(), schema_file)
with open(schema_filename, mode='rb') as f:
schema = json.load(f)
return schema
if __name__ == '__main__':
# with open(".\\data\\dataset-list.json") as f:
# ds_lists = json.load(f)
# for standard, datasets in ds_lists.items():
# convert_example_datasets(datasets, standard)
json_schema = get_ndjson_json_schema("dataset-ndjson-schema.json")
# convert_example_datasets(["dd"], "sdtm-ndjson", json_schema)
convert_example_datasets(["ae", "cm", "relrec", "suppdm", "fa", "tv", "vs", "dd"], "sdtm", json_schema)