Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#2300: Create a JSON validator that runs on all JSON produced by CI #2315

Merged
merged 12 commits into from
Aug 2, 2024
139 changes: 104 additions & 35 deletions scripts/JSON_data_files_validator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import re
import sys

try:
Expand All @@ -12,6 +13,7 @@
from collections import Counter
from collections.abc import Iterable
import json
import logging

import brotli
from schema import And, Optional, Schema
Expand Down Expand Up @@ -289,13 +291,26 @@ def validate(self, schema_to_validate: dict):
return self.valid_schema.validate(schema_to_validate)


def get_json(file_path):
""" Always try to decompress in case '.br' extension is missing. """
with open(file_path, "rb") as json_file:
content = json_file.read()
try:
content = brotli.decompress(content)
except brotli.error as e:
logging.debug(f"No decompression applied for {file_path}: {e}")
return json.loads(content.decode("utf-8"))

class JSONDataFilesValidator:
""" Class validating VT data files according do defined schema. """
def __init__(self, file_path: str = None, dir_path: str = None, file_prefix: str = None, file_suffix: str = None):
def __init__(self, file_path: str = None, dir_path: str = None,
file_prefix: str = None, file_suffix: str = None,
validate_comm_links: bool = False):
self.__file_path = file_path
self.__dir_path = dir_path
self.__file_prefix = file_prefix
self.__file_suffix = file_suffix
self.__validate_comm_links = validate_comm_links
self.__cli()

def __cli(self):
Expand All @@ -306,6 +321,8 @@ def __cli(self):
group.add_argument("--file_path", help="Path to a validated file. Pass only when validating a single file.")
parser.add_argument("--file_prefix", help="File prefix. Optional. Pass only when --dir_path is provided.")
parser.add_argument("--file_suffix", help="File suffix. Optional. Pass only when --dir_path is provided.")
parser.add_argument("--validate_comm_links", help='Verify that comm links reference tasks.', action='store_true')
parser.add_argument("--debug", help="Enable debug logging", action="store_true")
args = parser.parse_args()
if args.file_path:
self.__file_path = os.path.abspath(args.file_path)
Expand All @@ -315,32 +332,24 @@ def __cli(self):
self.__file_prefix = args.file_prefix
if args.file_suffix:
self.__file_suffix = args.file_suffix

@staticmethod
def __check_if_file_exists(file_path: str) -> bool:
""" Check for existence of a given file. Returns True when file exists. """
return os.path.isfile(file_path)

@staticmethod
def __check_if_dir_exists(dir_path: str) -> bool:
""" Check for existence of a given directory. Returns True when file exists. """
return os.path.isdir(dir_path)
self.__validate_comm_links = args.validate_comm_links
logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO, format='%(levelname)s - %(filename)s:%(lineno)d - %(message)s')

@staticmethod
def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str) -> list:
""" Check for existence of a given directory. Returns True when file exists. """
""" Get a sorted list of files from directory. """
list_of_files = os.listdir(dir_path)

if not list_of_files:
sys.excepthook = exc_handler
raise FileNotFoundError(f"Directory: {dir_path} is EMPTY")

if file_prefix is None and file_suffix is None:
print("File prefix and file suffix not given")
logging.info("File prefix and file suffix not given")
file_prefix = Counter([file.split('.')[0] for file in list_of_files]).most_common()[0][0]
print(f"Found most common prefix: {file_prefix}")
logging.info(f"Found most common prefix: {file_prefix}")
file_suffix = Counter([file.split('.')[-1] for file in list_of_files]).most_common()[0][0]
print(f"Found most common suffix: {file_suffix}")
logging.info(f"Found most common suffix: {file_suffix}")

if file_prefix is not None:
list_of_files = [file for file in list_of_files if file.split('.')[0] == file_prefix]
Expand All @@ -352,44 +361,104 @@ def __get_files_for_validation(dir_path: str, file_prefix: str, file_suffix: str
key=lambda x: int(x.split(os.sep)[-1].split('.')[-2]))

@staticmethod
def __validate_file(file_path):
def get_complete_dataset(file_path):
""" Returns all json files that share the same basename. """
dirname = os.path.dirname(file_path)
basename = os.path.basename(file_path)
index = basename.rfind('0')
base = basename[0:index]
files = [os.path.join(dirname, f) for f in os.listdir(dirname)
if f.startswith(base) and (f.endswith(".json") or f.endswith(".json.br"))]
logging.debug(f"Dataset: {files}")

return files

@staticmethod
def get_nodes_info(file_path):
""" Returns node information from file name. """
basename = os.path.basename(file_path)
nodes_info = re.findall(r'\d+', basename)
if not nodes_info:
return '-1', '-1'
elif len(nodes_info) == 1:
return '-1', nodes_info[0]
else:
return nodes_info[0], nodes_info[1]

def __validate_file(self, file_path):
""" Validates the file against the schema. """
print(f"Validating file: {file_path}")
with open(file_path, "rb") as compr_json_file:
compr_bytes = compr_json_file.read()
try:
decompr_bytes = brotli.decompress(compr_bytes)
decompressed_dict = json.loads(decompr_bytes.decode("utf-8"))
except brotli.error:
decompressed_dict = json.loads(compr_bytes.decode("utf-8"))
logging.info(f"Validating file: {file_path}")
json_data = get_json(file_path)

# Extracting type from JSON data
schema_type = None
if decompressed_dict.get("metadata") is not None:
schema_type = decompressed_dict.get("metadata").get("type")
if json_data.get("metadata") is not None:
schema_type = json_data.get("metadata").get("type")
else:
if decompressed_dict.get("type") is not None:
schema_type = decompressed_dict.get("type")
if json_data.get("type") is not None:
schema_type = json_data.get("type")

if schema_type is not None:
# Validate schema
if SchemaValidator(schema_type=schema_type).is_valid(schema_to_validate=decompressed_dict):
print(f"Valid JSON schema in {file_path}")
if SchemaValidator(schema_type=schema_type).is_valid(schema_to_validate=json_data):
logging.info(f"Valid JSON schema in {file_path}")
else:
print(f"Invalid JSON schema in {file_path}")
SchemaValidator(schema_type=schema_type).validate(schema_to_validate=decompressed_dict)
logging.error(f"Invalid JSON schema in {file_path}")
SchemaValidator(schema_type=schema_type).validate(schema_to_validate=json_data)
else:
print(f"Schema type not found in file: {file_path}. \nPassing by default when schema type not found.")
logging.warning(f"Schema type not found in file: {file_path}. \n"
"Passing by default when schema type not found.")

if self.__validate_comm_links and schema_type == "LBDatafile":
num_nodes, current_node = self.get_nodes_info(file_path)
if num_nodes == '-1' and current_node == '-1':
# validate single file
all_jsons = [json_data]
elif current_node == '0':
# validate complete dataset
dataset_files = self.get_complete_dataset(file_path)
all_jsons = [get_json(file) for file in dataset_files]
else:
# this dataset is already validated
return

if not self.validate_comm_links(all_jsons):
logging.error(f" Invalid dataset for file: {file_path}!")


@staticmethod
def validate_comm_links(all_jsons):
for n in range(len(all_jsons[0]["phases"])):
comm_ids = set()
task_ids = set()

for data in all_jsons:
if data["phases"][n].get("communications") is not None:
comms = data["phases"][n]["communications"]
comm_ids.update({int(comm["from"]["id"]) for comm in comms})
comm_ids.update({int(comm["to"]["id"]) for comm in comms})

tasks = data["phases"][n]["tasks"]
task_ids.update({int(task["entity"]["id"]) for task in tasks})

if not comm_ids.issubset(task_ids):
logging.error(
f" Phase {n}: Task ids: {comm_ids - task_ids}. Tasks are "
"referenced in communication, but are not present in the "
"dataset."
)
return False
return True

def main(self):
if self.__file_path is not None:
if self.__check_if_file_exists(file_path=self.__file_path):
if os.path.isfile(self.__file_path):
self.__validate_file(file_path=self.__file_path)
else:
sys.excepthook = exc_handler
raise FileNotFoundError(f"File: {self.__file_path} NOT found")
elif self.__dir_path is not None:
if self.__check_if_dir_exists(dir_path=self.__dir_path):
if os.path.isdir(self.__dir_path):
list_of_files_for_validation = self.__get_files_for_validation(dir_path=self.__dir_path,
file_prefix=self.__file_prefix,
file_suffix=self.__file_suffix)
Expand Down
6 changes: 1 addition & 5 deletions scripts/check_lb_data_files.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,8 @@ cd "$path_to_vt_build_dir" || exit 1

function run_schema_validator() {
file=$1
echo "Running schema validator on: $file"
if python3 "${path_to_vt_src_dir}/scripts/JSON_data_files_validator.py" --file_path="$file"
if ! python3 "${path_to_vt_src_dir}/scripts/JSON_data_files_validator.py" --file_path="$file" --validate_comm_links
then
echo "Valid file"
else
>&2 echo "Invalid schema in $file.. exiting"
exit 1;
fi
}
Expand Down