diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..4d6b15e --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,64 @@ +fail_fast: false + +repos: +- repo: https://github.com/python-poetry/poetry + rev: 1.7.0 + hooks: + - id: poetry-check + +- repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.1.9 + hooks: + - id: ruff + args: [--fix] + - id: ruff-format + +- repo: https://github.com/pre-commit/mirrors-mypy + rev: v1.8.0 + hooks: + - id: mypy + args: [--ignore-missing-imports] + additional_dependencies: + - types-requests + - types-toml + - types-PyYAML +- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks + rev: v2.12.0 + hooks: + - id: pretty-format-yaml + args: + - --autofix + - --indent=2 + - id: pretty-format-toml + exclude: ^poetry.lock$ + args: + - --autofix + - --indent=2 + - --no-sort + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-case-conflict + - id: end-of-file-fixer + - id: mixed-line-ending + args: + - --fix=lf + - id: trailing-whitespace + - id: pretty-format-json + args: + - --autofix + - --indent=4 + - --no-sort-keys + - id: check-merge-conflict + - id: check-yaml + - id: check-json + - id: check-toml + +- repo: local + hooks: + - id: yaml-file-extension + name: Prefer .yaml over .yml. + entry: YAML files must have .yaml extension. + language: fail + files: \.yml$ diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..cebba76 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,21 @@ + +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [1.0.0] + +Initial release. See [README](https://github.com/FCP-INDI/C-PAC_regression_dashboard/blob/v1.0.0/README.md). + +[1.0.0]: https://github.com/FCP-INDI/C-PAC_regression_dashboard/releases/tag/v1.0.0 diff --git a/calculate_correlations.py b/calculate_correlations.py deleted file mode 100644 index 062d0e0..0000000 --- a/calculate_correlations.py +++ /dev/null @@ -1,845 +0,0 @@ -#!/usr/bin/env python - -from typing import Optional, NamedTuple, Tuple, Union -from utils.html_script import body - -import os -import numpy as np -import pandas as pd - -from multiprocessing import Pool - -Axis = Union[int, Tuple[int, ...]] - -class CorrValue(NamedTuple): - concor: np.ndarray - pearson: np.ndarray - -def read_yml_file(yml_filepath): - import yaml - with open(yml_filepath,"r") as f: - yml_dict = yaml.safe_load(f) - - return yml_dict - -def write_yml_file(yml_dict, out_filepath): - import yaml - with open(out_filepath, "wt") as f: - yaml.safe_dump(yml_dict, f) - -def read_pickle(pickle_file): - import pickle - with open(pickle_file, "rb") as f: - dct = pickle.load(f) - return dct - -def write_pickle(dct, out_filepath): - import pickle - with open(out_filepath, "wb") as f: - pickle.dump(dct, f, protocol=pickle.HIGHEST_PROTOCOL) - -def read_txt_file(txt_file): - with open(txt_file,"r") as f: - strings = f.read().splitlines() - return strings - -def write_txt_file(text_lines, out_filepath): - with open(out_filepath, "wt") as f: - for line in text_lines: - f.write("{0}\n".format(line)) - -def write_dct(dct=None, text_lines=None, outname=None): - if not dct: - dct = {outname: text_lines} - else: - dct.update({outname: text_lines}) - return dct - -def gather_local_filepaths(output_folder_path): - import os - filepaths = [] - - print("Gathering file paths from {0}\n".format(output_folder_path)) - for root, dirs, files in os.walk(output_folder_path): - # loops through every file in the directory - for filename in files: - # checks if the file is a nifti (.nii.gz) - if '.nii' in filename or '.csv' in filename or '.txt' in filename \ - or '.1D' in filename or '.tsv' in filename: - filepaths.append(os.path.join(root, filename)) - - if len(filepaths) == 0: - err = "\n\n[!] No filepaths were found given the output folder!\n\n" - raise Exception(err) - - return filepaths - -def batch_correlate( - x: np.ndarray, y: np.ndarray, axis: Optional[Axis] = None -) -> CorrValue: - """ - Compute a batch of concordance and Pearson correlation coefficients between - x and y along an axis (or axes). - - References: - https://en.wikipedia.org/wiki/Concordance_correlation_coefficient - """ - # Summary stats for x - x_mean = np.mean(x, axis=axis, keepdims=True) - x_var = np.var(x, axis=axis, keepdims=True) - x_std = np.sqrt(x_var) - # NOTE: Not trying to fix NaNs - x_norm = (x - x_mean) / x_std - - # Summary stats for y - y_mean = np.mean(y, axis=axis, keepdims=True) - y_var = np.var(y, axis=axis, keepdims=True) - y_std = np.sqrt(y_var) - y_norm = (y - y_mean) / y_std - - # Correlation coefficients - pearson = np.mean(x_norm * y_norm, axis=axis, keepdims=True) - concor = 2 * pearson * x_std * y_std / (x_var + y_var + (x_mean - y_mean) ** 2) - - # Squeeze reduced singleton dimensions - if axis is not None: - concor = np.squeeze(concor, axis=axis) - pearson = np.squeeze(pearson, axis=axis) - return CorrValue(concor, pearson) - -def correlate_text_based(txt1, txt2): - # TODO: why do we drop columns containing na? - oned_one = pd.read_csv(txt1, delimiter=None, comment="#").dropna(axis=1).values - oned_two = pd.read_csv(txt2, delimiter=None, comment="#").dropna(axis=1).values - - concor, pearson = batch_correlate(oned_one, oned_two, axis=0) - concor = np.nanmean(concor) - pearson = np.nanmean(pearson) - return concor, pearson - -def create_unique_file_dict(filepaths, output_folder_path, replacements=None): - - # filepaths: - # list of output filepaths from a CPAC output directory - # output_folder_path: - # the CPAC output directory the filepaths are from - # replacements: - # (optional) a list of strings to be removed from the filepaths should - # they occur - - # output - # files_dict - # a dictionary of dictionaries, format: - # files_dict["centrality"] = - # {("centrality", midpath, nums): , ..} - - files_dict = {} - - for filepath in filepaths: - - if "_stack" in filepath: - continue - - if ("itk" in filepath) or ("xfm" in filepath) or ("montage" in filepath): - continue - path_changes = [] - real_filepath = filepath - if replacements: - for word_couple in replacements: - if "," not in word_couple: - err = "\n\n[!] In the replacements text file, the old " \ - "substring and its replacement must be separated " \ - "by a comma.\n\n" - raise Exception(err) - word = word_couple.split(",")[0] - new = word_couple.split(",")[1] - if word in filepath: - path_changes.append("old: {0}".format(filepath)) - filepath = filepath.replace(word, new) - path_changes.append("new: {0}".format(filepath)) - if path_changes: - import os - with open(os.path.join(os.getcwd(), "path_changes.txt"), "wt") as f: - for path in path_changes: - f.write(path) - f.write("\n") - - filename = filepath.split("/")[-1] - - # name of the directory the file is in - folder = filepath.split("/")[-2] - - midpath = filepath.replace(output_folder_path, "") - midpath = midpath.replace(filename, "") - - pre180 = False - if pre180: - # name of the output type/derivative - try: - category = midpath.split("/")[2] - except IndexError as e: - continue - - if "eigenvector" in filepath: - category = category + ": eigenvector" - if "degree" in filepath: - category = category + ": degree" - if "lfcd" in filepath: - category = category + ": lfcd" - else: - tags = [] - category = filename - category = category.rstrip('.gz').rstrip('.nii') - - excl_tags = ['sub-', 'ses-', 'task-', 'run-', 'acq-'] - - # len(filetag) == 1 is temporary for broken/missing ses-* tag - for filetag in filename.split("_"): - for exctag in excl_tags: - if exctag in filetag or len(filetag) == 1: - category = category.replace(f'{filetag}_', '') - - # this provides a way to safely identify the specific file - # without relying on a full string of the filename (because - # this can change between versions depending on what any given - # processing tool appends to output file names) - nums_in_folder = [int(s) for s in folder if s.isdigit()] - nums_in_filename = [int(s) for s in filename if s.isdigit()] - - file_nums = '' - - for num in nums_in_folder: - file_nums = file_nums + str(num) - - for num in nums_in_filename: - file_nums = file_nums + str(num) - - # load these settings into the tuple so that the file can be - # identified without relying on its full path (as it would be - # impossible to match files from two regression tests just - # based on their filepaths) - file_tuple = (category, midpath, file_nums) - - temp_dict = {} - temp_dict[file_tuple] = [real_filepath] - - if category not in files_dict.keys(): - files_dict[category] = {} - - files_dict[category].update(temp_dict) - - return files_dict - - -def gather_all_files(input_dct, pickle_dir, source='output_dir'): - - file_dct_list = [] - - for key, pipe_dct in input_dct['pipelines'].items(): - - pipe_outdir = pipe_dct[source] - - if input_dct['settings']['s3_creds']: - if not "s3://" in pipe_outdir: - err = "\n\n[!] If pulling output files from an S3 bucket, the "\ - "output folder path must have the s3:// prefix.\n\n" - raise Exception(err) - else: - pipe_outdir = os.path.abspath(pipe_outdir).rstrip('/') - - pipeline_name = pipe_outdir.split('/')[-1] - - #if source == "output_dir" and "pipeline_" not in pipeline_name: - # err = "\n\n[!] Your pipeline output directory has to be a specific " \ - # "one that has the 'pipeline_' prefix.\n\n(Not the main output " \ - # "directory that contains all of the 'pipeline_X' subdirectories," \ - # "and not a specific participant's output subdirectory either.)\n" - # raise Exception(err) - - output_pkl = os.path.join(pickle_dir, "{0}_{1}_paths.p".format(key, source)) - - if os.path.exists(output_pkl): - print("Found output list pickle for {0}, skipping output file" \ - "path parsing..".format(key)) - pipeline_files_dct = read_pickle(output_pkl) - else: - pipeline_files_list = gather_local_filepaths(pipe_outdir) - - pipeline_files_dct = create_unique_file_dict(pipeline_files_list, - pipe_outdir, - pipe_dct['replacements']) - - write_pickle(pipeline_files_dct, output_pkl) - - file_dct_list.append(pipeline_files_dct) - - return (file_dct_list[0], file_dct_list[1]) - -def match_filepaths(old_files_dict, new_files_dict): - """Returns a dictionary mapping each filepath from the first CPAC run to the - second one, matched to derivative, strategy, and scan. - - old_files_dict: each key is a derivative name, and each value is another - dictionary keying (derivative, mid-path, last digit in path) - tuples to a list containing the full filepath described by - the tuple that is the key - new_files_dict: same as above, but for the second CPAC run - - matched_path_dict: same as the input dictionaries, except the list in the - sub-dictionary value has both file paths that are matched - """ - - # file path matching - matched_path_dict = {} - missing_in_old = [] - missing_in_new = [] - - for key in new_files_dict: - # for types of derivative... - if key in old_files_dict.keys(): - for file_id in new_files_dict[key]: - if file_id in old_files_dict[key].keys(): - - if key not in matched_path_dict.keys(): - matched_path_dict[key] = {} - - matched_path_dict[key][file_id] = \ - old_files_dict[key][file_id] + new_files_dict[key][file_id] - - else: - missing_in_old.append(file_id)#new_files_dict[key][file_id]) - else: - missing_in_old.append(new_files_dict[key]) - - # find out what is in the last version's outputs that isn't in the new - # version's outputs - for key in old_files_dict: - if new_files_dict.get(key) != None: - missing_in_new.append(old_files_dict[key]) - - if len(matched_path_dict) == 0: - err = "\n\n[!] No output paths were successfully matched between " \ - "the two CPAC output directories!\n\n" - raise Exception(err) - - matched_files_dct = { - "matched": matched_path_dict, - "missing_old": missing_in_old, - "missing_new": missing_in_new - } - - return matched_files_dct - -def calculate_correlation(args_tuple): - - import os - import subprocess - import nibabel as nb - import numpy as np - import scipy.stats.mstats - import scipy.stats - import math - - category = args_tuple[0] - old_path = args_tuple[1] - new_path = args_tuple[2] - local_dir = args_tuple[3] - s3_creds = args_tuple[4] - verbose = args_tuple[5] - - if verbose: - print("Calculating correlation between {0} and {1}".format(old_path, new_path)) - - corr_tuple = None - - if s3_creds: - try: - # full filepath with filename - old_local_file = os.path.join(local_dir, "s3_input_files", \ - old_path.replace("s3://","")) - # directory without filename - old_local_path = old_local_file.replace(old_path.split("/")[-1],"") - - new_local_file = os.path.join(local_dir, "s3_input_files", \ - new_path.replace("s3://","")) - new_local_path = new_local_file.replace(new_path.split("/")[-1],"") - - if not os.path.exists(old_local_path): - os.makedirs(old_local_path) - if not os.path.exists(new_local_path): - os.makedirs(new_local_path) - - except Exception as e: - err = "\n\nLocals: {0}\n\n[!] Could not create the local S3 " \ - "download directory.\n\nError details: {1}\n\n".format((locals(), e)) - raise Exception(e) - - try: - old_path = old_local_file - except Exception as e: - err = "\n\nLocals: {0}\n\n[!] Could not download the files from " \ - "the S3 bucket. \nS3 filepath: {1}\nLocal destination: {2}" \ - "\nS3 creds: {3}\n\nError details: {4}\n\n".format(locals(), - old_path, - old_local_path, - s3_creds, e) - raise Exception(e) - - try: - new_path = new_local_file - except Exception as e: - err = "\n\nLocals: {0}\n\n[!] Could not download the files from " \ - "the S3 bucket. \nS3 filepath: {1}\nLocal destination: {2}" \ - "\nS3 creds: {3}\n\nError details: {4}\n\n".format(locals(), - new_path, - new_local_path, - s3_creds, e) - raise Exception(e) - - ## nibabel to pull the data from the re-assembled file paths - if os.path.exists(old_path) and os.path.exists(new_path): - - if ('.csv' in old_path and '.csv' in new_path) or \ - ('spatial_map_timeseries.txt' in old_path and 'spatial_map_timeseries.txt' in new_path) or \ - ('.1D' in old_path and '.1D' in new_path) or \ - ('.tsv' in old_path and '.tsv' in new_path): - try: - concor, pearson = correlate_text_based(old_path, new_path) - - if concor > 0.980: - corr_tuple = (category, [concor], [pearson]) - else: - corr_tuple = (category, [concor], [pearson], (old_path, new_path)) - if verbose: - print("Success - {0}".format(str(concor))) - - except Exception as e: - corr_tuple = ("file reading problem: {0}".format(e), - old_path, new_path) - if verbose: - print(str(corr_tuple)) - - return corr_tuple - - else: - try: - old_file_img = nb.load(old_path) - old_file_hdr = old_file_img.header - new_file_img = nb.load(new_path) - new_file_hdr = new_file_img.header - - old_file_dims = old_file_hdr.get_zooms() - new_file_dims = new_file_hdr.get_zooms() - - data_1 = nb.load(old_path).get_fdata() - data_2 = nb.load(new_path).get_fdata() - - except Exception as e: - corr_tuple = ("file reading problem: {0}".format(e), - old_path, new_path) - if verbose: - print(str(corr_tuple)) - return corr_tuple - - ## set up and run the Pearson correlation and concordance correlation - if data_1.flatten().shape == data_2.flatten().shape: - try: - if len(old_file_dims) > 3: - axis = tuple(range(3, len(old_file_dims))) - concor, pearson = batch_correlate(data_1, data_2, axis=axis) - concor = np.nanmean(concor) - pearson = np.nanmean(pearson) - else: - concor, pearson = batch_correlate(data_1, data_2) - except Exception as e: - corr_tuple = ("correlating problem: {0}".format(e), - old_path, new_path) - if verbose: - print(str(corr_tuple)) - return corr_tuple - if concor > 0.980: - corr_tuple = (category, [concor], [pearson]) - else: - corr_tuple = (category, [concor], [pearson], (old_path, new_path)) - if verbose: - print("Success - {0}".format(str(concor))) - else: - corr_tuple = ("different shape", old_path, new_path) - if verbose: - print(str(corr_tuple)) - - else: - if not os.path.exists(old_path): - corr_tuple = ("file doesn't exist", [old_path], None) - if verbose: - print(str(corr_tuple)) - if not os.path.exists(new_path): - if not corr_tuple: - corr_tuple = ("file doesn't exist", [new_path], None) - if verbose: - print(str(corr_tuple)) - else: - corr_tuple = ("file doesn't exist", old_path, new_path) - if verbose: - print(str(corr_tuple)) - - return corr_tuple - -def run_correlations(matched_dct, input_dct, source='output_dir', quick=False, verbose=False): - - all_corr_dct = { - 'pearson': {}, - 'concordance': {}, - 'sub_optimal': {} - } - - args_list = [] - - quick_list = [ - 'anatomical_brain', - 'anatomical_csf_mask', - 'anatomical_gm_mask', - 'anatomical_wm_mask', - 'anatomical_to_standard', - 'functional_preprocessed', - 'functional_brain_mask', - 'mean_functional_in_anat', - 'functional_nuisance_residuals', - 'functional_nuisance_regressors', - 'functional_to_standard', - 'roi_timeseries' - ] - - matched_path_dct = matched_dct['matched'] - output_dir = input_dct['settings']['correlations_dir'] - s3_creds = input_dct['settings']['s3_creds'] - - for category in matched_path_dct.keys(): - - if quick: - if category not in quick_list: - continue - - for file_id in matched_path_dct[category].keys(): - - old_path = matched_path_dct[category][file_id][0] - new_path = matched_path_dct[category][file_id][1] - - if source == 'work_dir': - args_list.append((file_id, old_path, new_path, output_dir, s3_creds, verbose)) - else: - args_list.append((category, old_path, new_path, output_dir, s3_creds, verbose)) - - print("\nNumber of correlations to calculate: {0}\n".format(len(args_list))) - - print("Running correlations...") - p = Pool(input_dct['settings']['n_cpus']) - corr_tuple_list = p.map(calculate_correlation, args_list) - p.close() - p.join() - - print("\nCorrelations of the {0} are done.\n".format(source)) - - for corr_tuple in corr_tuple_list: - if not corr_tuple: - continue - if corr_tuple[0] not in all_corr_dct['concordance'].keys(): - all_corr_dct['concordance'][corr_tuple[0]] = [] - if corr_tuple[0] not in all_corr_dct['pearson'].keys(): - all_corr_dct['pearson'][corr_tuple[0]] = [] - all_corr_dct['concordance'][corr_tuple[0]] += corr_tuple[1] - all_corr_dct['pearson'][corr_tuple[0]] += corr_tuple[2] - - if len(corr_tuple) > 3: - if corr_tuple[0] not in all_corr_dct['sub_optimal'].keys(): - all_corr_dct['sub_optimal'][corr_tuple[0]] = [] - try: - all_corr_dct['sub_optimal'][corr_tuple[0]].append("{0}:\n{1}\n{2}" - "\n\n".format(corr_tuple[1][0], - corr_tuple[3][0], - corr_tuple[3][1])) - except TypeError: - pass - - return all_corr_dct - -def post180_organize_correlations(concor_dct, corr_type="concordance", quick=False): - - corr_map_dct = {"correlations": {}} - for key in concor_dct: - if "problem" in key: - continue - # shouldn't need this - FIX - rawkey = key.replace('acq-', '').replace('run-', '') - datatype = rawkey.split("_")[-1] - - if datatype not in corr_map_dct["correlations"]: - corr_map_dct["correlations"][datatype] = {} - corr_map_dct["correlations"][datatype][rawkey] = concor_dct[key] - - return corr_map_dct - - -def organize_correlations(concor_dict, corr_type="concordance", quick=False): - # break up all of the correlations into groups - each group of derivatives - # will go into its own boxplot - - regCorrMap = {} - native_outputs = {} - template_outputs = {} - timeseries = {} - functionals = {} - - core = {} - - corr_map_dict = {} - corr_map_dict["correlations"] = {} - - derivs = [ - 'alff', - 'dr_tempreg', - 'reho', - 'sca_roi', - 'timeseries', - 'ndmg'] - anats = [ - 'anatomical', - 'seg' - ] - time_series = [ - 'functional_freq', - 'nuisance_residuals', - 'functional_preprocessed', - 'functional_to_standard', - 'ica_aroma_', - 'motion_correct', - 'slice_time', - ] - funcs = [ - 'functional', - 'displacement'] - - for key in concor_dict: - - if quick: - core[key] = concor_dict[key] - continue - - if 'xfm' in key or 'mixel' in key: - continue - - if 'centrality' in key or 'vmhc' in key or 'sca_tempreg' in key: - template_outputs[key] = concor_dict[key] - continue - - for word in anats: - if word in key: - regCorrMap[key] = concor_dict[key] - continue - - for word in derivs: - if word in key and 'standard' not in key: - native_outputs[key] = concor_dict[key] - continue - elif word in key: - template_outputs[key] = concor_dict[key] - continue - - for word in time_series: - if word in key and 'mean' not in key and 'mask' not in key: - timeseries[key] = concor_dict[key] - continue - - for word in funcs: - if word in key: - functionals[key] = concor_dict[key] - - if quick: - group = "{0}_core_outputs".format(corr_type) - if len(core.values()) > 0: - corr_map_dict["correlations"][group] = core - else: - print("No values in {0}".format(group)) - return corr_map_dict - - group = "{0}_registration_and_segmentation".format(corr_type) - if len(regCorrMap.values()) > 0: - corr_map_dict["correlations"][group] = regCorrMap - else: - print("No values in {0}".format(group)) - - group = "{0}_native_space_outputs".format(corr_type) - if len(native_outputs.values()) > 0: - corr_map_dict["correlations"][group] = native_outputs - else: - print("No values in {0}".format(group)) - - group = "{0}_template_space_outputs".format(corr_type) - if len(template_outputs.values()) > 0: - corr_map_dict["correlations"][group] = template_outputs - else: - print("No values in {0}".format(group)) - - group = "{0}_timeseries_outputs".format(corr_type) - if len(timeseries.values()) > 0: - corr_map_dict["correlations"][group] = timeseries - else: - print("No values in {0}".format(group)) - - group = "{0}_functional_outputs".format(corr_type) - if len(functionals.values()) > 0: - corr_map_dict["correlations"][group] = functionals - else: - print("No values in {0}".format(group)) - - return corr_map_dict - -def quick_summary(dct, corr_map_dct, output_dir): - for corr_group in corr_map_dct["correlations"].keys(): - cat_dct = {} - lines = [] - for output_type, corr_vec in dict(corr_map_dct["correlations"][corr_group]).items(): - try: - corrmean = np.mean(np.asarray(corr_vec)) - except TypeError: - continue - lines.append("{0}: {1}".format(output_type, corrmean)) - - dct = write_dct(dct, lines, output_type) - return(dct) - -def compare_pipelines(input_dct, dir_type='output_dir'): - - output_dir = input_dct['settings']['output_dir'] - pickle_dir = input_dct['settings']['pickle_dir'] - - corrs_pkl = os.path.join(pickle_dir, "{0}_correlations.p".format(dir_type)) - matched_pkl = os.path.join(pickle_dir, "{0}_matched_files.p".format(dir_type)) - - all_corr_dct = None - if os.path.exists(corrs_pkl): - print("\n\nFound the correlations pickle: {0}\n\n" - "Starting from there..\n".format(corrs_pkl)) - all_corr_dct = read_pickle(corrs_pkl) - elif os.path.exists(matched_pkl): - print("\n\nFound the matched filepaths pickle: {0}\n\n" - "Starting from there..\n".format(matched_pkl)) - matched_dct = read_pickle(matched_pkl) - - else: - # gather all relevant output and working files - outfiles1_dct, outfiles2_dct = gather_all_files(input_dct, pickle_dir, - source=dir_type) - - matched_dct = match_filepaths(outfiles1_dct, outfiles2_dct) - write_pickle(matched_dct, matched_pkl) - - if not all_corr_dct: - all_corr_dct = run_correlations(matched_dct, - input_dct, - source=dir_type, - quick=input_dct['settings']['quick'], - verbose=input_dct['settings']['verbose']) - write_pickle(all_corr_dct, corrs_pkl) - - if dir_type == 'work_dir': - sorted_vals = [] - #sorted_keys = sorted(all_corr_dct, key=all_corr_dct.get) - for key in all_corr_dct.keys(): #sorted_keys: - if 'file reading problem:' in key or 'different shape' in key or 'correlating problem' in key: - continue - else: - sorted_vals.append("{0}: {1}".format(all_corr_dct[key], key)) - working_corrs_file = os.path.join(output_dir, "work_dir_correlations.txt") - with open(working_corrs_file, 'wt') as f: - for line in sorted_vals: - f.write(line) - f.write("\n") - - else: - pre180 = False - if pre180: - organize = organize_correlations - else: - organize = post180_organize_correlations - - corr_map_dict = organize(all_corr_dct["concordance"], "concordance", - quick=input_dct['settings']['quick']) - corr_map_dict["pipeline_names"] = input_dct["pipelines"].keys() - - pearson_map_dict = organize(all_corr_dct["pearson"], "pearson", - quick=input_dct['settings']['quick']) - pearson_map_dict["pipeline_names"] = input_dct["pipelines"].keys() - dct = {} - corr_map = quick_summary(dct, corr_map_dict, output_dir) - pearson_map = quick_summary(dct, pearson_map_dict, output_dir) - - if all_corr_dct['sub_optimal']: - write_yml_file(all_corr_dct['sub_optimal'], os.path.join(output_dir, "sub_optimal.yml")) - - #for corr_group_name in corr_map_dict["correlations"].keys(): - # corr_group = corr_map_dict["correlations"][corr_group_name] - # create_boxplot(corr_group, corr_group_name, - # corr_map_dict["pipeline_names"], output_dir) - - #for corr_group_name in pearson_map_dict["correlations"].keys(): - # corr_group = pearson_map_dict["correlations"][corr_group_name] - # create_boxplot(corr_group, corr_group_name, - # pearson_map_dict["pipeline_names"], output_dir) - return(corr_map, pearson_map) - -def main(): - - import os - import argparse - - from multiprocessing import Pool - import itertools - - parser = argparse.ArgumentParser() - parser.add_argument("input_yaml", type=str, - help="file path of the script's input YAML") - parser.add_argument("--data_source", type=str, - help="Which site data comes from") - parser.add_argument("--branch", type=str, - help="Branch name") - args = parser.parse_args() - data_source = args.data_source - branch = args.branch - - # get the input info - input_dct = read_yml_file(args.input_yaml) - - # check for already completed stuff (pickles) - output_dir = os.path.join(os.getcwd(), - "correlations_{0}".format(input_dct['settings']['run_name'])) - pickle_dir = os.path.join(output_dir, "pickles") - - if not os.path.exists(pickle_dir): - try: - os.makedirs(pickle_dir) - except: - err = "\n\n[!] Could not create the output directory for the " \ - "correlations. Do you have write permissions?\nAttempted " \ - "output directory: {0}\n\n".format(output_dir) - raise Exception(err) - - input_dct['settings'].update({'output_dir': output_dir}) - input_dct['settings'].update({'pickle_dir': pickle_dir}) - - corr_map, pearson_map = compare_pipelines(input_dct, dir_type='output_dir') - corr_map_keys = list(corr_map.keys()) - all_keys = [] - for key in corr_map_keys: - keys = list(corr_map[key]) - for i in keys: - all_keys.append(i) - return all_keys, data_source, branch - - -if __name__ == "__main__": - all_keys, data_source, branch = main() - html_body = body(all_keys, data_source) - file = open(f"{data_source}_{branch}.json","w") - file.write(html_body) - file.close() \ No newline at end of file diff --git a/create_yml.py b/create_yml.py deleted file mode 100644 index a7238ea..0000000 --- a/create_yml.py +++ /dev/null @@ -1,29 +0,0 @@ -from utils.parse_yaml import cpac_yaml - -import os -import click - -@click.command() -@click.option('--pipeline1', required=True, type=str, help='Path to output directory from CPAC run ' - 'to correlate against pipeline2') -@click.option('--pipeline2', required=True, type=str, help='Path to output directory from CPAC run ' - 'to correlate against pipeline1') -@click.option('--workspace', type=str, help = 'directory to save correlations') -@click.option('--branch', type=str, help = 'branch name') -@click.option('--data_source', type=str, help = 'Data site') - - -def main(pipeline1, pipeline2, workspace, branch, data_source): - """ - Correlate outputs from regression run again another C-PAC version. - """ - - git_home = os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + os.sep + os.pardir) - run_name = f'{branch}_{data_source}' - - cpac_yaml(pipeline1, pipeline2, f'{workspace}/correlations', run_name, 1, branch, data_source) - - return - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..ac04376 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,89 @@ +[tool.poetry] +name = "cpac_regression_dashboard" +version = "1.0.0" +description = "Generate a dashboard for C-PAC regression tests" +authors = [ + "Amy Gutierrez <58920810+amygutierrez@users.noreply.github.com>", + "Jon Clucas =1.2.0"] +build-backend = "poetry.core.masonry.api" diff --git a/src/cpac_regression_dashboard/__init__.py b/src/cpac_regression_dashboard/__init__.py new file mode 100644 index 0000000..75013dc --- /dev/null +++ b/src/cpac_regression_dashboard/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Create a dashboard of regression test results.""" diff --git a/src/cpac_regression_dashboard/_version.py b/src/cpac_regression_dashboard/_version.py new file mode 100644 index 0000000..f220d89 --- /dev/null +++ b/src/cpac_regression_dashboard/_version.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Get version from packaging metadata.""" +from importlib.metadata import PackageNotFoundError, version + +try: + __version__ = version("cpac_regression_dashboard") +except PackageNotFoundError: + __version__ = "unknown" diff --git a/src/cpac_regression_dashboard/build_d3_dashboard.py b/src/cpac_regression_dashboard/build_d3_dashboard.py new file mode 100644 index 0000000..0cc47c9 --- /dev/null +++ b/src/cpac_regression_dashboard/build_d3_dashboard.py @@ -0,0 +1,39 @@ +import os +from shutil import copy + +import click +from lxml import etree + + +@click.command() +@click.option("--json_file", required=True, help="JSON file from correlations") +@click.option("--branch", required=True, help="branch name") +def main(json_file=None, branch=None): + outdir = f"output/{branch}" + os.makedirs(outdir, exist_ok=True) + json_filename = os.path.basename(json_file) + copy(json_file, "/".join([outdir, json_filename])) + name = json_filename.replace(f"_{branch}.json", "") + with open("templates/heatmap.html", "r", encoding="utf-8") as _f: + body = etree.HTML(_f.read()) + script_element = etree.SubElement(body[0], "script") + script_element.set("defer", "defer") + script_element.set("src", "./heatmap.js") + with open("templates/heatmap.js", "r", encoding="utf-8") as _f: + with open(f"{outdir}/heatmap.js", "w", encoding="utf=8") as _s: + _s.write( + _f.read() + .replace("DATAFILE", json_filename) + .replace("GRAPHTITLE", branch) + .replace("GRAPHSUBTITLE", name) + ) + body = etree.tostring(body, encoding="unicode", method="html") + + with open(f"{outdir}/{name}.html", "w", encoding="utf-8") as _f: + _f.write(body) + + return body, name, branch + + +if __name__ == "__main__": + main() diff --git a/build_dashboard.py b/src/cpac_regression_dashboard/build_dashboard.py similarity index 61% rename from build_dashboard.py rename to src/cpac_regression_dashboard/build_dashboard.py index 82ee895..f00da15 100644 --- a/build_dashboard.py +++ b/src/cpac_regression_dashboard/build_dashboard.py @@ -1,24 +1,31 @@ -from utils.html_script import write_html, setup_browser - import os + import click +from .utils.html_script import setup_browser, write_html + + def process_option(ctx, param, value): if value is not None: - values = value.split(',') + values = value.split(",") return [val.strip() for val in values] + return [] -@click.command() -@click.option('--json_files', required=True, - callback=process_option, help='JSON files from correlations') -@click.option('--branch', required=True, help='branch name') +@click.command() +@click.option( + "--json_files", + required=True, + callback=process_option, + help="JSON files from correlations", +) +@click.option("--branch", required=True, help="branch name") def main(json_files=None, branch=None): - body = '' + body = "" data_source = [] for json in json_files: name = os.path.basename(json) - data = name.replace(f"_{branch}.json", '') + data = name.replace(f"_{branch}.json", "") data_source.append(data) with open(json) as user_file: file_contents = user_file.read() @@ -26,12 +33,13 @@ def main(json_files=None, branch=None): body = (body.rstrip()).rstrip(",") html_body = write_html(body) - file = open('html.html', 'w') + file = open("html.html", "w") file.write(html_body) file.close() setup_browser(html_body) return body, data_source, branch + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/src/cpac_regression_dashboard/calculate_correlations.py b/src/cpac_regression_dashboard/calculate_correlations.py new file mode 100644 index 0000000..ec21582 --- /dev/null +++ b/src/cpac_regression_dashboard/calculate_correlations.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +"""Calculate correlations and write them to D3-friendly file.""" +import json + +from cpac_correlations import cpac_correlations + +from .utils.html_script import body + + +def main() -> None: # noqa: D103 + """Gather correlation coefficients and write them to D3-readable JSON.""" + all_keys, data_source, branch = cpac_correlations() + html_body = body(all_keys, data_source) + with open(f"{data_source}_{branch}.json", "w", encoding="utf-8") as file: + file.write(json.dumps(json.loads(f"[{html_body.strip().strip(',')}]"))) + + +main.__doc__ = __doc__ + +if __name__ == "__main__": + main() diff --git a/src/cpac_regression_dashboard/create_yml.py b/src/cpac_regression_dashboard/create_yml.py new file mode 100644 index 0000000..b7eed2c --- /dev/null +++ b/src/cpac_regression_dashboard/create_yml.py @@ -0,0 +1,41 @@ +import os + +import click + +from .utils.parse_yaml import cpac_yaml + + +@click.command() +@click.option( + "--pipeline1", + required=True, + type=str, + help="Path to output directory from CPAC run " "to correlate against pipeline2", +) +@click.option( + "--pipeline2", + required=True, + type=str, + help="Path to output directory from CPAC run " "to correlate against pipeline1", +) +@click.option("--workspace", type=str, help="directory to save correlations") +@click.option("--branch", type=str, help="branch name") +@click.option("--data_source", type=str, help="Data site") +def main(pipeline1, pipeline2, workspace, branch, data_source) -> None: + """Correlate outputs from regression run again another C-PAC version.""" + os.path.normpath(os.path.dirname(os.path.abspath(__file__)) + os.sep + os.pardir) + run_name = f"{branch}_{data_source}" + + cpac_yaml( + pipeline1, + pipeline2, + f"{workspace}/correlations", + run_name, + 1, + branch, + data_source, + ) + + +if __name__ == "__main__": + main() diff --git a/src/cpac_regression_dashboard/generate_comment.py b/src/cpac_regression_dashboard/generate_comment.py new file mode 100644 index 0000000..86f4e60 --- /dev/null +++ b/src/cpac_regression_dashboard/generate_comment.py @@ -0,0 +1,298 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Gather generated PNGs and link to heatmap in a GitHub-flavored Markdown string.""" +import asyncio +from dataclasses import dataclass +from importlib.metadata import metadata +import os +from pathlib import Path +import subprocess +import sys +import tempfile +from typing import Generator, Optional + +from cairosvg import svg2png +from git import Repo +from git.exc import GitCommandError +from github import Github +from github.Repository import Repository +from playwright.async_api import async_playwright +import requests + +from ._version import __version__ + + +@dataclass +class EnvVars: + """Dataclass for environment variables.""" + + github_token: str + owner: str + repo: str + sha: str + testing_owner: str + + def __init__(self) -> None: + """Initialize the dataclass from the environment.""" + attrs = ["github_token", "owner", "repo", "sha", "testing_owner"] + for attr in attrs: + setattr(self, attr, os.environ.get(attr.upper(), "")) + + +_ENV = EnvVars() + + +@dataclass +class Heatmap: + """Heatmap dataclass.""" + + filename: str + content: str + + +def add_heatmap_to_branch(file: Heatmap) -> None: + """Add a heatmap to a branch. + + Parameters + ---------- + file : Heatmap + The heatmap file to add. + + Returns + ------- + None + """ + g = Github(_ENV.github_token) + repo = g.get_repo(f"{_ENV.testing_owner}/regtest-runlogs") + branch_name = f"{_ENV.repo}_{_ENV.sha}" + with tempfile.TemporaryDirectory() as _temp_dir: + temp_dir = Path(_temp_dir) + local_repo = Repo.clone_from( + repo.clone_url.replace( + "https://", f"https://${_ENV.github_token}:x-oauth-basic@" + ), + temp_dir, + branch=branch_name, + depth=1, + ) + # make sure branch is up to date + local_repo.remotes.origin.fetch("+refs/heads/*:refs/remotes/origin/*") + local_repo.remotes.origin.pull(branch_name) + svg_path = temp_dir / f"{file.filename}.svg" + png_path = temp_dir / f"{file.filename}.png" + with open(svg_path, "w") as _f: + _f.write(file.content) + svg2png(background_color="white", url=str(svg_path), write_to=str(png_path)) + local_repo.index.add([png_path]) + local_repo.index.commit(":loud_sound: Add heatmap image") + try: + local_repo.remotes.origin.push(branch_name) + except GitCommandError: + local_repo.remotes.origin.push(branch_name, force=True) + + +def gather_images(path: Path) -> Generator[Path, None, None]: + """Gather the images. + + Parameters + ---------- + path : Path + The path to the correlations directory.. + + Yields + ------ + image : Path + The path to an image. + """ + return path.glob("*.png") + + +def gather_text(path: Path) -> str: + """Gathers and concatenates all text files in the given directory. + + Parameters + ---------- + path : Path + The path to the correlations directory. + + Returns + ------- + str + The concatenated text. + """ + text = "|feature|coefficient|\n|---|---|\n" + for file in path.glob("*.txt"): + with open(file, "r", encoding="utf=8") as _f: + for line in _f.readlines(): + text += f"|{'|'.join(_.strip() for _ in line.split(':', 1))}|\n" + return text.strip() + + +async def generate_comment(path: Path) -> str: + """Generate the comment. + + Parameters + ---------- + path : Path + The path to the correlations directory. + + Returns + ------- + str : The comment. + """ + project_urls = metadata(__package__).get_all("Project-URL", []) + source_url = None + for _url in project_urls: + if _url.startswith("Repository, "): + source_url = _url.split(",")[1].strip() + break + if source_url is None: + comment = f"Generated by {__name__} {__version__}\n\n" + else: + _packageless_name = __name__.replace(__package__, "").lstrip(".") + comment = ( + f"Generated by [{__package__}]({source_url})." + f"{_packageless_name} {__version__}\n\n" + ) + comment += await get_heatmap() + for image in gather_images(path): + raw_image_path = _raw_image_path(_ENV.testing_owner, _ENV.repo, _ENV.sha, image) + comment += f"![{image.stem}]({raw_image_path})\n" + return comment + gather_text(path) + + +async def get_heatmap() -> str: + """Get a heatmap image.""" + subprocess.run( + "playwright install chromium".split(" "), check=False + ) # update chromium + url = f"https://{_ENV.testing_owner}.github.io/dashboard/?data_sha={_ENV.sha}" + svg_string: Optional[str] = None + async with async_playwright() as p: + try: + browser = await p.chromium.launch(headless=True) + page = await browser.new_page() + await page.goto(url, wait_until="networkidle") + svg_string = await page.evaluate( + """() => { + let svg = document.querySelector('svg'); + return svg ? svg.outerHTML : null; +}""" + ) + except Exception as exception: + from warnings import warn + + warn( + f"{exception}\n\nAre playwright and chromium installed?", RuntimeWarning + ) + if svg_string: + _heatmap = Heatmap("heatmap", svg_string) + add_heatmap_to_branch(_heatmap) + heatmap = _raw_image_path( + _ENV.testing_owner, + _ENV.repo, + _ENV.sha, + Path(f"{_heatmap.filename}.png"), + ) + heatmap = f"[![heatmap]({heatmap})]({url})" + else: + heatmap = "" + + await browser.close() + return heatmap + + +def main() -> None: + """Generate and post a comment on a GitHub commit. + + Also post the comment to any open PR in which the commit is the most recent. + """ + if len(sys.argv) > 1: + if sys.argv[1] in ["-h", "--help"]: + print("Usage: cpac_regsuite_generate_comment [path]") + print("If no path is given, the current working directory is used.") + print("Required environment variables:") + print( + "GITHUB_TOKEN: A personal access token with scope to write to " + "comments and pull requests." + ) + print("OWNER: The owner of the repository.") + print("PLAYWRIGHT_BROWSERS_PATH: The path for Playwright browsers.") + print("REPO: The name of the repository.") + print("SHA: The SHA of the commit.") + print("TESTING_OWNER: The owner of the testing repository.") + sys.exit(0) + elif sys.argv[1] in ["-v", "--version"]: + print(f"{__name__} version {__version__}") + sys.exit(0) + path = Path(sys.argv[1]) + else: + path = Path(os.getcwd()) + asyncio.run(post_comment(path)) + + +def repost_comment_on_pull_request( + repo: Repository, comment: str, pr: dict[str, str] +) -> None: + """Repost a commit comment on a PR containing that commit.""" + pr_number = pr["number"] + issue = repo.get_issue(number=pr_number) + issue.create_comment(comment) + + +def repost_comment_on_pull_requests(repo: Repository, comment: str) -> None: + """Repost a commit comment on all PR containing that commit.""" + pr_url: str = f"https://api.github.com/repos/{_ENV.owner}/{_ENV.repo}/commits/{_ENV.sha}/pulls" + headers: dict[str, str] = { + "Authorization": f"Bearer {_ENV.github_token}", + "Accept": "application/vnd.github.v3+json", + } + + response: requests.Response = requests.get(pr_url, headers=headers) + success_response = 200 + if response.status_code == success_response: + pull_requests: Optional[list[dict]] = response.json() + if pull_requests: + for pr in pull_requests: + repost_comment_on_pull_request(repo, comment, pr) + + +async def post_comment(path: Path) -> None: + """Post a comment on a GitHub commit and relevant PR.""" + personal_access_token = _ENV.github_token + g = Github(personal_access_token) + repo = g.get_repo(f"{_ENV.owner}/{_ENV.repo}") + commit = repo.get_commit(_ENV.sha) + comment = await generate_comment(path) + commit.create_comment(comment) + for pr in repo.get_pulls(state="open", sort="created"): + if pr.head.sha == _ENV.sha: + pr.create_issue_comment(comment) + + +def _raw_image_path(owner: str, repo: str, sha: str, image: Path) -> str: + """Generate the raw image path. + + Parameters + ---------- + owner : str + The owner of the repository. + + repo : str + The name of the repository. + + sha : str + The SHA of the commit. + + image : Path + The path to the image. + + Returns + ------- + str : The raw image path. + """ + return f"https://raw.githubusercontent.com/{owner}/regtest-runlogs/{repo}_{sha}/{image.name}" + + +if __name__ == "__main__": + main() diff --git a/src/cpac_regression_dashboard/templates/heatmap.html b/src/cpac_regression_dashboard/templates/heatmap.html new file mode 100644 index 0000000..2f5fc29 --- /dev/null +++ b/src/cpac_regression_dashboard/templates/heatmap.html @@ -0,0 +1,9 @@ + + + Correlations heatmap + + + +
Correlations heatmap will load here!
+ + diff --git a/src/cpac_regression_dashboard/templates/heatmap.js b/src/cpac_regression_dashboard/templates/heatmap.js new file mode 100644 index 0000000..c36bcd6 --- /dev/null +++ b/src/cpac_regression_dashboard/templates/heatmap.js @@ -0,0 +1,126 @@ +// set the dimensions and margins of the graph +var margin = {top: 80, right: 25, bottom: 30, left: 40}, + width = 800 - margin.left - margin.right, + height = 5000 - margin.top - margin.bottom; + +// append the svg object to the body of the page +var svg = d3.select("#heatmap-container") + .html(null) + .append("svg") + .attr("width", width + margin.left + margin.right) + .attr("height", height + margin.top + margin.bottom) + .append("g") + .attr("transform", + "translate(" + margin.left + "," + margin.top + ")"); + +//Read the data +datasource = d3.json("DATAFILE"); +datasource.then(function(data) { + + data.sort(function(a, b) { return d3.descending(a.rowid, b.rowid) }); + // Labels of row and columns -> unique identifier of the column called 'group' and 'variable' + var groupedData = d3.group(data, d => d.columnid); + var myGroups = Array.from(groupedData.keys()); + var myVars = Array.from(d3.group(data, d => d.rowid).keys()); + + // Build X scales and axis: + var x = d3.scaleBand() + .domain(myGroups) + .range([0, width]) + .padding(0.05); + + svg.append("g") + .style("font-size", 15) + .attr("transform", "translate(0,0)") + .call(d3.axisTop(x).tickSize(0)) + .select(".domain").remove(); + + // Build Y scales and axis: + var y = d3.scaleBand() + .domain(myVars) + .range([height, 0]) + .padding(0.05); + + svg.append("g") + .style("font-size", 15) + .attr("transform", "translate(" + width + ",0)") + .call(d3.axisLeft(y).tickSize(0)) + .select(".domain").remove(); + + // Build color scale + var myColor = d3.scaleSequential() + .interpolator(d3.interpolateRdYlGn) + .domain([0.8, 1]); + + // Create a tooltip + var tooltip = d3.select("#my_dataviz") + .append("div") + .style("opacity", 0) + .attr("class", "tooltip") + .style("background-color", "white") + .style("border", "solid") + .style("border-width", "2px") + .style("border-radius", "5px") + .style("padding", "5px"); + + // Three functions that change the tooltip when user hovers / moves / leaves a cell + var mouseover = function(d) { + tooltip + .style("opacity", 1); + d3.select(this) + .style("stroke", "black") + .style("opacity", 1); + }; + + var mousemove = function(d) { + tooltip + .html(d.rowid + ": " + d.value) + .style("left", (d3.pointer(this)[0] + 70) + "px") + .style("top", (d3.pointer(this)[1]) + "px"); + }; + + var mouseleave = function(d) { + tooltip + .style("opacity", 0); + d3.select(this) + .style("stroke", "none") + .style("opacity", 0.8); + }; + + // Add the squares + svg.selectAll() + .data(data, function(d) {return d.columnid + ':' + d.variable;}) + .enter() + .append("rect") + .attr("x", function(d) { return x(d.columnid) + (x.bandwidth() / 2); }) + .attr("y", function(d) { return y(d.rowid); }) + .attr("rx", 4) + .attr("ry", 4) + .attr("width", y.bandwidth()) + .attr("height", y.bandwidth()) + .style("fill", function(d) { return myColor(d.value); }) + .style("stroke-width", 0) + .style("stroke", "none") + .style("opacity", 0.8) + .on("mouseover", mouseover) + .on("mousemove", mousemove) + .on("mouseleave", mouseleave); +}); + +// Add title to graph +svg.append("text") + .attr("x", 0) + .attr("y", -50) + .attr("text-anchor", "left") + .style("font-size", "22px") + .text("GRAPHTITLE"); + +// Add subtitle to graph +svg.append("text") + .attr("x", 0) + .attr("y", -20) + .attr("text-anchor", "left") + .style("font-size", "14px") + .style("fill", "grey") + .style("max-width", 400) + .text("GRAPHSUBTITLE"); diff --git a/utils/html_script.py b/src/cpac_regression_dashboard/utils/html_script.py similarity index 85% rename from utils/html_script.py rename to src/cpac_regression_dashboard/utils/html_script.py index 7a23761..020c090 100644 --- a/utils/html_script.py +++ b/src/cpac_regression_dashboard/utils/html_script.py @@ -1,31 +1,30 @@ +import json -def dataset(name, data_source, value): - dataset = f""" + +def dataset(name: str, data_source: str, value: float | int | str) -> str: + return f""" {{ "rowid": "{name}", "columnid": "{data_source}", "value": "{value}" }}, """ - return dataset -def body(all_keys, data_source): - data_body = '' + +def body(all_keys: list[str], data_source: str) -> str: + data_body: str = "[" for key in all_keys: - name_value = key.split(': ') + name_value = key.split(": ") name = name_value[0] value = name_value[1] data_body += dataset(name, data_source, value) - out = f""" - {{"data": [ - {data_body} - ]}} - """ - return data_body + data_body = data_body.strip() + data_body += "]" + return json.dumps(json.loads(data_body)) + -def write_html(data_body): - script = \ - f""" +def write_html(data_body) -> str: + return f""" Correlations @@ -90,17 +89,14 @@ def write_html(data_body):
Correlations heatmap will load here!
- """ + """ # noqa: E501 - return(script) -def setup_browser(html_template): +def setup_browser(html_template) -> None: import tempfile import webbrowser - with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as temp_file: - temp_file.write(html_template.encode('utf-8')) - filename = 'file:///'+ temp_file.name + with tempfile.NamedTemporaryFile(suffix=".html", delete=False) as temp_file: + temp_file.write(html_template.encode("utf-8")) + filename = "file:///" + temp_file.name webbrowser.open_new_tab(filename) - - return \ No newline at end of file diff --git a/src/cpac_regression_dashboard/utils/parse_yaml.py b/src/cpac_regression_dashboard/utils/parse_yaml.py new file mode 100644 index 0000000..8d50688 --- /dev/null +++ b/src/cpac_regression_dashboard/utils/parse_yaml.py @@ -0,0 +1,117 @@ +"""From a pair of CPAC output directories, write a YAML file for regression.""" +import os +from pathlib import Path +from typing import cast, Optional + +import yaml + +_PIPELINE_DICT = dict[Optional[str], dict[str, Optional[int | str]]] +_FULL_YAML_DICT = dict[str, dict[str, bool | int | Optional[str]] | _PIPELINE_DICT] + + +def get_dir(paths: str) -> Optional[str]: + """Get the full path to a ``pipeline_*`` directory.""" + directory = paths + if directory: + for root, dirs, files in os.walk(paths): + for _dir in dirs: + if "pipeline_" in _dir: + directory = os.path.join(root, _dir) + return directory + + +def write_pipeline_yaml( + output_dir: Optional[str] = None, + working_dir: Optional[str] = None, + log_dir: Optional[str] = None, + pipeline_config: Optional[str] = None, + pipeline_name: Optional[str] = None, +) -> _PIPELINE_DICT: + """Collect paths and strings to write.""" + return { + pipeline_name: { + "output_dir": output_dir, + "work_dir": working_dir, + "log_dir": log_dir, + "pipe_config": pipeline_config, + "replacements": None, + } + } + + +def parse_yaml(directory: str, pipeline_name: str) -> _PIPELINE_DICT: + """Parse a CPAC output directory for pipeline information.""" + subdirs = ["log", "working", "output"] + paths: dict[str, Optional[str]] = {} + + for subdir in subdirs: + if os.path.isdir(os.path.join(directory, subdir)): + paths[f"{subdir}_dir"] = os.path.join(directory, subdir) + else: + paths[f"{subdir}_dir"] = None + assert isinstance(paths["log_dir"], str), f"log_dir: {paths['log_dir']}" + log_dir: Optional[str] = get_dir(paths["log_dir"]) + + if log_dir is not None: + for root, _dirs, files in os.walk(paths["log_dir"]): + for file in files: + if file.endswith("Z.yml"): + pipeline_config = os.path.join(root, file) + assert isinstance(paths["working_dir"], str), f"working_dir: {paths['working_dir']}" + working_dir = get_dir(paths["working_dir"]) + assert isinstance(paths["output_dir"], str), f"output_dir: {paths['output_dir']}" + output_dir = get_dir(paths["output_dir"]) + + return write_pipeline_yaml( + output_dir, working_dir, log_dir, pipeline_config, pipeline_name + ) + + +def write_yaml( + pipeline_1: _PIPELINE_DICT, + pipeline_2: _PIPELINE_DICT, + correlations_dir: str, + run_name: str, + n_cpus: int = 1, +) -> _FULL_YAML_DICT: + """Combine settings and both pipelines into a single dictionary.""" + yaml_dict: _FULL_YAML_DICT = {} + yaml_dict["settings"] = cast( + dict[str, bool | int | Optional[str]] | _PIPELINE_DICT, + { + "n_cpus": int(n_cpus), + "correlations_dir": correlations_dir, + "run_name": run_name, + "s3_creds": None, + "quick": False, + "verbose": False, + }, + ) + + yaml_dict["pipelines"] = {**pipeline_1, **pipeline_2} + + return yaml_dict + + +def cpac_yaml( + pipeline1: str, + pipeline2: str, + correlations_dir: str, + run_name: str, + n_cpus: int, + branch: str, + data_source: str, +) -> Path: + """Write a YAML file for the regression run.""" + pipeline_1: _PIPELINE_DICT = parse_yaml(pipeline1, "pipeline_1") + pipeline_2: _PIPELINE_DICT = parse_yaml(pipeline2, "pipeline_2") + + yaml_contents: _FULL_YAML_DICT = write_yaml( + pipeline_1, pipeline_2, correlations_dir, run_name, n_cpus + ) + + yaml_path: Path = Path(f"{branch}_{data_source}.yml") + """Path to YAML file for regression correlation.""" + with yaml_path.open("w") as file: + yaml.dump(yaml_contents, file, default_flow_style=False, sort_keys=False) + return yaml_path diff --git a/utils/__pycache__/parse_yaml.cpython-311.pyc b/utils/__pycache__/parse_yaml.cpython-311.pyc deleted file mode 100644 index 206699b..0000000 Binary files a/utils/__pycache__/parse_yaml.cpython-311.pyc and /dev/null differ diff --git a/utils/parse_yaml.py b/utils/parse_yaml.py deleted file mode 100644 index f288dee..0000000 --- a/utils/parse_yaml.py +++ /dev/null @@ -1,85 +0,0 @@ -import os -import yaml - -def get_dir(paths): - if not paths: - directory = None - else: - for root, dirs, files in os.walk(paths): - for dir in dirs: - if 'pipeline_' in dir: - directory = os.path.join(root, dir) - return directory - -def write_pipeline_yaml(output_dir=None, working_dir=None, log_dir=None, \ - pipeline_config=None, pipeline_name=None): - - pipeline = { - pipeline_name: { - "output_dir": output_dir, - "work_dir": working_dir, - "log_dir": log_dir, - "pipe_config": pipeline_config, - "replacements": None - } - } - - return pipeline - -def parse_yaml(directory=None, pipeline_name=None): - subdirs = ['log', 'working', 'output'] - paths = {} - - for subdir in subdirs: - if os.path.isdir(os.path.join(directory, subdir)): - paths[f"{subdir}_dir"] = (os.path.join(directory, subdir)) - else: - paths[f"{subdir}_dir"] = None - - log_dir = get_dir(paths['log_dir']) - - for root, dirs, files in os.walk(paths['log_dir']): - for file in files: - if file.endswith("Z.yml"): - pipeline_config = os.path.join(root, file) - - working_dir = get_dir(paths['working_dir']) - output_dir = get_dir(paths['output_dir']) - - pipeline_dict = write_pipeline_yaml(output_dir, working_dir, log_dir, \ - pipeline_config, pipeline_name) - - return pipeline_dict - -def write_yaml(pipeline_1=None, pipeline_2=None, correlations_dir=None, \ - run_name=None, n_cpus=None): - - yaml_dict = {} - yaml_dict["settings"] = { - "n_cpus": n_cpus, - "correlations_dir": correlations_dir, - "run_name": run_name, - "s3_creds": None, - "quick": False, - "verbose": False - } - - yaml_dict["pipelines"] = { - **pipeline_1, - **pipeline_2 - } - - return yaml_dict - -def cpac_yaml(pipeline1, pipeline2, correlations_dir, run_name, n_cpus, branch, data_source): - - pipeline_1 = parse_yaml(pipeline1, 'pipeline_1') - pipeline_2 = parse_yaml(pipeline2, 'pipeline_2') - - yaml_contents = write_yaml(pipeline_1, pipeline_2, correlations_dir, - run_name, n_cpus) - - with open(f'{branch}_{data_source}.yml', 'w') as file: - yaml.dump(yaml_contents, file, default_flow_style=False, sort_keys=False) - - return \ No newline at end of file