From 21ca8ed48123ed5d59b3841de61122f8eab47ee5 Mon Sep 17 00:00:00 2001 From: sfmig <33267254+sfmig@users.noreply.github.com> Date: Thu, 5 Oct 2023 18:25:26 +0100 Subject: [PATCH] change script to receive json as required CLI argument --- .../cellfinder/cellfinder_main.py | 146 ++++------ .../test_cellfinder_workflow.py | 249 +++++++++++------- 2 files changed, 214 insertions(+), 181 deletions(-) diff --git a/brainglobe_workflows/cellfinder/cellfinder_main.py b/brainglobe_workflows/cellfinder/cellfinder_main.py index c411ac38..ef90e4e3 100644 --- a/brainglobe_workflows/cellfinder/cellfinder_main.py +++ b/brainglobe_workflows/cellfinder/cellfinder_main.py @@ -8,10 +8,12 @@ """ +import argparse import datetime import json import logging import os +import sys from dataclasses import dataclass from pathlib import Path from typing import Optional, Tuple, Union @@ -26,67 +28,13 @@ # logger # if imported as a module, the logger is named after the module -logger = logging.getLogger(__name__) - -# Default config -CELLFINDER_CACHE_DIR = Path.home() / ".cellfinder_workflows" - - -def make_default_config_dict(cellfinder_cache_dir): - """Generate a config dictionary with the required parameters - for the workflow - The input data is fetched from GIN and downloaded to - the location provided by cellfinder_cache_dir. The results are - also saved in a timestamped output subdirectory under cellfinder_cache_dir - - Parameters - ---------- - cellfinder_cache_dir : _type_ - _description_ - - Returns - ------- - dict - dictionary with the required parameters for the workflow - """ - return { - "install_path": cellfinder_cache_dir, - "data_url": "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip", - "data_hash": ( - "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914" - ), - "local_path": cellfinder_cache_dir / "cellfinder_test_data", - "signal_parent_dir": str( - cellfinder_cache_dir / "cellfinder_test_data" / "signal" - ), - "background_parent_dir": str( - cellfinder_cache_dir / "cellfinder_test_data" / "background" - ), - "output_path_basename": cellfinder_cache_dir / "cellfinder_output_", - "detected_cells_filename": "detected_cells.xml", - "voxel_sizes": [5, 2, 2], # microns - "start_plane": 0, - "end_plane": -1, - "trained_model": None, # if None, it will use a default model - "model_weights": None, - "model": "resnet50_tv", - "batch_size": 32, - "n_free_cpus": 2, - "network_voxel_sizes": [5, 1, 1], - "soma_diameter": 16, - "ball_xy_size": 6, - "ball_z_size": 15, - "ball_overlap_fraction": 0.6, - "log_sigma_size": 0.2, - "n_sds_above_mean_thresh": 10, - "soma_spread_factor": 1.4, - "max_cluster_size": 100000, - "cube_width": 50, - "cube_height": 50, - "cube_depth": 20, - "network_depth": "50", - } +console_handler = logging.StreamHandler(sys.stdout) +console_format = logging.Formatter("%(name)s %(levelname)s: %(message)s") +console_handler.setFormatter(console_format) +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) +logger.addHandler(console_handler) @dataclass @@ -140,11 +88,6 @@ class CellfinderConfig: output_path: Optional[Pathlike] = None -def example_cellfinder_script(): - cfg = setup_workflow() - run_workflow_from_cellfinder_run(cfg) - - def run_workflow_from_cellfinder_run(cfg): """ Run workflow based on the cellfinder_core.main.main() @@ -180,7 +123,7 @@ def run_workflow_from_cellfinder_run(cfg): save_cells(detected_cells, cfg.output_path / cfg.detected_cells_filename) -def setup_workflow(cellfinder_cache_dir=CELLFINDER_CACHE_DIR): +def setup_workflow(input_config_path): """Prepare configuration to run workflow This includes @@ -190,9 +133,11 @@ def setup_workflow(cellfinder_cache_dir=CELLFINDER_CACHE_DIR): - creating a timestamped directory for the output of the workflow if it doesn't exist and adding it to the config - To instantiate the config dictionary, we first check if an environment - variable "CELLFINDER_CONFIG_PATH" pointing to a config json file exists. - If not, the default config (DEFAULT_CONFIG_DICT) is used. + + Parameters + ---------- + input_config_path : Path + _description_ Returns ------- @@ -202,28 +147,16 @@ def setup_workflow(cellfinder_cache_dir=CELLFINDER_CACHE_DIR): """ # Define config - # if environment variable defined, that prevails - if "CELLFINDER_CONFIG_PATH" in os.environ.keys(): - input_config_path = Path(os.environ["CELLFINDER_CONFIG_PATH"]) - assert input_config_path.exists() + assert input_config_path.exists() - # read config into dict - # (assumes config is json serializable) - with open(input_config_path) as cfg: - config_dict = json.load(cfg) + # read config into dict + # (assumes config is json serializable) + with open(input_config_path) as cfg: + config_dict = json.load(cfg) - config = CellfinderConfig(**config_dict) + config = CellfinderConfig(**config_dict) - logger.info( - "Configuration retrieved from " - f'{os.environ["CELLFINDER_CONFIG_PATH"]}' - ) - # else use the default config, with the cellfinder cache directory provided - else: - config = CellfinderConfig( - **make_default_config_dict(cellfinder_cache_dir) - ) - logger.info("Using default configuration") + logger.info(f"Input config read from {input_config_path}") # Retrieve and add lists of input data to config if neither are defined if not (config.list_signal_files and config.list_signal_files): @@ -339,5 +272,40 @@ def retrieve_input_data(config): return config +def parse_cli_arguments(): + # initialise argument parser + parser = argparse.ArgumentParser( + description=( + "To launch the workflow with " + "a desired set of input parameters, run:" + " `python cellfinder_main path/to/input/config.json` " + " where path/to/input/config.json is the json file " + "containing the workflow parameters." + ) + ) + # add required arguments + # add --config? + parser.add_argument( + "input_config_path", # required=True, + type=str, + metavar="INPUT_CONFIG_PATH", # a name for usage messages + help="", + ) + # build parser object + args = parser.parse_args() + + # error if required arguments not provided + if not args.input_config_path: + logger.error("Paths to input config not provided.") + parser.print_help() + + return args + + if __name__ == "__main__": - example_cellfinder_script() + args = parse_cli_arguments() + + cfg = setup_workflow( + Path(args.input_config_path) + ) # this won't be benchmarked + run_workflow_from_cellfinder_run(cfg) # this will be benchmarked diff --git a/tests/test_integration/test_cellfinder_workflow.py b/tests/test_integration/test_cellfinder_workflow.py index 48c8f0b6..d0096cf3 100644 --- a/tests/test_integration/test_cellfinder_workflow.py +++ b/tests/test_integration/test_cellfinder_workflow.py @@ -1,34 +1,126 @@ import json -import logging -import os +import subprocess +import sys from pathlib import Path import pytest from brainglobe_workflows.cellfinder.cellfinder_main import ( - make_default_config_dict, - run_workflow_from_cellfinder_run, - setup_workflow, + CellfinderConfig, ) -@pytest.fixture(autouse=True, scope="session") -def logger_str(): - logging.root.setLevel(logging.DEBUG) - logger_str = "brainglobe_workflows.cellfinder.cellfinder_main" - return logger_str +# config factory +def make_config_dict_fetch_from_GIN(cellfinder_cache_dir): + """Generate a config dictionary with the required parameters + for the workflow + + The input data is fetched from GIN and downloaded to + the location provided by cellfinder_cache_dir. The results are + also saved in a timestamped output subdirectory under cellfinder_cache_dir + + Parameters + ---------- + cellfinder_cache_dir : _type_ + _description_ + + Returns + ------- + dict + dictionary with the required parameters for the workflow + """ + return { + "install_path": cellfinder_cache_dir, + "data_url": "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip", + "data_hash": ( + "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914" + ), + "local_path": cellfinder_cache_dir / "cellfinder_test_data", + "signal_parent_dir": str( + cellfinder_cache_dir / "cellfinder_test_data" / "signal" + ), + "background_parent_dir": str( + cellfinder_cache_dir / "cellfinder_test_data" / "background" + ), + "output_path_basename": cellfinder_cache_dir / "cellfinder_output_", + "detected_cells_filename": "detected_cells.xml", + "voxel_sizes": [5, 2, 2], # microns + "start_plane": 0, + "end_plane": -1, + "trained_model": None, # if None, it will use a default model + "model_weights": None, + "model": "resnet50_tv", + "batch_size": 32, + "n_free_cpus": 2, + "network_voxel_sizes": [5, 1, 1], + "soma_diameter": 16, + "ball_xy_size": 6, + "ball_z_size": 15, + "ball_overlap_fraction": 0.6, + "log_sigma_size": 0.2, + "n_sds_above_mean_thresh": 10, + "soma_spread_factor": 1.4, + "max_cluster_size": 100000, + "cube_width": 50, + "cube_height": 50, + "cube_depth": 20, + "network_depth": "50", + } + + +@pytest.fixture(autouse=True) +def cellfinder_cache_dir(tmp_path): + """Create a .cellfinder_workflows directory + under a temporary pytest directory. + It uses pytest's tmp_path fixture so that all is cleared after the test. + A new temporary directory is created every function call (scope="function" + by default) -@pytest.fixture(autouse=True, scope="function") -def cellfinder_cache_dir(tmp_path): - # use pytest's tmp_path fixture so that all is cleared after the test - # a new temporary directory is created every function call - return Path(tmp_path) / ".cellfinder_benchmarks" + Parameters + ---------- + tmp_path : _type_ + _description_ + + Returns + ------- + Path + path to the created cache directory + """ + + return Path(tmp_path) / ".cellfinder_workflows" + + +# @pytest.fixture() +# def config_fetch_from_local(): +# pass @pytest.fixture() -def config_from_env_var(tmp_path, cellfinder_cache_dir): - # dump config from fixture into a json file +def path_to_config_fetch_GIN(tmp_path, cellfinder_cache_dir): + """Create an input config that fetches data from GIN and + return its path + + Parameters + ---------- + tmp_path : _type_ + _description_ + cellfinder_cache_dir : _type_ + _description_ + + Returns + ------- + _type_ + _description_ + """ + # create config dict + config_dict = make_config_dict_fetch_from_GIN(cellfinder_cache_dir) + + # create a temp json file to dump config data + input_config_path = ( + tmp_path / "input_config.json" + ) # save it in a temp dir separate from cellfinder_cache_dir + # ensure all Paths are JSON serializable def prep_json(obj): if isinstance(obj, Path): @@ -36,89 +128,62 @@ def prep_json(obj): else: return json.JSONEncoder.default(obj) - # create a temp json file to dump config data - input_config_path = tmp_path / "input_config.json" - - # alter config if required by the test - # - missing signal directory - # - missing background directory - config_dict = make_default_config_dict(cellfinder_cache_dir) - - # dump config into json + # save config data to json file with open(input_config_path, "w") as js: json.dump(config_dict, js, default=prep_json) - # define environment variable pointing to this json file - os.environ["CELLFINDER_CONFIG_PATH"] = str(input_config_path) - - yield os.environ["CELLFINDER_CONFIG_PATH"] - # teardown for this fixture - del os.environ["CELLFINDER_CONFIG_PATH"] - - -def test_run_with_env_var_config(config_from_env_var, caplog, logger_str): - # check environment variable exists - assert "CELLFINDER_CONFIG_PATH" in os.environ.keys() + # check json file exists + assert Path(input_config_path).is_file() + + return input_config_path + + +def test_run_with_config_from_GIN( + path_to_config_fetch_GIN, +): + # run workflow with CLI and capture log + subprocess_output = subprocess.run( + [ + sys.executable, + Path(__file__).resolve().parents[2] + / "brainglobe_workflows" + / "cellfinder" + / "cellfinder_main.py", + str(path_to_config_fetch_GIN), + ], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + ) - # run setup and workflow - with caplog.at_level(logging.DEBUG, logger=logger_str): - cfg = setup_workflow() - run_workflow_from_cellfinder_run(cfg) + # check returncode + assert subprocess_output.returncode == 0 - # check log + # check logs assert ( - "Configuration retrieved from " - f'{os.environ["CELLFINDER_CONFIG_PATH"]}' in caplog.messages + f"Input config read from {str(path_to_config_fetch_GIN)}" + in subprocess_output.stdout ) assert ( - "Fetching input data from the " - "provided GIN repository" in caplog.messages + "Fetching input data from the provided GIN repository" + in subprocess_output.stdout ) - # check output directory and output file exist - assert Path(cfg.output_path).exists() - assert (Path(cfg.output_path) / cfg.detected_cells_filename).is_file() - - -def test_run_with_default_config(cellfinder_cache_dir, caplog, logger_str): - # check environment var is not defined - assert "CELLFINDER_CONFIG_PATH" not in os.environ.keys() - - # run setup and workflow - with caplog.at_level(logging.DEBUG, logger=logger_str): - cfg = setup_workflow(cellfinder_cache_dir) - run_workflow_from_cellfinder_run(cfg) - - # check log - assert "Using default configuration" in caplog.messages + # check output directory + # load config used + with open(path_to_config_fetch_GIN) as cfg: + config_dict = json.load(cfg) + config = CellfinderConfig(**config_dict) + + # check one output directory exists and has expected output file inside it + output_path_timestamped = [ + x + for x in Path(config.output_path_basename).parent.glob("*") + if x.is_dir() + and x.name.startswith(Path(config.output_path_basename).name) + ] + assert len(output_path_timestamped) == 1 + assert (output_path_timestamped[0]).exists() assert ( - "Fetching input data from the " - "provided GIN repository" in caplog.messages - ) - - # check output directory and output file exist - assert Path(cfg.output_path).exists() - assert (Path(cfg.output_path) / cfg.detected_cells_filename).is_file() - - -# test running on local data? -# def test_run_with_default_config_local( -# caplog, logger_str -# ): -# # check environment var is not defined -# assert "CELLFINDER_CONFIG_PATH" not in os.environ.keys() - -# # run setup and workflow -# # do not pass a cellfinder_cache_dir location to use default -# with caplog.at_level( -# logging.DEBUG, logger=logger_str -# ): -# cfg = setup_workflow() -# run_workflow_from_cellfinder_run(cfg) - -# # check log -# assert "Using default configuration" in caplog.messages -# assert ( -# "Fetching input data from the " -# "local directories" in caplog.messages -# ) + output_path_timestamped[0] / config.detected_cells_filename + ).is_file()