From 432f1eee58a34e6bc14ff2548c63922ca74aea7c Mon Sep 17 00:00:00 2001
From: sfmig <33267254+sfmig@users.noreply.github.com>
Date: Fri, 6 Oct 2023 17:00:09 +0100
Subject: [PATCH] refactor pytest fixtures to conftest.py. add adocstrings.

---
 tests/test_integration/conftest.py            | 194 ++++++++++++
 .../test_cellfinder_workflow.py               | 287 +++++++-----------
 2 files changed, 297 insertions(+), 184 deletions(-)
 create mode 100644 tests/test_integration/conftest.py

diff --git a/tests/test_integration/conftest.py b/tests/test_integration/conftest.py
new file mode 100644
index 00000000..8ef61168
--- /dev/null
+++ b/tests/test_integration/conftest.py
@@ -0,0 +1,194 @@
+import json
+from pathlib import Path
+
+import pooch
+import pytest
+
+from brainglobe_workflows.cellfinder.cellfinder_main import (
+    CellfinderConfig,
+)
+
+
+def make_config_dict_fetch_from_GIN(cellfinder_cache_dir):
+    """Generate a config dictionary with the required parameters
+    for the workflow
+
+    The input data is fetched from GIN and downloaded to cellfinder_cache_dir.
+    The results are also saved in a timestamped output subdirectory under
+    cellfinder_cache_dir
+
+    Parameters
+    ----------
+    cellfinder_cache_dir : Path
+        Path to the directory where the downloaded input data will be unzipped,
+        and the output will be saved
+
+    Returns
+    -------
+    dict
+        dictionary with the required parameters for the workflow
+    """
+    return {
+        "install_path": cellfinder_cache_dir,
+        "data_url": "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip",
+        "data_hash": (
+            "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914"
+        ),
+        "extract_relative_dir": "cellfinder_test_data",  # relative path
+        "signal_parent_dir": str(
+            cellfinder_cache_dir / "cellfinder_test_data" / "signal"
+        ),
+        "background_parent_dir": str(
+            cellfinder_cache_dir / "cellfinder_test_data" / "background"
+        ),
+        "output_path_basename": cellfinder_cache_dir / "cellfinder_output_",
+        "detected_cells_filename": "detected_cells.xml",
+        "voxel_sizes": [5, 2, 2],  # microns
+        "start_plane": 0,
+        "end_plane": -1,
+        "trained_model": None,  # if None, it will use a default model
+        "model_weights": None,
+        "model": "resnet50_tv",
+        "batch_size": 32,
+        "n_free_cpus": 2,
+        "network_voxel_sizes": [5, 1, 1],
+        "soma_diameter": 16,
+        "ball_xy_size": 6,
+        "ball_z_size": 15,
+        "ball_overlap_fraction": 0.6,
+        "log_sigma_size": 0.2,
+        "n_sds_above_mean_thresh": 10,
+        "soma_spread_factor": 1.4,
+        "max_cluster_size": 100000,
+        "cube_width": 50,
+        "cube_height": 50,
+        "cube_depth": 20,
+        "network_depth": "50",
+    }
+
+
+def prep_json(obj):
+    """
+    Returns a JSON encodable version of the object.
+
+    It uses the JSON default encoder for all objects
+    except those of type Path.
+
+    Parameters
+    ----------
+    obj : _type_
+        _description_
+
+    Returns
+    -------
+    _type_
+        JSON serializable version of input object
+    """
+    if isinstance(obj, Path):
+        return str(obj)
+    else:
+        return json.JSONEncoder.default(obj)
+
+
+@pytest.fixture(autouse=True)
+def cellfinder_cache_dir(tmp_path):
+    """Create a .cellfinder_workflows directory
+    under a temporary pytest directory and return
+    its path.
+
+    The temporary directory is available via pytest's tmp_path
+    fixture. A new temporary directory is created every function call
+
+    Parameters
+    ----------
+    tmp_path : Path
+        path to pytest-generated temporary directory
+
+    Returns
+    -------
+    Path
+        path to the created cellfinder_workflows cache directory
+    """
+
+    return Path(tmp_path) / ".cellfinder_workflows"
+
+
+@pytest.fixture()
+def path_to_config_fetch_GIN(tmp_path, cellfinder_cache_dir):
+    """Create an input config that fetches data from GIN and
+    return its path
+
+    Parameters
+    ----------
+    tmp_path : Path
+        path to a fresh pytest-generated temporary directory. The
+        generated config is saved here.
+
+    cellfinder_cache_dir : Path
+        path to the cellfinder cache directory, where the paths
+        in the config should point to.
+
+    Returns
+    -------
+    input_config_path : Path
+        path to config file that fetches data from GIN
+    """
+    # create config dict
+    config_dict = make_config_dict_fetch_from_GIN(cellfinder_cache_dir)
+
+    # create a temp json file to dump config data
+    input_config_path = (
+        tmp_path / "input_config.json"
+    )  # save it in a temp dir separate from cellfinder_cache_dir
+
+    # save config data to json file
+    with open(input_config_path, "w") as js:
+        json.dump(config_dict, js, default=prep_json)
+
+    # check json file exists
+    assert Path(input_config_path).is_file()
+
+    return input_config_path
+
+
+@pytest.fixture()
+def path_to_config_fetch_local(path_to_config_fetch_GIN):
+    """Create an input config that points to local data and
+    return its path.
+
+    The process is analogous to creating a config that
+    fetches data from GIN, except that here we download the data
+    from GIN prior to running the workflow.
+
+    Parameters
+    ----------
+    path_to_config_fetch_GIN : Path
+        path to a config file that fetches data from GIN
+
+    Returns
+    -------
+    path_to_config_fetch_GIN : Path
+        path to a config file that fetches data from GIN
+    """
+    # create config that fetches data from GIN
+    # path_to_config_fetch_GIN
+
+    # read into config class
+    with open(path_to_config_fetch_GIN) as cfg:
+        config_dict = json.load(cfg)
+    config = CellfinderConfig(**config_dict)
+
+    # Download GIN data
+    pooch.retrieve(
+        url=config.data_url,
+        known_hash=config.data_hash,
+        path=config.install_path,  # path to download zip to
+        progressbar=True,
+        processor=pooch.Unzip(
+            extract_dir=config.extract_relative_dir
+            # path to unzipped dir, *relative*  to 'path'
+        ),
+    )
+
+    # return path to config json
+    return path_to_config_fetch_GIN
diff --git a/tests/test_integration/test_cellfinder_workflow.py b/tests/test_integration/test_cellfinder_workflow.py
index c6168c4b..d82293a4 100644
--- a/tests/test_integration/test_cellfinder_workflow.py
+++ b/tests/test_integration/test_cellfinder_workflow.py
@@ -3,196 +3,33 @@
 import sys
 from pathlib import Path
 
-import pooch
-import pytest
-
 from brainglobe_workflows.cellfinder.cellfinder_main import (
     DEFAULT_JSON_CONFIG_PATH,
     CellfinderConfig,
 )
 
 
-## Utils
-def make_config_dict_fetch_from_GIN(cellfinder_cache_dir):
-    """Generate a config dictionary with the required parameters
-    for the workflow
-
-    The input data is fetched from GIN and downloaded to
-    the location provided by cellfinder_cache_dir. The results are
-    also saved in a timestamped output subdirectory under cellfinder_cache_dir
-
-    Parameters
-    ----------
-    cellfinder_cache_dir : _type_
-        _description_
-
-    Returns
-    -------
-    dict
-        dictionary with the required parameters for the workflow
-    """
-    return {
-        "install_path": cellfinder_cache_dir,
-        "data_url": "https://gin.g-node.org/BrainGlobe/test-data/raw/master/cellfinder/cellfinder-test-data.zip",
-        "data_hash": (
-            "b0ef53b1530e4fa3128fcc0a752d0751909eab129d701f384fc0ea5f138c5914"
-        ),
-        "extract_relative_dir": "cellfinder_test_data",  # relative path
-        "signal_parent_dir": str(
-            cellfinder_cache_dir / "cellfinder_test_data" / "signal"
-        ),
-        "background_parent_dir": str(
-            cellfinder_cache_dir / "cellfinder_test_data" / "background"
-        ),
-        "output_path_basename": cellfinder_cache_dir / "cellfinder_output_",
-        "detected_cells_filename": "detected_cells.xml",
-        "voxel_sizes": [5, 2, 2],  # microns
-        "start_plane": 0,
-        "end_plane": -1,
-        "trained_model": None,  # if None, it will use a default model
-        "model_weights": None,
-        "model": "resnet50_tv",
-        "batch_size": 32,
-        "n_free_cpus": 2,
-        "network_voxel_sizes": [5, 1, 1],
-        "soma_diameter": 16,
-        "ball_xy_size": 6,
-        "ball_z_size": 15,
-        "ball_overlap_fraction": 0.6,
-        "log_sigma_size": 0.2,
-        "n_sds_above_mean_thresh": 10,
-        "soma_spread_factor": 1.4,
-        "max_cluster_size": 100000,
-        "cube_width": 50,
-        "cube_height": 50,
-        "cube_depth": 20,
-        "network_depth": "50",
-    }
-
-
-# ensure all Paths are JSON serializable
-def prep_json(obj):
-    if isinstance(obj, Path):
-        return str(obj)
-    else:
-        return json.JSONEncoder.default(obj)
-
-
-def assert_outputs(path_to_config, parent_dir=""):
-    # load input config
-    # ATT! config.output_path is only defined after the workflow
-    # setup, because its name is timestamped
-    with open(path_to_config) as cfg:
-        config_dict = json.load(cfg)
-    config = CellfinderConfig(**config_dict)
-
-    # check one output directory exists and
-    # it has expected output file inside it
-    output_path_timestamped = [
-        x
-        for x in (Path(parent_dir) / config.output_path_basename).parent.glob(
-            "*"
-        )
-        if x.is_dir()
-        and x.name.startswith(Path(config.output_path_basename).name)
-    ]
-
-    assert len(output_path_timestamped) == 1
-    assert (output_path_timestamped[0]).exists()
-    assert (
-        output_path_timestamped[0] / config.detected_cells_filename
-    ).is_file()
-
-
-### Fixtures
-@pytest.fixture(autouse=True)
-def cellfinder_cache_dir(tmp_path):
-    """Create a .cellfinder_workflows directory
-    under a temporary pytest directory.
-
-    It uses pytest's tmp_path fixture so that all is cleared after the test.
-    A new temporary directory is created every function call (scope="function"
-    by default)
-
-    Parameters
-    ----------
-    tmp_path : _type_
-        _description_
-
-    Returns
-    -------
-    Path
-        path to the created cache directory
-    """
-
-    return Path(tmp_path) / ".cellfinder_workflows"
+def test_run_with_default_config(tmp_path):
+    """Test workflow run with no command line arguments
 
+    If no command line arguments are provided, the default
+    config at brainglobe_workflows/cellfinder/default_config.json
+    should be used.
 
-@pytest.fixture()
-def path_to_config_fetch_GIN(tmp_path, cellfinder_cache_dir):
-    """Create an input config that fetches data from GIN and
-    return its path
+    After the workflow is run we check that:
+    - there are no errors (via returncode),
+    - the logs reflect the default config file was used, and
+    - a single output directory exists with the expected
+      output file inside it
 
     Parameters
     ----------
-    tmp_path : _type_
-        _description_
-    cellfinder_cache_dir : _type_
-        _description_
-
-    Returns
-    -------
-    _type_
-        _description_
+    tmp_path : Path
+        path to a pytest-generated temporary directory.
     """
-    # create config dict
-    config_dict = make_config_dict_fetch_from_GIN(cellfinder_cache_dir)
-
-    # create a temp json file to dump config data
-    input_config_path = (
-        tmp_path / "input_config.json"
-    )  # save it in a temp dir separate from cellfinder_cache_dir
-
-    # save config data to json file
-    with open(input_config_path, "w") as js:
-        json.dump(config_dict, js, default=prep_json)
-
-    # check json file exists
-    assert Path(input_config_path).is_file()
-
-    return input_config_path
 
-
-@pytest.fixture()
-def path_to_config_fetch_local(path_to_config_fetch_GIN):
-    # create config that fetches data from GIN
-    # path_to_config_fetch_GIN
-
-    # read into config class
-    with open(path_to_config_fetch_GIN) as cfg:
-        config_dict = json.load(cfg)
-    config = CellfinderConfig(**config_dict)
-
-    # Download GIN data
-    pooch.retrieve(
-        url=config.data_url,
-        known_hash=config.data_hash,
-        path=config.install_path,  # path to download zip to
-        progressbar=True,
-        processor=pooch.Unzip(
-            extract_dir=config.extract_relative_dir
-            # path to unzipped dir, *relative*  to 'path'
-        ),
-    )
-
-    # return path to config json
-    return path_to_config_fetch_GIN
-
-
-### Tests
-def test_run_with_default_config(tmp_path):
-    # run workflow with CLI and capture log
-    # with cwd = pytest tmp_path <-------
+    # run workflow with no CLI arguments,
+    # with cwd=tmp_path
     subprocess_output = subprocess.run(
         [
             sys.executable,
@@ -218,9 +55,24 @@ def test_run_with_default_config(tmp_path):
     assert_outputs(DEFAULT_JSON_CONFIG_PATH, tmp_path)
 
 
-def test_run_with_config_GIN(
+def test_run_with_GIN_data(
     path_to_config_fetch_GIN,
 ):
+    """Test workflow runs when passing a config that fetches data
+    from the GIN repository
+
+    After the workflow is run we check that:
+    - there are no errors (via returncode),
+    - the logs reflect the input config file was used,
+    - the logs reflect the data was downloaded from GIN, and
+    - a single output directory exists with the expected
+      output file inside it
+
+    Parameters
+    ----------
+    tmp_path : Path
+        path to a pytest-generated temporary directory.
+    """
     # run workflow with CLI and capture log
     subprocess_output = subprocess.run(
         [
@@ -251,15 +103,31 @@ def test_run_with_config_GIN(
         in subprocess_output.stdout
     )
 
-    # check output directory
-    # check one output directory exists and has expected output file inside it
+    # check one output directory exists and
+    # has expected output file inside it
     assert_outputs(path_to_config_fetch_GIN)
 
 
-def test_run_with_config_local(
+def test_run_with_local_data(
     path_to_config_fetch_local,
 ):
-    # run workflow with CLI and capture log
+    """Test workflow runs when passing a config that uses
+    local data
+
+    After the workflow is run we check that:
+    - there are no errors (via returncode),
+    - the logs reflect the input config file was used,
+    - the logs reflect the data was found locally, and
+    - a single output directory exists with the expected
+      output file inside it
+
+    Parameters
+    ----------
+    tmp_path : Path
+        path to a pytest-generated temporary directory.
+    """
+
+    # run workflow with CLI
     subprocess_output = subprocess.run(
         [
             sys.executable,
@@ -289,6 +157,57 @@ def test_run_with_config_local(
         in subprocess_output.stdout
     )
 
-    # check output directory
-    # check one output directory exists and has expected output file inside it
+    # check one output directory exists and
+    # has expected output file inside it
     assert_outputs(path_to_config_fetch_local)
+
+
+def assert_outputs(path_to_config, parent_dir=""):
+    """Helper function to determine whether the output is
+    as expected.
+
+    It checks that:
+     - a single output directory exists, and
+     - the expected output file exists inside it
+
+    Note that config.output_path is only defined after the workflow
+    setup is run, because its name is timestamped. Therefore,
+    we search for an output directory based on config.output_path_basename.
+
+    Parameters
+    ----------
+    path_to_config : Path
+        path to the input config used to generate the
+        output.
+
+    parent_dir : str, optional
+        If the paths in the input config are expressed as
+        relative, the absolute path to their parent_dir
+        must be specified here. If the paths in the input
+        config are absolute, this input is not required.
+        By default "".
+    """
+
+    # load input config
+    # ATT! config.output_path is only defined after the workflow
+    # setup is run, because its name is timestamped!
+    with open(path_to_config) as cfg:
+        config_dict = json.load(cfg)
+    config = CellfinderConfig(**config_dict)
+
+    # check one output directory exists and
+    # it has expected output file inside it
+    output_path_timestamped = [
+        x
+        for x in (Path(parent_dir) / config.output_path_basename).parent.glob(
+            "*"
+        )
+        if x.is_dir()
+        and x.name.startswith(Path(config.output_path_basename).name)
+    ]
+
+    assert len(output_path_timestamped) == 1
+    assert (output_path_timestamped[0]).exists()
+    assert (
+        output_path_timestamped[0] / config.detected_cells_filename
+    ).is_file()