benmalef · benmalef · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 18, 2024
diff --git a/.devcontainer/onCreateCommand.sh b/.devcontainer/onCreateCommand.sh
@@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 
 python -m ensurepip # ensures pip is installed in the current environment
-pip install --upgrade pip
+pip install --upgrade pip==24.0
 pip install wheel
 pip install openvino-dev==2023.0.1 # [OPTIONAL] to generate optimized models for inference
 pip install mlcube_docker          # [OPTIONAL] to deploy GaNDLF models as MLCube-compliant Docker containers

diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml
@@ -27,7 +27,7 @@ jobs:
 
       - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
+          python -m pip install --upgrade pip==24.0
           python -m pip install black==${{ env.BLACK_VERSION }}
 
       - name: Run tests

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -38,7 +38,7 @@ jobs:
           ${{ runner.os }}-pip-
     - name: Install dependencies
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip==24.0
         pip install scikit-build
         pip install -e .
         pip install build

diff --git a/.github/workflows/openfl-test.yml b/.github/workflows/openfl-test.yml
@@ -100,3 +100,4 @@ jobs:
           config_to_use=$(pwd)/testing/config_segmentation.yaml
           cd openfl
           python -m tests.github.test_gandlf --template gandlf_seg_test --fed_workspace aggregator --col1 one --col2 two --rounds-to-train 1 --gandlf_config $config_to_use
+
diff --git a/.github/workflows/publish-nightly.yml b/.github/workflows/publish-nightly.yml
@@ -52,7 +52,7 @@ jobs:
     - name: Install dependencies
       if: env.publish_nightly
       run: |
-        python -m pip install --upgrade pip
+        python -m pip install --upgrade pip==24.0
         pip install scikit-build
         pip install -e .
         pip install build

diff --git a/.gitignore b/.gitignore
@@ -36,3 +36,4 @@ tutorials/classification_medmnist_notebook/output_stats
 tutorials/classification_medmnist_notebook/model
 tutorials/classification_medmnist_notebook/dataset/*.csv
 testing/test_deploy
+tmp
diff --git a/GANDLF/cli/generate_metrics.py b/GANDLF/cli/generate_metrics.py
@@ -1,5 +1,5 @@
 import sys
-import yaml
+import json
 from typing import Optional
 from pprint import pprint
 import pandas as pd
@@ -31,8 +31,41 @@
 )
 
 
+def __update_header_location_case_insensitive(
+    input_df: pd.DataFrame, expected_column_name: str, required: bool = True
+) -> pd.DataFrame:
+    """
+    This function checks for a column in the dataframe in a case-insensitive manner and renames it.
+
+    Args:
+        input_df (pd.DataFrame): The input dataframe.
+        expected_column_name (str): The expected column name.
+        required (bool, optional): Whether the column is required. Defaults to True.
+
+    Returns:
+        pd.DataFrame: The updated dataframe.
+    """
+    actual_column_name = None
+    for col in input_df.columns:
+        if col.lower() == expected_column_name.lower():
+            actual_column_name = col
+            break
+
+    if required:
+        assert (
+            actual_column_name is not None
+        ), f"Column {expected_column_name} not found in the dataframe"
+
+        return input_df.rename(columns={actual_column_name: expected_column_name})
+    else:
+        return input_df
+
+
 def generate_metrics_dict(
-    input_csv: str, config: str, outputfile: Optional[str] = None
+    input_csv: str,
+    config: str,
+    outputfile: Optional[str] = None,
+    missing_prediction: int = -1,
 ) -> dict:
     """
     This function generates metrics from the input csv and the config.
@@ -41,27 +74,83 @@ def generate_metrics_dict(
         input_csv (str): The input CSV.
         config (str): The input yaml config.
         outputfile (str, optional): The output file to save the metrics. Defaults to None.
+        missing_prediction (int, optional): The value to use for missing predictions as penalty. Default is -1.
 
     Returns:
         dict: The metrics dictionary.
     """
-    input_df = pd.read_csv(input_csv)
+    # the case where the input is a comma-separated 2 files with targets and predictions
+    if "," in input_csv:
+        target_csv, prediction_csv = input_csv.split(",")
+        target_df = pd.read_csv(target_csv)
+        prediction_df = pd.read_csv(prediction_csv)
+        ## start sanity checks
+        # if missing predictions are not to be penalized, check if the number of rows in the target and prediction files are the same
+        if missing_prediction == -1:
+            assert (
+                target_df.shape[0] == prediction_df.shape[0]
+            ), "The number of rows in the target and prediction files should be the same"
+
+        # check if the number of columns in the target and prediction files are the same
+        assert (
+            target_df.shape[1] == prediction_df.shape[1]
+        ), "The number of columns in the target and prediction files should be the same"
+        assert (
+            target_df.shape[1] == 2
+        ), "The target and prediction files should have *exactly* 2 columns"
+
+        # find the correct header for the subjectID column
+        target_df = __update_header_location_case_insensitive(target_df, "SubjectID")
+        prediction_df = __update_header_location_case_insensitive(
+            prediction_df, "SubjectID"
+        )
+        # check if prediction_df has extra subjectIDs
+        assert (
+            prediction_df["SubjectID"].isin(target_df["SubjectID"]).all()
+        ), "The `SubjectID` column in the prediction file should be a subset of the `SubjectID` column in the target file"
+
+        # individual checks for target and prediction dataframes
+        for df in [target_df, prediction_df]:
+            # check if the "subjectID" column has duplicates
+            assert (
+                df["SubjectID"].duplicated().sum() == 0
+            ), "The `SubjectID` column should not have duplicates"
+
+            # check if SubjectID is the first column
+            assert (
+                df.columns[0] == "SubjectID"
+            ), "The `SubjectID` column should be the first column in the target and prediction files"
+
+        # change the column name after subjectID to target and prediction
+        target_df = target_df.rename(columns={target_df.columns[1]: "Target"})
+        prediction_df = prediction_df.rename(
+            columns={prediction_df.columns[1]: "Prediction"}
+        )
+
+        # combine the two dataframes
+        input_df = target_df.merge(prediction_df, how="left", on="SubjectID").fillna(
+            missing_prediction
+        )
 
-    # check required headers in a case insensitive manner
-    headers = {}
-    required_columns = ["subjectid", "prediction", "target"]
-    for col, _ in input_df.items():
-        col_lower = col.lower()
+    else:
+        # the case where the input is a single file with targets and predictions
+        input_df = pd.read_csv(input_csv)
+
+        # check required headers in a case insensitive manner and rename them
+        required_columns = ["SubjectID", "Prediction", "Target"]
         for column_to_check in required_columns:
-            if column_to_check == col_lower:
-                headers[column_to_check] = col
-        if col_lower == "mask":
-            headers["mask"] = col
-    for column in required_columns:
-        assert column in headers, f"The input csv should have a column named {column}"
+            input_df = __update_header_location_case_insensitive(
+                input_df, column_to_check
+            )
+
+        # check if the "subjectID" column has duplicates
+        assert (
+            input_df["SubjectID"].duplicated().sum() == 0
+        ), "The `SubjectID` column should not have duplicates"
 
     overall_stats_dict = {}
     parameters = ConfigManager(config)
+    # ensure that the problem_type is set
     problem_type = parameters.get("problem_type", None)
     problem_type = (
         find_problem_type_from_parameters(parameters)
@@ -70,12 +159,14 @@ def generate_metrics_dict(
     )
     parameters["problem_type"] = problem_type
 
-    if problem_type == "regression" or problem_type == "classification":
-        parameters["model"]["num_classes"] = len(parameters["model"]["class_list"])
-        predictions_tensor = torch.from_numpy(
-            input_df[headers["prediction"]].to_numpy().ravel()
+    if problem_type == "classification":
+        parameters["model"]["num_classes"] = parameters["model"].get(
+            "num_classes", len(parameters["model"]["class_list"])
         )
-        labels_tensor = torch.from_numpy(input_df[headers["target"]].to_numpy().ravel())
+
+    if problem_type == "regression" or problem_type == "classification":
+        predictions_tensor = torch.from_numpy(input_df["Prediction"].to_numpy().ravel())
+        labels_tensor = torch.from_numpy(input_df["Target"].to_numpy().ravel())
         overall_stats_dict = overall_stats(
             predictions_tensor, labels_tensor, parameters
         )
@@ -84,10 +175,10 @@ def generate_metrics_dict(
         # read images and then calculate metrics
         class_list = parameters["model"]["class_list"]
         for _, row in tqdm(input_df.iterrows(), total=input_df.shape[0]):
-            current_subject_id = row[headers["subjectid"]]
+            current_subject_id = row["SubjectID"]
             overall_stats_dict[current_subject_id] = {}
-            label_image = torchio.LabelMap(row[headers["target"]])
-            pred_image = torchio.LabelMap(row[headers["prediction"]])
+            label_image = torchio.LabelMap(row["Target"])
+            pred_image = torchio.LabelMap(row["Prediction"])
             label_tensor = label_image.data
             pred_tensor = pred_image.data
             spacing = label_image.spacing
@@ -225,20 +316,17 @@ def __percentile_clip(
             )  # normalizes values to [0;1]
             return output_tensor
 
+        input_df = __update_header_location_case_insensitive(input_df, "Mask", False)
         for _, row in tqdm(input_df.iterrows(), total=input_df.shape[0]):
-            current_subject_id = row[headers["subjectid"]]
+            current_subject_id = row["SubjectID"]
             overall_stats_dict[current_subject_id] = {}
-            target_image = __fix_2d_tensor(
-                torchio.ScalarImage(row[headers["target"]]).data
-            )
-            pred_image = __fix_2d_tensor(
-                torchio.ScalarImage(row[headers["prediction"]]).data
-            )
-            # if "mask" is not in the row, we assume that the whole image is the mask
+            target_image = __fix_2d_tensor(torchio.ScalarImage(row["Target"]).data)
+            pred_image = __fix_2d_tensor(torchio.ScalarImage(row["Prediction"]).data)
+            # if "Mask" is not in the row, we assume that the whole image is the mask
             # always cast to byte tensor
             mask = (
-                __fix_2d_tensor(torchio.LabelMap(row[headers["mask"]]).data)
-                if "mask" in row
+                __fix_2d_tensor(torchio.LabelMap(row["Mask"]).data)
+                if "Mask" in row
                 else torch.from_numpy(
                     np.ones(target_image.numpy().shape, dtype=np.uint8)
                 )
@@ -339,5 +427,8 @@ def __percentile_clip(
 
     pprint(overall_stats_dict)
     if outputfile is not None:
-        with open(outputfile, "w") as outfile:
-            yaml.dump(overall_stats_dict, outfile)
+        ## todo: needs debugging since this writes the file handler in some cases, so replaced with json
+        # with open(outputfile, "w") as outfile:
+        #     yaml.dump(overall_stats_dict, outfile)
+        with open(outputfile, "w") as file:
+            file.write(json.dumps(overall_stats_dict))
diff --git a/GANDLF/cli/post_training_model_optimization.py b/GANDLF/cli/post_training_model_optimization.py
@@ -1,16 +1,21 @@
 import os
+from pathlib import Path
+from typing import Optional
 from GANDLF.compute import create_pytorch_objects
 from GANDLF.config_manager import ConfigManager
 from GANDLF.utils import version_check, load_model, optimize_and_save_model
 
 
-def post_training_model_optimization(model_path: str, config_path: str) -> bool:
+def post_training_model_optimization(
+    model_path: str, config_path: Optional[str] = None, output_dir: Optional[str] = None
+) -> bool:
     """
     CLI function to optimize a model for deployment.
 
     Args:
         model_path (str): Path to the model file.
-        config_path (str): Path to the config file.
+        config_path (str, optional): Path to the configuration file.
+        output_dir (str, optional): Output directory to save the optimized model.
 
     Returns:
         bool: True if successful, False otherwise.
@@ -26,6 +31,12 @@ def post_training_model_optimization(model_path: str, config_path: str) -> bool:
         else parameters
     )
 
+    output_dir = os.path.dirname(model_path) if output_dir is None else output_dir
+    Path(output_dir).mkdir(parents=True, exist_ok=True)
+    optimized_model_path = os.path.join(
+        output_dir, os.path.basename(model_path).replace("pth.tar", "onnx")
+    )
+
     # Create PyTorch objects and set onnx_export to True for optimization
     model, _, _, _, _, parameters = create_pytorch_objects(parameters, device="cpu")
     parameters["model"]["onnx_export"] = True
@@ -35,10 +46,9 @@ def post_training_model_optimization(model_path: str, config_path: str) -> bool:
     model.load_state_dict(main_dict["model_state_dict"])
 
     # Optimize the model and save it to an ONNX file
-    optimize_and_save_model(model, parameters, model_path, onnx_export=True)
+    optimize_and_save_model(model, parameters, optimized_model_path, onnx_export=True)
 
     # Check if the optimized model file exists
-    optimized_model_path = model_path.replace("pth.tar", "onnx")
     if not os.path.exists(optimized_model_path):
         print("Error while optimizing the model.")
         return False

diff --git a/GANDLF/compute/forward_pass.py b/GANDLF/compute/forward_pass.py
@@ -1,7 +1,7 @@
 import os
 import pathlib
 from typing import Optional, Tuple
-
+import logging
 import numpy as np
 import pandas as pd
 import SimpleITK as sitk
@@ -116,7 +116,7 @@ def validate_network(
         tqdm(valid_dataloader, desc="Looping over " + mode + " data")
     ):
         if params["verbose"]:
-            print("== Current subject:", subject["subject_id"], flush=True)
+            logging.debug(f'== Current subject: {subject["subject_id"]}')
 
         # ensure spacing is always present in params and is always subject-specific
         params["subject_spacing"] = None

diff --git a/GANDLF/config_manager.py b/GANDLF/config_manager.py
@@ -620,9 +620,7 @@ def _parseConfig(
             params["model"]["class_list"] = temp_classList.split(",")
         else:
             try:
-                params["model"]["class_list"] = ast.literal_eval(
-                    params["model"]["class_list"]
-                )
+                params["model"]["class_list"] = eval(params["model"]["class_list"])
             except AssertionError:
                 raise AssertionError("Could not evaluate the 'class_list' in 'model'")
 

diff --git a/GANDLF/entrypoints/anonymizer.py b/GANDLF/entrypoints/anonymizer.py
@@ -12,6 +12,7 @@
 from GANDLF.anonymize import run_anonymizer
 from GANDLF.cli import copyrightMessage
 from GANDLF.entrypoints import append_copyright_to_help
+from GANDLF.utils.gandlf_logger import gandlf_logger_setup
 
 
 def _anonymize_images(
@@ -77,6 +78,7 @@ def new_way(input_dir, config, modality, output_file):
     + "`gandlf_anonymizer` script would be deprecated soon."
 )
 def old_way():
+    gandlf_logger_setup()
     parser = argparse.ArgumentParser(
         prog="GANDLF_Anonymize",
         formatter_class=argparse.RawTextHelpFormatter,

diff --git a/GANDLF/entrypoints/cli_tool.py b/GANDLF/entrypoints/cli_tool.py
@@ -3,7 +3,7 @@
 import click
 from .subcommands import cli_subcommands
 from GANDLF.entrypoints import append_copyright_to_help
-
+from GANDLF.utils import gandlf_logger_setup
 from GANDLF import version
 
 
@@ -24,7 +24,8 @@ def gandlf(ctx, loglevel):
     """GANDLF command-line tool."""
     ctx.ensure_object(dict)
     ctx.obj["LOGLEVEL"] = loglevel
-    setup_logging(loglevel)
+    # setup_logging(loglevel)
+    gandlf_logger_setup()
 
 
 # registers subcommands: `gandlf anonymizer`, `gandlf run`, etc.

diff --git a/GANDLF/entrypoints/collect_stats.py b/GANDLF/entrypoints/collect_stats.py
@@ -14,6 +14,7 @@
 
 from GANDLF.cli import copyrightMessage
 from GANDLF.entrypoints import append_copyright_to_help
+from GANDLF.utils import gandlf_logger_setup
 
 
 def plot_all(df_training, df_validation, df_testing, output_plot_dir):
@@ -205,6 +206,7 @@ def new_way(model_dir: str, output_dir: str):
     + "`gandlf_collectStats` script would be deprecated soon."
 )
 def old_way():
+    gandlf_logger_setup()
     parser = argparse.ArgumentParser(
         prog="GANDLF_CollectStats",
         formatter_class=argparse.RawTextHelpFormatter,
Original file line number	Diff line number	Diff line change
Expand Up		@@ -100,3 +100,4 @@ jobs:
		config_to_use=$(pwd)/testing/config_segmentation.yaml
		cd openfl
		python -m tests.github.test_gandlf --template gandlf_seg_test --fed_workspace aggregator --col1 one --col2 two --rounds-to-train 1 --gandlf_config $config_to_use