Merge pull request FeTS-AI#1 from hasan7n/separate-stages-copied-branch

Separate stages copied branch
hasan7n · Aug 28, 2023 · 696ce37 · 696ce37
2 parents 419733b + 84f8d98
commit 696ce37
Show file tree

Hide file tree

Showing 8 changed files with 60 additions and 71 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -76,9 +76,9 @@ ENV LANG C.UTF-8
 
 RUN mkdir /project/stages
 
-RUN cp /Front-End/src/applications/*.py /project/stages/
+RUN cp /Front-End/bin/install/appdir/usr/bin/*.py /project/stages/
 
-RUN cp -R /Front-End/src/applications/data_prep_models /project/stages/data_prep_models
+RUN cp -R /Front-End/bin/install/appdir/usr/bin/data_prep_models /project/stages/data_prep_models
 
 # Hotfix: install more recent version of GaNDLF for metrics generation
 RUN pip install git+https://github.com/mlcommons/GaNDLF@616b37bafad8f89d5c816a88f44fa30470601311

diff --git a/mlcubes/data_preparation/project/prepare.py b/mlcubes/data_preparation/project/prepare.py
@@ -174,5 +174,5 @@ def setup_argparser():
     )
 
     if match_proc.should_run(report):
-        loop.set_description(stage.get_name())
-        report = stage.execute(0, report)
+        loop.set_description(match_proc.get_name())
+        report = match_proc.execute("AAAC_0|2008.03.30", report)
diff --git a/mlcubes/data_preparation/project/stages/get_csv.py b/mlcubes/data_preparation/project/stages/get_csv.py
@@ -75,14 +75,14 @@ def execute(self, index: Union[str, int], report: pd.DataFrame) -> pd.DataFrame:
             "data_path": tp_out_path,
             "labels_path": "",
         }
-        if self.csv_processor.subject_timepoint_missing_modalities:
+        if f"{id}_{tp}" in self.csv_processor.subject_timepoint_missing_modalities:
             shutil.rmtree(tp_out_path, ignore_errors=True)
             comment = "There are missing modalities. Please check the data"
             report_data["status"] = -1.1
             report_data["status_name"] = "MISSING_MODALITIES"
             report_data["data_path"] = tp_path
             report_data["comment"] = comment
-        elif self.csv_processor.subject_timepoint_extra_modalities:
+        elif f"{id}_{tp}" in self.csv_processor.subject_timepoint_extra_modalities:
             shutil.rmtree(tp_out_path, ignore_errors=True)
             comment = "There are extra modalities. Please check the data"
             report_data["status"] = -1.2

diff --git a/mlcubes/data_preparation/project/stages/manual.py b/mlcubes/data_preparation/project/stages/manual.py
@@ -63,6 +63,8 @@ def execute(self, index: Union[str, int], report: pd.DataFrame) -> pd.DataFrame:
         out_path = self.__get_output_path(index)
         under_review_path = self.__get_under_review_path(index)
         bak_path = self.__get_backup_path(index)
+        id, tp = get_id_tp(index)
+        final_filename = f"{id}_{tp}_final_seg.nii.gz"
         if not os.path.exists(bak_path):
             shutil.copytree(in_path, bak_path)
             set_files_read_only(bak_path)
@@ -73,7 +75,7 @@ def execute(self, index: Union[str, int], report: pd.DataFrame) -> pd.DataFrame:
             f"You may find baseline segmentations inside {in_path}. "
             + f"Please inspect those segmentations and move the best one to {under_review_path}. "
             + "Make the necessary corrections to the generated segmentations with your desired tool, "
-            + f"and once you're done, move the finalized file to {out_path}"
+            + f"and once you're done, move the finalized file to {out_path} with the name {final_filename}."
         )
 
         report_data = {

diff --git a/mlcubes/data_preparation/project/stages/match.py b/mlcubes/data_preparation/project/stages/match.py
@@ -1,7 +1,6 @@
 from typing import Union
 import os
 import yaml
-import json
 
 import pandas as pd
 from pandas import DataFrame
@@ -25,12 +24,12 @@ def get_name(self):
 
     def __get_input_path(self, index: Union[str, int]):
         id, tp = get_id_tp(index)
-        path = os.path.join(self.prev_stage_path, id, tp)
+        path = os.path.join(self.prev_stage_path, INTERIM_FOLDER, id, tp)
         return path
 
     def __get_backup_path(self, index: Union[str, int]):
         id, tp = get_id_tp(index)
-        path = os.path.join(self.backup_path, id, tp)
+        path = os.path.join(self.backup_path, id, tp, TUMOR_MASK_FOLDER)
         return path
 
     def __get_output_path(self, index: Union[str, int]):
@@ -71,37 +70,34 @@ def execute(self, index: Union[str, int], report: DataFrame) -> DataFrame:
         # TODO: Add the percent of unchanged files, as well as voxel changes
         # To the report, as separate columns
 
+        match_output_path = self.__get_output_path(index)
+        os.makedirs(match_output_path, exist_ok=True)
         # Get the necessary files for match check
         id, tp = get_id_tp(index)
-        reviewed_filename = f"{id}_{tp}_final_seg.nii.gz"
+        reviewed_filename = f"reviewed/{id}_{tp}_final_seg.nii.gz"
         reviewed_file = os.path.join(self.__get_input_path(index), reviewed_filename)
-        gt_filename = ""  # TODO: How do we know which segmentation to compare against?
+        gt_filename = f"{id}_{tp}_tumorMask_fused-voting.nii.gz"
+        # TODO: How do we know which segmentation to compare against?
         # Should we compare against all segmentations?
         # If there's no exact match, which segmentation should we compare metrics with?
         ground_truth = os.path.join(self.__get_backup_path(index), gt_filename)
 
         # Prepare the assets for metrics generation
-        inputdata_file = os.path.join(self.__get_output_path(index), "inputdata.csv")
-        config_file = os.path.join(self.__get_output_path(index), "parameters.yaml")
-        data = {"subjectid": id, "prediction": reviewed_file, "target": ground_truth}
-        pd.DataFrame(data).to_csv(inputdata_file)
-        # TODO: Where do we get this config file?
-        # From reading the code, it seems to expect an MLCube parameters.yaml
-        # file which was used for training/generating inference
-        # That concept breaks here, because we have multiple models running
-        # without an accompanying MLCube, and we would need to know which config to use
-        # for which model
-
-        # config.yaml can be found inside project/data_prep_models/tumor_segmentation/{model_id}/config.yaml
-        config = {"problem_type": "segmentation"}
-        with open(config_file, "w") as f:
-            yaml.dump(config, f)
-
-        out_file = os.path.join(self.__get_output_path(index), "out.json")
+        inputdata_file = os.path.join(match_output_path, "inputdata.csv")
+        data = {"subjectid": f"{id}_{tp}", "prediction": reviewed_file, "target": ground_truth}
+        pd.DataFrame(data, index=[0]).to_csv(inputdata_file, index=False)
+
+        # Read gandlf config file.
+        # TODO: what are the requirements of config?
+        # TODO: do NOT hardcode the filesystem names used below
+        config_file = os.path.join(os.path.dirname(__file__), "data_prep_models/tumor_segmentation/model_0/config.yaml")
+
+        out_file = os.path.join(match_output_path, "out.yaml")
 
         # Run the metrics generation logic
         generate_metrics.generate_metrics_dict(inputdata_file, config_file, out_file)
 
         # Open the generated metrics
         with open(out_file, "r") as f:
-            metrics = json.load(f)
+            metrics = yaml.safe_load(f)
+        print(metrics)
diff --git a/src/applications/CreateCSVForDICOMs.py b/src/applications/CreateCSVForDICOMs.py
@@ -113,7 +113,7 @@ def process_timepoint(self, timepoint, subject, subject_dir):
                     continue
 
                 for modality_id in MODALITY_ID_DICT[modality_to_check]:
-                    if modality_id not in modality_lower:
+                    if modality_id != modality_lower:
                         continue
 
                     valid_dicom, first_dicom_file = verify_dicom_folder(modality_path)

diff --git a/src/applications/PrepareDataset.py b/src/applications/PrepareDataset.py
@@ -314,11 +314,11 @@ def _run_brain_extraction_using_gandlf(
         else models_to_infer.split(",")
     )
 
-    model_counter = 0
     images_for_fusion = []
     for model_dir in models_to_run:
+        model_id = os.path.basename(model_dir)
         model_output_dir = posixpath.join(
-            base_output_dir, "model_" + str(model_counter)
+            base_output_dir, "brain_extraction_" + str(model_id)
         )
         file_list = os.listdir(model_dir)
         for file in file_list:
@@ -342,18 +342,17 @@ def _run_brain_extraction_using_gandlf(
         for modality in modality_outputs:
             modality_output_dir = posixpath.join(model_output_dir_testing, modality)
             files_in_modality = os.listdir(modality_output_dir)
-            for file in files_in_modality:
+            for file in files_in_modality:  # this loop may not be necessary
                 if file.endswith(".nii.gz"):
                     file_path = posixpath.join(modality_output_dir, file)
                     shutil.copyfile(
                         file_path,
                         posixpath.join(
                             base_output_dir,
-                            f"brainMask_{model_counter}_{modality}.nii.gz",
+                            f"brainMask_{model_id}_{modality}.nii.gz",
                         ),
                     )
                     images_for_fusion.append(sitk.ReadImage(file_path, sitk.sitkUInt8))
-        model_counter += 1
 
     return fuse_images(images_for_fusion, "staple", [0, 1])
 
@@ -379,11 +378,11 @@ def _run_tumor_segmentation_using_gandlf(
     df_for_gandlf = pd.DataFrame(columns=GANDLF_DF_COLUMNS)
     current_subject = {"SubjectID": subject_id}
     channel_idx = 0
+    # modality order (trained according to EC): t1,t2,flair,t1c
+    modality_order = ["T1", "T2", "FLAIR", "T1GD"]
     # todo: confirm the order for modalities
-    for key in MODALITIES_LIST:
-        current_subject = {
-            f"Channel_{channel_idx}": input_oriented_brain_images[key],
-        }
+    for key in modality_order:
+        current_subject[f"Channel_{channel_idx}"] = input_oriented_brain_images[key]
         channel_idx += 1
     df_for_gandlf = pd.DataFrame(current_subject, index=[0])
     data_path = posixpath.join(base_output_dir, TUMOR_FILENAME)
@@ -398,12 +397,14 @@ def _run_tumor_segmentation_using_gandlf(
         else models_to_infer.split(",")
     )
 
-    model_counter = 0
+    tumor_masks_to_return = []
     images_for_fusion = []
     mask_output_dir = posixpath.join(base_output_dir, TUMOR_MASK_FOLDER)
+    os.makedirs(mask_output_dir, exist_ok=True)
     for model_dir in models_to_run:
+        model_id = os.path.basename(model_dir)
         model_output_dir = posixpath.join(
-            base_output_dir, "model_" + str(model_counter)
+            base_output_dir, "tumor_segmentation_" + str(model_id)
         )
         file_list = os.listdir(model_dir)
         for file in file_list:
@@ -428,27 +429,24 @@ def _run_tumor_segmentation_using_gandlf(
         )
 
         model_output_dir_testing = posixpath.join(model_output_dir, TESTING_FOLDER)
-        subject_model_output_dir = os.listdir(model_output_dir_testing)
-        for subject in subject_model_output_dir:
-            subject_output_dir = posixpath.join(model_output_dir_testing, subject)
-            files_in_modality = os.listdir(subject_output_dir)
-            for file in files_in_modality:
-                if file.endswith(".nii.gz"):
-                    file_path = posixpath.join(subject_output_dir, file)
-                    shutil.copyfile(
-                        file_path,
-                        posixpath.join(
-                            mask_output_dir,
-                            f"{subject_id}_tumorMask_model-{model_counter}.nii.gz",
-                        ),
-                    )
-                    images_for_fusion.append(sitk.ReadImage(file_path, sitk.sitkUInt8))
-        model_counter += 1
+        # We expect one subject (one output modality, one file).
+        subject = os.listdir(model_output_dir_testing)[0]
+        subject_output_dir = posixpath.join(model_output_dir_testing, subject)
+        files_in_modality = os.listdir(subject_output_dir)
+        for file in files_in_modality:  # this loop may not be necessary
+            if file.endswith(".nii.gz"):
+                file_path = posixpath.join(subject_output_dir, file)
+                renamed_path = posixpath.join(
+                    mask_output_dir,
+                    f"{subject_id}_tumorMask_model-{model_id}.nii.gz",
+                )
+                shutil.copyfile(file_path, renamed_path)
+                # Append the renamed path to keep track of model IDs
+                tumor_masks_to_return.append(renamed_path)
+                images_for_fusion.append(sitk.ReadImage(file_path, sitk.sitkUInt8))
 
     tumor_class_list = [0, 1, 2, 3, 4]
 
-    tumor_masks_to_return = images_for_fusion
-
     if len(images_for_fusion) > 1:
         for fusion_type in ["staple", "simple", "voting"]:
             fused_mask = fuse_images(images_for_fusion, fusion_type, tumor_class_list)
@@ -739,10 +737,7 @@ def extract_brain(self, row: pd.Series, pbar: tqdm):
         for modality in MODALITIES_LIST:
             image = sitk.ReadImage(outputs_reoriented[modality])
             masked_image = sitk.Mask(image, brain_mask)
-            file_to_save = posixpath.join(
-                finalSubjectOutputDir_actual,
-                f"{subject_id_timepoint}_brain_{MODALITY_ID_MAPPING[modality]}.nii.gz",
-            )
+            file_to_save = input_for_tumor_models[modality]
             sitk.WriteImage(masked_image, file_to_save)
 
         # save the screenshot
@@ -783,18 +778,14 @@ def extract_tumor(self, row: pd.Series, pbar: tqdm):
             interimOutputDir_actual,
         )
 
-        tumor_mask_idx = 0
         for tumor_mask in tumor_masks_for_qc:
+            tumor_mask_id = os.path.basename(tumor_mask).replace(".nii.gz", "")
             # save the screenshot
             _save_screenshot(
                 input_for_tumor_models,
-                posixpath.join(
-                    interimOutputDir_actual,
-                    f"{subject_id_timepoint}_summary_tumor-segmentation_model-{tumor_mask_idx}.png",
-                ),
+                posixpath.join(interimOutputDir_actual, f"{tumor_mask_id}_summary.png"),
                 tumor_mask,
             )
-            tumor_mask_idx += 1
 
         with open(self.stdout_log, "a+") as f:
             f.write(f"***\nTumor Masks For QC:\n{tumor_masks_for_qc}\n***")

diff --git a/src/applications/setup.py b/src/applications/setup.py
@@ -72,7 +72,7 @@
 
 urls_for_download = {
     "brain_extraction": "https://upenn.box.com/shared/static/cp5xz726mtb6gwwym8ydcxmw52zfngun",
-    "tumor_segmentation": "https://upenn.box.com/shared/static/hdcb0xqj4z528v3uc9xmfu60p0xtsv62",  # should be changed
+    "tumor_segmentation": "https://storage.googleapis.com/medperf-storage/rano_test_assets/tumor_segmentation.zip",  # should be changed
 }
 
 for model in urls_for_download.keys():