From f5ad518622d42c94d0c4fdb0ae12ce3fddecb12f Mon Sep 17 00:00:00 2001 From: hasan7n Date: Tue, 15 Aug 2023 23:32:55 +0000 Subject: [PATCH 01/13] copy data_prep_models from the correct folder --- Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 9c4b416b..128d2fad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -76,9 +76,9 @@ ENV LANG C.UTF-8 RUN mkdir /project/stages -RUN cp /Front-End/src/applications/*.py /project/stages/ +RUN cp /Front-End/bin/install/appdir/usr/bin/*.py /project/stages/ -RUN cp -R /Front-End/src/applications/data_prep_models /project/stages/data_prep_models +RUN cp -R /Front-End/bin/install/appdir/usr/bin/data_prep_models /project/stages/data_prep_models # Hotfix: install more recent version of GaNDLF for metrics generation RUN pip install git+https://github.com/mlcommons/GaNDLF@616b37bafad8f89d5c816a88f44fa30470601311 From ef574ab012f503a9562617fdacc02d427a3f785b Mon Sep 17 00:00:00 2001 From: hasan7n Date: Tue, 15 Aug 2023 23:34:25 +0000 Subject: [PATCH 02/13] fix bugs related to tumor segmentation --- src/applications/PrepareDataset.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/applications/PrepareDataset.py b/src/applications/PrepareDataset.py index 403ff7e7..93413fb8 100644 --- a/src/applications/PrepareDataset.py +++ b/src/applications/PrepareDataset.py @@ -381,9 +381,7 @@ def _run_tumor_segmentation_using_gandlf( channel_idx = 0 # todo: confirm the order for modalities for key in MODALITIES_LIST: - current_subject = { - f"Channel_{channel_idx}": input_oriented_brain_images[key], - } + current_subject[f"Channel_{channel_idx}"] = input_oriented_brain_images[key] channel_idx += 1 df_for_gandlf = pd.DataFrame(current_subject, index=[0]) data_path = posixpath.join(base_output_dir, TUMOR_FILENAME) @@ -401,6 +399,7 @@ def _run_tumor_segmentation_using_gandlf( model_counter = 0 images_for_fusion = [] mask_output_dir = posixpath.join(base_output_dir, TUMOR_MASK_FOLDER) + os.makedirs(mask_output_dir, exist_ok=True) for model_dir in models_to_run: model_output_dir = posixpath.join( base_output_dir, "model_" + str(model_counter) @@ -447,7 +446,7 @@ def _run_tumor_segmentation_using_gandlf( tumor_class_list = [0, 1, 2, 3, 4] - tumor_masks_to_return = images_for_fusion + tumor_masks_to_return = [] if len(images_for_fusion) > 1: for fusion_type in ["staple", "simple", "voting"]: From 2aefcdcf4d47aa94b88f75cdbe60136a7674f352 Mon Sep 17 00:00:00 2001 From: hasan7n Date: Wed, 16 Aug 2023 21:34:41 +0000 Subject: [PATCH 03/13] modify tumor segmentation model download url --- src/applications/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/applications/setup.py b/src/applications/setup.py index e69169b7..8787e8a6 100644 --- a/src/applications/setup.py +++ b/src/applications/setup.py @@ -72,7 +72,7 @@ urls_for_download = { "brain_extraction": "https://upenn.box.com/shared/static/cp5xz726mtb6gwwym8ydcxmw52zfngun", - "tumor_segmentation": "https://upenn.box.com/shared/static/hdcb0xqj4z528v3uc9xmfu60p0xtsv62", # should be changed + "tumor_segmentation": "https://storage.googleapis.com/medperf-storage/rano_test_assets/tumor_segmentation.zip", # should be changed } for model in urls_for_download.keys(): From 3f04385cb1647eb94cf03b1e5b116c48d42dd124 Mon Sep 17 00:00:00 2001 From: hasan7n Date: Wed, 16 Aug 2023 21:36:40 +0000 Subject: [PATCH 04/13] fix the bugfix in prepareDataset --- src/applications/PrepareDataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/applications/PrepareDataset.py b/src/applications/PrepareDataset.py index 93413fb8..d2f973ee 100644 --- a/src/applications/PrepareDataset.py +++ b/src/applications/PrepareDataset.py @@ -428,6 +428,7 @@ def _run_tumor_segmentation_using_gandlf( model_output_dir_testing = posixpath.join(model_output_dir, TESTING_FOLDER) subject_model_output_dir = os.listdir(model_output_dir_testing) + tumor_masks_to_return = [] for subject in subject_model_output_dir: subject_output_dir = posixpath.join(model_output_dir_testing, subject) files_in_modality = os.listdir(subject_output_dir) @@ -441,13 +442,12 @@ def _run_tumor_segmentation_using_gandlf( f"{subject_id}_tumorMask_model-{model_counter}.nii.gz", ), ) + tumor_masks_to_return.append(file_path) images_for_fusion.append(sitk.ReadImage(file_path, sitk.sitkUInt8)) model_counter += 1 tumor_class_list = [0, 1, 2, 3, 4] - tumor_masks_to_return = [] - if len(images_for_fusion) > 1: for fusion_type in ["staple", "simple", "voting"]: fused_mask = fuse_images(images_for_fusion, fusion_type, tumor_class_list) From d122ae6b889914feea9bd8a749f04e86f0eab1f4 Mon Sep 17 00:00:00 2001 From: hasan7n Date: Wed, 16 Aug 2023 21:38:27 +0000 Subject: [PATCH 05/13] start fixing matching step (WIP) --- mlcubes/data_preparation/project/prepare.py | 4 +- .../data_preparation/project/stages/manual.py | 4 +- .../data_preparation/project/stages/match.py | 42 +++++++++---------- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/mlcubes/data_preparation/project/prepare.py b/mlcubes/data_preparation/project/prepare.py index 29bccaca..ca034700 100644 --- a/mlcubes/data_preparation/project/prepare.py +++ b/mlcubes/data_preparation/project/prepare.py @@ -174,5 +174,5 @@ def setup_argparser(): ) if match_proc.should_run(report): - loop.set_description(stage.get_name()) - report = stage.execute(0, report) + loop.set_description(match_proc.get_name()) + report = match_proc.execute("AAAC_0|2008.03.30", report) diff --git a/mlcubes/data_preparation/project/stages/manual.py b/mlcubes/data_preparation/project/stages/manual.py index 25d85b4c..2643e4f6 100644 --- a/mlcubes/data_preparation/project/stages/manual.py +++ b/mlcubes/data_preparation/project/stages/manual.py @@ -63,6 +63,8 @@ def execute(self, index: Union[str, int], report: pd.DataFrame) -> pd.DataFrame: out_path = self.__get_output_path(index) under_review_path = self.__get_under_review_path(index) bak_path = self.__get_backup_path(index) + id, tp = get_id_tp(index) + final_filename = f"{id}_{tp}_final_seg.nii.gz" if not os.path.exists(bak_path): shutil.copytree(in_path, bak_path) set_files_read_only(bak_path) @@ -73,7 +75,7 @@ def execute(self, index: Union[str, int], report: pd.DataFrame) -> pd.DataFrame: f"You may find baseline segmentations inside {in_path}. " + f"Please inspect those segmentations and move the best one to {under_review_path}. " + "Make the necessary corrections to the generated segmentations with your desired tool, " - + f"and once you're done, move the finalized file to {out_path}" + + f"and once you're done, move the finalized file to {out_path} with the name {final_filename}." ) report_data = { diff --git a/mlcubes/data_preparation/project/stages/match.py b/mlcubes/data_preparation/project/stages/match.py index 85c98d07..11a37139 100644 --- a/mlcubes/data_preparation/project/stages/match.py +++ b/mlcubes/data_preparation/project/stages/match.py @@ -1,7 +1,6 @@ from typing import Union import os import yaml -import json import pandas as pd from pandas import DataFrame @@ -25,12 +24,12 @@ def get_name(self): def __get_input_path(self, index: Union[str, int]): id, tp = get_id_tp(index) - path = os.path.join(self.prev_stage_path, id, tp) + path = os.path.join(self.prev_stage_path, INTERIM_FOLDER, id, tp) return path def __get_backup_path(self, index: Union[str, int]): id, tp = get_id_tp(index) - path = os.path.join(self.backup_path, id, tp) + path = os.path.join(self.backup_path, id, tp, TUMOR_MASK_FOLDER) return path def __get_output_path(self, index: Union[str, int]): @@ -71,37 +70,34 @@ def execute(self, index: Union[str, int], report: DataFrame) -> DataFrame: # TODO: Add the percent of unchanged files, as well as voxel changes # To the report, as separate columns + match_output_path = self.__get_output_path(index) + os.makedirs(match_output_path, exist_ok=True) # Get the necessary files for match check id, tp = get_id_tp(index) - reviewed_filename = f"{id}_{tp}_final_seg.nii.gz" + reviewed_filename = f"reviewed/{id}_{tp}_final_seg.nii.gz" reviewed_file = os.path.join(self.__get_input_path(index), reviewed_filename) - gt_filename = "" # TODO: How do we know which segmentation to compare against? + gt_filename = f"{id}_{tp}_tumorMask_fused-voting.nii.gz" + # TODO: How do we know which segmentation to compare against? # Should we compare against all segmentations? # If there's no exact match, which segmentation should we compare metrics with? ground_truth = os.path.join(self.__get_backup_path(index), gt_filename) # Prepare the assets for metrics generation - inputdata_file = os.path.join(self.__get_output_path(index), "inputdata.csv") - config_file = os.path.join(self.__get_output_path(index), "parameters.yaml") - data = {"subjectid": id, "prediction": reviewed_file, "target": ground_truth} - pd.DataFrame(data).to_csv(inputdata_file) - # TODO: Where do we get this config file? - # From reading the code, it seems to expect an MLCube parameters.yaml - # file which was used for training/generating inference - # That concept breaks here, because we have multiple models running - # without an accompanying MLCube, and we would need to know which config to use - # for which model - - # config.yaml can be found inside project/data_prep_models/tumor_segmentation/{model_id}/config.yaml - config = {"problem_type": "segmentation"} - with open(config_file, "w") as f: - yaml.dump(config, f) - - out_file = os.path.join(self.__get_output_path(index), "out.json") + inputdata_file = os.path.join(match_output_path, "inputdata.csv") + data = {"subjectid": f"{id}_{tp}", "prediction": reviewed_file, "target": ground_truth} + pd.DataFrame(data, index=[0]).to_csv(inputdata_file, index=False) + + # Read gandlf config file. + # TODO: what are the requirements of config? + # TODO: do NOT hardcode the filesystem names used below + config_file = os.path.join(os.path.dirname(__file__), "data_prep_models/tumor_segmentation/model_0/config.yaml") + + out_file = os.path.join(match_output_path, "out.yaml") # Run the metrics generation logic generate_metrics.generate_metrics_dict(inputdata_file, config_file, out_file) # Open the generated metrics with open(out_file, "r") as f: - metrics = json.load(f) + metrics = yaml.safe_load(f) + print(metrics) From e56bb5a5e57fff711ce6ae1c32908bde5e9ff119 Mon Sep 17 00:00:00 2001 From: sarthakpati Date: Fri, 18 Aug 2023 19:51:59 -0400 Subject: [PATCH 06/13] updated link for tumor segmentation model --- src/applications/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/applications/setup.py b/src/applications/setup.py index e69169b7..ccc1e9b0 100644 --- a/src/applications/setup.py +++ b/src/applications/setup.py @@ -72,7 +72,7 @@ urls_for_download = { "brain_extraction": "https://upenn.box.com/shared/static/cp5xz726mtb6gwwym8ydcxmw52zfngun", - "tumor_segmentation": "https://upenn.box.com/shared/static/hdcb0xqj4z528v3uc9xmfu60p0xtsv62", # should be changed + "tumor_segmentation": "https://upenn.box.com/shared/static/woiqk6x9ygazst5ofrnfnezuy0aw0tn6", # should be changed } for model in urls_for_download.keys(): From ee06c5193494268cba56e5a8225680aeff687871 Mon Sep 17 00:00:00 2001 From: sarthakpati Date: Fri, 18 Aug 2023 19:56:19 -0400 Subject: [PATCH 07/13] updated modality order and models --- src/applications/PrepareDataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/applications/PrepareDataset.py b/src/applications/PrepareDataset.py index 60f8fe20..578b5d44 100644 --- a/src/applications/PrepareDataset.py +++ b/src/applications/PrepareDataset.py @@ -393,8 +393,10 @@ def _run_tumor_segmentation_using_gandlf( df_for_gandlf = pd.DataFrame(columns=["SubjectID", "Channel_0"]) current_subject = {"SubjectID": subject_id} channel_idx = 0 + # modality order (trained according to EC): t1,t2,flair,t1c + modality_order = ["T1", "T2", "FLAIR", "T1GD"] # todo: confirm the order for modalities - for key in modalities_list: + for key in modality_order: current_subject = { f"Channel_{channel_idx}": input_oriented_brain_images[key], } From 99aefe40ff5895a3f9018ec6c29c47403248e720 Mon Sep 17 00:00:00 2001 From: "Edwards, Brandon" Date: Tue, 22 Aug 2023 14:49:26 -0700 Subject: [PATCH 08/13] replacing string containment with string non-equality --- src/applications/CreateCSVForDICOMs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/applications/CreateCSVForDICOMs.py b/src/applications/CreateCSVForDICOMs.py index e46b1e55..2484e546 100644 --- a/src/applications/CreateCSVForDICOMs.py +++ b/src/applications/CreateCSVForDICOMs.py @@ -113,7 +113,7 @@ def process_timepoint(self, timepoint, subject, subject_dir): continue for modality_id in MODALITY_ID_DICT[modality_to_check]: - if modality_id not in modality_lower: + if modality_id != modality_lower: continue valid_dicom, first_dicom_file = verify_dicom_folder(modality_path) From 0939398edd3c64ecb6fdb633fec6dc49328b21d0 Mon Sep 17 00:00:00 2001 From: hasan7n Date: Wed, 23 Aug 2023 13:00:38 +0000 Subject: [PATCH 09/13] use tumor files names from dict This is not a bugfix. Just to remove confusion --- src/applications/PrepareDataset.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/applications/PrepareDataset.py b/src/applications/PrepareDataset.py index 8c635bb6..a4b15be0 100644 --- a/src/applications/PrepareDataset.py +++ b/src/applications/PrepareDataset.py @@ -740,10 +740,7 @@ def extract_brain(self, row: pd.Series, pbar: tqdm): for modality in MODALITIES_LIST: image = sitk.ReadImage(outputs_reoriented[modality]) masked_image = sitk.Mask(image, brain_mask) - file_to_save = posixpath.join( - finalSubjectOutputDir_actual, - f"{subject_id_timepoint}_brain_{MODALITY_ID_MAPPING[modality]}.nii.gz", - ) + file_to_save = input_for_tumor_models[modality] sitk.WriteImage(masked_image, file_to_save) # save the screenshot From 436a4c224f754e1af95e724df155c872946d7cd9 Mon Sep 17 00:00:00 2001 From: hasan7n Date: Wed, 23 Aug 2023 13:38:36 +0000 Subject: [PATCH 10/13] fix name conventions for extraction models outputs --- src/applications/PrepareDataset.py | 55 +++++++++++++----------------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/src/applications/PrepareDataset.py b/src/applications/PrepareDataset.py index a4b15be0..508f6677 100644 --- a/src/applications/PrepareDataset.py +++ b/src/applications/PrepareDataset.py @@ -314,11 +314,11 @@ def _run_brain_extraction_using_gandlf( else models_to_infer.split(",") ) - model_counter = 0 images_for_fusion = [] for model_dir in models_to_run: + model_id = os.path.basename(model_dir) model_output_dir = posixpath.join( - base_output_dir, "model_" + str(model_counter) + base_output_dir, "brain_extraction_" + str(model_id) ) file_list = os.listdir(model_dir) for file in file_list: @@ -342,18 +342,17 @@ def _run_brain_extraction_using_gandlf( for modality in modality_outputs: modality_output_dir = posixpath.join(model_output_dir_testing, modality) files_in_modality = os.listdir(modality_output_dir) - for file in files_in_modality: + for file in files_in_modality: # this loop may not be necessary if file.endswith(".nii.gz"): file_path = posixpath.join(modality_output_dir, file) shutil.copyfile( file_path, posixpath.join( base_output_dir, - f"brainMask_{model_counter}_{modality}.nii.gz", + f"brainMask_{model_id}_{modality}.nii.gz", ), ) images_for_fusion.append(sitk.ReadImage(file_path, sitk.sitkUInt8)) - model_counter += 1 return fuse_images(images_for_fusion, "staple", [0, 1]) @@ -398,13 +397,14 @@ def _run_tumor_segmentation_using_gandlf( else models_to_infer.split(",") ) - model_counter = 0 + tumor_masks_to_return = [] images_for_fusion = [] mask_output_dir = posixpath.join(base_output_dir, TUMOR_MASK_FOLDER) os.makedirs(mask_output_dir, exist_ok=True) for model_dir in models_to_run: + model_id = os.path.basename(model_dir) model_output_dir = posixpath.join( - base_output_dir, "model_" + str(model_counter) + base_output_dir, "tumor_segmentation_" + str(model_id) ) file_list = os.listdir(model_dir) for file in file_list: @@ -429,24 +429,21 @@ def _run_tumor_segmentation_using_gandlf( ) model_output_dir_testing = posixpath.join(model_output_dir, TESTING_FOLDER) - subject_model_output_dir = os.listdir(model_output_dir_testing) - tumor_masks_to_return = [] - for subject in subject_model_output_dir: - subject_output_dir = posixpath.join(model_output_dir_testing, subject) - files_in_modality = os.listdir(subject_output_dir) - for file in files_in_modality: - if file.endswith(".nii.gz"): - file_path = posixpath.join(subject_output_dir, file) - shutil.copyfile( - file_path, - posixpath.join( - mask_output_dir, - f"{subject_id}_tumorMask_model-{model_counter}.nii.gz", - ), - ) - tumor_masks_to_return.append(file_path) - images_for_fusion.append(sitk.ReadImage(file_path, sitk.sitkUInt8)) - model_counter += 1 + # We expect one subject (one output modality, one file). + subject = os.listdir(model_output_dir_testing)[0] + subject_output_dir = posixpath.join(model_output_dir_testing, subject) + files_in_modality = os.listdir(subject_output_dir) + for file in files_in_modality: # this loop may not be necessary + if file.endswith(".nii.gz"): + file_path = posixpath.join(subject_output_dir, file) + renamed_path = posixpath.join( + mask_output_dir, + f"{subject_id}_tumorMask_model-{model_id}.nii.gz", + ) + shutil.copyfile(file_path, renamed_path) + # Append the renamed path to keep track of model IDs + tumor_masks_to_return.append(renamed_path) + images_for_fusion.append(sitk.ReadImage(file_path, sitk.sitkUInt8)) tumor_class_list = [0, 1, 2, 3, 4] @@ -781,18 +778,14 @@ def extract_tumor(self, row: pd.Series, pbar: tqdm): interimOutputDir_actual, ) - tumor_mask_idx = 0 for tumor_mask in tumor_masks_for_qc: + tumor_mask_id = os.path.basename(tumor_mask) # save the screenshot _save_screenshot( input_for_tumor_models, - posixpath.join( - interimOutputDir_actual, - f"{subject_id_timepoint}_summary_tumor-segmentation_model-{tumor_mask_idx}.png", - ), + posixpath.join(interimOutputDir_actual, f"{tumor_mask_id}_summary.png"), tumor_mask, ) - tumor_mask_idx += 1 with open(self.stdout_log, "a+") as f: f.write(f"***\nTumor Masks For QC:\n{tumor_masks_for_qc}\n***") From 6afa8ce54ac927d11c4a2f76fcfe5ba4e3e19f53 Mon Sep 17 00:00:00 2001 From: hasan7n Date: Wed, 23 Aug 2023 13:40:07 +0000 Subject: [PATCH 11/13] revert back to non openvino model the model was fine. There was some confusing folder copying that made me think the model was outputting five files --- src/applications/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/applications/setup.py b/src/applications/setup.py index ccc1e9b0..8787e8a6 100644 --- a/src/applications/setup.py +++ b/src/applications/setup.py @@ -72,7 +72,7 @@ urls_for_download = { "brain_extraction": "https://upenn.box.com/shared/static/cp5xz726mtb6gwwym8ydcxmw52zfngun", - "tumor_segmentation": "https://upenn.box.com/shared/static/woiqk6x9ygazst5ofrnfnezuy0aw0tn6", # should be changed + "tumor_segmentation": "https://storage.googleapis.com/medperf-storage/rano_test_assets/tumor_segmentation.zip", # should be changed } for model in urls_for_download.keys(): From 07ea0f2bd2411c8425628b63513b8955cc04cf61 Mon Sep 17 00:00:00 2001 From: hasan7n Date: Wed, 23 Aug 2023 14:58:43 +0000 Subject: [PATCH 12/13] remove file extension from tumor mask ID --- src/applications/PrepareDataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/applications/PrepareDataset.py b/src/applications/PrepareDataset.py index 508f6677..791488f8 100644 --- a/src/applications/PrepareDataset.py +++ b/src/applications/PrepareDataset.py @@ -779,7 +779,7 @@ def extract_tumor(self, row: pd.Series, pbar: tqdm): ) for tumor_mask in tumor_masks_for_qc: - tumor_mask_id = os.path.basename(tumor_mask) + tumor_mask_id = os.path.basename(tumor_mask).replace(".nii.gz", "") # save the screenshot _save_screenshot( input_for_tumor_models, From 2fca4ca391073ae4ca3b6a89b3daec0ccc34576b Mon Sep 17 00:00:00 2001 From: "Edwards, Brandon" Date: Wed, 23 Aug 2023 17:07:18 -0700 Subject: [PATCH 13/13] missing and extra modalities detected by string for subject and timepoint in the CSVCreator missing and extra modalities lists which are long lived attributes over iterations through subjects and timepoints --- mlcubes/data_preparation/project/stages/get_csv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlcubes/data_preparation/project/stages/get_csv.py b/mlcubes/data_preparation/project/stages/get_csv.py index 5caa9717..587df0c6 100644 --- a/mlcubes/data_preparation/project/stages/get_csv.py +++ b/mlcubes/data_preparation/project/stages/get_csv.py @@ -75,14 +75,14 @@ def execute(self, index: Union[str, int], report: pd.DataFrame) -> pd.DataFrame: "data_path": tp_out_path, "labels_path": "", } - if self.csv_processor.subject_timepoint_missing_modalities: + if f"{id}_{tp}" in self.csv_processor.subject_timepoint_missing_modalities: shutil.rmtree(tp_out_path, ignore_errors=True) comment = "There are missing modalities. Please check the data" report_data["status"] = -1.1 report_data["status_name"] = "MISSING_MODALITIES" report_data["data_path"] = tp_path report_data["comment"] = comment - elif self.csv_processor.subject_timepoint_extra_modalities: + elif f"{id}_{tp}" in self.csv_processor.subject_timepoint_extra_modalities: shutil.rmtree(tp_out_path, ignore_errors=True) comment = "There are extra modalities. Please check the data" report_data["status"] = -1.2