Merge branch 'main' into empty_jwks_fix

mlcommons · Sep 9, 2024 · 9697868 · 9697868
2 parents 88434cb + f6927b8
commit 9697868
Show file tree

Hide file tree

Showing 30 changed files with 750 additions and 1 deletion.
diff --git a/examples/BraTS/data_prep/mlcube/mlcube.yaml b/examples/BraTS/data_prep/mlcube/mlcube.yaml
@@ -8,7 +8,7 @@ platform:
 
 docker:
   # Image name.
-  image: mlcommons/fets_data-prep
+  image: mlcommons/fets_data-prep-v2
   # Docker build context relative to $MLCUBE_ROOT. Default is `build`.
   build_context: "../project"
   # Docker file name within docker build context, default is `Dockerfile`.

diff --git a/examples/BraTS/data_prep/project/sanity_check.py b/examples/BraTS/data_prep/project/sanity_check.py
@@ -17,6 +17,7 @@ def check_subject_validity(subject_dir):
     subject_valid = True
     strings_to_check = [
         "_t1.nii.gz",
+        "_t1c.nii.gz",
         "_t1ce.nii.gz",
         "_t2.nii.gz",
         "_flair.nii.gz",

diff --git a/examples/BraTS2024/data_prep/README.md b/examples/BraTS2024/data_prep/README.md
@@ -0,0 +1,72 @@
+# BraTS 2024 Data Preparation
+
+Data preparation MLCube for the raw datasets of:
+
+* [Meningioma Radiotherapy](https://www.synapse.org/Synapse:syn53708249/wiki/627503)
+* [Pathology](https://www.synapse.org/Synapse:syn53708249/wiki/628091)
+
+## Example raw datasets
+
+### Radiotherapy
+
+```
+BraTS-MEN-RT/
+├── BraTS-MEN-RT-xxxx-x
+│   ├── BraTS-MEN-RT-xxxx-x_gtv.nii.gz
+│   └── BraTS-MEN-RT-xxxx-x_t1c.nii.gz
+├── BraTS-MEN-RT-yyyy-y
+│   ├── BraTS-MEN-RT-yyyy-y_gtv.nii.gz
+│   └── BraTS-MEN-RT-yyyy-y_t1c.nii.gz
+└── BraTS-MEN-RT-zzzz-z
+    ├── BraTS-MEN-RT-zzzz-z_gtv.nii.gz
+    └── BraTS-MEN-RT-zzzz-z_t1c.nii.gz
+```
+
+where:
+* `*_t1c.nii.gz` are data given to model MLCubes to make their inference
+* `*_gtv.nii.gz` are the labels (groundtruth)
+
+### Pathology
+
+```
+BraTS-Path/
+├── BraTSPath_cohort_xxxxxxx.png
+├── BraTSPath_cohort_yyyyyyy.png
+├── BraTSPath_cohort_zzzzzzz.png
+└── labels.csv
+```
+
+where:
+* `*.png` are data given to model MLCubes to make their inference
+* `labels.csv` are the classfication labels
+
+## Example prepared datasets
+
+### Radiotherapy
+
+```
+data
+├── BraTS-MEN-RT-xxxx-x
+│   └── BraTS-MEN-RT-xxxx-x_t1c.nii.gz
+├── BraTS-MEN-RT-yyyy-y
+│   └── BraTS-MEN-RT-yyyy-y_t1c.nii.gz
+└── BraTS-MEN-RT-zzzz-z
+    └── BraTS-MEN-RT-zzzz-z_t1c.nii.gz
+
+labels
+├── BraTS-MEN-RT-xxxx-x_gtv.nii.gz
+├── BraTS-MEN-RT-yyyy-y_gtv.nii.gz
+└── BraTS-MEN-RT-zzzz-z_gtv.nii.gz
+```
+
+### Pathology
+
+```
+data
+├── BraTSPath_cohort_xxxxxxx.png
+├── BraTSPath_cohort_yyyyyyy.png
+└── BraTSPath_cohort_zzzzzzz.png
+
+labels
+└── labels.csv
+```
diff --git a/examples/BraTS2024/data_prep/mlcube/mlcube.yaml b/examples/BraTS2024/data_prep/mlcube/mlcube.yaml
@@ -0,0 +1,46 @@
+name: BraTS2024 Data Preparator Cube
+description: BraTS2024 Data Preparator Cube for Tasks 3 and 10
+authors:
+  - { name: "MLCommons Medical Working Group" }
+  - { name: "Verena Chung" }
+
+platform:
+  accelerator_count: 0
+
+docker:
+  # Image name.
+  image: ghcr.io/vpchung/brats2024-prep:0.0.1
+  # Docker build context relative to $MLCUBE_ROOT. Default is `build`.
+  build_context: "../project"
+  # Docker file name within docker build context, default is `Dockerfile`.
+  build_file: "Dockerfile"
+
+tasks:
+  prepare:
+    parameters:
+      inputs:
+        {
+          data_path: input_data/,
+          labels_path: input_labels/,
+          parameters_file: parameters.yaml,
+        }
+      outputs: { output_path: data/, output_labels_path: labels/ }
+  sanity_check:
+    parameters:
+      inputs:
+        {
+          data_path: data/,
+          labels_path: labels/,
+
+          parameters_file: parameters.yaml,
+        }
+  statistics:
+    parameters:
+      inputs:
+        {
+          data_path: data/,
+          labels_path: labels/,
+
+          parameters_file: parameters.yaml,
+        }
+      outputs: { output_path: { type: file, default: statistics.yaml } }
diff --git a/examples/BraTS2024/data_prep/mlcube/workspace/parameters-path.yaml b/examples/BraTS2024/data_prep/mlcube/workspace/parameters-path.yaml
@@ -0,0 +1 @@
+task: pathology
diff --git a/examples/BraTS2024/data_prep/mlcube/workspace/parameters-rt.yaml b/examples/BraTS2024/data_prep/mlcube/workspace/parameters-rt.yaml
@@ -0,0 +1,3 @@
+task: segmentation-radiotherapy
+segmentation_modalities: ["t1c"]
+label_modality: gtv
diff --git a/examples/BraTS2024/data_prep/project/Dockerfile b/examples/BraTS2024/data_prep/project/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.9.16-slim 
+
+COPY ./requirements.txt /mlcube_project/requirements.txt 
+
+RUN pip3 install --no-cache-dir -r /mlcube_project/requirements.txt
+
+ENV LANG C.UTF-8
+
+# Create a non-root user.
+RUN useradd nonrootuser
+USER nonrootuser
+
+COPY . /mlcube_project
+
+ENTRYPOINT ["python3", "/mlcube_project/mlcube.py"]
diff --git a/examples/BraTS2024/data_prep/project/mlcube.py b/examples/BraTS2024/data_prep/project/mlcube.py
@@ -0,0 +1,51 @@
+"""MLCube handler file"""
+import typer
+import yaml
+from prepare import prepare_dataset
+from sanity_check import perform_sanity_checks
+from stats import generate_statistics
+
+app = typer.Typer()
+
+
+@app.command("prepare")
+def prepare(
+    data_path: str = typer.Option(..., "--data_path"),
+    labels_path: str = typer.Option(..., "--labels_path"),
+    parameters_file: str = typer.Option(..., "--parameters_file"),
+    output_path: str = typer.Option(..., "--output_path"),
+    output_labels_path: str = typer.Option(..., "--output_labels_path"),
+):
+    with open(parameters_file) as f:
+        parameters = yaml.safe_load(f)
+
+    prepare_dataset(data_path, labels_path, parameters, output_path, output_labels_path)
+
+
+@app.command("sanity_check")
+def sanity_check(
+    data_path: str = typer.Option(..., "--data_path"),
+    labels_path: str = typer.Option(..., "--labels_path"),
+    parameters_file: str = typer.Option(..., "--parameters_file"),
+):
+    with open(parameters_file) as f:
+        parameters = yaml.safe_load(f)
+
+    perform_sanity_checks(data_path, labels_path, parameters)
+
+
+@app.command("statistics")
+def statistics(
+    data_path: str = typer.Option(..., "--data_path"),
+    labels_path: str = typer.Option(..., "--labels_path"),
+    parameters_file: str = typer.Option(..., "--parameters_file"),
+    out_path: str = typer.Option(..., "--output_path"),
+):
+    with open(parameters_file) as f:
+        parameters = yaml.safe_load(f)
+
+    generate_statistics(data_path, labels_path, parameters, out_path)
+
+
+if __name__ == "__main__":
+    app()
diff --git a/examples/BraTS2024/data_prep/project/prepare.py b/examples/BraTS2024/data_prep/project/prepare.py
@@ -0,0 +1,64 @@
+import os
+import random
+import shutil
+from glob import iglob
+
+random.seed(7)
+
+
+def __copy_modalities(input_folder, modalities, output_folder):
+    for file in iglob(os.path.join(input_folder, "*.nii.gz")):
+        for modality in modalities:
+            if file.endswith(f"{modality}.nii.gz"):
+                new_file = os.path.join(output_folder, os.path.basename(file))
+                shutil.copyfile(file, new_file)
+                break
+
+
+def copy_radiotherapy_data(
+    data_path, labels_path, parameters, output_path, output_labels_path
+):
+    # copy data
+    modalities = parameters["segmentation_modalities"]
+    for folder in iglob(os.path.join(data_path, "*/")):
+        outfolder = os.path.join(
+            output_path, os.path.basename(os.path.normpath(folder))
+        )
+        os.makedirs(outfolder, exist_ok=True)
+        __copy_modalities(folder, modalities, outfolder)
+
+    # copy labels
+    modality = parameters["label_modality"]
+    for f in iglob(os.path.join(labels_path, "*")):
+        if os.path.isdir(f):
+            __copy_modalities(f, [modality], output_labels_path)
+        else:
+            if f.endswith(f"{modality}.nii.gz"):
+                new_file = os.path.join(output_labels_path, os.path.basename(f))
+                shutil.copyfile(f, new_file)
+
+
+def copy_pathology_data(data_path, labels_path, output_path, output_labels_path):
+    # copy data
+    for file in iglob(os.path.join(data_path, "*.png")):
+        new_file = os.path.join(output_path, os.path.basename(file))
+        shutil.copyfile(file, new_file)
+
+    # copy labels
+    for file in iglob(os.path.join(labels_path, "*.csv")):
+        new_file = os.path.join(output_labels_path, os.path.basename(file))
+        shutil.copyfile(file, new_file)
+
+
+def prepare_dataset(
+    data_path, labels_path, parameters, output_path, output_labels_path
+):
+    task = parameters["task"]
+    assert task in ["segmentation-radiotherapy", "pathology"], "Invalid task"
+    os.makedirs(output_path, exist_ok=True)
+    os.makedirs(output_labels_path, exist_ok=True)
+
+    if task == "segmentation-radiotherapy":
+        copy_radiotherapy_data(data_path, labels_path, parameters, output_path, output_labels_path)
+    else:
+        copy_pathology_data(data_path, labels_path, output_path, output_labels_path)
diff --git a/examples/BraTS2024/data_prep/project/requirements.txt b/examples/BraTS2024/data_prep/project/requirements.txt
@@ -0,0 +1,4 @@
+pyYAML
+typer
+numpy
+SimpleITK>=2.1.0
diff --git a/examples/BraTS2024/data_prep/project/sanity_check.py b/examples/BraTS2024/data_prep/project/sanity_check.py
@@ -0,0 +1,60 @@
+import os
+
+
+def check_subject_validity_for_segmentation(labels_path, subject_dir, parameters):
+    modalities = parameters["segmentation_modalities"]
+    label_modality = parameters["label_modality"]
+
+    # data
+    strings_to_check = [f"_{modality}.nii.gz" for modality in modalities]
+    for string in strings_to_check:
+        if not os.path.isfile(
+            os.path.join(subject_dir, os.path.basename(subject_dir) + string)
+        ):
+            raise ValueError(
+                f"{os.path.basename(subject_dir)} does not contain all modalities"
+            )
+
+    assert len(os.listdir(subject_dir)) == len(
+        modalities
+    ), "invalid number of modalities"
+
+    # labels
+    if not os.path.isfile(
+        os.path.join(
+            labels_path, os.path.basename(subject_dir) + f"_{label_modality}.nii.gz"
+        )
+    ):
+        raise ValueError(
+            f"{os.path.basename(subject_dir)} does not contain segmentation labels"
+        )
+
+
+def check_subject_validity_for_pathology(labels_path, data_path):
+    # data
+    if not all(file.endswith("png") for file in os.listdir(data_path)):
+        raise ValueError(
+            f"{os.path.basename(data_path)} should only contain PNG files"
+        )
+
+    # labels
+    assert len(os.listdir(labels_path)) == 1, "invalid number of labels file"
+    if not os.listdir(labels_path)[0].endswith("csv"):
+        raise ValueError(
+            f"{labels_path} does not contain classification labels in a CSV file"
+        )
+
+
+def perform_sanity_checks(data_path, labels_path, parameters):
+    task = parameters["task"]
+
+    if task == "segmentation-radiotherapy":
+        data_folders = os.listdir(data_path)
+        for folder in data_folders:
+            current_subject = os.path.join(data_path, folder)
+            assert os.path.isdir(current_subject), "Unexpected file found"
+            check_subject_validity_for_segmentation(
+                labels_path, current_subject, parameters
+            )
+    else:
+        check_subject_validity_for_pathology(labels_path, data_path)
diff --git a/examples/BraTS2024/data_prep/project/stats.py b/examples/BraTS2024/data_prep/project/stats.py
@@ -0,0 +1,11 @@
+import os
+import yaml
+
+
+def generate_statistics(data_path, labels_path, parameters, out_path):
+    stats = {
+        "Number of Subjects": len(os.listdir(data_path)),
+    }
+
+    with open(out_path, "w") as f:
+        yaml.dump(stats, f)
diff --git a/examples/BraTS2024/dummy_model/README.md b/examples/BraTS2024/dummy_model/README.md
@@ -0,0 +1,32 @@
+# BraTS 2024 Dummy Models
+
+Reference model MLCube for:
+
+* [Meningioma Radiotherapy](https://www.synapse.org/Synapse:syn53708249/wiki/627503)
+* [Pathology](https://www.synapse.org/Synapse:syn53708249/wiki/628091)
+
+## Example model outputs
+
+### Radiotherapy
+
+A single folder with segmentation files, e.g.
+
+```
+predictions
+├── BraTS-MEN-RT-xxxx-x.nii.gz
+├── BraTS-MEN-RT-yyyy-y.nii.gz
+└── BraTS-MEN-RT-zzzz-z.nii.gz
+```
+
+### Pathology
+
+A 2-column CSV with `SubjectID` and `Prediction` as the headers, e.g.
+
+```
+SubjectID,Prediction
+BraTSPath_cohort_xxxxxxx.png,A
+BraTSPath_cohort_yyyyyyy.png,B
+BraTSPath_cohort_zzzzzzz.png,C
+```
+
+where `A`, `B`, and `C` are integers from 0 to 5.