Project-MONAI · KumoLiu · Aug 26, 2024 · Aug 15, 2024 · Aug 15, 2024 · Aug 16, 2024
diff --git a/docs/source/apps.rst b/docs/source/apps.rst
@@ -248,6 +248,10 @@ FastMRIReader
 ~~~~~~~~~~~~~
 .. autofunction:: monai.apps.reconstruction.complex_utils.complex_conj
 
+`Vista3d`
+---------
+.. autofunction:: monai.apps.vista3d.inferer.point_based_window_inferer
+
 `Auto3DSeg`
 -----------
 .. automodule:: monai.apps.auto3dseg

diff --git a/...i/apps/generation/maisi/utils/__init__.py → monai/apps/vista3d/__init__.py b/...i/apps/generation/maisi/utils/__init__.py → monai/apps/vista3d/__init__.py
diff --git a/monai/apps/vista3d/inferer.py b/monai/apps/vista3d/inferer.py
@@ -0,0 +1,165 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import copy
+from collections.abc import Sequence
+from typing import Any
+
+import torch
+
+from monai.data.meta_tensor import MetaTensor
+from monai.utils import optional_import
+
+tqdm, _ = optional_import("tqdm", name="tqdm")
+
+__all__ = ["point_based_window_inferer"]
+
+
+def point_based_window_inferer(
+    inputs: torch.Tensor | MetaTensor,
+    roi_size: Sequence[int],
+    predictor: torch.nn.Module,
+    point_coords: torch.Tensor,
+    point_labels: torch.Tensor,
+    class_vector: torch.Tensor | None = None,
+    prompt_class: torch.Tensor | None = None,
+    prev_mask: torch.Tensor | MetaTensor | None = None,
+    point_start: int = 0,
+    **kwargs: Any,
+) -> torch.Tensor:
+    """
+    Point based window inferer, crop a patch centered at the point, and perform inference.
+    Different patches are combined with gaussian weighted weights.
+
+    Args:
+        inputs: input image to be processed (assuming NCHW[D])
+        roi_size: the spatial window size for inferences.
+            When its components have None or non-positives, the corresponding inputs dimension will be used.
+            if the components of the `roi_size` are non-positive values, the transform will use the
+            corresponding components of img size. For example, `roi_size=(32, -1)` will be adapted
+            to `(32, 64)` if the second spatial dimension size of img is `64`.
+        sw_batch_size: the batch size to run window slices.
+        predictor: partial(infer_wrapper, model). infer_wrapper transpose the model output.
+            The model output is [B, 1, H, W, D] which needs to be transposed to [1, B, H, W, D].
+        point_coords: [B, N, 3]
+        point_labels: [B, N]
+        class_vector: [B]
+        prev_mask: [1, B, H, W, D]. The value is before sigmoid.
+    Returns:
+        stitched_output: [1, B, H, W, D]. The value is before sigmoid.
+    Notice: The function only supports SINGLE OBJECT INFERENCE with B=1.
+    """
+    if not point_coords.shape[0] == 1:
+        raise ValueError("Only supports single object point click.")
+    image, pad = _pad_previous_mask(copy.deepcopy(inputs), roi_size)
+    point_coords = point_coords + torch.tensor([pad[-2], pad[-4], pad[-6]]).to(point_coords.device)
+    prev_mask = _pad_previous_mask(copy.deepcopy(prev_mask), roi_size)[0] if prev_mask is not None else None
+    stitched_output = None
+    center_only = True
+    for p in point_coords[0][point_start:]:
+        lx_, rx_ = _get_window_idx(p[0], roi_size[0], image.shape[-3], center_only=center_only, margin=5)
+        ly_, ry_ = _get_window_idx(p[1], roi_size[1], image.shape[-2], center_only=center_only, margin=5)
+        lz_, rz_ = _get_window_idx(p[2], roi_size[2], image.shape[-1], center_only=center_only, margin=5)
+        for i in range(len(lx_)):
+            for j in range(len(ly_)):
+                for k in range(len(lz_)):
+                    lx, rx, ly, ry, lz, rz = (lx_[i], rx_[i], ly_[j], ry_[j], lz_[k], rz_[k])
+                    unravel_slice = [
+                        slice(None),
+                        slice(None),
+                        slice(int(lx), int(rx)),
+                        slice(int(ly), int(ry)),
+                        slice(int(lz), int(rz)),
+                    ]
+                    batch_image = image[unravel_slice]
+                    output = predictor(
+                        batch_image,
+                        point_coords=point_coords,
+                        point_labels=point_labels,
+                        class_vector=class_vector,
+                        prompt_class=prompt_class,
+                        patch_coords=unravel_slice,
+                        prev_mask=prev_mask,
+                        **kwargs,
+                    )
+                    if stitched_output is None:
+                        stitched_output = torch.zeros(
+                            [1, output.shape[1], image.shape[-3], image.shape[-2], image.shape[-1]], device="cpu"
+                        )
+                        stitched_mask = torch.zeros(
+                            [1, output.shape[1], image.shape[-3], image.shape[-2], image.shape[-1]], device="cpu"
+                        )
+                    stitched_output[unravel_slice] += output.to("cpu")
+                    stitched_mask[unravel_slice] = 1
+    # if stitched_mask is 0, then NaN value
+    stitched_output = stitched_output / stitched_mask
+    # revert padding
+    stitched_output = stitched_output[
+        :, :, pad[4] : image.shape[-3] - pad[5], pad[2] : image.shape[-2] - pad[3], pad[0] : image.shape[-1] - pad[1]
+    ]
+    stitched_mask = stitched_mask[
+        :, :, pad[4] : image.shape[-3] - pad[5], pad[2] : image.shape[-2] - pad[3], pad[0] : image.shape[-1] - pad[1]
+    ]
+    if prev_mask is not None:
+        prev_mask = prev_mask[
+            :,
+            :,
+            pad[4] : image.shape[-3] - pad[5],
+            pad[2] : image.shape[-2] - pad[3],
+            pad[0] : image.shape[-1] - pad[1],
+        ]
+        prev_mask = prev_mask.to("cpu")  # type: ignore
+        # for un-calculated place, use previous mask
+        stitched_output[stitched_mask < 1] = prev_mask[stitched_mask < 1]
+    if isinstance(inputs, torch.Tensor):
+        inputs = MetaTensor(inputs)
+    if not hasattr(stitched_output, "meta"):
+        stitched_output = MetaTensor(stitched_output, affine=inputs.meta["affine"], meta=inputs.meta)
+    return stitched_output
+
+
+def _get_window_idx_c(p: int, roi: int, s: int) -> tuple[int, int]:
+    """Helper function to get the window index."""
+    if p - roi // 2 < 0:
+        left, right = 0, roi
+    elif p + roi // 2 > s:
+        left, right = s - roi, s
+    else:
+        left, right = int(p) - roi // 2, int(p) + roi // 2
+    return left, right
+
+
+def _get_window_idx(p: int, roi: int, s: int, center_only: bool = True, margin: int = 5) -> tuple[list[int], list[int]]:
+    """Get the window index."""
+    left, right = _get_window_idx_c(p, roi, s)
+    if center_only:
+        return [left], [right]
+    left_most = max(0, p - roi + margin)
+    right_most = min(s, p + roi - margin)
+    left_list = [left_most, right_most - roi, left]
+    right_list = [left_most + roi, right_most, right]
+    return left_list, right_list
+
+
+def _pad_previous_mask(
+    inputs: torch.Tensor | MetaTensor, roi_size: Sequence[int], padvalue: int = 0
+) -> tuple[torch.Tensor | MetaTensor, list[int]]:
+    """Helper function to pad inputs."""
+    pad_size = []
+    for k in range(len(inputs.shape) - 1, 1, -1):
+        diff = max(roi_size[k - 2] - inputs.shape[k], 0)
+        half = diff // 2
+        pad_size.extend([half, diff - half])
+    if any(pad_size):
+        inputs = torch.nn.functional.pad(inputs, pad=pad_size, mode="constant", value=padvalue)  # type: ignore
+    return inputs, pad_size
diff --git a/monai/inferers/utils.py b/monai/inferers/utils.py
@@ -300,6 +300,7 @@ def sliding_window_inference(
 
     # remove padding if image_size smaller than roi_size
     if any(pad_size):
+        kwargs.update({"pad_size": pad_size})
         for ss, output_i in enumerate(output_image_list):
             zoom_scale = [_shape_d / _roi_size_d for _shape_d, _roi_size_d in zip(output_i.shape[2:], roi_size)]
             final_slicing: list[slice] = []

diff --git a/monai/networks/nets/vista3d.py b/monai/networks/nets/vista3d.py
@@ -78,6 +78,35 @@ def __init__(self, image_encoder: nn.Module, class_head: nn.Module, point_head:
         self.NINF_VALUE = -9999
         self.PINF_VALUE = 9999
 
+    def update_slidingwindow_padding(
+        self,
+        pad_size: list | None,
+        labels: torch.Tensor | None,
+        prev_mask: torch.Tensor | None,
+        point_coords: torch.Tensor | None,
+    ):
+        """
+        Image has been padded by sliding window inferer.
+        The related padding need to be performed outside of slidingwindow inferer.
+
+        Args:
+            pad_size: padding size passed from sliding window inferer.
+            labels: image label ground truth.
+            prev_mask: previous segmentation mask.
+            point_coords: point click coordinates.
+        """
+        if pad_size is None:
+            return labels, prev_mask, point_coords
+        if labels is not None:
+            labels = F.pad(labels, pad=pad_size, mode="constant", value=0)
+        if prev_mask is not None:
+            prev_mask = F.pad(prev_mask, pad=pad_size, mode="constant", value=0)
+        if point_coords is not None:
+            point_coords = point_coords + torch.tensor(
+                [pad_size[-2], pad_size[-4], pad_size[-6]], device=point_coords.device
+            )
+        return labels, prev_mask, point_coords
+
     def get_foreground_class_count(self, class_vector: torch.Tensor | None, point_coords: torch.Tensor | None) -> int:
         """Get number of foreground classes based on class and point prompt."""
         if class_vector is None:
@@ -329,7 +358,7 @@ def forward(
             point_coords: [B, N, 3]
             point_labels: [B, N], -1 represents padding. 0/1 means negative/positive points for regular class.
                 2/3 means negative/postive ponits for special supported class like tumor.
-            class_vector: [B, 1], the global class index
+            class_vector: [B, 1], the global class index.
             prompt_class: [B, 1], the global class index. This value is associated with point_coords to identify if
                 the points are for zero-shot or supported class. When class_vector and point_coords are both
                 provided, prompt_class is the same as class_vector. For prompt_class[b] > 512, point_coords[b]
@@ -348,6 +377,9 @@ def forward(
             val_point_sampler: function used to sample points from labels. This is only used for point-only evaluation.
 
         """
+        labels, prev_mask, point_coords = self.update_slidingwindow_padding(
+            kwargs.get("pad_size", None), labels, prev_mask, point_coords
+        )
         image_size = input_images.shape[-3:]
         device = input_images.device
         if point_coords is None and class_vector is None:

diff --git a/tests/test_point_based_window_inferer.py b/tests/test_point_based_window_inferer.py
@@ -0,0 +1,56 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import unittest
+
+import torch
+from parameterized import parameterized
+
+from monai.apps.vista3d.inferer import point_based_window_inferer
+from monai.networks import eval_mode
+from monai.networks.nets.vista3d import vista3d132
+from monai.utils import optional_import
+from tests.utils import SkipIfBeforePyTorchVersion, skip_if_quick
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+_, has_tqdm = optional_import("tqdm")
+
+TEST_CASES = [
+    [
+        {"encoder_embed_dim": 48, "in_channels": 1},
+        (1, 1, 64, 64, 64),
+        {
+            "roi_size": [32, 32, 32],
+            "point_coords": torch.tensor([[[1, 2, 3], [1, 2, 3]]], device=device),
+            "point_labels": torch.tensor([[1, 0]], device=device),
+        },
+    ]
+]
+
+
+@SkipIfBeforePyTorchVersion((1, 11))
+@skip_if_quick
+class TestPointBasedWindowInferer(unittest.TestCase):
+    @parameterized.expand(TEST_CASES)
+    def test_vista3d(self, vista3d_params, inputs_shape, inferer_params):
+        vista3d = vista3d132(**vista3d_params).to(device)
+        with eval_mode(vista3d):
+            inferer_params["predictor"] = vista3d
+            inferer_params["inputs"] = torch.randn(*inputs_shape).to(device)
+            stitched_output = point_based_window_inferer(**inferer_params)
+            self.assertEqual(stitched_output.shape, inputs_shape)
+
+
+if __name__ == "__main__":
+    unittest.main()