pytorch · RdoubleA · Nov 18, 2024 · Oct 21, 2024 · Nov 14, 2024 · Nov 14, 2024
diff --git a/docs/source/tutorials/memory_optimizations.rst b/docs/source/tutorials/memory_optimizations.rst
@@ -108,8 +108,9 @@ tensors will be offloaded.
 
 *Sounds great! How do I use it?*
 
-To enable activation offloading, use ``enable_activation_offloading=True``. If you are on torch
-version later than PyTorch 2.5.0, it will allow the usage of multiple CUDA streams automatically.
+To enable activation offloading, use the ``enable_activation_offloading`` config entry or flag
+in our lora finetuning single device recipe, e.g. ``enable_activation_offloading=True``. To allow
+usage of streams, make sure you are on a torch version equal to or later than PyTorch.
-usage of streams, make sure you are on a torch version equal to or later than PyTorch.
+usage of streams, make sure you are on a torch version equal to or later than PyTorch 2.5.0.
-usage of streams, make sure you are on a torch version equal to or later than PyTorch.
+usage of streams, make sure you are on a torch version equal to or later than PyTorch 2.5.0.
 
 .. _glossary_grad_accm:
 

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -69,9 +69,9 @@ class LoRAFinetuneRecipeDistributed(FTRecipeInterface):
             back during the backward pass. As always, there is a tradeoff--these savings in memory can
             come at the cost of training performance and CPU resources. To recover some runtime cost,
             we've added an option to enable offloading on a different stream to permit overlapping with
-            the computation. This option is currently only available on PyTorch 2.5 or later and will
-            be enabled by default if an acceptable torch version is found. Activation offloading can be
-            used in conjunction with activation checkpointing.
+            the computation. This option is currently only available on PyTorch 2.5.0 or later and will be
+            enabled by default if an acceptable torch version is found. Activation offloading can be used in
+            conjunction with activation checkpointing.
 
         - Precision. Full fp32 and bf16 training are supported. Precision is controlled using the ``dtype``
             flag. When ``dtype=bf16``, all activations, gradients and optimizer states are in bfloat16. In

diff --git a/tests/torchtune/modules/test_attention_utils.py b/tests/torchtune/modules/test_attention_utils.py
@@ -84,7 +84,7 @@ def test_packed_block_causal_mask_sdpa(self, seq_lens):
 
     @pytest.mark.skipif(
         not _SUPPORTS_FLEX_ATTENTION,
-        reason="Please install a nightly build of torch (>=2.5.0) to run this test.",
+        reason="Hardware does not support Flex Attention.",
     )
     @gpu_test(gpu_count=1)
     def test_packed_block_causal_mask_flex(self):
@@ -100,7 +100,7 @@ def test_packed_block_causal_mask_flex(self):
 class TestSDPAOrFlexAttention:
     @pytest.mark.skipif(
         not _SUPPORTS_FLEX_ATTENTION,
-        reason="Please install a nightly build of torch (>=2.5.0) to run this test.",
+        reason="Hardware does not support Flex Attention.",
     )
     @mock.patch("torchtune.modules.attention_utils.compile_friendly_flex_attention")
     @mock.patch(

diff --git a/torchtune/modules/attention_utils.py b/torchtune/modules/attention_utils.py
@@ -115,9 +115,9 @@ def packed_block_causal_mask(
     seq_lens: List[torch.Tensor],
 ) -> _MaskType:
     """
-    Create a block causal document mask for a batch of packed sequences. If on
-    torch version >= 2.5.0, this is done by creating a mask_mod function with the
-    block causal logic and passing this into :func:`torch.nn.attention.flex_attention.create_block_mask`.
+    Create a block causal document mask for a batch of packed sequences. If
+    flex attention is supported by the current hardware, block causal logic and
+    passing this into :func:`torch.nn.attention.flex_attention.create_block_mask`.
     The resultant BlockMask is a compressed representation of the full block causal
     mask. If on an older version, a standard 2D block causal mask is created and returned.
 

diff --git a/torchtune/modules/common_utils.py b/torchtune/modules/common_utils.py
@@ -149,11 +149,7 @@ def _register_reparametrize_state_dict_hooks(
         RuntimeError: If the low RAM reparametrize hook is used on Windows or an incompatible torch version.
     """
     if _use_low_cpu_ram:
-        if torch.__version__ < "2.5.0.dev20240906":
-            raise RuntimeError(
-                "Low RAM reparametrize_as_dtype_state_dict_post_hook requires PyTorch 2.5.0.dev20240906 or later."
-            )
-        elif sys.platform == "win32":
+        if sys.platform == "win32":
             # mmap.MAP_SHARED is not supported on Windows but this change targets colab.
             raise RuntimeError(
                 "Low RAM reparametrize_as_dtype_state_dict_post_hook is not supported on Windows."

diff --git a/torchtune/training/_activation_offloading.py b/torchtune/training/_activation_offloading.py
@@ -5,7 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import contextlib
-from typing import Optional, Union
+from typing import Union
 from warnings import warn
 
 import psutil
@@ -38,9 +38,9 @@ class OffloadActivations(saved_tensors_hooks):
             memory on the CPU. Pinned memory allows the Tensor to be moved back onto GPU more quickly
             but is a limited resource. Default: True.
 
-        use_streams (Optional[bool]): Whether or not to use streams for performance optimization where
+        use_streams (bool): Whether or not to use streams for performance optimization where
             the communications get overlapped with the computation. Requires a torch build
-            after torch-2.5.0.dev20240907. Default: True if a later torch build is found, else False.
+            after torch-2.5.0.]. Default: True.
-            after torch-2.5.0.]. Default: True.
+            after torch-2.5.0. Default: True.
-            after torch-2.5.0.]. Default: True.
+            after torch-2.5.0. Default: True.
 
         max_fwd_stash_size (int): The maximum size of the forward stash, or the maximum number of
             consecutive activations to keep alive during the forward pass. This number must be at
@@ -67,15 +67,12 @@ class OffloadActivations(saved_tensors_hooks):
     def __init__(
         self,
         use_pin_memory: bool = True,
-        use_streams: Optional[bool] = None,
+        use_streams: bool = True,
         max_fwd_stash_size: int = 5,
         min_offload_size: int = 1024,
     ) -> None:
-        if use_streams is None:
-            # Default to True if an acceptable torch is installed (later nightly/version or from source)
-            self.use_streams = torch.__version__ >= "2.5.0.dev20240907"
-        else:
-            self.use_streams = use_streams
+
+        self.use_streams: bool = use_streams
 
         self.min_tensor_size_bytes = (
             min_offload_size  # we don't want to bother with small tensors
@@ -98,10 +95,6 @@ def __init__(
 
         # for streaming
         if self.use_streams:
-            if torch.__version__ < "2.5.0.dev20240907":
-                raise RuntimeError(
-                    "OffloadActivations with use_streams=True requires PyTorch 2.5.0.dev20240907 or later."
-                )
             self.s1 = torch.cuda.Stream()  # comms stream
             self.fwd_stash = {}  # tensor_id => (activation, ev1)
             if max_fwd_stash_size < 1:

diff --git a/torchtune/training/_compile.py b/torchtune/training/_compile.py
@@ -17,7 +17,7 @@
 )
 from torchtune.modules.loss import CEWithChunkedOutputLoss
 from torchtune.modules.model_fusion import DeepFusionModel
-from torchtune.utils import get_logger, torch_version_ge
+from torchtune.utils import get_logger
 
 log = get_logger("INFO")
 
@@ -42,23 +42,14 @@ def compile_model(
     backend = os.environ.get("TORCH_COMPILE_BACKEND", "inductor")
     if isinstance(model, DeepFusionModel):
         model = model.decoder
-    if torch_version_ge("2.5.0"):
-        if verbose:
-            log.info("Compiling model layers with torch.compile...")
-        for m in reversed(list(model.modules())):
-            if isinstance(m, TransformerSelfAttentionLayer) or isinstance(
-                m, TransformerCrossAttentionLayer
-            ):
-                m.compile(backend=backend)
-    else:
-        if verbose:
-            log.info(
-                """
-                Compiling full model with torch.compile...
-                For faster compile times via per-layer compile, please run on PyTorch nightlies.
-                """
-            )
-        model.compile(backend=backend)
+    # Per-layer compilation by default
+    if verbose:
+        log.info("Compiling model layers with torch.compile...")
+    for m in reversed(list(model.modules())):
+        if isinstance(m, TransformerSelfAttentionLayer) or isinstance(
+            m, TransformerCrossAttentionLayer
+        ):
+            m.compile(backend=backend)
 
 
 def compile_loss(loss: nn.Module, verbose: bool = True) -> nn.Module:

diff --git a/torchtune/utils/_import_guard.py b/torchtune/utils/_import_guard.py
@@ -5,11 +5,8 @@
 # LICENSE file in the root directory of this source tree.
 
 import torch
-from torchtune.utils._version import torch_version_ge
 
 # We can only use flex attention / BlockMask if torch version >= 2.5.0 and GPU is Turing / SM75 and above
 _SUPPORTS_FLEX_ATTENTION = (
-    torch_version_ge("2.5.0")
-    and torch.cuda.is_available()
-    and torch.cuda.get_device_capability() >= (7, 5)
+    torch.cuda.is_available() and torch.cuda.get_device_capability() >= (7, 5)
 )