Merge pull request #3 from lexming/20240122121636_new_pr_PyTorch212_p…

…atch Two more patches for PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb
verdurin · Jan 24, 2024 · 1ce12d9 · 1ce12d9
2 parents 8d0677d + df898b6
commit 1ce12d9
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 0 deletions.
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch
@@ -0,0 +1,56 @@
+Disallow TF32 on tests with thresholds too strict for this data type. Nvidia
+GPUs with TF32 support default to this data type instead of regular FP32 to
+improve performance at the expense of precision.
+author: Alex Domingo (Vrije Universiteit Brussel)
+--- test/test_nn.py.orig	2024-01-15 14:07:35.421908795 +0100
++++ test/test_nn.py	2024-01-15 14:54:00.867537101 +0100
+@@ -3762,6 +3761,7 @@
+             self.assertEqual(weight_data, all_vars[4].data)
+
+     @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available')
++    @torch.backends.cudnn.flags(enabled=True, allow_tf32=False)
+     def test_cudnn_weight_tying(self):
+         rnns = [
+             nn.LSTM(10, 20, batch_first=True, bidirectional=True),
+@@ -4461,6 +4461,7 @@
+         self._test_RNN_cpu_vs_cudnn(1)
+
+     @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
++    @torch.backends.cudnn.flags(enabled=True, allow_tf32=False)
+     def test_RNN_cudnn_weight_norm(self):
+         input_size = 10
+         hidden_size = 6
+@@ -4492,6 +4493,7 @@
+         check_weight_norm(nn.LSTM(input_size, hidden_size, num_layers, proj_size=3), 'weight_hr_l0')
+
+     @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
++    @torch.backends.cudnn.flags(enabled=True, allow_tf32=False)
+     def test_partial_flat_weights(self):
+         input_size = 10
+         hidden_size = 6
+--- test/jit/test_freezing.py.orig	2024-01-15 14:38:11.054125484 +0100
++++ test/jit/test_freezing.py	2024-01-15 14:49:41.689011617 +0100
+@@ -2733,7 +2733,11 @@
+                     else:
+                         FileCheck().check("aten::cudnn_convolution_relu").run(frozen_mod.graph)
+
+-                self.assertEqual(mod_eager(inp), frozen_mod(inp))
++                if not TEST_WITH_ROCM:
++                    with torch.backends.cudnn.flags(enabled=True, allow_tf32=False):
++                        self.assertEqual(mod_eager(inp), frozen_mod(inp))
++                else:
++                    self.assertEqual(mod_eager(inp), frozen_mod(inp))
+
+     @unittest.skipIf(not (TEST_CUDNN or TEST_WITH_ROCM), "requires CUDNN")
+     def test_freeze_conv_relu_fusion_not_forward(self):
+--- ../PyTorch/2.1.2/foss-2023a-CUDA-12.1.1/pytorch-v2.1.2/test/nn/test_convolution.py	2023-12-15 03:03:27.000000000 +0100
++++ test/nn/test_convolution.py	2024-01-15 15:03:15.606208376 +0100
+@@ -518,7 +518,7 @@
+     # Covering special case when group > 1, input-channel / group < 16 and output-channel is multiple of 16
+     # See also https://github.com/pytorch/pytorch/pull/18463#issuecomment-476563686
+     # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024
+-    @torch.backends.cudnn.flags(enabled=True, benchmark=False)
++    @torch.backends.cudnn.flags(enabled=True, benchmark=False, allow_tf32=False)
+     def test_Conv2d_groups_nobias_v2(self):
+         torch.manual_seed(123)
+         dev_dtypes = [("cpu", torch.float)]
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_skip-test-linalg-svd-complex.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.0_skip-test-linalg-svd-complex.patch
@@ -0,0 +1,19 @@
+Skip test_python_ref_meta__refs_linalg_svd_cpu_complex
+Result varies depending on underlying device
+see https://github.com/pytorch/pytorch/issues/105068
+author: Alex Domingo (Vrije Universiteit Brussel)
+--- test/test_ops.py.orig	2024-01-16 15:37:02.596411122 +0100
++++ test/test_ops.py	2024-01-16 15:39:02.824489395 +0100
+@@ -311,6 +311,12 @@
+                 return out
+             return x
+
++        # Skip test_python_ref_meta__refs_linalg_svd_cpu_complex
++        # Result varies depending on underlying device
++        # see https://github.com/pytorch/pytorch/issues/105068
++        if op.name == '_refs.linalg.svd' and dtype in (torch.complex64, torch.complex128):
++            self.skipTest("Unreliable on certain devices, see issue #105068")
++
+         # TODO: iterate over requires_grad true/false
+         for sample in op.reference_inputs(device, dtype, requires_grad=False):
+             result = op(sample.input, *sample.args, **sample.kwargs)
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb
@@ -43,6 +43,8 @@ patches = [
     'PyTorch-2.1.0_skip-test_jvp_linalg_det_singular.patch',
     'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch',
     'PyTorch-2.1.0_skip-test_wrap_bad.patch',
+    'PyTorch-2.1.0_skip-test-linalg-svd-complex.patch',
+    'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch',
 ]
 checksums = [
     {'pytorch-v2.1.2.tar.gz': '85effbcce037bffa290aea775c9a4bad5f769cb229583450c40055501ee1acd7'},
@@ -100,6 +102,10 @@ checksums = [
     {'PyTorch-2.1.0_skip-test_linear_fp32-without-MKL.patch':
      '5dcc79883b6e3ec0a281a8e110db5e0a5880de843bb05653589891f16473ead5'},
     {'PyTorch-2.1.0_skip-test_wrap_bad.patch': 'b8583125ee94e553b6f77c4ab4bfa812b89416175dc7e9b7390919f3b485cb63'},
+    {'PyTorch-2.1.0_skip-test-linalg-svd-complex.patch':
+     '5ba7e0b4203ea8c27b55b5231de024004697aca7bbae30aa248524babb451dc7'},
+    {'PyTorch-2.1.0_disable-cudnn-tf32-for-too-strict-tests.patch':
+     '7abccc94f0ae6c317d5d08d4db4e3724eedde8d1d00707e78cf57d8cbf858be5'},
 ]
 
 osdependencies = [OS_PKG_IBVERBS_DEV]