From 51b5bd396a39bd8688dcd3a36e6c02f1c5dd9f6a Mon Sep 17 00:00:00 2001 From: Simon Branford Date: Sat, 6 Mar 2021 10:13:56 +0000 Subject: [PATCH 1/3] adding easyconfigs: PyTorch-1.8.0-foss-2020b.eb --- .../p/PyTorch/PyTorch-1.8.0-foss-2020b.eb | 73 +++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb new file mode 100644 index 000000000000..e2cd66dbed6e --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb @@ -0,0 +1,73 @@ +name = 'PyTorch' +version = '1.8.0' + +homepage = 'https://pytorch.org/' +description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration. +PyTorch is a deep learning framework that puts Python first.""" + +toolchain = {'name': 'foss', 'version': '2020b'} + +sources = [{ + 'filename': '%(name)s-%(version)s.tar.gz', + 'git_config': { + 'url': 'https://github.com/pytorch', + 'repo_name': 'pytorch', + 'tag': 'v%(version)s', + 'recursive': True, + }, +}] +patches = [ + 'PyTorch-1.6.0_fix-test-dataloader-fixed-affinity.patch', + 'PyTorch-1.7.0_avoid-nan-in-test-torch.patch', + 'PyTorch-1.7.0_increase-distributed-test-timeout.patch', + 'PyTorch-1.7.0_disable-dev-shm-test.patch', +] +checksums = [ + None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone' + # PyTorch-1.6.0_fix-test-dataloader-fixed-affinity.patch + 'a4208a46cd2098744daaba96cebb96cd91166f8fc616924315e05974bad80c67', + 'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch + # PyTorch-1.7.0_increase-distributed-test-timeout.patch + '95abb468a35451fbd0f864ca843f6ad15ff8bfb909c3fd580f65859b26c9691c', + '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch +] + +osdependencies = [OS_PKG_IBVERBS_DEV] + +builddependencies = [ + ('CMake', '3.18.4'), + ('hypothesis', '5.41.5'), +] + +dependencies = [ + ('Ninja', '1.10.1'), # Required for JIT compilation of C++ extensions + ('Python', '3.8.6'), + ('protobuf', '3.14.0'), + ('protobuf-python', '3.14.0'), + ('pybind11', '2.6.0'), + ('SciPy-bundle', '2020.11'), + ('typing-extensions', '3.7.4.3'), + ('PyYAML', '5.3.1'), + ('MPFR', '4.1.0'), + ('GMP', '6.2.0'), + ('numactl', '2.0.13'), + ('FFmpeg', '4.3.1'), + ('Pillow', '8.0.1'), +] + +excluded_tests = { + '': [ + # Test from this suite timeout often. The process group backend is deprecated anyway + 'distributed/rpc/test_process_group_agent', + # Potentially problematic save/load issue with test_lstm on only some machines. Tell users to verify save&load! + # https://github.com/pytorch/pytorch/issues/43209 + 'test_quantization', + ] +} + +runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --verbose %(excluded_tests)s' + +sanity_check_commands = ["python -c 'import caffe2.python'"] +tests = ['PyTorch-check-cpp-extension.py'] + +moduleclass = 'devel' From 3a0c33baeee2959a9ed17d1ef76b17cb265af01a Mon Sep 17 00:00:00 2001 From: Simon Branford Date: Fri, 12 Mar 2021 20:22:47 +0000 Subject: [PATCH 2/3] Add patches from PyTorch --- .../p/PyTorch/PyTorch-1.8.0-foss-2020b.eb | 5 + ...-1.8.0_correct-skip-tests-decorators.patch | 109 ++++++++++++++++++ ...Torch-1.8.0_fix-noMKL-linear-algebra.patch | 77 +++++++++++++ 3 files changed, 191 insertions(+) create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0_correct-skip-tests-decorators.patch create mode 100644 easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0_fix-noMKL-linear-algebra.patch diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb index e2cd66dbed6e..dea42dbd7d84 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb @@ -21,6 +21,8 @@ patches = [ 'PyTorch-1.7.0_avoid-nan-in-test-torch.patch', 'PyTorch-1.7.0_increase-distributed-test-timeout.patch', 'PyTorch-1.7.0_disable-dev-shm-test.patch', + 'PyTorch-1.8.0_fix-noMKL-linear-algebra.patch', + 'PyTorch-1.8.0_correct-skip-tests-decorators.patch', ] checksums = [ None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone' @@ -30,6 +32,9 @@ checksums = [ # PyTorch-1.7.0_increase-distributed-test-timeout.patch '95abb468a35451fbd0f864ca843f6ad15ff8bfb909c3fd580f65859b26c9691c', '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch + 'a1ca9382b0eb333090536633092bab6fa281d26b491bf7b1849117f68ab0730c', # PyTorch-1.8.0_fix-noMKL-linear-algebra.patch + # PyTorch-1.8.0_correct-skip-tests-decorators.patch + '4b2fe7616217dd6fd12d667cb1439dde58f84bf234fbf3e6026c4665fc697a2e', ] osdependencies = [OS_PKG_IBVERBS_DEV] diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0_correct-skip-tests-decorators.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0_correct-skip-tests-decorators.patch new file mode 100644 index 000000000000..89cbeae07597 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0_correct-skip-tests-decorators.patch @@ -0,0 +1,109 @@ +Based off https://github.com/pytorch/pytorch/pull/53736 +Fixed to apply to PyTorch 1.8.0 by Simon Branford (University of Birmingham) +--- test/test_spectral_ops.py 2021-03-12 19:36:32.910758000 +0000 ++++ test/test_spectral_ops.py 2021-03-12 19:41:42.314654931 +0000 +@@ -185,6 +185,7 @@ + with self.assertRaisesRegex(RuntimeError, match): + op(t) + ++ @onlyOnCPUAndCUDA + def test_fft_invalid_dtypes(self, device): + t = torch.randn(64, device=device, dtype=torch.complex128) + +@@ -599,7 +600,6 @@ + _test_complex((40, 60, 3, 80), 3, lambda x: x.transpose(2, 0).select(0, 2)[5:55, :, 10:]) + _test_complex((30, 55, 50, 22), 3, lambda x: x[:, 3:53, 15:40, 1:21]) + +- @skipCUDAIfRocm + @skipCPUIfNoMkl + @onlyOnCPUAndCUDA + @dtypes(torch.double) +@@ -680,8 +680,8 @@ + self.assertEqual(torch.backends.cuda.cufft_plan_cache.max_size, 11) # default is cuda:1 + + # passes on ROCm w/ python 2.7, fails w/ python 3.6 +- @skipCUDAIfRocm + @skipCPUIfNoMkl ++ @onlyOnCPUAndCUDA + @dtypes(torch.double) + def test_stft(self, device, dtype): + if not TEST_LIBROSA: +@@ -712,9 +712,8 @@ + else: + window = None + if expected_error is None: +- with self.maybeWarnsRegex(UserWarning, "stft with return_complex=False"): +- result = x.stft(n_fft, hop_length, win_length, window, +- center=center, return_complex=False) ++ result = x.stft(n_fft, hop_length, win_length, window, ++ center=center, return_complex=False) + # NB: librosa defaults to np.complex64 output, no matter what + # the input dtype + ref_result = librosa_stft(x, n_fft, hop_length, win_length, window, center) +@@ -748,7 +747,7 @@ + _test((10,), 5, 4, win_sizes=(1, 1), expected_error=RuntimeError) + + +- @skipCUDAIfRocm ++ @onlyOnCPUAndCUDA + @skipCPUIfNoMkl + @dtypes(torch.double, torch.cdouble) + def test_complex_stft_roundtrip(self, device, dtype): +@@ -790,7 +789,7 @@ + length=x.size(-1), **common_kwargs) + self.assertEqual(x_roundtrip, x) + +- @skipCUDAIfRocm ++ @onlyOnCPUAndCUDA + @skipCPUIfNoMkl + @dtypes(torch.double, torch.cdouble) + def test_stft_roundtrip_complex_window(self, device, dtype): +@@ -831,6 +830,7 @@ + self.assertEqual(x_roundtrip, x) + + ++ @onlyOnCPUAndCUDA + @skipCUDAIfRocm + @skipCPUIfNoMkl + @dtypes(torch.cdouble) +@@ -851,7 +851,7 @@ + actual = torch.stft(*args, window=window, center=False) + self.assertEqual(actual, expected) + +- @skipCUDAIfRocm ++ @onlyOnCPUAndCUDA + @skipCPUIfNoMkl + @dtypes(torch.cdouble) + def test_complex_stft_real_equiv(self, device, dtype): +@@ -885,6 +885,7 @@ + center=center, normalized=normalized) + self.assertEqual(expected, actual) + ++ @onlyOnCPUAndCUDA + @skipCUDAIfRocm + @skipCPUIfNoMkl + @dtypes(torch.cdouble) +@@ -912,6 +913,7 @@ + return_complex=True) + self.assertEqual(expected, actual) + ++ @onlyOnCPUAndCUDA + @skipCUDAIfRocm + @skipCPUIfNoMkl + def test_complex_stft_onesided(self, device): +@@ -934,12 +936,15 @@ + x.stft(10, pad_mode='constant', onesided=True) + + # stft is currently warning that it requires return-complex while an upgrader is written ++ @onlyOnCPUAndCUDA ++ @skipCPUIfNoMkl + def test_stft_requires_complex(self, device): + x = torch.rand(100) + y = x.stft(10, pad_mode='constant') + # with self.assertRaisesRegex(RuntimeError, 'stft requires the return_complex parameter'): + # y = x.stft(10, pad_mode='constant') + ++ @onlyOnCPUAndCUDA + @skipCUDAIfRocm + @skipCPUIfNoMkl + def test_fft_input_modification(self, device): diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0_fix-noMKL-linear-algebra.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0_fix-noMKL-linear-algebra.patch new file mode 100644 index 000000000000..d9eeefb653a5 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0_fix-noMKL-linear-algebra.patch @@ -0,0 +1,77 @@ +From f4824c2eca26887a0f7aeebb4e966c278258142a Mon Sep 17 00:00:00 2001 +From: Ivan Yashchuk +Date: Fri, 12 Mar 2021 17:54:19 +0000 +Subject: [PATCH] Fixed worksize + +--- + aten/src/ATen/native/BatchLinearAlgebra.cpp | 8 ++++---- + aten/src/ATen/native/BatchLinearAlgebra.h | 2 +- + aten/src/ATen/native/BatchLinearAlgebraKernel.cpp | 2 +- + 3 files changed, 6 insertions(+), 6 deletions(-) + +diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp +index 39d4291b7a8d..2daf7a93cdea 100644 +--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp ++++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp +@@ -695,7 +695,7 @@ static void apply_inverse(Tensor& self, Tensor& infos_lu, Tensor& infos_getri) { + int lwork = -1; + scalar_t wkopt; + lapackGetri(n, self_data, lda, ipiv_data, &wkopt, lwork, &info); +- lwork = static_cast(real_impl(wkopt)); ++ lwork = std::max(1, real_impl(wkopt)); + Tensor work = at::empty({lwork}, self.options()); + auto work_data = work.data_ptr(); + +@@ -1211,7 +1211,7 @@ static void apply_geqrf(Tensor& self, Tensor& tau, int64_t m, int64_t n, + int lwork = -1; + scalar_t wkopt; + lapackGeqrf(m, n, self_data, m, tau_data, &wkopt, lwork, &info); +- lwork = static_cast(real_impl(wkopt)); ++ lwork = std::max(1, real_impl(wkopt)); + Tensor work = at::empty({lwork}, self.options()); + + for (const auto i : c10::irange(batch_size)) { +@@ -1626,7 +1626,7 @@ static void apply_symeig(Tensor& self, Tensor& eigvals, bool eigenvectors, bool + } + + lapackSymeig(jobz, uplo, n, self_data, n, eigvals_data, &wkopt, lwork, rwork_data, &info); +- lwork = static_cast(real_impl(wkopt)); ++ lwork = std::max(1, real_impl(wkopt)); + Tensor work = at::empty({lwork}, self.options()); + + for (const auto i : c10::irange(batch_size)) { +@@ -1782,7 +1782,7 @@ static void apply_svd(Tensor& self, Tensor& U, Tensor& S, Tensor& VT, + int lwork = -1; + scalar_t wkopt; + lapackSvd(jobz, m, n, self_data, lda, S_data, U_data, lda, VT_data, ldvt, &wkopt, lwork, rwork_data, iwork_data, &info); +- lwork = static_cast(real_impl(wkopt)); ++ lwork = std::max(1, real_impl(wkopt)); + Tensor work = at::empty({lwork}, self.options()); + auto work_data = work.data_ptr(); + +diff --git a/aten/src/ATen/native/BatchLinearAlgebra.h b/aten/src/ATen/native/BatchLinearAlgebra.h +index 138819f6f4cd..59e71e2964e5 100644 +--- a/aten/src/ATen/native/BatchLinearAlgebra.h ++++ b/aten/src/ATen/native/BatchLinearAlgebra.h +@@ -82,7 +82,7 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau, Tensor& infos, int64_t + int lwork = -1; + scalar_t wkopt; + lapackOrgqr(m, n_columns, k, self_data, lda, tau_data, &wkopt, lwork, &infos_data[0]); +- lwork = static_cast(real_impl(wkopt)); ++ lwork = std::max(1, real_impl(wkopt)); + Tensor work = at::empty({lwork}, self.options()); + + for (int64_t i = 0; i < batch_size; i++) { +diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +index ef64cef6a771..334f4d60ce44 100644 +--- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp ++++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +@@ -115,7 +115,7 @@ void apply_eig(const Tensor& self, bool eigenvectors, Tensor& vals_, Tensor& vec + int info; + lapackEig('N', jobvr, n, self_data, n, wr, + nullptr, 1, vecs_data, ldvr, &wkopt, -1, rwork_data, &info); +- int lwork = static_cast(real_impl(wkopt)); ++ int lwork = std::max(1, real_impl(wkopt)); + + // call again to do the actual work + Tensor work = at::empty({lwork}, self.dtype()); From 25aa9962e0e08168de1f54c3e7bb4533dec30ba2 Mon Sep 17 00:00:00 2001 From: Simon Branford Date: Wed, 31 Mar 2021 19:01:20 +0100 Subject: [PATCH 3/3] switch to PyTorch 1.8.1 --- ...{PyTorch-1.8.0-foss-2020b.eb => PyTorch-1.8.1-foss-2020b.eb} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename easybuild/easyconfigs/p/PyTorch/{PyTorch-1.8.0-foss-2020b.eb => PyTorch-1.8.1-foss-2020b.eb} (99%) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.1-foss-2020b.eb similarity index 99% rename from easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb rename to easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.1-foss-2020b.eb index dea42dbd7d84..f8827697e06e 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.0-foss-2020b.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.8.1-foss-2020b.eb @@ -1,5 +1,5 @@ name = 'PyTorch' -version = '1.8.0' +version = '1.8.1' homepage = 'https://pytorch.org/' description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.