Fix MKL related test failures

easybuilders · May 11, 2023 · 3317a0d · 3317a0d
1 parent af2b983
commit 3317a0d
Show file tree

Hide file tree

Showing 3 changed files with 203 additions and 0 deletions.
diff --git a/easybuild/easyconfigs/t/TensorFlow/TensorFlow-2.9.1-foss-2022a-CUDA-11.7.0.eb b/easybuild/easyconfigs/t/TensorFlow/TensorFlow-2.9.1-foss-2022a-CUDA-11.7.0.eb
@@ -176,6 +176,7 @@ exts_list = [
             'TensorFlow-2.8.4_exclude-xnnpack-on-ppc.patch',
             'TensorFlow-2.8.4_fix-PPC-JIT.patch',
             'TensorFlow-2.8.4_resolve-gcc-symlinks.patch',
+            'TensorFlow-2.9.1_fix-check-for-MKL.patch',
             'TensorFlow-2.9.1_fix-PPC-Eigen-build.patch',
             'TensorFlow-2.9.1_remove-duplicate-gpu-tests.patch',
             'TensorFlow-2.9.1_remove-libclang-and-io-gcs-deps.patch',
@@ -204,6 +205,8 @@ exts_list = [
              '27d28293105b4dd0a25f58346c68b672f57215756f14a97c442d0e3317e93a2b'},
             {'TensorFlow-2.8.4_resolve-gcc-symlinks.patch':
              '43ce9acc6bffff68a31d2263d0064d272999b2e0a2c6546690287cd1c9c90f04'},
+            {'TensorFlow-2.9.1_fix-check-for-MKL.patch':
+             '3b9d20b43391def093a30dbc45b7502a48916efedf7314700f78cc7b2cc1b645'},
             {'TensorFlow-2.9.1_fix-PPC-Eigen-build.patch':
              '5f559a6eade65df665c7c69bc2e5d5d4214b85ea836e966f5dba73211307b972'},
             {'TensorFlow-2.9.1_remove-duplicate-gpu-tests.patch':

diff --git a/easybuild/easyconfigs/t/TensorFlow/TensorFlow-2.9.1-foss-2022a.eb b/easybuild/easyconfigs/t/TensorFlow/TensorFlow-2.9.1-foss-2022a.eb
@@ -172,6 +172,7 @@ exts_list = [
             'TensorFlow-2.8.4_exclude-xnnpack-on-ppc.patch',
             'TensorFlow-2.8.4_fix-PPC-JIT.patch',
             'TensorFlow-2.8.4_resolve-gcc-symlinks.patch',
+            'TensorFlow-2.9.1_fix-check-for-MKL.patch',
             'TensorFlow-2.9.1_fix-PPC-Eigen-build.patch',
             'TensorFlow-2.9.1_remove-duplicate-gpu-tests.patch',
             'TensorFlow-2.9.1_remove-libclang-and-io-gcs-deps.patch',
@@ -200,6 +201,8 @@ exts_list = [
              '27d28293105b4dd0a25f58346c68b672f57215756f14a97c442d0e3317e93a2b'},
             {'TensorFlow-2.8.4_resolve-gcc-symlinks.patch':
              '43ce9acc6bffff68a31d2263d0064d272999b2e0a2c6546690287cd1c9c90f04'},
+            {'TensorFlow-2.9.1_fix-check-for-MKL.patch':
+             '3b9d20b43391def093a30dbc45b7502a48916efedf7314700f78cc7b2cc1b645'},
             {'TensorFlow-2.9.1_fix-PPC-Eigen-build.patch':
              '5f559a6eade65df665c7c69bc2e5d5d4214b85ea836e966f5dba73211307b972'},
             {'TensorFlow-2.9.1_remove-duplicate-gpu-tests.patch':

diff --git a/easybuild/easyconfigs/t/TensorFlow/TensorFlow-2.9.1_fix-check-for-MKL.patch b/easybuild/easyconfigs/t/TensorFlow/TensorFlow-2.9.1_fix-check-for-MKL.patch
@@ -0,0 +1,197 @@
+Tests are incorrectly not skipped causing failures related to MKL.
+See https://github.com/tensorflow/tensorflow/issues/59252
+
+Use a patch from TF 2.11: https://github.com/tensorflow/tensorflow/commit/5ec3d2e626589540bcfbeb7dac40255034e587df
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/tensorflow/core/util/BUILD b/tensorflow/core/util/BUILD
+index 8881f6fd5e9..25882152abf 100644
+--- a/tensorflow/core/util/BUILD
++++ b/tensorflow/core/util/BUILD
+@@ -491,6 +491,11 @@ cc_library(
+         "//tensorflow/python:__pkg__",
+         "//tensorflow/python/util:__pkg__",
+     ],
++    deps = [
++        "//tensorflow/core/platform:platform_port",
++        "//tensorflow/core/util:env_var",
++        "@com_google_absl//absl/base",
++    ],
+     alwayslink = 1,
+ )
+
+diff --git a/tensorflow/core/util/port.cc b/tensorflow/core/util/port.cc
+index 358b39bfb00..0aa3cfa708e 100644
+--- a/tensorflow/core/util/port.cc
++++ b/tensorflow/core/util/port.cc
+@@ -15,6 +15,9 @@ limitations under the License.
+
+ #include "tensorflow/core/util/port.h"
+
++#include "absl/base/call_once.h"
++#include "tensorflow/core/platform/cpu_info.h"
++#include "tensorflow/core/util/env_var.h"
+
+ namespace tensorflow {
+
+@@ -60,10 +63,57 @@ bool GpuSupportsHalfMatMulAndConv() {
+ }
+
+ bool IsMklEnabled() {
+-#if defined(INTEL_MKL) && defined(ENABLE_MKL)
+-  return true;
+-#else
++#ifndef INTEL_MKL
+   return false;
+-#endif  // INTEL_MKL && ENABLE_MKL
++#endif
++  static absl::once_flag once;  // NOLINT(clang-diagnostic-unreachable-code)
++#ifdef ENABLE_MKL
++  // Keeping TF_DISABLE_MKL env variable for legacy reasons.
++  static bool oneDNN_disabled = false;
++  absl::call_once(once, [&] {
++    TF_CHECK_OK(ReadBoolFromEnvVar("TF_DISABLE_MKL", false, &oneDNN_disabled));
++    if (oneDNN_disabled) VLOG(2) << "TF-MKL: Disabling oneDNN";
++  });
++  return (!oneDNN_disabled);
++#else
++  // Linux: Turn oneDNN on by default for CPUs with neural network features.
++  // Windows: oneDNN is off by default.
++  // No need to guard for other platforms here because INTEL_MKL is only defined
++  // for non-mobile Linux or Windows.
++  static bool oneDNN_enabled =
++#ifdef __linux__
++      port::TestCPUFeature(port::CPUFeature::AVX512_VNNI) ||
++      port::TestCPUFeature(port::CPUFeature::AVX512_BF16) ||
++      port::TestCPUFeature(port::CPUFeature::AVX_VNNI) ||
++      port::TestCPUFeature(port::CPUFeature::AMX_TILE) ||
++      port::TestCPUFeature(port::CPUFeature::AMX_INT8) ||
++      port::TestCPUFeature(port::CPUFeature::AMX_BF16);
++#else
++      false;
++#endif  // __linux__
++  absl::call_once(once, [&] {
++    auto status = ReadBoolFromEnvVar("TF_ENABLE_ONEDNN_OPTS", oneDNN_enabled,
++                                     &oneDNN_enabled);
++    if (!status.ok()) {
++      LOG(WARNING) << "TF_ENABLE_ONEDNN_OPTS is not set to either '0', 'false',"
++                   << " '1', or 'true'. Using the default setting: "
++                   << oneDNN_enabled;
++    }
++    if (oneDNN_enabled) {
++#ifndef DNNL_AARCH64_USE_ACL
++      LOG(INFO) << "oneDNN custom operations are on. "
++                << "You may see slightly different numerical results due to "
++                << "floating-point round-off errors from different computation "
++                << "orders. To turn them off, set the environment variable "
++                << "`TF_ENABLE_ONEDNN_OPTS=0`.";
++#else
++      LOG(INFO) << "Experimental oneDNN custom operations are on. "
++                << "If you experience issues, please turn them off by setting "
++                << "the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.";
++#endif  // !DNNL_AARCH64_USE_ACL
++    }
++  });
++  return oneDNN_enabled;
++#endif  // ENABLE_MKL
+ }
+ }  // end namespace tensorflow
+diff --git a/tensorflow/core/util/util.cc b/tensorflow/core/util/util.cc
+index eef2618de91..1c12f552d7d 100644
+--- a/tensorflow/core/util/util.cc
++++ b/tensorflow/core/util/util.cc
+@@ -15,16 +15,10 @@ limitations under the License.
+
+ #include "tensorflow/core/util/util.h"
+
+-#include <string>
+-#include <vector>
+-
+-#include "absl/base/call_once.h"
+-#include "tensorflow/core/framework/device_factory.h"
+ #include "tensorflow/core/lib/gtl/inlined_vector.h"
+ #include "tensorflow/core/lib/strings/strcat.h"
+-#include "tensorflow/core/platform/cpu_info.h"
+ #include "tensorflow/core/platform/logging.h"
+-#include "tensorflow/core/util/env_var.h"
++#include "tensorflow/core/util/port.h"
+
+ namespace tensorflow {
+
+@@ -127,59 +121,7 @@ string SliceDebugString(const TensorShape& shape, const int64_t flat) {
+   return result;
+ }
+
+-bool IsMKLEnabled() {
+-#ifndef INTEL_MKL
+-  return false;
+-#endif  // !INTEL_MKL
+-  static absl::once_flag once;
+-#ifdef ENABLE_MKL
+-  // Keeping TF_DISABLE_MKL env variable for legacy reasons.
+-  static bool oneDNN_disabled = false;
+-  absl::call_once(once, [&] {
+-    TF_CHECK_OK(ReadBoolFromEnvVar("TF_DISABLE_MKL", false, &oneDNN_disabled));
+-    if (oneDNN_disabled) VLOG(2) << "TF-MKL: Disabling oneDNN";
+-  });
+-  return (!oneDNN_disabled);
+-#else
+-  // Linux: Turn oneDNN on by default for CPUs with neural network features.
+-  // Windows: oneDNN is off by default.
+-  // No need to guard for other platforms here because INTEL_MKL is only defined
+-  // for non-mobile Linux or Windows.
+-  static bool oneDNN_enabled =
+-#ifdef __linux__
+-      port::TestCPUFeature(port::CPUFeature::AVX512_VNNI) ||
+-      port::TestCPUFeature(port::CPUFeature::AVX512_BF16) ||
+-      port::TestCPUFeature(port::CPUFeature::AVX_VNNI) ||
+-      port::TestCPUFeature(port::CPUFeature::AMX_TILE) ||
+-      port::TestCPUFeature(port::CPUFeature::AMX_INT8) ||
+-      port::TestCPUFeature(port::CPUFeature::AMX_BF16);
+-#else
+-      false;
+-#endif  // __linux__
+-  absl::call_once(once, [&] {
+-    auto status = ReadBoolFromEnvVar("TF_ENABLE_ONEDNN_OPTS", oneDNN_enabled,
+-                                     &oneDNN_enabled);
+-    if (!status.ok()) {
+-      LOG(WARNING) << "TF_ENABLE_ONEDNN_OPTS is not set to either '0', 'false',"
+-                   << " '1', or 'true'. Using the default setting: "
+-                   << oneDNN_enabled;
+-    }
+-    if (oneDNN_enabled) {
+-#ifndef DNNL_AARCH64_USE_ACL
+-      LOG(INFO) << "oneDNN custom operations are on. "
+-                << "You may see slightly different numerical results due to "
+-                << "floating-point round-off errors from different computation "
+-                << "orders. To turn them off, set the environment variable "
+-                << "`TF_ENABLE_ONEDNN_OPTS=0`.";
+-#else
+-      LOG(INFO) << "Experimental oneDNN custom operations are on. "
+-                << "If you experience issues, please turn them off by setting "
+-                << "the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.";
+-#endif  // !DNNL_AARCH64_USE_ACL
+-    }
+-  });
+-  return oneDNN_enabled;
+-#endif  // ENABLE_MKL
+-}
++// TODO(penporn): Remove this function from util.cc
++bool IsMKLEnabled() { return IsMklEnabled(); }
+
+ }  // namespace tensorflow
+diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
+index 02149d89b2f..e859ec47a1e 100644
+--- a/tensorflow/python/framework/test_util.py
++++ b/tensorflow/python/framework/test_util.py
+@@ -359,8 +359,7 @@ def GpuSupportsHalfMatMulAndConv():
+
+
+ def IsMklEnabled():
+-  return (_pywrap_util_port.IsMklEnabled() or
+-          os.getenv("TF_ENABLE_ONEDNN_OPTS", "False").lower() in ["true", "1"])
++  return _pywrap_util_port.IsMklEnabled()
+
+
+ def InstallStackTraceHandler():