diff --git a/deepmd/env.py b/deepmd/env.py index 8c6937b7f7..5f5c344031 100644 --- a/deepmd/env.py +++ b/deepmd/env.py @@ -1,13 +1,14 @@ """Module that sets tensorflow working environment and exports inportant constants.""" -import os -from pathlib import Path import logging +import os import platform -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Any -import numpy as np -from imp import reload from configparser import ConfigParser +from imp import reload +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple + +import numpy as np if TYPE_CHECKING: from types import ModuleType @@ -37,6 +38,7 @@ SHARED_LIB_MODULE = "op" + def set_env_if_empty(key: str, value: str, verbose: bool = True): """Set environment variable only if it is empty. @@ -74,7 +76,8 @@ def set_mkl(): """ if "mkl_rt" in np.__config__.get_info("blas_mkl_info").get("libraries", []): set_env_if_empty("KMP_BLOCKTIME", "0") - set_env_if_empty("KMP_AFFINITY", "granularity=fine,verbose,compact,1,0") + set_env_if_empty( + "KMP_AFFINITY", "granularity=fine,verbose,compact,1,0") reload(np) @@ -118,8 +121,10 @@ def get_tf_session_config() -> Any: intra_op_parallelism_threads=intra, inter_op_parallelism_threads=inter ) + default_tf_session_config = get_tf_session_config() + def get_module(module_name: str) -> "ModuleType": """Load force module. @@ -149,14 +154,59 @@ def get_module(module_name: str) -> "ModuleType": if not module_file.is_file(): raise FileNotFoundError(f"module {module_name} does not exist") else: - module = tf.load_op_library(str(module_file)) + try: + module = tf.load_op_library(str(module_file)) + except tf.errors.NotFoundError as e: + # check CXX11_ABI_FLAG is compatiblity + # see https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html + # ABI should be the same + if 'CXX11_ABI_FLAG' in tf.__dict__: + tf_cxx11_abi_flag = tf.CXX11_ABI_FLAG + else: + tf_cxx11_abi_flag = tf.sysconfig.CXX11_ABI_FLAG + if TF_CXX11_ABI_FLAG != tf_cxx11_abi_flag: + raise RuntimeError( + "This deepmd-kit package was compiled with " + "CXX11_ABI_FLAG=%d, but TensorFlow runtime was compiled " + "with CXX11_ABI_FLAG=%d. These two library ABIs are " + "incompatible and thus an error is raised when loading %s." + "You need to rebuild deepmd-kit against this TensorFlow " + "runtime." % ( + TF_CXX11_ABI_FLAG, + tf_cxx11_abi_flag, + module_name, + )) from e + + # different versions may cause incompatibility + # see #406, #447, #557, #774, and #796 for example + # throw a message if versions are different + if TF_VERSION != tf.version.VERSION: + raise RuntimeError( + "The version of TensorFlow used to compile this " + "deepmd-kit package is %s, but the version of TensorFlow " + "runtime you are using is %s. These two versions are " + "incompatible and thus an error is raised when loading %s. " + "You need to install TensorFlow %s, or rebuild deepmd-kit " + "against TensorFlow %s.\nIf you are using a wheel from " + "pypi, you may consider to install deepmd-kit execuating " + "`pip install deepmd-kit --no-binary deepmd-kit` " + "instead." % ( + TF_VERSION, + tf.version.VERSION, + module_name, + TF_VERSION, + tf.version.VERSION, + )) from e + raise RuntimeError( + "This deepmd-kit package is inconsitent with TensorFlow" + "Runtime, thus an error is raised when loading %s." + "You need to rebuild deepmd-kit against this TensorFlow" + "runtime." % ( + module_name, + )) from e return module -op_module = get_module("libop_abi") -op_grads_module = get_module("libop_grads") - - def _get_package_constants( config_file: Path = Path(__file__).parent / "pkg_config/run_config.ini", ) -> Dict[str, str]: @@ -165,7 +215,7 @@ def _get_package_constants( Parameters ---------- config_file : str, optional - path to CONFIG file, by default "config/run_config.ini" + path to CONFIG file, by default "pkg_config/run_config.ini" Returns ------- @@ -176,8 +226,14 @@ def _get_package_constants( config.read(config_file) return dict(config.items("CONFIG")) + GLOBAL_CONFIG = _get_package_constants() MODEL_VERSION = GLOBAL_CONFIG["model_version"] +TF_VERSION = GLOBAL_CONFIG["tf_version"] +TF_CXX11_ABI_FLAG = int(GLOBAL_CONFIG["tf_cxx11_abi_flag"]) + +op_module = get_module("libop_abi") +op_grads_module = get_module("libop_grads") if GLOBAL_CONFIG["precision"] == "-DHIGH_PREC": GLOBAL_TF_FLOAT_PRECISION = tf.float64 @@ -221,5 +277,3 @@ def global_cvt_2_ener_float(xx: tf.Tensor) -> tf.Tensor: output tensor cast to `GLOBAL_ENER_FLOAT_PRECISION` """ return tf.cast(xx, GLOBAL_ENER_FLOAT_PRECISION) - - diff --git a/source/cmake/Findtensorflow.cmake b/source/cmake/Findtensorflow.cmake index 8901c698b9..91ed0809a3 100644 --- a/source/cmake/Findtensorflow.cmake +++ b/source/cmake/Findtensorflow.cmake @@ -137,10 +137,27 @@ else (BUILD_CPP_IF) endif () endif (BUILD_CPP_IF) +# detect TensorFlow version +try_run( + TENSORFLOW_VERSION_RUN_RESULT_VAR TENSORFLOW_VERSION_COMPILE_RESULT_VAR + ${CMAKE_CURRENT_BINARY_DIR}/tf_version + "${CMAKE_CURRENT_LIST_DIR}/tf_version.cpp" + LINK_LIBRARIES ${TensorFlowFramework_LIBRARY} + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${TensorFlow_INCLUDE_DIRS}" + RUN_OUTPUT_VARIABLE TENSORFLOW_VERSION + COMPILE_OUTPUT_VARIABLE TENSORFLOW_VERSION_COMPILE_OUTPUT_VAR +) +if (NOT ${TENSORFLOW_VERSION_COMPILE_RESULT_VAR}) + message(FATAL_ERROR "Failed to compile: \n ${TENSORFLOW_VERSION_COMPILE_OUTPUT_VAR}" ) +endif() +if (NOT ${TENSORFLOW_VERSION_RUN_RESULT_VAR} EQUAL "0") + message(FATAL_ERROR "Failed to run, return code: ${TENSORFLOW_VERSION}" ) +endif() + # print message if (NOT TensorFlow_FIND_QUIETLY) message(STATUS "Found TensorFlow: ${TensorFlow_INCLUDE_DIRS}, ${TensorFlow_LIBRARY}, ${TensorFlowFramework_LIBRARY} " - " in ${TensorFlow_search_PATHS}") + " in ${TensorFlow_search_PATHS} (found version \"${TENSORFLOW_VERSION}\")") endif () unset(TensorFlow_search_PATHS) diff --git a/source/cmake/tf_version.cpp b/source/cmake/tf_version.cpp new file mode 100644 index 0000000000..9d129aefb8 --- /dev/null +++ b/source/cmake/tf_version.cpp @@ -0,0 +1,10 @@ +#include +#include "tensorflow/core/public/version.h" + +int main(int argc, char * argv[]) +{ + // See https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h + // TF_VERSION_STRING has been avaiable since TensorFlow v0.6 + std::cout << TF_VERSION_STRING; + return 0; +} diff --git a/source/config/run_config.ini b/source/config/run_config.ini index 3f2e8cc86a..bb04319e47 100644 --- a/source/config/run_config.ini +++ b/source/config/run_config.ini @@ -6,5 +6,7 @@ GIT_DATE = @GIT_DATE@ GIT_BRANCH = @GIT_BRANCH@ TF_INCLUDE_DIR = @TensorFlow_INCLUDE_DIRS@ TF_LIBS = @TensorFlow_LIBRARY@ +TF_VERSION = @TENSORFLOW_VERSION@ +TF_CXX11_ABI_FLAG = @OP_CXX_ABI@ PRECISION = @PREC_DEF@ MODEL_VERSION=@MODEL_VERSION@