From bc456e361d49d1d89a74b80060c70efb51fd7d87 Mon Sep 17 00:00:00 2001 From: Martin Wicke Date: Thu, 23 Mar 2017 12:31:16 -0800 Subject: [PATCH] Merge changes from github. Change: 151046259 --- .gitignore | 3 +- RELEASE.md | 9 + WORKSPACE | 7 +- configure | 65 +-- tensorflow/BUILD | 30 + .../xla/service/allocation_tracker.cc | 3 +- .../xla/service/generic_transfer_manager.cc | 4 +- tensorflow/compiler/xla/service/service.cc | 16 +- tensorflow/contrib/android/BUILD | 6 +- tensorflow/contrib/cmake/CMakeLists.txt | 4 +- tensorflow/contrib/cmake/README.md | 2 +- tensorflow/contrib/cmake/tf_cc_ops.cmake | 40 ++ .../contrib/cmake/tf_core_framework.cmake | 7 +- .../contrib/cmake/tf_core_kernels.cmake | 6 + tensorflow/contrib/cmake/tf_python.cmake | 107 +++- tensorflow/contrib/cmake/tf_tests.cmake | 7 + tensorflow/contrib/cmake/tf_tools.cmake | 19 + .../contrib/cmake/tools/create_def_file.py | 134 +++++ .../python/framework/checkpoint_utils.py | 7 +- .../python/framework/checkpoint_utils_test.py | 23 + .../contrib/framework/python/ops/arg_scope.py | 14 +- tensorflow/contrib/layers/__init__.py | 1 + .../contrib/layers/python/layers/layers.py | 10 +- tensorflow/contrib/learn/__init__.py | 1 + .../contrib/learn/python/learn/README.md | 17 +- .../python/learn/estimators/estimator.py | 2 +- .../learn/python/learn/estimators/linear.py | 23 +- .../dataframe/tensorflow_dataframe_test.py | 103 ++-- .../contrib/metrics/python/ops/metric_ops.py | 2 +- .../rnn/python/ops/core_rnn_cell_impl.py | 15 +- tensorflow/contrib/rnn/python/ops/lstm_ops.py | 2 +- .../contrib/seq2seq/python/ops/helper.py | 2 +- tensorflow/contrib/seq2seq/python/ops/loss.py | 3 +- tensorflow/contrib/slim/README.md | 6 +- tensorflow/contrib/util/loader.py | 27 +- tensorflow/core/BUILD | 101 ++-- .../core/common_runtime/mkl_cpu_allocator.h | 120 ++++ .../core/common_runtime/threadpool_device.cc | 9 + tensorflow/core/framework/allocator.cc | 5 +- .../core/framework/allocator_registry.cc | 66 +++ .../core/framework/allocator_registry.h | 77 +++ tensorflow/core/framework/type_index.h | 4 +- tensorflow/core/framework/types.h | 6 +- tensorflow/core/graph/mkl_layout_pass.cc | 548 ++++++++++++++++++ tensorflow/core/graph/mkl_layout_pass.h | 36 ++ tensorflow/core/graph/mkl_layout_pass_test.cc | 199 +++++++ tensorflow/core/graph/mkl_optimizer_merge.cc | 124 ++-- tensorflow/core/graph/mkl_optimizer_merge.h | 6 - .../core/graph/mkl_optimizer_merge_test.cc | 135 ++++- .../core/graph/mkl_tfconversion_pass.cc | 271 +++++++++ tensorflow/core/graph/mkl_tfconversion_pass.h | 36 ++ .../core/graph/mkl_tfconversion_pass_test.cc | 243 ++++++++ tensorflow/core/kernels/BUILD | 63 +- tensorflow/core/kernels/adjust_hue_op.cc | 43 +- tensorflow/core/kernels/adjust_hue_op.h | 42 ++ .../core/kernels/adjust_hue_op_gpu.cu.cc | 141 +++++ tensorflow/core/kernels/eigen_pooling.h | 1 + .../kernels/fixed_length_record_reader_op.cc | 4 +- tensorflow/core/kernels/mkl_conv_ops.cc | 457 +++++++++++++++ tensorflow/core/kernels/mkl_tfconv_op.cc | 135 +++++ tensorflow/core/kernels/mkl_transpose_op.cc | 67 +++ tensorflow/core/kernels/pooling_ops_common.cc | 2 + .../kernels/resize_nearest_neighbor_op.cc | 4 +- ...rk_test.cc => resize_op_benchmark_test.cc} | 28 +- tensorflow/core/kernels/transpose_op.cc | 15 + tensorflow/core/kernels/transpose_op.h | 11 + tensorflow/core/ops/nn_ops.cc | 41 ++ tensorflow/core/ops/ops.pbtxt | 53 ++ .../core/platform/default/build_config.bzl | 74 +-- .../platform/default/build_config_root.bzl | 18 +- .../platform/hadoop/hadoop_file_system.cc | 15 + tensorflow/core/platform/macros.h | 11 + tensorflow/core/platform/windows/cpu_info.h | 3 + .../core/platform/windows/intrinsics_port.h | 4 + .../platform/windows/windows_file_system.cc | 8 +- tensorflow/core/util/mkl_util.h | 296 ++++++++++ tensorflow/docs_src/extend/adding_an_op.md | 16 +- .../docs_src/get_started/get_started.md | 2 +- .../docs_src/get_started/mnist/mechanics.md | 8 +- tensorflow/docs_src/programmers_guide/faq.md | 4 +- .../docs_src/programmers_guide/variables.md | 5 + tensorflow/docs_src/tutorials/linear.md | 2 +- tensorflow/docs_src/tutorials/using_gpu.md | 10 +- tensorflow/docs_src/tutorials/wide.md | 6 +- .../docs_src/tutorials/wide_and_deep.md | 2 +- .../org/tensorflow/demo/StylizeActivity.java | 2 +- tensorflow/examples/learn/README.md | 2 +- tensorflow/examples/learn/boston.py | 9 +- tensorflow/examples/learn/iris.py | 4 +- .../examples/learn/text_classification.py | 3 +- .../tutorials/deepdream/deepdream.ipynb | 2 +- .../tutorials/word2vec/word2vec_basic.py | 6 +- tensorflow/go/genop/generate.sh | 11 +- tensorflow/java/README.md | 6 +- .../java/org/tensorflow/SavedModelBundle.java | 3 +- .../src/main/java/org/tensorflow/Tensor.java | 3 +- .../java/org/tensorflow/package-info.java | 4 +- tensorflow/python/client/session.py | 6 +- tensorflow/python/debug/BUILD | 6 +- .../python/kernel_tests/tensordot_op_test.py | 16 +- tensorflow/python/layers/pooling.py | 4 +- tensorflow/python/ops/control_flow_ops.py | 4 +- tensorflow/python/ops/image_ops_impl.py | 138 ++++- tensorflow/python/ops/image_ops_test.py | 26 +- tensorflow/python/ops/math_ops.py | 36 +- tensorflow/python/ops/metrics_impl.py | 2 +- tensorflow/python/ops/nn_ops.py | 32 +- tensorflow/python/ops/rnn_cell_impl.py | 26 +- tensorflow/python/platform/tf_logging.py | 1 + tensorflow/tensorboard/README.md | 2 +- tensorflow/tensorboard/defs.bzl | 2 +- tensorflow/tensorflow.bzl | 3 +- tensorflow/tools/benchmark/BUILD | 1 + tensorflow/tools/benchmark/benchmark_model.cc | 5 + tensorflow/tools/ci_build/Dockerfile.android | 7 +- tensorflow/tools/ci_build/Dockerfile.cmake | 3 +- tensorflow/tools/ci_build/Dockerfile.cpu | 7 +- .../ci_build/Dockerfile.debian.jessie.cpu | 5 +- tensorflow/tools/ci_build/Dockerfile.gpu | 12 +- tensorflow/tools/ci_build/Dockerfile.hadoop | 7 +- .../tools/ci_build/Dockerfile.tensorboard | 2 +- tensorflow/tools/ci_build/README.md | 12 +- .../tools/ci_build/builds/run_pip_tests.sh | 24 +- .../tools/ci_build/ci_parameterized_build.sh | 33 +- tensorflow/tools/ci_build/ci_sanity.sh | 2 +- .../ci_build/install/install_deb_packages.sh | 6 +- .../ci_build/install/install_pip_packages.sh | 38 +- .../install/install_python3.5_pip_packages.sh | 91 +++ .../ci_build/windows/bazel/bazel_test_lib.sh | 3 - tensorflow/tools/compatibility/README.md | 3 + tensorflow/tools/compatibility/tf_upgrade.py | 25 +- tensorflow/tools/docker/Dockerfile.devel | 5 +- tensorflow/tools/docker/Dockerfile.devel-gpu | 5 +- tensorflow/tools/git/gen_git_source.py | 6 +- .../graph_transforms/summarize_graph_main.cc | 5 +- tensorflow/tools/pip_package/BUILD | 18 +- tensorflow/tools/pip_package/MANIFEST.in | 2 + tensorflow/workspace.bzl | 4 + third_party/gpus/cuda_configure.bzl | 11 +- third_party/sycl/crosstool/computecpp.tpl | 2 +- util/python/python_config.sh | 2 +- 141 files changed, 4407 insertions(+), 602 deletions(-) mode change 100644 => 100755 tensorflow/contrib/cmake/tf_python.cmake create mode 100644 tensorflow/contrib/cmake/tools/create_def_file.py create mode 100644 tensorflow/core/common_runtime/mkl_cpu_allocator.h create mode 100644 tensorflow/core/framework/allocator_registry.cc create mode 100644 tensorflow/core/framework/allocator_registry.h create mode 100644 tensorflow/core/graph/mkl_layout_pass.cc create mode 100644 tensorflow/core/graph/mkl_layout_pass.h create mode 100644 tensorflow/core/graph/mkl_layout_pass_test.cc create mode 100644 tensorflow/core/graph/mkl_tfconversion_pass.cc create mode 100644 tensorflow/core/graph/mkl_tfconversion_pass.h create mode 100644 tensorflow/core/graph/mkl_tfconversion_pass_test.cc create mode 100644 tensorflow/core/kernels/adjust_hue_op.h create mode 100644 tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc create mode 100644 tensorflow/core/kernels/mkl_conv_ops.cc create mode 100644 tensorflow/core/kernels/mkl_tfconv_op.cc create mode 100644 tensorflow/core/kernels/mkl_transpose_op.cc rename tensorflow/core/kernels/{resize_nearest_neighbor_op_benchmark_test.cc => resize_op_benchmark_test.cc} (64%) create mode 100644 tensorflow/core/util/mkl_util.h create mode 100755 tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh diff --git a/.gitignore b/.gitignore index 07dd1513806c13..01f06be1a909f3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .DS_Store .ipynb_checkpoints node_modules +/.bazelrc /bazel-* /third_party/py/numpy/numpy_include /tools/bazel.rc @@ -13,4 +14,4 @@ node_modules *.pyc __pycache__ *.swp -.vscode/ \ No newline at end of file +.vscode/ diff --git a/RELEASE.md b/RELEASE.md index b223f517303ec6..5f261a4543db80 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,10 @@ +# Release 1.0.1 + +## Bug Fixes and Other Changes +* Change GraphConstructor to not increase the version when importing, but instead take the min of all versions. +* Google Cloud Storage fixes. +* Removed `tf.core` and `tf.python` modules from the API. These were never intended to be exposed. Please use the same objects through top-level `tf` module instead. + # Release 1.0.0 ## Major Features and Improvements @@ -88,6 +95,8 @@ To help you upgrade your existing TensorFlow Python code to match the API change from the tensorflow::ops namespace to tensorflow. * Change arg order for `{softmax,sparse_softmax,sigmoid}_cross_entropy_with_logits` to be (labels, predictions), and force use of named args. * tf.nn.rnn_cell.* and most functions in tf.nn.rnn.* (with the exception of dynamic_rnn and raw_rnn) are temporarily in tf.contrib.rnn. They will be moved back into core for TF 1.1. +* `tf.nn.sampled_softmax_loss` and `tf.nn.nce_loss` have both changed their API such that you need to switch the `inputs, labels` to `labels, inputs` parameters. +* The shape keyword argument of the `SparseTensor` constructor changes its name to `dense_shape` between Tensorflow 0.12 and Tensorflow 1.0. ## Bug Fixes and Other Changes * Numerous C++ API updates. diff --git a/WORKSPACE b/WORKSPACE index 72fa0d89494236..6ec1a7df3ec5a5 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -14,12 +14,7 @@ load("@io_bazel_rules_closure//closure:defs.bzl", "closure_repositories") closure_repositories() -load("//tensorflow:workspace.bzl", "check_version", "tf_workspace") - -# We must check the bazel version before trying to parse any other BUILD files, -# in case the parsing of those build files depends on the bazel version we -# require here. -check_version("0.4.2") +load("//tensorflow:workspace.bzl", "tf_workspace") # Uncomment and update the paths in these entries to build the Android demo. #android_sdk_repository( diff --git a/configure b/configure index 05daa23d706f66..081db20d7535d3 100755 --- a/configure +++ b/configure @@ -8,6 +8,9 @@ pushd `dirname $0` > /dev/null SOURCE_BASE_DIR=`pwd -P` popd > /dev/null +# This file contains customized config settings. +touch .bazelrc + PLATFORM="$(uname -s | tr 'A-Z' 'a-z')" function is_linux() { @@ -36,15 +39,11 @@ function is_windows() { } function bazel_clean_and_fetch() { - # bazel clean --expunge currently doesn't work on Windows - # TODO(pcloudy): Re-enable it after bazel clean --expunge is fixed. - if ! is_windows; then - bazel clean --expunge - fi if [ -z "$TF_BAZEL_TARGETS" ]; then - TF_BAZEL_TARGETS="//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..." + bazel fetch "//tensorflow/... -//tensorflow/contrib/nccl/... -//tensorflow/examples/android/..." + else + bazel fetch $TF_BAZEL_TARGETS fi - bazel fetch "$TF_BAZEL_TARGETS" } function sed_hyphen_i() { @@ -102,8 +101,8 @@ if false; then # Disable building with MKL for now if [ "$TF_NEED_MKL" == "1" ]; then # TF_NEED_MKL DST=`dirname $0` - ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170110.tgz - GITHUB_RELEASE_TAG=v0.3 + ARCHIVE_BASENAME=mklml_lnx_2017.0.2.20170209.tgz + GITHUB_RELEASE_TAG=v0.5 MKLURL="https://github.com/01org/mkl-dnn/releases/download/$GITHUB_RELEASE_TAG/$ARCHIVE_BASENAME" if ! [ -e "$DST/third_party/mkl/$ARCHIVE_BASENAME" ]; then wget --no-check-certificate -P $DST/third_party/mkl/ $MKLURL @@ -182,13 +181,12 @@ else TF_NEED_JEMALLOC=0 fi -if [ "$TF_NEED_JEMALLOC" == "1" ]; then - sed_hyphen_i -e "s/WITH_JEMALLOC = False/WITH_JEMALLOC = True/" tensorflow/core/platform/default/build_config.bzl -else - sed_hyphen_i -e "s/WITH_JEMALLOC = True/WITH_JEMALLOC = False/" tensorflow/core/platform/default/build_config.bzl +sed_hyphen_i -e "/with_jemalloc/d" .bazelrc +if [[ "$TF_NEED_JEMALLOC" == "1" ]]; then + echo 'build --define with_jemalloc=true' >>.bazelrc fi -while [ "$TF_NEED_GCP" == "" ]; do +while [[ "$TF_NEED_GCP" == "" ]]; do read -p "Do you wish to build TensorFlow with "\ "Google Cloud Platform support? [y/N] " INPUT case $INPUT in @@ -202,23 +200,12 @@ while [ "$TF_NEED_GCP" == "" ]; do esac done -if [ "$TF_NEED_GCP" == "1" ]; then - ## Verify that libcurl header files are available. - # Only check Linux, since on MacOS the header files are installed with XCode. - if is_linux && [[ ! -f "/usr/include/curl/curl.h" ]]; then - echo "ERROR: It appears that the development version of libcurl is not "\ -"available. Please install the libcurl3-dev package." - exit 1 - fi - - # Update Bazel build configuration. - sed_hyphen_i -e "s/WITH_GCP_SUPPORT = False/WITH_GCP_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl -else - # Update Bazel build configuration. - sed_hyphen_i -e "s/WITH_GCP_SUPPORT = True/WITH_GCP_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl +sed_hyphen_i -e "/with_gcp_support/d" .bazelrc +if [[ "$TF_NEED_GCP" == "1" ]]; then + echo 'build --define with_gcp_support=true' >>.bazelrc fi -while [ "$TF_NEED_HDFS" == "" ]; do +while [[ "$TF_NEED_HDFS" == "" ]]; do read -p "Do you wish to build TensorFlow with "\ "Hadoop File System support? [y/N] " INPUT case $INPUT in @@ -232,16 +219,13 @@ while [ "$TF_NEED_HDFS" == "" ]; do esac done -if [ "$TF_NEED_HDFS" == "1" ]; then - # Update Bazel build configuration. - sed_hyphen_i -e "s/WITH_HDFS_SUPPORT = False/WITH_HDFS_SUPPORT = True/" tensorflow/core/platform/default/build_config.bzl -else - # Update Bazel build configuration. - sed_hyphen_i -e "s/WITH_HDFS_SUPPORT = True/WITH_HDFS_SUPPORT = False/" tensorflow/core/platform/default/build_config.bzl +sed_hyphen_i -e "/with_hdfs_support/d" .bazelrc +if [[ "$TF_NEED_HDFS" == "1" ]]; then + echo 'build --define with_hdfs_support=true' >>.bazelrc fi ## Enable XLA. -while [ "$TF_ENABLE_XLA" == "" ]; do +while [[ "$TF_ENABLE_XLA" == "" ]]; do read -p "Do you wish to build TensorFlow with the XLA just-in-time compiler (experimental)? [y/N] " INPUT case $INPUT in [Yy]* ) echo "XLA JIT support will be enabled for TensorFlow"; TF_ENABLE_XLA=1;; @@ -251,12 +235,9 @@ while [ "$TF_ENABLE_XLA" == "" ]; do esac done -if [ "$TF_ENABLE_XLA" == "1" ]; then - # Update Bazel build configuration. - sed_hyphen_i -e "s/^WITH_XLA_SUPPORT = [FT].*/WITH_XLA_SUPPORT = True/" tensorflow/core/platform/default/build_config_root.bzl -else - # Update Bazel build configuration. - sed_hyphen_i -e "s/^WITH_XLA_SUPPORT = [FT].*/WITH_XLA_SUPPORT = False/" tensorflow/core/platform/default/build_config_root.bzl +sed_hyphen_i -e "/with_xla_support/d" .bazelrc +if [[ "$TF_ENABLE_XLA" == "1" ]]; then + echo 'build --define with_xla_support=true' >>.bazelrc fi diff --git a/tensorflow/BUILD b/tensorflow/BUILD index a2e74f40c3c336..1956cb0763a816 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -110,6 +110,34 @@ config_setting( visibility = ["//visibility:public"], ) +# TODO(jhseu): Enable on other platforms other than Linux. +config_setting( + name = "with_jemalloc", + values = { + "cpu": "k8", + "define": "with_jemalloc=true", + }, + visibility = ["//visibility:public"], +) + +config_setting( + name = "with_gcp_support", + values = {"define": "with_gcp_support=true"}, + visibility = ["//visibility:public"], +) + +config_setting( + name = "with_hdfs_support", + values = {"define": "with_hdfs_support=true"}, + visibility = ["//visibility:public"], +) + +config_setting( + name = "with_xla_support", + values = {"define": "with_xla_support=true"}, + visibility = ["//visibility:public"], +) + package_group( name = "internal", packages = ["//tensorflow/..."], @@ -321,6 +349,8 @@ cc_binary( deps = [ "//tensorflow/c:c_api", "//tensorflow/cc:cc_ops", + "//tensorflow/cc:client_session", + "//tensorflow/cc:scope", "//tensorflow/core:tensorflow", ], ) diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc index 998ca7d21f12f4..8f169cd036814e 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.cc +++ b/tensorflow/compiler/xla/service/allocation_tracker.cc @@ -138,7 +138,8 @@ tensorflow::Status AllocationTracker::DeallocateShape( TF_RET_CHECK(ShapeUtil::TupleElementCount(shape) == elements.size()) << "tuple has unexpected number of elements: " << elements.size() << " != " << ShapeUtil::TupleElementCount(shape); - for (int i = 0; i < elements.size(); ++i) { + for (std::vector::size_type i = 0; + i < elements.size(); ++i) { VLOG(2) << "recursing onto the tuple elements"; TF_RETURN_IF_ERROR(DeallocateShape(backend, device_ordinal, &elements[i], shape.tuple_shapes(i), diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc index aa512f242a3a42..715d3f33bc04db 100644 --- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc +++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc @@ -118,10 +118,10 @@ GenericTransferManager::ShallowCopyTupleFromDevice( // Create a DeviceMemoryBase from each void* pointer. std::vector destination; - for (int i = 0; i < element_pointers.size(); ++i) { + for (std::vector::size_type i = 0; i < element_pointers.size(); ++i) { if (element_pointers[i] == nullptr && !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) { - return FailedPrecondition("tuple contains nullptr at element %d", i); + return FailedPrecondition("tuple contains nullptr at element %lu", i); } int64 buffer_size = ShapeUtil::ByteSizeOf(shape.tuple_shapes(i), /*pointer_size=*/sizeof(void*)); diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index d88315e74784f8..60593afb8c431f 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -256,7 +256,8 @@ StatusOr> Service::ResolveAndValidateArguments( tensorflow::gtl::ArraySlice arguments, const Backend* backend, int device_ordinal) { std::vector allocations; - for (int i = 0; i < arguments.size(); ++i) { + for (tensorflow::gtl::ArraySlice::size_type i = 0; + i < arguments.size(); ++i) { auto allocation_status = allocation_tracker_.Resolve(*arguments[i]); if (!allocation_status.ok()) { return Status(allocation_status.status().code(), @@ -269,7 +270,7 @@ StatusOr> Service::ResolveAndValidateArguments( if (allocation->backend() != backend || allocation->device_ordinal() != device_ordinal) { return InvalidArgument( - "argument %d is on device %s but computation will be executed " + "argument %lu is on device %s but computation will be executed " "on device %s", i, allocation->backend() @@ -295,13 +296,14 @@ StatusOr> Service::CreateModuleConfig( program_shape.parameters_size(), arguments.size()); } - for (int i = 0; i < arguments.size(); ++i) { + for (tensorflow::gtl::ArraySlice::size_type i = 0; + i < arguments.size(); ++i) { // Verify that shape of arguments matches the shape of the arguments in the // ProgramShape. if (!ShapeUtil::Compatible(arguments[i]->shape(), program_shape.parameters(i))) { return InvalidArgument( - "computation expects parameter %d to have shape %s, given shape %s", + "computation expects parameter %lu to have shape %s, given shape %s", i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(), ShapeUtil::HumanString(arguments[i]->shape()).c_str()); } @@ -383,7 +385,8 @@ StatusOr>> Service::BuildExecutables( hlo_dumper, std::move(executors))); if (!other_directory_path.empty()) { - for (int64 i = 0; i < versioned_handles.size(); ++i) { + for (std::vector::size_type i = 0; + i < versioned_handles.size(); ++i) { executables[i]->set_session_module(std::move(session_modules[i])); } } @@ -523,7 +526,8 @@ Service::ExecuteParallelAndRegisterResult( // Asynchronously launch all executables. std::vector result_handles; - for (int64 i = 0; i < executables.size(); i++) { + for (tensorflow::gtl::ArraySlice::size_type i = 0; + i < executables.size(); i++) { TF_ASSIGN_OR_RETURN( perftools::gputools::DeviceMemoryBase result, executables[i]->ExecuteAsyncOnStream(&run_options[i], arguments[i])); diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD index acd82dc21ea395..952f24f34b28e8 100644 --- a/tensorflow/contrib/android/BUILD +++ b/tensorflow/contrib/android/BUILD @@ -72,13 +72,17 @@ LINKER_SCRIPT = "//tensorflow/contrib/android:jni/version_script.lds" cc_binary( name = "libtensorflow_inference.so", srcs = [], - copts = tf_copts(), + copts = tf_copts() + [ + "-ffunction-sections", + "-fdata-sections", + ], linkopts = if_android([ "-landroid", "-llog", "-lm", "-z defs", "-s", + "-Wl,--gc-sections", "-Wl,--version-script", # This line must be directly followed by LINKER_SCRIPT. LINKER_SCRIPT, ]), diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt index 043a69f264b729..3c8dc869afa0c7 100644 --- a/tensorflow/contrib/cmake/CMakeLists.txt +++ b/tensorflow/contrib/cmake/CMakeLists.txt @@ -56,9 +56,10 @@ mark_as_advanced(DOWNLOAD_LOCATION) set(CMAKE_POSITION_INDEPENDENT_CODE ON) add_definitions(-DEIGEN_AVOID_STL_ARRAY) if(WIN32) - add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC -D__VERSION__=\"MSVC\") + add_definitions(-DNOMINMAX -D_WIN32_WINNT=0x0A00 -DLANG_CXX11 -DCOMPILER_MSVC) add_definitions(-DWIN32 -DOS_WIN -D_MBCS -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI -DPLATFORM_WINDOWS) add_definitions(-DTENSORFLOW_USE_EIGEN_THREADPOOL -DEIGEN_HAS_C99_MATH -D_ITERATOR_DEBUG_LEVEL=0) + add_definitions(-DTF_COMPILE_LIBRARY) add_definitions(-DNDEBUG /O2) # Equivalent of -c opt in Bazel. add_definitions(/bigobj /nologo /EHsc /GF /FC /MP /Gm-) # Suppress warnings to reduce build log size. @@ -190,6 +191,7 @@ if (tensorflow_ENABLE_GPU) ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cuComplex.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cublas_v2.h ${CUDNN_HOME}/include/cudnn.h ${CUDA_TOOLKIT_TARGET_DIR}/include/cufft.h ${CUDA_TOOLKIT_TARGET_DIR}/include/curand.h + ${CUDA_TOOLKIT_TARGET_DIR}/include/cuda_runtime_api.h DESTINATION ${tensorflow_source_dir}/third_party/gpus/cuda/include ) include_directories(${tensorflow_source_dir}/third_party/gpus) diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md index 8e7f43b5119000..2641d5292d201e 100644 --- a/tensorflow/contrib/cmake/README.md +++ b/tensorflow/contrib/cmake/README.md @@ -13,7 +13,7 @@ Linux. Current Status -------------- -CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/get_started/os_setup.html#pip-installation-on-windows) +CMake can be used to build TensorFlow on Windows. See the [getting started documentation](https://www.tensorflow.org/install/install_windows) for instructions on how to install a pre-built TensorFlow package on Windows. ### Current known limitations diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake index bca700aca218c6..936196dd202dbb 100644 --- a/tensorflow/contrib/cmake/tf_cc_ops.cmake +++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake @@ -120,3 +120,43 @@ list(REMOVE_ITEM tf_cc_srcs ${tf_cc_test_srcs}) add_library(tf_cc OBJECT ${tf_cc_srcs}) add_dependencies(tf_cc tf_cc_framework tf_cc_ops) + +set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib") +add_custom_target(tf_extension_ops) + +function(AddUserOps) + cmake_parse_arguments(_AT "" "" "TARGET;SOURCES;GPUSOURCES;DEPENDS;DISTCOPY" ${ARGN}) + if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES) + # if gpu build is enabled and we have gpu specific code, + # hint to cmake that this needs to go to nvcc + set (gpu_source ${_AT_GPUSOURCES}) + set (gpu_lib "${_AT_TARGET}_gpu") + set_source_files_properties(${gpu_source} PROPERTIES CUDA_SOURCE_PROPERTY_FORMAT OBJ) + cuda_compile(gpu_lib ${gpu_source}) + endif() + # create shared library from source and cuda obj + add_library(${_AT_TARGET} SHARED ${_AT_SOURCES} ${gpu_lib}) + target_link_libraries(${_AT_TARGET} ${pywrap_tensorflow_lib}) + if(WIN32) + if (tensorflow_ENABLE_GPU AND _AT_GPUSOURCES) + # some ops call out to cuda directly; need to link libs for the cuda dlls + target_link_libraries(${_AT_TARGET} ${CUDA_LIBRARIES}) + endif() + if (_AT_DISTCOPY) + add_custom_command(TARGET ${_AT_TARGET} POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy $ ${_AT_DISTCOPY}/) + endif() + endif() + if (_AT_DEPENDS) + add_dependencies(${_AT_TARGET} ${_AT_DEPENDS}) + endif() + # make sure TF_COMPILE_LIBRARY is not defined for this target + get_target_property(target_compile_flags ${_AT_TARGET} COMPILE_FLAGS) + if(target_compile_flags STREQUAL "target_compile_flags-NOTFOUND") + set(target_compile_flags "/UTF_COMPILE_LIBRARY") + else() + set(target_compile_flags "${target_compile_flags} /UTF_COMPILE_LIBRARY") + endif() + set_target_properties(${_AT_TARGET} PROPERTIES COMPILE_FLAGS ${target_compile_flags}) + add_dependencies(tf_extension_ops ${_AT_TARGET}) +endfunction(AddUserOps) diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake index 691dee9ef091ad..3787ac4c81d6c2 100644 --- a/tensorflow/contrib/cmake/tf_core_framework.cmake +++ b/tensorflow/contrib/cmake/tf_core_framework.cmake @@ -199,7 +199,6 @@ add_custom_command(OUTPUT COMMAND ${PYTHON_EXECUTABLE} ${tensorflow_source_dir}/tensorflow/tools/git/gen_git_source.py --raw_generate ${VERSION_INFO_CC} DEPENDS __force_rebuild) - set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.cc) ######################################################## @@ -238,3 +237,9 @@ add_dependencies(tf_core_framework tf_core_lib proto_text ) + +if(WIN32) + # Cmake > 3.6 will quote this as -D"__VERSION__=\"MSVC\"" which nvcc fails on. + # Instead of defining this global, limit it to tf_core_framework where its used. + target_compile_definitions(tf_core_framework PRIVATE __VERSION__="MSVC") +endif() diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake index dd28817b5468a6..33384eed4809bc 100644 --- a/tensorflow/contrib/cmake/tf_core_kernels.cmake +++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake @@ -93,6 +93,12 @@ if(WIN32) "${tensorflow_source_dir}/tensorflow/core/kernels/meta_support.*" "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.h" "${tensorflow_source_dir}/tensorflow/core/kernels/*quantiz*.cc" + # no in tensorflow.dll - comes from .so + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc" ) list(REMOVE_ITEM tf_core_kernels_srcs ${tf_core_kernels_windows_exclude_srcs}) endif(WIN32) diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake old mode 100644 new mode 100755 index 2c211542176ec9..2ecc08f421f97e --- a/tensorflow/contrib/cmake/tf_python.cmake +++ b/tensorflow/contrib/cmake/tf_python.cmake @@ -623,12 +623,7 @@ add_custom_command( COMMENT "Running SWIG to generate Python wrappers" VERBATIM ) -# pywrap_tensorflow_internal is a shared library containing all of the -# TensorFlow runtime and the standard ops and kernels. These are installed into -# tf_python/tensorflow/python/. -# TODO(mrry): Refactor this to expose a framework library that -# facilitates `tf.load_op_library()`. -add_library(pywrap_tensorflow_internal SHARED +set (pywrap_tensorflow_internal_src "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.h" "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.cc" "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.h" @@ -652,6 +647,55 @@ add_library(pywrap_tensorflow_internal SHARED "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc" "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.h" "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.cc" +) + +if(WIN32) + # Windows: build a static library with the same objects as tensorflow.dll. + # This can be used to build for a standalone exe and also helps us to + # find all symbols that need to be exported from the dll which is needed + # to provide the tensorflow c/c++ api in tensorflow.dll. + # From the static library we create the def file with all symbols that need to + # be exported from tensorflow.dll. Because there is a limit of 64K sybmols + # that can be exported, we filter the symbols with a python script to the namespaces + # we need. + # + add_library(pywrap_tensorflow_internal_static STATIC + ${pywrap_tensorflow_internal_src} + $ + $ + $ + $ + $ + $ + $<$:$> + $ + $<$:$> + $<$:$> + ) + target_include_directories(pywrap_tensorflow_internal_static PUBLIC + ${PYTHON_INCLUDE_DIR} + ${NUMPY_INCLUDE_DIR} + ) + target_link_libraries(pywrap_tensorflow_internal_static + tf_protos_cc + tf_python_protos_cc + ) + set(pywrap_tensorflow_deffile "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow.def") + set_source_files_properties(${pywrap_tensorflow_deffile} PROPERTIES GENERATED TRUE) + + add_custom_command(TARGET pywrap_tensorflow_internal_static POST_BUILD + COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/tools/create_def_file.py + --input $ + --output ${pywrap_tensorflow_deffile} + ) +endif(WIN32) + + +# pywrap_tensorflow_internal is a shared library containing all of the +# TensorFlow runtime and the standard ops and kernels. These are installed into +# tf_python/tensorflow/python/. +add_library(pywrap_tensorflow_internal SHARED + ${pywrap_tensorflow_internal_src} $ $ $ @@ -662,7 +706,13 @@ add_library(pywrap_tensorflow_internal SHARED $ $<$:$> $<$:$> + ${pywrap_tensorflow_deffile} ) + +if(WIN32) + add_dependencies(pywrap_tensorflow_internal pywrap_tensorflow_internal_static) +endif(WIN32) + target_include_directories(pywrap_tensorflow_internal PUBLIC ${PYTHON_INCLUDE_DIR} ${NUMPY_INCLUDE_DIR} @@ -675,6 +725,44 @@ target_link_libraries(pywrap_tensorflow_internal ${PYTHON_LIBRARIES} ) +if(WIN32) + # include contrib/rnn as .so + # + set(tf_gru_srcs + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.h" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.cc" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops.h" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/gru_ops.cc" + ) + set(tf_gru_gpu_srcs + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/gru_ops_gpu.cu.cc" + ) + + set(tf_lstm_srcs + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.cc" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/blas_gemm.h" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.cc" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops.h" + "${tensorflow_source_dir}/tensorflow/contrib/rnn/ops/lstm_ops.cc" + ) + set(tf_lstm_gpu_srcs + "${tensorflow_source_dir}/tensorflow/contrib/rnn/kernels/lstm_ops_gpu.cu.cc" + ) + + AddUserOps(TARGET _gru_ops + SOURCES "${tf_gru_srcs}" + GPUSOURCES ${tf_gru_gpu_srcs} + DEPENDS pywrap_tensorflow_internal tf_python_ops + DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/) + + AddUserOps(TARGET _lstm_ops + SOURCES "${tf_lstm_srcs}" + GPUSOURCES ${tf_lstm_gpu_srcs} + DEPENDS pywrap_tensorflow_internal tf_python_ops + DISTCOPY ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/rnn/python/ops/) +endif(WIN32) + ############################################################ # Build a PIP package containing the TensorFlow runtime. ############################################################ @@ -684,14 +772,17 @@ add_dependencies(tf_python_build_pip_package tensorboard_copy_dependencies tf_python_copy_scripts_to_destination tf_python_touchup_modules - tf_python_ops) + tf_python_ops + tf_extension_ops) add_custom_command(TARGET tf_python_build_pip_package POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/setup.py ${CMAKE_CURRENT_BINARY_DIR}/tf_python/) if(WIN32) add_custom_command(TARGET tf_python_build_pip_package POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.dll - ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd) + ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.pyd + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/pywrap_tensorflow_internal.lib + ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/) else() add_custom_command(TARGET tf_python_build_pip_package POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake index 711b5c49f40b35..449a762a9aa4d9 100644 --- a/tensorflow/contrib/cmake/tf_tests.cmake +++ b/tensorflow/contrib/cmake/tf_tests.cmake @@ -115,7 +115,14 @@ if (tensorflow_BUILD_PYTHON_TESTS) # # include all test + if (WIN32) + file(GLOB_RECURSE tf_test_rnn_src_py + "${tensorflow_source_dir}/tensorflow/contrib/rnn/python/kernel_tests/*_test.py" + ) + endif() + file(GLOB_RECURSE tf_test_src_py + ${tf_test_rnn_src_py} "${tensorflow_source_dir}/tensorflow/python/debug/cli/*_test.py" "${tensorflow_source_dir}/tensorflow/python/debug/lib/*_test.py" "${tensorflow_source_dir}/tensorflow/python/debug/wrappers/*_test.py" diff --git a/tensorflow/contrib/cmake/tf_tools.cmake b/tensorflow/contrib/cmake/tf_tools.cmake index 2aaa9ed53ebbf1..5151fdb444f75c 100644 --- a/tensorflow/contrib/cmake/tf_tools.cmake +++ b/tensorflow/contrib/cmake/tf_tools.cmake @@ -106,3 +106,22 @@ target_link_libraries(${compare_graphs} PUBLIC ${tf_core_gpu_kernels_lib} ${tensorflow_EXTERNAL_LIBRARIES} ) + +set(benchmark_model "benchmark_model") + +add_executable(${benchmark_model} + "${tensorflow_source_dir}/tensorflow/tools/benchmark/benchmark_model.cc" + "${tensorflow_source_dir}/tensorflow/tools/benchmark/benchmark_model_main.cc" + $ + $ + $ + $ + $ + $ +) + +target_link_libraries(${benchmark_model} PUBLIC + tf_protos_cc + ${tf_core_gpu_kernels_lib} + ${tensorflow_EXTERNAL_LIBRARIES} +) diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py new file mode 100644 index 00000000000000..950c8f79bc9622 --- /dev/null +++ b/tensorflow/contrib/cmake/tools/create_def_file.py @@ -0,0 +1,134 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +""" +create_def_file.py - tool to create a windows def file to export +symbols from tensorflow.dll to enable tf.load_library(). +Because the linker allows only 64K symbols to be exported per dll +we filter the symbols down to the essentials. The regular expressions +we use for this are specific to tensorflow. + +TODO: this works fine but there is an issue with exporting +'const char * const' and importing it from a user_ops. The problem is +on the importing end and using __declspec(dllimport) works around it. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import io +import os +import re +import sys +import tempfile +from subprocess import Popen, PIPE + +# External tools we use that come with visual studio sdk and +# we assume that the caller has the correct PATH to the sdk +UNDNAME = "undname.exe" +DUMPBIN = "dumpbin.exe" + +# Exclude if matched +EXCLUDE_RE = re.compile(r"deleting destructor|::internal::") + +# Include if matched before exclude +INCLUDEPRE_RE = re.compile(r"tensorflow::internal::LogMessage|" + + r"tensorflow::internal::CheckOpMessageBuilder") + +# Include if matched after exclude +INCLUDE_RE = re.compile(r"^(TF_\w*)$|" + + r"tensorflow::|" + + r"functor::|" + + r"perftools::gputools") + + +def get_args(): + """Parse command line.""" + parser = argparse.ArgumentParser() + parser.add_argument("--input", help="input library", required=True) + parser.add_argument("--output", help="output deffile", required=True) + args = parser.parse_args() + return args + + +def main(): + """main.""" + args = get_args() + + # Pipe dumpbin to extract all linkable symbols from a lib. + # Good symbols are collected in candidates and also written to + # a temp file. + candidates = [] + tmpfile = tempfile.NamedTemporaryFile(mode="w", delete=False) + proc = Popen([DUMPBIN, "/nologo", "/linkermember:1", args.input], stdout=PIPE) + for line in io.TextIOWrapper(proc.stdout, encoding="utf-8"): + cols = line.split() + if len(cols) < 2: + continue + sym = cols[1] + tmpfile.file.write(sym + "\n") + candidates.append(sym) + tmpfile.file.close() + exit_code = proc.wait() + if exit_code != 0: + print("{} failed, exit={}".format(DUMPBIN, exit_code)) + return exit_code + + # Run the symbols through undname to get their undecorated name + # so we can filter on something readable. + with open(args.output, "w") as def_fp: + # track dupes + taken = set() + + # Header for the def file. Since the tensorflow.dll is actually called + # _pywrap_tensorflow.pyd in the python wheel, hint that in the def file. + def_fp.write("LIBRARY _pywrap_tensorflow_internal.pyd\n") + def_fp.write("EXPORTS\n") + def_fp.write("\t ??1OpDef@tensorflow@@UEAA@XZ\n") + + # Each symbols returned by undname matches the same position in candidates. + # We compare on undname but use the decorated name from candidates. + dupes = 0 + proc = Popen([UNDNAME, tmpfile.name], stdout=PIPE) + for idx, line in enumerate(io.TextIOWrapper(proc.stdout, encoding="utf-8")): + decorated = candidates[idx] + if decorated in taken: + # Symbol is already in output, done. + dupes += 1 + continue + + if not INCLUDEPRE_RE.search(line): + if EXCLUDE_RE.search(line): + continue + if not INCLUDE_RE.search(line): + continue + + def_fp.write("\t" + decorated + "\n") + taken.add(decorated) + exit_code = proc.wait() + if exit_code != 0: + print("{} failed, exit={}".format(UNDNAME, exit_code)) + return exit_code + + os.unlink(tmpfile.name) + + print("symbols={}, taken={}, dupes={}" + .format(len(candidates), len(taken), dupes)) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py index 4cd3efafa0e71e..5d078236ac331c 100644 --- a/tensorflow/contrib/framework/python/framework/checkpoint_utils.py +++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils.py @@ -280,10 +280,11 @@ def init_from_checkpoint(checkpoint_dir, assignment_map): for var_name in scope_variables: # Lookup name with specified prefix and suffix from current variable. # If tensor_name given is '/' (root), don't use it for full name. + full_tensor_name = var_name[len(scopes):] + if current_var_or_name != "/": + full_tensor_name = full_tensor_name[1:] if tensor_name_in_ckpt != "/": - full_tensor_name = tensor_name_in_ckpt + var_name[len(scopes) + 1:] - else: - full_tensor_name = var_name[len(scopes) + 1:] + full_tensor_name = tensor_name_in_ckpt + full_tensor_name if full_tensor_name not in variable_map: raise ValueError( "Tensor %s (%s in %s) is not found in %s checkpoint" % ( diff --git a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py index 09eecb56dcb162..51ca5ec1251dd9 100644 --- a/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py +++ b/tensorflow/contrib/framework/python/framework/checkpoint_utils_test.py @@ -168,6 +168,29 @@ def testInitFromRootCheckpoint(self): self.assertAllEqual(my3.eval(session), v3) self.assertAllEqual(my4.eval(session), v4) + def testInitToRootCheckpoint(self): + checkpoint_dir = self.get_temp_dir() + with self.test_session() as session: + v1, v2, v3, v4 = _create_checkpoints(session, checkpoint_dir) + + # New graph and session. + with ops.Graph().as_default() as g: + with self.test_session(graph=g) as session: + my1 = variable_scope.get_variable("var1", [1, 10]) + my2 = variable_scope.get_variable("var2", [10, 10]) + my3 = variable_scope.get_variable("var3", [100, 100]) + with variable_scope.variable_scope("useful_scope"): + my4 = variable_scope.get_variable("var4", [9, 9]) + + checkpoint_utils.init_from_checkpoint(checkpoint_dir, + {"/": "/",}) + + session.run(variables.global_variables_initializer()) + self.assertAllEqual(my1.eval(session), v1) + self.assertAllEqual(my2.eval(session), v2) + self.assertAllEqual(my3.eval(session), v3) + self.assertAllEqual(my4.eval(session), v4) + def testInitFromPartitionVar(self): checkpoint_dir = self.get_temp_dir() with self.test_session() as session: diff --git a/tensorflow/contrib/framework/python/ops/arg_scope.py b/tensorflow/contrib/framework/python/ops/arg_scope.py index b7ec9ba9369dfb..ad84cd681aa2cc 100644 --- a/tensorflow/contrib/framework/python/ops/arg_scope.py +++ b/tensorflow/contrib/framework/python/ops/arg_scope.py @@ -30,11 +30,15 @@ net = layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', scope='conv1') net = layers.conv2d(net, 256, [5, 5], scope='conv2') ``` - The first call to conv2d will use predefined args: - layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', ..., scope='conv1') + The first call to conv2d will behave as follows: + layers.conv2d(inputs, 64, [11, 11], 4, padding='VALID', + initializer=layers.variance_scaling_initializer(), + regularizer=layers.l2_regularizer(0.05), scope='conv1') - The second call to conv2d will overwrite padding: - layers.conv2d(inputs, 256, [5, 5], padding='SAME', ..., scope='conv2') + The second call to conv2d will also use the arg_scope's default for padding: + layers.conv2d(inputs, 256, [5, 5], padding='SAME', + initializer=layers.variance_scaling_initializer(), + regularizer=layers.l2_regularizer(0.05), scope='conv2') Example of how to reuse an arg_scope: @@ -49,7 +53,7 @@ net = layers.conv2d(net, 256, [5, 5], scope='conv2') ``` - Example of how to use tf.contrib.framework.add_arg_scope: + Example of how to use tf.contrib.framework.add_arg_scope to enable your function to be called within an arg_scope later: @tf.contrib.framework.add_arg_scope def conv2d(*args, **kwargs) diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py index e746107e361d05..6ba8f7e8aec2cf 100644 --- a/tensorflow/contrib/layers/__init__.py +++ b/tensorflow/contrib/layers/__init__.py @@ -40,6 +40,7 @@ @@softmax @@stack @@unit_norm +@@bow_encoder @@embed_sequence @@apply_regularization diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py index adbbcea02fa008..07be8e9990ff1e 100644 --- a/tensorflow/contrib/layers/python/layers/layers.py +++ b/tensorflow/contrib/layers/python/layers/layers.py @@ -160,9 +160,8 @@ def _fused_batch_norm( they need to be added as a dependency to the `train_op`, example: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - if update_ops: - updates = tf.group(*update_ops) - total_loss = control_flow_ops.with_dependencies([updates], total_loss) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(loss) One can set updates_collections=None to force the updates in place, but that can have speed penalty, especially in distributed settings. @@ -393,9 +392,8 @@ def batch_norm(inputs, they need to be added as a dependency to the `train_op`, example: update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) - if update_ops: - updates = tf.group(*update_ops) - total_loss = control_flow_ops.with_dependencies([updates], total_loss) + with tf.control_dependencies(update_ops): + train_op = optimizer.minimize(loss) One can set updates_collections=None to force the updates in place, but that can have speed penalty, especially in distributed settings. diff --git a/tensorflow/contrib/learn/__init__.py b/tensorflow/contrib/learn/__init__.py index 85cef3d8db2c38..bd56066b1bc95b 100644 --- a/tensorflow/contrib/learn/__init__.py +++ b/tensorflow/contrib/learn/__init__.py @@ -33,6 +33,7 @@ @@DNNLinearCombinedRegressor @@DNNLinearCombinedEstimator @@DNNLinearCombinedClassifier +@@DynamicRnnEstimator @@LinearClassifier @@LinearEstimator @@LinearRegressor diff --git a/tensorflow/contrib/learn/python/learn/README.md b/tensorflow/contrib/learn/python/learn/README.md index f412c83a97793e..0aae178e9ac6d6 100644 --- a/tensorflow/contrib/learn/python/learn/README.md +++ b/tensorflow/contrib/learn/python/learn/README.md @@ -20,18 +20,17 @@ Optionally you can install [scikit-learn](http://scikit-learn.org/stable/) and [ ### Tutorials -- [TF Learn Quickstart](../../../../g3doc/tutorials/tflearn/index.md). Build, +- [TF Learn Quickstart](https://www.tensorflow.org/get_started/tflearn). Build, train, and evaluate a neural network with just a few lines of code. -- [Input Functions](../../../../g3doc/tutorials/input_fn/index.md). Learn how +- [Input Functions](https://www.tensorflow.org/get_started/input_fn). Learn how to create input functions to feed data into your models. -- [Linear Model](../../../../g3doc/tutorials/wide/index.md). Learn the basics +- [Linear Model](https://www.tensorflow.org/tutorials/wide). Learn the basics of building linear models. -- [Wide and Deep - Learning](../../../../g3doc/tutorials/wide_and_deep/index.md). Jointly train - a linear model and a deep neural network. -- [Logging and Monitoring](../../../../g3doc/tutorials/monitors/index.md). Use - the Monitor API to audit training of a neural network. -- [Custom Estimators](../../../../g3doc/tutorials/estimators/index.md). Learn +- [Wide and Deep Learning](https://www.tensorflow.org/tutorials/wide_and_deep). + Jointly train a linear model and a deep neural network. +- [Logging and Monitoring](https://www.tensorflow.org/get_started/monitors). + Use the Monitor API to audit training of a neural network. +- [Custom Estimators](https://www.tensorflow.org/extend/estimators). Learn how to create a custom estimator. - More coming soon. diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py index 6d591c42c65bf0..7a9529694513e1 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py +++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py @@ -1108,7 +1108,7 @@ def _call_model_fn(self, features, labels, mode): if isinstance(model_fn_results, model_fn_lib.ModelFnOps): return model_fn_results - # Here model_fn_ops should be a tuple with 3 elements. + # Here model_fn_results should be a tuple with 3 elements. if len(model_fn_results) != 3: raise ValueError('Unrecognized value returned by model_fn, ' 'please return ModelFnOps.') diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py index faf78a36752f33..d7f1017a46a8d8 100644 --- a/tensorflow/contrib/learn/python/learn/estimators/linear.py +++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py @@ -149,21 +149,16 @@ def _linear_model_fn(features, labels, mode, params, config=None): values=tuple(six.itervalues(features)), partitioner=partitioner) as scope: if joint_weights: - logits, _, _ = ( - layers.joint_weighted_sum_from_feature_columns( - columns_to_tensors=features, - feature_columns=feature_columns, - num_outputs=head.logits_dimension, - weight_collections=[parent_scope], - scope=scope)) + layer_fn = layers.joint_weighted_sum_from_feature_columns else: - logits, _, _ = ( - layers.weighted_sum_from_feature_columns( - columns_to_tensors=features, - feature_columns=feature_columns, - num_outputs=head.logits_dimension, - weight_collections=[parent_scope], - scope=scope)) + layer_fn = layers.weighted_sum_from_feature_columns + + logits, _, _ = layer_fn( + columns_to_tensors=features, + feature_columns=feature_columns, + num_outputs=head.logits_dimension, + weight_collections=[parent_scope], + scope=scope) def _train_op_fn(loss): global_step = contrib_variables.get_global_step() diff --git a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py index c164a12b1d2325..09f19ad2748ba3 100644 --- a/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py +++ b/tensorflow/contrib/learn/python/learn/tests/dataframe/tensorflow_dataframe_test.py @@ -63,57 +63,54 @@ def _assert_df_equals_dict(expected_df, actual_dict): actual_dict[col])) -def _make_test_csv(): - f = tempfile.NamedTemporaryFile( - dir=test.get_temp_dir(), delete=False, mode="w") - w = csv.writer(f) - w.writerow(["int", "float", "bool", "string"]) - for _ in range(100): - intvalue = np.random.randint(-10, 10) - floatvalue = np.random.rand() - boolvalue = int(np.random.rand() > 0.3) - stringvalue = "S: %.4f" % np.random.rand() - - row = [intvalue, floatvalue, boolvalue, stringvalue] - w.writerow(row) - f.close() - return f.name - - -def _make_test_csv_sparse(): - f = tempfile.NamedTemporaryFile( - dir=test.get_temp_dir(), delete=False, mode="w") - w = csv.writer(f) - w.writerow(["int", "float", "bool", "string"]) - for _ in range(100): - # leave columns empty; these will be read as default value (e.g. 0 or NaN) - intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else "" - floatvalue = np.random.rand() if np.random.rand() > 0.5 else "" - boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else "" - stringvalue = (("S: %.4f" % np.random.rand()) if np.random.rand() > 0.5 else - "") - - row = [intvalue, floatvalue, boolvalue, stringvalue] - w.writerow(row) - f.close() - return f.name - - -def _make_test_tfrecord(): - f = tempfile.NamedTemporaryFile(dir=test.get_temp_dir(), delete=False) - w = tf_record.TFRecordWriter(f.name) - for i in range(100): - ex = example_pb2.Example() - ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3))) - ex.features.feature["fixed_len_float"].float_list.value.extend( - [float(i), 2 * float(i)]) - w.write(ex.SerializeToString()) - return f.name - - class TensorFlowDataFrameTestCase(test.TestCase): """Tests for `TensorFlowDataFrame`.""" + def _make_test_csv(self): + f = tempfile.NamedTemporaryFile( + dir=self.get_temp_dir(), delete=False, mode="w") + w = csv.writer(f) + w.writerow(["int", "float", "bool", "string"]) + for _ in range(100): + intvalue = np.random.randint(-10, 10) + floatvalue = np.random.rand() + boolvalue = int(np.random.rand() > 0.3) + stringvalue = "S: %.4f" % np.random.rand() + + row = [intvalue, floatvalue, boolvalue, stringvalue] + w.writerow(row) + f.close() + return f.name + + def _make_test_csv_sparse(self): + f = tempfile.NamedTemporaryFile( + dir=self.get_temp_dir(), delete=False, mode="w") + w = csv.writer(f) + w.writerow(["int", "float", "bool", "string"]) + for _ in range(100): + # leave columns empty; these will be read as default value (e.g. 0 or NaN) + intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else "" + floatvalue = np.random.rand() if np.random.rand() > 0.5 else "" + boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else "" + stringvalue = (("S: %.4f" % np.random.rand()) if np.random.rand() > 0.5 else + "") + + row = [intvalue, floatvalue, boolvalue, stringvalue] + w.writerow(row) + f.close() + return f.name + + def _make_test_tfrecord(self): + f = tempfile.NamedTemporaryFile(dir=self.get_temp_dir(), delete=False) + w = tf_record.TFRecordWriter(f.name) + for i in range(100): + ex = example_pb2.Example() + ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3))) + ex.features.feature["fixed_len_float"].float_list.value.extend( + [float(i), 2 * float(i)]) + w.write(ex.SerializeToString()) + return f.name + def _assert_pandas_equals_tensorflow(self, pandas_df, tensorflow_df, num_batches, batch_size): self.assertItemsEqual( @@ -190,7 +187,7 @@ def testFromCSV(self): batch_size = 8 enqueue_size = 7 - data_path = _make_test_csv() + data_path = self._make_test_csv() default_values = [0, 0.0, 0, ""] pandas_df = pd.read_csv(data_path) @@ -211,7 +208,7 @@ def testFromCSVLimitEpoch(self): num_epochs = 17 expected_num_batches = (num_epochs * 100) // batch_size - data_path = _make_test_csv() + data_path = self._make_test_csv() default_values = [0, 0.0, 0, ""] tensorflow_df = df.TensorFlowDataFrame.from_csv( @@ -234,7 +231,7 @@ def testFromCSVWithFeatureSpec(self): num_batches = 100 batch_size = 8 - data_path = _make_test_csv_sparse() + data_path = self._make_test_csv_sparse() feature_spec = { "int": parsing_ops.FixedLenFeature(None, dtypes.int16, np.nan), "float": parsing_ops.VarLenFeature(dtypes.float16), @@ -270,7 +267,7 @@ def testFromExamples(self): enqueue_size = 11 batch_size = 13 - data_path = _make_test_tfrecord() + data_path = self._make_test_tfrecord() features = { "fixed_len_float": parsing_ops.FixedLenFeature( @@ -318,7 +315,7 @@ def testSplitString(self): num_epochs = 17 expected_num_batches = (num_epochs * 100) // batch_size - data_path = _make_test_csv() + data_path = self._make_test_csv() default_values = [0, 0.0, 0, ""] tensorflow_df = df.TensorFlowDataFrame.from_csv( diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py index 1e5795d0356b7e..c1ba9d4eadf54f 100644 --- a/tensorflow/contrib/metrics/python/ops/metric_ops.py +++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py @@ -261,7 +261,7 @@ def streaming_false_negatives(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None): - """Computes the total number of false positives. + """Computes the total number of false negatives. If `weights` is `None`, weights default to 1. Use weights of 0 to mask values. diff --git a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py b/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py index 755ebd048b5892..f44302638eb994 100644 --- a/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py +++ b/tensorflow/contrib/rnn/python/ops/core_rnn_cell_impl.py @@ -13,7 +13,14 @@ # limitations under the License. # ============================================================================== -"""Module implementing RNN Cells.""" +"""Module implementing RNN Cells. + +This module provides a number of basic commonly used RNN cells, such as LSTM +(Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number of +operators that allow adding dropouts, projections, or embeddings for inputs. +Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by +calling the `rnn` ops several times. +""" from __future__ import absolute_import from __future__ import division @@ -146,12 +153,12 @@ def __call__(self, inputs, state, scope=None): with _checked_scope(self, scope or "gru_cell", reuse=self._reuse): with vs.variable_scope("gates"): # Reset gate and update gate. # We start with bias of 1.0 to not reset and not update. + value = sigmoid(_linear( + [inputs, state], 2 * self._num_units, True, 1.0)) r, u = array_ops.split( - value=_linear( - [inputs, state], 2 * self._num_units, True, 1.0), + value=value, num_or_size_splits=2, axis=1) - r, u = sigmoid(r), sigmoid(u) with vs.variable_scope("candidate"): c = self._activation(_linear([inputs, r * state], self._num_units, True)) diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py index c1ec46d763773d..318b552f4a7128 100644 --- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py +++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py @@ -70,7 +70,7 @@ def _lstm_block_cell(x, cs = ci .* i + cs_prev .* f cs = clip(cs, cell_clip) - o = sigmoid(cs * wco + f) + o = sigmoid(cs * wco + o) co = tanh(cs) h = co .* o ``` diff --git a/tensorflow/contrib/seq2seq/python/ops/helper.py b/tensorflow/contrib/seq2seq/python/ops/helper.py index 34367db01bfc9a..616de3199c2001 100644 --- a/tensorflow/contrib/seq2seq/python/ops/helper.py +++ b/tensorflow/contrib/seq2seq/python/ops/helper.py @@ -486,7 +486,7 @@ def sample(self, time, outputs, state, name=None): # Outputs are logits, use argmax to get the most probable id if not isinstance(outputs, ops.Tensor): raise TypeError("Expected outputs to be a single Tensor, got: %s" % - outputs) + type(outputs)) sample_ids = math_ops.cast( math_ops.argmax(outputs, axis=-1), dtypes.int32) return sample_ids diff --git a/tensorflow/contrib/seq2seq/python/ops/loss.py b/tensorflow/contrib/seq2seq/python/ops/loss.py index e14f07bc09999c..61852eda4f07e6 100644 --- a/tensorflow/contrib/seq2seq/python/ops/loss.py +++ b/tensorflow/contrib/seq2seq/python/ops/loss.py @@ -44,8 +44,7 @@ def sequence_loss(logits, targets, weights, sequence. When using weights as masking set all valid timesteps to 1 and all padded timesteps to 0. average_across_timesteps: If set, sum the cost across the sequence - dimension and divide by the cost by the total label weight across - timesteps. + dimension and divide the cost by the total label weight across timesteps. average_across_batch: If set, sum the cost across the batch dimension and divide the returned cost by the batch size. softmax_loss_function: Function (labels-batch, inputs-batch) -> loss-batch diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md index 898d3a11d0017c..94b0263ae8693f 100644 --- a/tensorflow/contrib/slim/README.md +++ b/tensorflow/contrib/slim/README.md @@ -229,7 +229,7 @@ net = ... net = slim.conv2d(net, 256, [3, 3], scope='conv3_1') net = slim.conv2d(net, 256, [3, 3], scope='conv3_2') net = slim.conv2d(net, 256, [3, 3], scope='conv3_3') -net = slim.max_pool2d(net, [2, 2], scope='pool3') +net = slim.max_pool2d(net, [2, 2], scope='pool2') ``` One way to reduce this code duplication would be via a `for` loop: @@ -238,14 +238,14 @@ One way to reduce this code duplication would be via a `for` loop: net = ... for i in range(3): net = slim.conv2d(net, 256, [3, 3], scope='conv3_' % (i+1)) -net = slim.max_pool2d(net, [2, 2], scope='pool3') +net = slim.max_pool2d(net, [2, 2], scope='pool2') ``` This can be made even cleaner by using TF-Slim's `repeat` operation: ```python net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], scope='conv3') -net = slim.max_pool(net, [2, 2], scope='pool2') +net = slim.max_pool2d(net, [2, 2], scope='pool2') ``` Notice that the `slim.repeat` not only applies the same argument in-line, it diff --git a/tensorflow/contrib/util/loader.py b/tensorflow/contrib/util/loader.py index 95657217a00ce8..c2ae425b56a9da 100644 --- a/tensorflow/contrib/util/loader.py +++ b/tensorflow/contrib/util/loader.py @@ -21,6 +21,7 @@ from __future__ import print_function import os +import re from tensorflow.python.framework import load_library from tensorflow.python.platform import resource_loader @@ -29,9 +30,9 @@ def load_op_library(path): """Loads a contrib op library from the given path. - NOTE(mrry): On Windows, we currently assume that contrib op + NOTE(mrry): On Windows, we currently assume that some contrib op libraries are statically linked into the main TensorFlow Python - extension DLL. + extension DLL - use dynamically linked ops if the .so is present. Args: path: An absolute path to a shared object file. @@ -40,11 +41,17 @@ def load_op_library(path): A Python module containing the Python wrappers for Ops defined in the plugin. """ - if os.name != 'nt': - path = resource_loader.get_path_to_datafile(path) - ret = load_library.load_op_library(path) - assert ret, 'Could not load %s' % path - return ret - else: - # NOTE(mrry): - return None + if os.name == 'nt': + # To avoid makeing every user_ops aware of windows, re-write + # the file extension from .so to .dll. + path = re.sub('\.so$', '.dll', path) + + # TODO: currently we have only some user_ops as .dll's on windows - don't try + # to load them if the dll is not found. Once we have all of them + # this check should be removed. + if not os.path.exists(path): + return None + path = resource_loader.get_path_to_datafile(path) + ret = load_library.load_op_library(path) + assert ret, 'Could not load %s' % path + return ret diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index ea434c3eb20813..79d44c5a0c7a91 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -339,6 +339,7 @@ tf_cuda_library( hdrs = [ "example/feature_util.h", "framework/allocator.h", + "framework/allocator_registry.h", "framework/attr_value_util.h", "framework/bfloat16.h", "framework/cancellation.h", @@ -408,7 +409,9 @@ tf_cuda_library( "util/memmapped_file_system.h", "util/memmapped_file_system_writer.h", ], - }), + }) + if_mkl([ + "util/mkl_util.h", + ]), visibility = ["//visibility:public"], deps = [":framework_internal"], ) @@ -707,7 +710,9 @@ cc_library( "//tensorflow/core/kernels:math_not_windows", "//tensorflow/core/kernels:quantized_ops", ]) + if_mkl([ - "//tensorflow/core/kernels:mkl_ops", + "//tensorflow/core/kernels:mkl_conv_op", + "//tensorflow/core/kernels:mkl_matmul_op", + "//tensorflow/core/kernels:mkl_tfconv_op", ]), ) @@ -772,7 +777,7 @@ cc_library( "//tensorflow/core/kernels:constant_op", "//tensorflow/core/kernels:ops_testutil", "//tensorflow/core/kernels:ops_util", - "//tensorflow/core/platform/default/build_config:gtest", # + if_sycl([":sycl_runtime"]), + "//tensorflow/core/platform/default/build_config:gtest", # + if_sycl([":sycl_runtime"]) ], ) @@ -1393,7 +1398,7 @@ tf_cuda_library( ":version_lib", "//tensorflow/core/kernels:bounds_check", "//third_party/eigen3", - ], + ] + if_mkl(["//third_party/mkl:intel_binary_blob"]), alwayslink = 1, ) @@ -1482,20 +1487,21 @@ tf_cuda_library( ), copts = tf_copts(), deps = [ - ":framework", - ":framework_internal", - ":function_ops_op_lib", - ":functional_grad", - ":functional_ops_op_lib", - ":lib", - ":lib_internal", - ":proto_text", - ":protos_all_cc", - "//tensorflow/core/grappler:grappler_item", - "//tensorflow/core/grappler/optimizers:meta_optimizer", - "//third_party/eigen3", - "//tensorflow/core/kernels:required", - ] + tf_additional_core_deps(), + ":framework", + ":framework_internal", + ":function_ops_op_lib", + ":functional_grad", + ":functional_ops_op_lib", + ":lib", + ":lib_internal", + ":proto_text", + ":protos_all_cc", + "//tensorflow/core/grappler:grappler_item", + "//tensorflow/core/grappler/optimizers:meta_optimizer", + "//third_party/eigen3", + "//tensorflow/core/kernels:required", + ] + if_mkl(["//third_party/mkl:intel_binary_blob"]) + + tf_additional_core_deps(), alwayslink = 1, ) @@ -2037,33 +2043,38 @@ tf_cc_tests( ], ) -if_mkl( - tf_cc_test_mkl( - name = "mkl_related_tests", - size = "small", - srcs = ["graph/mkl_optimizer_merge_test.cc"], - linkstatic = tf_kernel_tests_linkstatic(), - deps = [ - ":core", - ":core_cpu", - ":core_cpu_internal", - ":direct_session_internal", - ":framework", - ":framework_internal", - ":lib", - ":lib_internal", - ":ops", - ":protos_all_cc", # under if_mkl - ":test", - ":test_main", - ":testlib", - "//tensorflow/cc:cc_ops", - "//tensorflow/cc:scope", - "//tensorflow/cc:sendrecv_ops", - "//tensorflow/core/kernels:ops_util", - "//third_party/eigen3", - ], - ), +tf_cc_test_mkl( + name = "mkl_related_tests", + size = "small", + srcs = [ + "graph/mkl_layout_pass_test.cc", + "graph/mkl_optimizer_merge_test.cc", + "graph/mkl_tfconversion_pass_test.cc", + ], + linkstatic = tf_kernel_tests_linkstatic(), + deps = [ + ":core", + ":core_cpu", + ":core_cpu_internal", + ":direct_session_internal", + ":framework", + ":framework_internal", + ":lib", + ":lib_internal", + ":ops", + ":protos_all_cc", + ":test", + ":test_main", + ":testlib", + "//tensorflow/cc:cc_ops", + "//tensorflow/cc:scope", + "//tensorflow/cc:sendrecv_ops", + "//tensorflow/core/kernels:mkl_conv_op", + "//tensorflow/core/kernels:mkl_matmul_op", + "//tensorflow/core/kernels:mkl_tfconv_op", + "//tensorflow/core/kernels:ops_util", + "//third_party/eigen3", + ], ) tf_cc_tests_gpu( diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h new file mode 100644 index 00000000000000..41bf23be27083e --- /dev/null +++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h @@ -0,0 +1,120 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// A simple CPU allocator that intercepts malloc/free calls from MKL library +// and redirects them to Tensorflow allocator + +#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ +#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ + +#ifdef INTEL_MKL + +#include +#include "tensorflow/core/common_runtime/bfc_allocator.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/platform/mem.h" + +#include "third_party/mkl/include/i_malloc.h" + +namespace tensorflow { + +class MklSubAllocator : public SubAllocator { + public: + ~MklSubAllocator() override {} + + void* Alloc(size_t alignment, size_t num_bytes) override { + return port::AlignedMalloc(num_bytes, alignment); + } + void Free(void* ptr, size_t num_bytes) override { port::AlignedFree(ptr); } +}; + +/// CPU allocator for MKL that wraps BFC allocator and intercepts +/// and redirects memory allocation calls from MKL. +class MklCPUAllocator : public Allocator { + public: + // Constructor and other standard functions + + MklCPUAllocator() { + VLOG(2) << "MklCPUAllocator: In MklCPUAllocator"; + allocator_ = + new BFCAllocator(new MklSubAllocator, kMaxMemSize, kAllowGrowth, kName); + + // For redirecting all allocations from MKL to this allocator + // From: http://software.intel.com/en-us/node/528565 + i_malloc = MallocHook; + i_calloc = CallocHook; + i_realloc = ReallocHook; + i_free = FreeHook; + } + + ~MklCPUAllocator() override { delete allocator_; } + + inline string Name() override { return kName; } + + inline void* AllocateRaw(size_t alignment, size_t num_bytes) override { + return allocator_->AllocateRaw(alignment, num_bytes); + } + + inline void DeallocateRaw(void* ptr) override { + allocator_->DeallocateRaw(ptr); + } + + private: + // Hooks provided by this allocator for memory allocation routines from MKL + + static inline void* MallocHook(size_t size) { + VLOG(2) << "MklCPUAllocator: In MallocHook"; + return cpu_allocator()->AllocateRaw(kAlignment, size); + } + + static inline void FreeHook(void* ptr) { + VLOG(2) << "MklCPUAllocator: In FreeHook"; + cpu_allocator()->DeallocateRaw(ptr); + } + + static inline void* CallocHook(size_t num, size_t size) { + Status s = Status(error::Code::UNIMPLEMENTED, + "Unimplemented case for hooking MKL function."); + TF_CHECK_OK(s); // way to assert with an error message + } + + static inline void* ReallocHook(void* ptr, size_t size) { + Status s = Status(error::Code::UNIMPLEMENTED, + "Unimplemented case for hooking MKL function."); + TF_CHECK_OK(s); // way to assert with an error message + } + + // TODO(jbobba): We should ideally move this into CPUOptions in config.proto. + /// Memory limit - 64GB + static const size_t kMaxMemSize = + static_cast(64) * 1024 * 1024 * 1024; + + /// Do we allow growth in BFC Allocator + static const bool kAllowGrowth = true; + + /// Name + static constexpr const char* kName = "mklcpu"; + + /// The alignment that we need for the allocations + static const size_t kAlignment = 64; + + Allocator* allocator_; // owned by this class +}; + +} // namespace tensorflow + +#endif // INTEL_MKL + +#endif // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index 686bc6885e06c9..ca6ba7970f0352 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -17,6 +17,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/local_device.h" #include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/allocator_registry.h" #include "tensorflow/core/framework/device_base.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor.pb_text.h" @@ -27,6 +28,10 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #include "tensorflow/core/public/session_options.h" +#ifdef INTEL_MKL +#include "tensorflow/core/common_runtime/mkl_cpu_allocator.h" +#endif + namespace tensorflow { ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, @@ -70,4 +75,8 @@ Status ThreadPoolDevice::MakeTensorFromProto( ProtoDebugString(tensor_proto)); } +#ifdef INTEL_MKL +REGISTER_MEM_ALLOCATOR("MklCPUAllocator", 200, MklCPUAllocator); +#endif + } // namespace tensorflow diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc index ff31ad965b0a8b..943dcab36269db 100644 --- a/tensorflow/core/framework/allocator.cc +++ b/tensorflow/core/framework/allocator.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/framework/allocator_registry.h" #include "tensorflow/core/framework/log_memory.h" #include "tensorflow/core/framework/tracking_allocator.h" #include "tensorflow/core/lib/strings/stringprintf.h" @@ -119,11 +120,13 @@ Allocator* MakeCpuAllocator() { } // namespace Allocator* cpu_allocator() { - static Allocator* cpu_alloc = MakeCpuAllocator(); + static Allocator* cpu_alloc = AllocatorRegistry::Global()->GetAllocator(); if (cpu_allocator_collect_full_stats && !cpu_alloc->TracksAllocationSizes()) { cpu_alloc = new TrackingAllocator(cpu_alloc, true); } return cpu_alloc; } +REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator", 100, CPUAllocator); + } // namespace tensorflow diff --git a/tensorflow/core/framework/allocator_registry.cc b/tensorflow/core/framework/allocator_registry.cc new file mode 100644 index 00000000000000..792b1ceb5ad300 --- /dev/null +++ b/tensorflow/core/framework/allocator_registry.cc @@ -0,0 +1,66 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/core/framework/allocator_registry.h" +#include "tensorflow/core/platform/logging.h" + + +namespace tensorflow { + +// static +AllocatorRegistry* AllocatorRegistry::Global() { + static AllocatorRegistry* global_allocator_registry = new AllocatorRegistry; + return global_allocator_registry; +} + +bool AllocatorRegistry::CheckForDuplicates(const string& name, int priority) { + for (auto entry : allocators_) { + if (!name.compare(entry.name) && priority == entry.priority) { + return true; + } + } + return false; +} + +void AllocatorRegistry::Register(const string& name, int priority, + Allocator* allocator) { + CHECK(!name.empty()) << "Need a valid name for Allocator"; + CHECK_GE(priority, 0) << "Priority needs to be non-negative"; + CHECK(!CheckForDuplicates(name, priority)) << "Allocator with name: [" << name + << "] and priority: [" << priority + << "] already registered"; + + AllocatorRegistryEntry tmp_entry; + tmp_entry.name = name; + tmp_entry.priority = priority; + tmp_entry.allocator = allocator; + + allocators_.push_back(tmp_entry); + int high_pri = -1; + for (auto entry : allocators_) { + if (high_pri < entry.priority) { + m_curr_allocator_ = entry.allocator; + high_pri = entry.priority; + } + } +} + +Allocator* AllocatorRegistry::GetAllocator() { + return CHECK_NOTNULL(m_curr_allocator_); +} + +} // namespace tensorflow diff --git a/tensorflow/core/framework/allocator_registry.h b/tensorflow/core/framework/allocator_registry.h new file mode 100644 index 00000000000000..c419366ae1aa6f --- /dev/null +++ b/tensorflow/core/framework/allocator_registry.h @@ -0,0 +1,77 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Classes to maintain a static registry of memory allocators +#ifndef TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_ +#define TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_ + +#include +#include + +#include "tensorflow/core/framework/allocator.h" + +namespace tensorflow { + +// A global AllocatorRegistry is used to hold allocators for CPU backends +class AllocatorRegistry { + public: + // Add an allocator to the registry. + void Register(const string& name, int priority, Allocator* allocator); + + // Return allocator with highest priority + // If multiple allocators have the same high priority, return one of them + Allocator* GetAllocator(); + + // Returns the global registry of allocators. + static AllocatorRegistry* Global(); + + private: + typedef struct { + string name; + int priority; + Allocator* allocator; // not owned + } AllocatorRegistryEntry; + + bool CheckForDuplicates(const string& name, int priority); + + std::vector allocators_; + Allocator* m_curr_allocator_; // not owned +}; + +namespace allocator_registration { + +class AllocatorRegistration { + public: + AllocatorRegistration(const string& name, int priority, + Allocator* allocator) { + AllocatorRegistry::Global()->Register(name, priority, allocator); + } +}; + +} // namespace allocator_registration + +#define REGISTER_MEM_ALLOCATOR(name, priority, allocator) \ + REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(__COUNTER__, name, priority, allocator) + +#define REGISTER_MEM_ALLOCATOR_UNIQ_HELPER(ctr, name, priority, allocator) \ + REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator) + +#define REGISTER_MEM_ALLOCATOR_UNIQ(ctr, name, priority, allocator) \ + static allocator_registration::AllocatorRegistration \ + register_allocator_##ctr(name, priority, new allocator) + +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_FRAMEWORK_ALLOCATOR_REGISTRY_H_ diff --git a/tensorflow/core/framework/type_index.h b/tensorflow/core/framework/type_index.h index dfde25c21e97b5..b978d90fa80013 100644 --- a/tensorflow/core/framework/type_index.h +++ b/tensorflow/core/framework/type_index.h @@ -17,7 +17,7 @@ limitations under the License. #define TENSORFLOW_FRAMEWORK_TYPE_INDEX_H_ #include -#ifdef __GXX_RTTI +#if defined(__GXX_RTTI) || defined(_CPPRTTI) #include #include #endif // __GXX_RTTI @@ -30,7 +30,7 @@ namespace tensorflow { // binary sizes. The following #ifdef section provides a non-RTTI // replacement for std::type_index (with a minimal set of functions needed by // the TensorFlow framework, and more can be added if necessary). -#ifndef __GXX_RTTI +#if !defined(__GXX_RTTI) && !defined(_CPPRTTI) // A thin TypeIndex class that mimics std::type_index but does not use RTTI. As // a result, it does not provide the actual name of the type, and only returns a diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h index 589730baf10789..932d788f230bcc 100644 --- a/tensorflow/core/framework/types.h +++ b/tensorflow/core/framework/types.h @@ -68,9 +68,9 @@ class DeviceType { std::ostream& operator<<(std::ostream& os, const DeviceType& d); // Convenient constants that can be passed to a DeviceType constructor -extern const char* const DEVICE_CPU; // "CPU" -extern const char* const DEVICE_GPU; // "GPU" -extern const char* const DEVICE_SYCL; // "SYCL" +TF_EXPORT extern const char* const DEVICE_CPU; // "CPU" +TF_EXPORT extern const char* const DEVICE_GPU; // "GPU" +TF_EXPORT extern const char* const DEVICE_SYCL; // "SYCL" typedef gtl::InlinedVector MemoryTypeVector; typedef gtl::ArraySlice MemoryTypeSlice; diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc new file mode 100644 index 00000000000000..87850b3e9abd89 --- /dev/null +++ b/tensorflow/core/graph/mkl_layout_pass.cc @@ -0,0 +1,548 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef INTEL_MKL + +#include +#include +#include +#include +#include +#include + +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/lib/gtl/map_util.h" +#include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/common_runtime/optimization_registry.h" + +#include "tensorflow/core/graph/mkl_layout_pass.h" +#include "tensorflow/core/util/mkl_util.h" + +namespace tensorflow { + +// This pass implements rewriting of graph for propagating Mkl +// layout as an additional output tensor (we will loosely call a +// tensor that carries Mkl layout as Mkl tensor henceforth.) +// from every Mkl supported NN layer. +// +// As a example, consider Relu layer. Current definition of Relu +// layer looks like: +// +// O = Relu(A) +// +// Relu has 1 input (A), and 1 output (O). +// +// This rewrite pass will generate a new graph node for Relu +// (new node is called MklRelu) as: +// +// O, O_m = MklRelu(A, A_m) +// +// MklRelu has 2 inputs (A and A_m) and 2 outputs (O and O_m). +// Here A input is same as A input of Relu; O output is same +// as O output of Relu. O_m is the additional output tensor +// that will be set by MklRelu, and it represents Mkl tensor +// corresponding to O -- in other words, O_m is some kind of +// metadata for O. A_m is additional input of Relu, and it +// represents metadata for A - as O_m is metadata for O, A_m +// is metadata for A. MklRelu receives this metadata from +// previous layer (in the graph). +// +// When previous layer in the graph is Mkl layer, A_m will +// represent a valid Mkl tensor. But when previous Mkl layer +// is not an Mkl layer, then A_m represents a dummy Mkl tensor. +// +// Rewriting rules: +// - Selection of an op for rewriting happens by registering +// an op with this pass. If an op is not registered, then +// it is not rewritten. +// - Number of inputs after rewriting: +// Since for every input Tensorflow tensor, the rewritten +// layer gets Mkl tensor, rewritten op gets 2*N inputs, +// where N is the number of inputs for original op. +// - Number of outputs after rewriting: +// Since for every output Tensorflow tensor, the rewritten +// layer generates Mkl tensor, rewritten op generates 2*N +// outputs, where N is the number of outputs of original op. +// - Ordering of Tensorflow tensors and Mkl tensors: +// Since every op generates twice the number of inputs and +// outputs, one could imagine different ordering among +// Tensorflow tensors and Mkl tensors. E.g., let's assume +// an op 'Conv2D' takes (A, B) as input, then new op +// 'MklConv2D' can take (A, A_m, B, B_m) as input or it +// can also take (A, B, A_m, B_m) as input. Among N inputs +// one can get N! permutations. +// +// So the question is: which one do we follow? Currently, +// we follow an intuitive order where Mkl tensor follows a +// corresponding Tensorflow tensor immediately. In the +// context of above example, it will be: (A, A_m, B, B_m). +// We follow same ordering rule for output tensors. +// +// NOTE: Current rewriting approach rewrites an op to Mkl op without +// any conditions. But in the future, it may be possible to +// consider conditions such as input shapes and sizes to rewrite +// an op. +// +// Graph rewrite algorithm: +// Algorithm: Graph Rewrite +// Input: Graph G, Names of nodes to rewrite and their new nodes +// Output: Modified Graph G' if nodes are modified, G otherwise. +// Start: +// N = Topological_Sort(G) // N is set of nodes in toposort order. +// foreach node n in N +// do +// if (Is_MKL_Layer(n)) // Can this layer accept Mkl layout as input. +// then +// E = set of of n +// E' = {} // new set of edges for rewritten node +// foreach in E +// do +// E' U {} // First copy edge which generates Tensorflow +// // tensor as it is +// m = Source node of edge e +// if Is_Rewritten(m) // Did we rewrite this node in this pass? +// then +// E' U {} // If yes, then m will generate Mkl tensor +// // as output. +// else +// d = Generate_Dummy_Mkl_Tensor() // If not, generate dummy +// // Mkl tensor. +// E' U {} // Dummy Mkl tensor has only 1 output slot. +// fi +// done +// n' = Build_New_Node(G,new_name,E') +// Mark_Rewritten(n') // Mark new node as being rewritten. +// fi +// done +// +// Explanation: +// For graph rewrite, we visit nodes of the graph in the topological +// sort order. With this ordering, we visit nodes in top-to-bottom +// fashion. We need this order because while visiting a node we want +// all of its input nodes (parents) visited (and rewritten if +// applicable). This is because if we need to rewrite a current node +// then all of its input nodes need to be fixed (in other words they +// cannot be removed later.) +// +// While visiting each node, we first check if it is Mkl layer. If +// it is, then we rewrite that node after constructing new inputs to +// the node. If it is not Mkl layer, then we do not rewrite the node. +// +class MklLayoutRewritePass : public GraphOptimizationPass { + public: + MklLayoutRewritePass() { + csinfo_.conv2d = "Conv2D"; + + ninfo_.push_back({csinfo_.conv2d, GetMklOpName(csinfo_.conv2d), + 2, CopyAttrsConv2D}); + } + + // Standard interface to run pass + Status Run(const GraphOptimizationPassOptions& options); + + // Helper function which does most of heavy lifting for rewriting + // Mkl nodes to propagate Mkl tensor as additional output + // + // Extracts common functionality between Run public interface and + // test interface. + // + // @return true, if and only if graph is mutated; false otherwise. + bool RunPass(std::unique_ptr* g); + + private: + /// Structure to specify name of original op, its new name after rewrite, + /// the number of inputs to the original op, and the function to be used + /// to copy attributes for the op + typedef struct { + string name; // Original name of the op in the graph + string newname; // New name of op in the graph + int numins; // Number of inputs to the original op + std::function copyattrs; // Function handler + // to copy attributes from old node to new node. + } NodesInfo; + + /// Structure to store all constant strings + struct { + string relu; + string relugrad; + string conv2d; + } csinfo_; + + /// Maintain info about nodes to rewrite + std::vector ninfo_; + + /// Hash table to maintain nodes visited in the graph. + std::unordered_set visited_nodes_; + + private: + // Predicate to check if we rewrote node 'n' + // + // If we rewrote the node, then the rewritten node will produce + // Mkl tensor as output. If we did not rewrite the node, then + // we need to insert dummy Mkl node on the input side. + // + // Returns true if node is rewritten, false otherwise. + inline bool IsRewrittenNode(Node* n) const { + return visited_nodes_.find(n) != visited_nodes_.end(); + } + + // Mark the node as rewritten + inline void MarkRewrittenNode(Node* n) { + visited_nodes_.insert(n); + } + + // Get the name of Mkl op from original TensorFlow op + // We prefix 'Mkl' to the original op to get Mkl op. + // TODO(nhasabni) We should move this to mkl_util.h. + inline string GetMklOpName(const string& name) const { + // Prefix that we add to Tensorflow op name to construct Mkl op name. + const char* const kMklOpPrefix = "Mkl"; + return string(kMklOpPrefix) + name; + } + + // Setup new inputs using old inputs 'inputs' for the rewritten node in 'nb' + // in graph 'g'. Original node is input in 'orign'. + // + // For details, refer to 'Number of inputs after rewriting' section in the + // documentation above. + // + // Returns Status::OK() if setting up inputs is successful, otherwise + // returns appropriate status code. + Status SetUpInputs(std::unique_ptr* g, + const gtl::InlinedVector, 4>& inputs, + NodeBuilder* nb, Node* orign); + + // Rewrite Node 'n' in graph 'g' with rewrite information specified in 'ni' + // Returns Status::OK() if node rewrite is successful, otherwise returns + // appropriate error status + Status RewriteNode(std::unique_ptr* g, Node* n, const NodesInfo& ni); + + // Functions specific to operators to copy attributes + // We need operator-specific function to copy attributes because the framework + // does not provide any generic function for it. + static void CopyAttrsConv2D(Node* orign, NodeBuilder* nb); + + // Generate a graph node in graph 'g' representing a dummy Mkl tensor node, + // using node for original node 'orign' and return it in '*out'. + // TODO(nhasabni) We should move this to mkl_util.h + void GetDummyMklTensorNode(std::unique_ptr* g, Node** out, + Node* orign); +}; + + +// We register Mkl rewrite pass for phase 1 in pre-placement group. +// Do not change the ordering of the Mkl passes. +REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1, + MklLayoutRewritePass); + + +static void FillInputs(const Node* n, + gtl::InlinedVector* control_edges, + gtl::InlinedVector, 4>* in) { + DCHECK_EQ(in->size(), n->num_inputs()); + control_edges->clear(); + for (const Edge* e : n->in_edges()) { + if (e->IsControlEdge()) { + control_edges->push_back(e->src()); + } else { + (*in)[e->dst_input()] = std::make_pair(e->src(), e->src_output()); + } + } + std::sort(control_edges->begin(), control_edges->end()); + if (n->op_def().is_commutative()) { + // For commutative inputs, we sort the input by the input Node* + // to get a canonical ordering (so that add(a,b) and add(b, a) will + // hash to the same value if is_commutative is true for 'add'). + std::sort(in->begin(), in->end()); + } +} + +////////////////////////////////////////////////////////////////////////// + +// Macros to build new node with different number of inputs. +// We need this way because we need to specify all the inputs when +// building a node. Comment at core/graph/node_builder.h, line 85-86. + +#define SETUP_INPUTS1(nb, op1) do { \ + nb->Input(op1.node, op1.index); \ +}while(0) + +#define SETUP_INPUTS2(nb, op1, op2) do { \ + nb->Input(op1.node, op1.index); \ + nb->Input(op2.node, op2.index); \ +}while(0) + +#define SETUP_INPUTS3(nb, op1, op2, op3) do { \ + nb->Input(op1.node, op1.index); \ + nb->Input(op2.node, op2.index); \ + nb->Input(op3.node, op3.index); \ +}while(0) + +#define SETUP_INPUTS4(nb, op1, op2, op3, op4) do { \ + nb->Input(op1.node, op1.index); \ + nb->Input(op2.node, op2.index); \ + nb->Input(op3.node, op3.index); \ + nb->Input(op4.node, op4.index); \ +}while(0) + +#define SETUP_INPUTS5(nb, op1, op2, op3, op4, op5) do {\ + nb->Input(op1.node, op1.index); \ + nb->Input(op2.node, op2.index); \ + nb->Input(op3.node, op3.index); \ + nb->Input(op4.node, op4.index); \ + nb->Input(op5.node, op5.index); \ +}while(0) + +// TODO(nhasabni) We should move this to mkl_util.h. +void MklLayoutRewritePass::GetDummyMklTensorNode( + std::unique_ptr* g, Node** out, Node* orign) { + // We use a tensor of shape {8} and value 0,0,0,0,0,0,0,0 to represent + // dummy Mkl tensor. 8 = 2*size_t. + const DataType dt = DataTypeToEnum::v(); + TensorProto proto; + proto.set_dtype(dt); + uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + proto.set_tensor_content(const_cast( + static_cast(&zero)), 8); + TensorShape dummy_shape({8}); + dummy_shape.AsProto(proto.mutable_tensor_shape()); + TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const") + .Attr("value", proto) + .Attr("dtype", dt) + .Device(orign->def().device()) // We place this node on same + // device as device of original + // node. + .Finalize(&**g, out)); +} + +Status MklLayoutRewritePass::SetUpInputs(std::unique_ptr* g, + const gtl::InlinedVector, 4>& inputs, + NodeBuilder* nb, Node* orign) { + std::vector new_inputs; + + // 1. Let's setup inputs for the new node. + for (int i = 0; i < inputs.size(); i++) { + Node* n = inputs[i].first; + // First let's copy original TF tensor input as it is. + new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second)); + + // Second, let's add edge to propagate Mkl tensors from input Mkl layers, + // or generate a dummy Mkl tensor representing not-mkl-tensor case. + if (IsRewrittenNode(n)) { + // If we have visited this node and rewritten it, then it will generate + // an edge that will receive Mkl tensor from a node. + // First, let's assert that this op is Mkl layer. + DataType T; + TF_CHECK_OK(GetNodeAttr(n->def(), "T", &T)); + // If this op has been rewritten, then its name must have been same as + // Mkl op. + CHECK_EQ(mkl_layer_registry::IsMklLayer(n->type_string()), true); + // src slot number for Mkl tensor would be the one next to TF tensor + // slot number. + new_inputs.push_back(NodeBuilder::NodeOut(n, inputs[i].second+1)); + } else { + // If we have not visited the node and rewritten it, then we need + // to create a dummy node that will feed a non-Mkl tensor to this node. + // DummyMklTensor node has no input and generates only 1 output + // (dummy Mkl tensor) as output slot number 0. + Node* dmt = nullptr; + GetDummyMklTensorNode(g, &dmt, orign); + CHECK_NOTNULL(dmt); + new_inputs.push_back(NodeBuilder::NodeOut(dmt, 0)); + } + } + + // The total number of inputs to new node _must_ be 2 times the number + // of inputs to the original node: N original Tensorflow tensors and + // N for Mkl tensors corresponding to each Tensorflow tensors. + CHECK_EQ(new_inputs.size(), inputs.size() * 2); + + // 2. Let's build the node with new inputs. + switch (new_inputs.size()) { + case 0: // We don't need to do anything for no input as we have + // already built node. + break; + case 1: SETUP_INPUTS1(nb, new_inputs[0]); break; + case 2: SETUP_INPUTS2(nb, new_inputs[0], + new_inputs[1]); break; + case 3: SETUP_INPUTS3(nb, new_inputs[0], + new_inputs[1], + new_inputs[2]); break; + case 4: SETUP_INPUTS4(nb, new_inputs[0], + new_inputs[1], + new_inputs[2], + new_inputs[3]); break; + case 5: SETUP_INPUTS5(nb, new_inputs[0], + new_inputs[1], + new_inputs[2], + new_inputs[3], + new_inputs[4]); break; + default: { + return Status(error::Code::UNIMPLEMENTED, + "Could not create node with given number of inputs"); + } + } + + return Status::OK(); +} + +void MklLayoutRewritePass::CopyAttrsConv2D(Node* orign, NodeBuilder* nb) { + DataType T; + string data_format; + string padding; + std::vector strides; + bool use_cudnn_on_gpu; + + // Get all attributes from old node. + TF_CHECK_OK(GetNodeAttr(orign->def(), "T", &T)); + TF_CHECK_OK(GetNodeAttr(orign->def(), "strides", &strides)); + TF_CHECK_OK(GetNodeAttr(orign->def(), "padding", &padding)); + TF_CHECK_OK(GetNodeAttr(orign->def(), "data_format", &data_format)); + TF_CHECK_OK(GetNodeAttr(orign->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu)); + + // Add attributes to new node. + nb->Attr("T", T); + nb->Attr("strides", strides); + nb->Attr("padding", padding); + nb->Attr("data_format", data_format); + nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu); +} + +Status MklLayoutRewritePass::RewriteNode( + std::unique_ptr* g, Node* orign, const NodesInfo& ni) { + VLOG(1) << "MKLLayoutRewritePass: Original node:" << orign->DebugString(); + + // Get all inputs. + const int num = orign->num_inputs(); + CHECK_EQ(num, ni.numins); + gtl::InlinedVector control_edges; + gtl::InlinedVector, 4> inputs(num); + FillInputs(orign, &control_edges, &inputs); + + // Build new node. We use same name as original node, but change the op name. + NodeBuilder nb(orign->name().c_str(), ni.newname.c_str()); + // Copy user-specified device assigned to original node to new node. + nb.Device(orign->def().device()); + // Set up new inputs to the rewritten node. + Status s = SetUpInputs(g, inputs, &nb, orign); + if (s != Status::OK()) { + return s; + } + // Copy attributes from original node to new node. + ni.copyattrs(orign, &nb); + // Set the Mkl layer label for this op. + nb.Attr("_kernel", mkl_layer_registry::kMklLayerLabel); + Node* newn = nullptr; + + // Finalize graph and get new node. + TF_CHECK_OK(nb.Finalize(&**g, &newn)); + CHECK_NOTNULL(newn); + + // Incoming edges from 'orign' node to new 'newn' node are already copied + // in BuildNode. Copy outgoing edges from 'orign' node to new 'newn' node. + for (const Edge* e : orign->out_edges()) { + (*g)->AddEdge(newn, e->src_output(), e->dst(), e->dst_input()); + } + + // Copy the runtime device assigned from original code to new node. + newn->set_assigned_device_name(orign->assigned_device_name()); + + // Delete original node and mark new node as rewritten. + (*g)->RemoveNode(orign); + MarkRewrittenNode(newn); + + VLOG(1) << "MKLLayoutRewritePass: New node:" << newn->DebugString(); + return Status::OK(); +} + +bool MklLayoutRewritePass::RunPass( + std::unique_ptr* g) { + bool result = false; + CHECK_NOTNULL(g); + + DumpGraph("Before running MklLayoutRewritePass", &**g); + + std::vector order; + GetReversePostOrder(**g, &order); // This will give us topological sort. + + for (Node* n : order) { + if (!n->IsOp()) { + continue; + } + + for (const NodesInfo& ni : ninfo_) { + DataType dtype = DT_INVALID; + // An op needs to have data type (T) attribute and its corresponding + // Mkl op name must be supported. + if (GetNodeAttr(n->def(), "T", &dtype) == Status::OK() && + mkl_layer_registry::IsMklLayer(GetMklOpName(n->type_string())) && + n->type_string().compare(ni.name) == 0) { + string node_name = n->name(); + string op_name = n->type_string(); + + VLOG(1) << "MKLLayoutRewritePass: Scheduled node " << node_name + << " with op " << op_name << " for rewrite using" + << " layout optimization."; + + if (RewriteNode(g, n, ni) == Status::OK()) { + VLOG(1) << "MKLLayoutRewritePass: Successfully rewrote node " + << node_name << " with op " << op_name + << " for Mkl layout optimization."; + result = true; + break; // We found matching nodesinfo so no need to search next. + } + } + } + } + + DumpGraph("After running MklLayoutRewritePass", &**g); + + return result; +} + +/////////////////////////////////////////////////////////////////////////////// +// Run function for the pass +/////////////////////////////////////////////////////////////////////////////// + +bool RunMklLayoutRewritePass(std::unique_ptr* g) { + return MklLayoutRewritePass().RunPass(g); +} + +Status MklLayoutRewritePass::Run(const GraphOptimizationPassOptions& options) { + if (options.graph == nullptr) { + return Status::OK(); + } + + // Get the ownership of graph + std::unique_ptr* g = std::move(options.graph); + + RunPass(g); + + // Return the ownership of graph back + options.graph->reset(g->release()); + + return Status::OK(); +} + +} // namespace tensorflow + +#endif diff --git a/tensorflow/core/graph/mkl_layout_pass.h b/tensorflow/core/graph/mkl_layout_pass.h new file mode 100644 index 00000000000000..ffe5c1ecfcdef0 --- /dev/null +++ b/tensorflow/core/graph/mkl_layout_pass.h @@ -0,0 +1,36 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// A graph pass that rewrites graph for propagating MKL layout as a tensor + +#ifndef TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_ +#define TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_ + +#ifdef INTEL_MKL + +#include +#include +#include "tensorflow/core/graph/graph.h" + +namespace tensorflow { +// Interface to invoke the pass for unit test +// +// Returns true if and only if 'g' is mutated. +extern bool RunMklLayoutRewritePass(std::unique_ptr* g); +} // namespace tensorflow + +#endif + +#endif // TENSORFLOW_GRAPH_MKL_LAYOUT_PASS_H_ diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc new file mode 100644 index 00000000000000..10671ee2e9612d --- /dev/null +++ b/tensorflow/core/graph/mkl_layout_pass_test.cc @@ -0,0 +1,199 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef INTEL_MKL + +#include "tensorflow/core/graph/mkl_layout_pass.h" +#include "tensorflow/core/util/mkl_util.h" + +#include +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { +namespace { + +static void InitGraph(const string& s, Graph* graph) { + GraphDef graph_def; + + auto parser = protobuf::TextFormat::Parser(); + // parser.AllowRelaxedWhitespace(true); + CHECK(parser.MergeFromString(s, &graph_def)) << s; + GraphConstructorOptions opts; + TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph)); +} + +class MklLayoutPassTest : public ::testing::Test { + public: + MklLayoutPassTest() : graph_(OpRegistry::Global()) {} + + void InitGraph(const string& s) { + ::tensorflow::InitGraph(s, &graph_); + original_ = CanonicalGraphString(&graph_); + } + + static bool IncludeNode(const Node* n) { return n->IsOp(); } + + static string EdgeId(const Node* n, int index) { + if (index == 0) { + return n->name(); + } else if (index == Graph::kControlSlot) { + return strings::StrCat(n->name(), ":control"); + } else { + return strings::StrCat(n->name(), ":", index); + } + } + + string CanonicalGraphString(Graph* g) { + std::vector nodes; + std::vector edges; + for (const Node* n : g->nodes()) { + if (IncludeNode(n)) { + nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")")); + } + } + for (const Edge* e : g->edges()) { + if (IncludeNode(e->src()) && IncludeNode(e->dst())) { + edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->", + EdgeId(e->dst(), e->dst_input()))); + } + } + // Canonicalize + std::sort(nodes.begin(), nodes.end()); + std::sort(edges.begin(), edges.end()); + return strings::StrCat(str_util::Join(nodes, ";"), "|", + str_util::Join(edges, ";")); + } + + string DoMklLayoutOptimizationPass() { + string before = CanonicalGraphString(&graph_); + LOG(ERROR) << "Before MKL layout rewrite pass: " << before; + + std::unique_ptr* ug = new std::unique_ptr(&graph_); + RunMklLayoutRewritePass(ug); + + string result = CanonicalGraphString(&graph_); + LOG(ERROR) << "After MKL layout rewrite pass: " << result; + return result; + } + + const string& OriginalGraph() const { return original_; } + + Graph graph_; + string original_; +}; + +REGISTER_OP("Input").Output("o: float").SetIsStateful(); + +// Single Conv2D Op; No Mkl layer on the input and on the output. +// We will generate dummy Mkl tensor as 2nd input of Conv2D. +TEST_F(MklLayoutPassTest, Conv2D_Basic) { + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Input'}" + "node { name: 'C' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'SAME' } }" + " input: ['A', 'B']}" + "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + " input: ['B', 'C'] }"); + EXPECT_EQ(DoMklLayoutOptimizationPass(), + "A(Input);B(Input);C(MklConv2D);D(Mul);DMT/_0(Const);DMT/_1(Const)|" + "A->C;B->C:2;B->D;C->D:1;DMT/_0->C:1;DMT/_1->C:3"); +} + +// 2 Conv2D Ops in sequence. Both should get transformed and 1st Conv2D will +// have 2 outputs, both of which will be inputs to next Conv2D. +TEST_F(MklLayoutPassTest, Conv2D_Positive1) { + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Input'}" + "node { name: 'C' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'SAME' } }" + " input: ['A', 'B']}" + "node { name: 'D' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'SAME' } }" + " input: ['A', 'C']}" + "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + " input: ['C', 'D'] }"); + EXPECT_EQ(DoMklLayoutOptimizationPass(), + "A(Input);B(Input);C(MklConv2D);D(MklConv2D);DMT/_0(Const);" + "DMT/_1(Const);DMT/_2(Const);E(Mul)|A->C;A->D;B->C:2;C->D:2;C->E;" + "C:1->D:3;D->E:1;DMT/_0->C:1;DMT/_1->C:3;DMT/_2->D:1"); +} + +static void BM_MklLayoutRewritePass(int iters, int op_nodes) { + testing::StopTiming(); + string s; + for (int in = 0; in < 10; in++) { + s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in); + } + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + for (int op = 0; op < op_nodes; op++) { + s += strings::Printf( + "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { " + "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }", + op, rnd.Uniform(10), rnd.Uniform(10)); + } + + bool first = true; + while (iters > 0) { + Graph* graph = new Graph(OpRegistry::Global()); + InitGraph(s, graph); + int N = graph->num_node_ids(); + if (first) { + testing::SetLabel(strings::StrCat("Per graph node. Nodes: ", N)); + first = false; + } + { + testing::StartTiming(); + std::unique_ptr ug(graph); + RunMklLayoutRewritePass(&ug); + testing::StopTiming(); + } + iters -= N; // Our benchmark units are individual graph nodes, + // not whole graphs + // delete graph; + } +} +BENCHMARK(BM_MklLayoutRewritePass)->Arg(1000)->Arg(10000); + +} // namespace +} // namespace tensorflow + +#endif /* INTEL_MKL */ diff --git a/tensorflow/core/graph/mkl_optimizer_merge.cc b/tensorflow/core/graph/mkl_optimizer_merge.cc index 98fc268d284a90..bc5915eda2bc90 100644 --- a/tensorflow/core/graph/mkl_optimizer_merge.cc +++ b/tensorflow/core/graph/mkl_optimizer_merge.cc @@ -22,6 +22,8 @@ limitations under the License. #include #include #include +#include +#include #include "tensorflow/core/graph/mkl_optimizer_merge.h" @@ -33,6 +35,8 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/common_runtime/optimization_registry.h" namespace tensorflow { @@ -58,8 +62,8 @@ static size_t kNodeMergeContextMaxDepth = 10; class NodeMergeRewritePass : public GraphOptimizationPass { public: NodeMergeRewritePass() { - csinfo_.conv2d = "Conv2D"; - csinfo_.conv2dwithbias = "Conv2DWithBias"; + csinfo_.conv2d = "MklConv2D"; + csinfo_.conv2dwithbias = "MklConv2DWithBias"; csinfo_.conv2dwithbiasbackpropbias = "Conv2DWithBiasBackpropBias"; csinfo_.biasadd = "BiasAdd"; csinfo_.matmul = "MatMul"; @@ -72,6 +76,9 @@ class NodeMergeRewritePass : public GraphOptimizationPass { // maxhops in backward data-flow graph. Since input of forward nodes // (Conv2D) directly goes to backward nodes, we do not expect the // hop-distance would be more than few nodes. + // TODO(nhasabni) Temporarily disabling rewrite of BiasAddGrad. + // Will enable it once we support Conv2DWithBiasBackpropBias op. +#if 0 rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias, {csinfo_.conv2dwithbias, kNodeMergeContextMaxDepth}}); rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.conv2dwithbiasbackpropbias, @@ -80,6 +87,7 @@ class NodeMergeRewritePass : public GraphOptimizationPass { // because we do not have a separate Op for MatMulwithBias. rinfo_.push_back({csinfo_.biasaddgrad, csinfo_.biasaddgrad, {csinfo_.matmul, kNodeMergeContextMaxDepth}}); +#endif } // Standard interface to run optimization pass @@ -182,10 +190,16 @@ class NodeMergeRewritePass : public GraphOptimizationPass { // @return Matching rewriteinfo in case a match is found; null otherwise. const RewriteInfo* FindMatchingRewriteInfo(const Node* n, const Node** fwdn) const; + + // Generate a graph node in graph 'g' representing a dummy Mkl tensor node, + // and return it in '*out'. + // TODO(nhasabni) We should move this to mkl_util.h + void GetDummyMklTensorNode(std::unique_ptr* g, Node** out); }; -/// We register merge optimizer for phase 1 and MKLToTF insertion for phase 2. -REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 1, +// We register merge optimizer for phase 2 in pre-placement group. +// Do not change the ordering of the Mkl passes. +REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 2, NodeMergeRewritePass); static void FillInputs(const Node* n, @@ -219,8 +233,6 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const { } } - VLOG(1) << "FindNodeForMerge: " << a->type_string(); - for (const MergeInfo* mi : matching_mi) { const int N_in = a->num_inputs(); if (mi->op >= N_in) { @@ -240,8 +252,6 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const { continue; } - VLOG(1) << " FindNode: " << b->type_string(); - gtl::InlinedVector b_control_edges; gtl::InlinedVector, 4> b_in(N_in); FillInputs(b, &b_control_edges, &b_in); @@ -258,6 +268,22 @@ Node* NodeMergeRewritePass::FindNodeForMerge(const Node* a) const { return nullptr; } +void NodeMergeRewritePass::GetDummyMklTensorNode( + std::unique_ptr* g, Node** out) { + const DataType dt = DataTypeToEnum::v(); + TensorProto proto; + proto.set_dtype(dt); + uint8 zero[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + proto.set_tensor_content(const_cast( + static_cast(&zero)), 8); + TensorShape dummy_shape({8}); + dummy_shape.AsProto(proto.mutable_tensor_shape()); + TF_CHECK_OK(NodeBuilder((*g)->NewName("DMT"), "Const") + .Attr("value", proto) + .Attr("dtype", dt) + .Finalize(&**g, out)); +} + Status NodeMergeRewritePass::MergeNode(std::unique_ptr* g, Node* succ, Node* pred) { CHECK_NOTNULL(succ); @@ -271,7 +297,6 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr* g, std::vector strides; string data_format_pred, data_format_succ; bool use_cudnn_on_gnu; - int groups = 1; TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred)); TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ)); TF_CHECK_OK(GetNodeAttr(pred->def(), "padding", &padding)); @@ -280,25 +305,28 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr* g, TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ)); TF_CHECK_OK(GetNodeAttr(pred->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu)); - // Groups attribute may not be there on the input node. So we do not - // check for error in GetNodeAttr call. - GetNodeAttr(pred->def(), "groups", &groups); // We check to ensure that data formats of both succ and pred are same. // We expect them to be same, so we can enforce this as assert. // But assert can be too strict, so we enforce this as a check. // If the check fails, then we do not merge two nodes. + // We also do same check for devices. if (data_format_pred != data_format_succ || - T_pred != T_succ) { + T_pred != T_succ || + pred->assigned_device_name() != succ->assigned_device_name() || + pred->def().device() != succ->def().device()) { return Status(error::Code::INVALID_ARGUMENT, - "data_format or T attribute of Conv2D and BiasAdd" - "do not match. Will skip node merge optimization"); + "data_format or T attribute or devices of Conv2D and " + "BiasAdd do not match. Will skip node merge optimization"); } // 2. Get inputs from both the nodes. // Find the 2 inputs from the conv and the bias from the add Bias. Node* oper1 = nullptr; + Node* oper1_mkl = nullptr; // Mkl tensor corresponding to oper1 Node* oper2 = nullptr; + Node* oper2_mkl = nullptr; // Mkl tensor corresponding to oper2 Node* oper3 = nullptr; + Node* oper3_mkl = nullptr; // Mkl tensor corresponding to oper3 const int succ_num = succ->num_inputs(); gtl::InlinedVector succ_control_edges; @@ -326,24 +354,35 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr* g, } } - // Get operand 0, 1 of conv2D - oper1 = pred_in[0].first; - oper2 = pred_in[1].first; + // Get operand 0, 1 of conv2D and their Mkl tensors. + CHECK_EQ(pred->in_edges().size(), 4); // MklConv2D must have 4 inputs. + oper1 = pred_in[0].first; + oper1_mkl = pred_in[1].first; + oper2 = pred_in[2].first; + oper2_mkl = pred_in[3].first; // Get operand 1 of add_bias - oper3 = succ_in[1].first; + // BiasAdd must have 2 inputs: Conv, bias + CHECK_EQ(succ->in_edges().size(), 2); + oper3 = succ_in[1].first; + GetDummyMklTensorNode(g, &oper3_mkl); // Get dummy Mkl tensor node + // as BiasAdd does not have Mkl tensor as input. + CHECK_NOTNULL(oper3_mkl); Node* ret; // We will use the node name of BiasAdd as the name of new node TF_CHECK_OK(NodeBuilder(succ->name(), csinfo_.conv2dwithbias) .Input(oper1) + .Input(oper1_mkl) .Input(oper2) + .Input(oper2_mkl) .Input(oper3) + .Input(oper3_mkl) .Attr("T", T_pred) .Attr("strides", strides) .Attr("padding", padding) .Attr("data_format", data_format_pred) .Attr("use_cudnn_on_gpu", use_cudnn_on_gnu) - .Attr("groups", groups) + .Device(succ->def().device()) .Finalize(&**g, &ret)); CHECK_NOTNULL(ret); @@ -352,6 +391,15 @@ Status NodeMergeRewritePass::MergeNode(std::unique_ptr* g, (*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input()); } + // Copy device assigned to old node to new node. + // It's ok to use pred or succ as we have enforced a check that + // both have same device assigned. + ret->set_assigned_device_name(pred->assigned_device_name()); + + VLOG(1) << "NodeMergeRewritePass: Merged old node:" << pred->DebugString() + << ", and node: " << succ->DebugString() << ", into node:" + << ret->DebugString(); + (*g)->RemoveNode(succ); (*g)->RemoveNode(pred); @@ -369,13 +417,14 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr* g, Node *n) { const Node* fwdn = nullptr; const RewriteInfo* ri = FindMatchingRewriteInfo(n, &fwdn); if (ri == nullptr || fwdn == nullptr) { - VLOG(1) << "Rewriteinfo not found for: " << n->type_string(); + VLOG(2) << "NodeMergeRewritePass: Rewriteinfo not found for: " + << n->type_string(); return Status(error::Code::INVALID_ARGUMENT, "Rewrite info not found for the node." "Will skip node rewrite optimization"); } - VLOG(1) << "Rewrite called for: " << n->type_string(); + VLOG(1) << "NodeMergeRewritePass: Rewrite called for: " << n->type_string(); if (n->type_string() == csinfo_.biasaddgrad && ri->node == csinfo_.biasaddgrad && @@ -407,6 +456,7 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr* g, Node *n) { .Attr("T", T) .Attr("data_format", data_format) .Attr("strides", strides) + .Device(n->def().device()) .Finalize(&**g, &ret)); } else { CHECK_EQ(ri->rewrite, csinfo_.biasaddgrad); @@ -414,6 +464,7 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr* g, Node *n) { .Input(op) .Attr("T", T) .Attr("data_format", data_format) + .Device(n->def().device()) .Finalize(&**g, &ret)); } @@ -424,7 +475,11 @@ Status NodeMergeRewritePass::RewriteNode(std::unique_ptr* g, Node *n) { (*g)->AddEdge(ret, e->src_output(), e->dst(), e->dst_input()); } - VLOG(1) << "Rewrite node: " << n->type_string() << " successful"; + // Copy device assigned to old node to new node. + ret->set_assigned_device_name(n->assigned_device_name()); + + VLOG(1) << "MKLOptimizerMergePass: Rewrote old node:" << n->DebugString() + << ", into node:" << ret->DebugString(); (*g)->RemoveNode(n); return Status::OK(); @@ -450,7 +505,8 @@ NodeMergeRewritePass::FindMatchingRewriteInfo(const Node* n, } } - VLOG(1) << "Searching graph for: " << n->type_string() << " in backwards."; + VLOG(1) << "NodeMergeRewritePass: Searching graph for: " + << n->type_string() << " in backwards."; // Now we will check for forward op name for rewrite info in data // flow graph. Get the max hops we should search for the fwd node @@ -473,7 +529,8 @@ NodeMergeRewritePass::FindMatchingRewriteInfo(const Node* n, curr_depth = curr_pair.second; CHECK_NOTNULL(curr_node); - VLOG(1) << "Visiting node: " << curr_node->type_string() + VLOG(1) << "NodeMergeRewritePass: Visiting node: " + << curr_node->type_string() << " at depth: " << curr_depth << " for node: " << n->type_string(); @@ -528,17 +585,16 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr* g) { std::vector> nodes_to_be_merged; std::vector nodes_to_be_rewritten; - VLOG(1) << "Running NodeMerge Optimization"; - for (Node* n : order) { if (!n->IsOp()) continue; Node* n1 = nullptr; if ((n1 = FindNodeForMerge(n)) != nullptr) { - VLOG(1) << "Scheduled nodes " << n->name() << " and " - << n1->name() << " for merging"; + VLOG(1) << "NodeMergeRewritePass: Scheduled nodes " + << n->name() << " and " << n1->name() << " for merging"; nodes_to_be_merged.push_back(std::make_pair(n, n1)); } else if (IsApplicableRewriteNode(n)) { - VLOG(1) << "Scheduled node " << n->name() << " for rewrite"; + VLOG(1) << "NodeMergeRewritePass: Scheduled node " << n->name() + << " for rewrite"; nodes_to_be_rewritten.push_back(n); } } @@ -549,7 +605,8 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr* g) { string n1_name = i.first->name(); string n2_name = i.second->name(); if (MergeNode(g, i.first, i.second) == Status::OK()) { - VLOG(1) << "Merged nodes " << n1_name << " and " << n2_name; + VLOG(1) << "NodeMergeRewritePass: Merged nodes " << n1_name + << " and " << n2_name; result = true; } } @@ -559,7 +616,8 @@ bool NodeMergeRewritePass::RunPass(std::unique_ptr* g) { for (Node* i : nodes_to_be_rewritten) { string name = i->name(); if (RewriteNode(g, i) == Status::OK()) { - VLOG(1) << "Rewrite node: " << name << " successful."; + VLOG(1) << "NodeMergeRewritePass: Rewrite node: " + << name << " successful."; result = true; } } @@ -574,8 +632,6 @@ bool OptimizeNodeMerge(std::unique_ptr* g) { } Status NodeMergeRewritePass::Run(const GraphOptimizationPassOptions& options) { - // Currently checking only for two cases - Conv2D+Bias and Matmul+Bias. - // It is possible to extend it to other operators in future. if (options.graph == nullptr) { return Status::OK(); } diff --git a/tensorflow/core/graph/mkl_optimizer_merge.h b/tensorflow/core/graph/mkl_optimizer_merge.h index 554709e9dd6902..b2caec58aff311 100644 --- a/tensorflow/core/graph/mkl_optimizer_merge.h +++ b/tensorflow/core/graph/mkl_optimizer_merge.h @@ -21,20 +21,14 @@ limitations under the License. #ifdef INTEL_MKL #include -#include -#include #include #include "tensorflow/core/graph/graph.h" -#include "tensorflow/core/lib/core/status.h" -#include "tensorflow/core/common_runtime/optimization_registry.h" namespace tensorflow { - // Interface to invoke the pass for unit test // // Returns true if and only if 'g' is mutated. extern bool OptimizeNodeMerge(std::unique_ptr* g); - } // namespace tensorflow #endif // INTEL_MKL diff --git a/tensorflow/core/graph/mkl_optimizer_merge_test.cc b/tensorflow/core/graph/mkl_optimizer_merge_test.cc index da3b01955cc39b..5aae61ad197674 100644 --- a/tensorflow/core/graph/mkl_optimizer_merge_test.cc +++ b/tensorflow/core/graph/mkl_optimizer_merge_test.cc @@ -105,6 +105,7 @@ class OptimizerMergeTest : public ::testing::Test { }; REGISTER_OP("Input").Output("o: float").SetIsStateful(); +REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful(); TEST_F(OptimizerMergeTest, Basic) { InitGraph( @@ -121,8 +122,38 @@ TEST_F(OptimizerMergeTest, Basic) { // Test set 1: Conv2D + AddBias -// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y) +// C=MklConv2D(A,M,B,N); E=BiasAdd(C,D); Z=Sub(E,Y) TEST_F(OptimizerMergeTest, Conv2DWithBias_Positive) { + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'M' op: 'MklInput'}" + "node { name: 'B' op: 'Input'}" + "node { name: 'N' op: 'MklInput'}" + "node { name: 'C' op: 'MklConv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'SAME' } }" + " input: ['A', 'M', 'B', 'N']}" + "node { name: 'D' op: 'Input'}" + "node { name: 'E' op: 'BiasAdd'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " input: ['C', 'D'] }" + "node { name: 'Y' op: 'Input'}" + "node { name: 'Z' op: 'Sub'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['E', 'Y']}"); + EXPECT_EQ(DoNodeMerge(), + "A(Input);B(Input);D(Input);DMT/_0(Const);E(MklConv2DWithBias);" + "M(MklInput);N(MklInput);Y(Input);Z(Sub)|A->E;B->E:2;D->E:4;" + "DMT/_0->E:5;E->Z;M->E:1;N->E:3;Y->Z:1"); +} + +// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y); +// We do not merge in this case as op is Conv2D and not MklConv2D. +TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoMklConv2D) { InitGraph( "node { name: 'A' op: 'Input'}" "node { name: 'B' op: 'Input'}" @@ -143,63 +174,69 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Positive) { " attr {key: 'T' value { type: DT_FLOAT } }" " input: ['E', 'Y']}"); EXPECT_EQ(DoNodeMerge(), - "A(Input);B(Input);D(Input);E(Conv2DWithBias);Y(Input);Z(Sub)|" - "A->E;B->E:1;D->E:2;E->Z;Y->Z:1"); + "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|" + "A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1"); } -// Graph contains only Conv2D, no AddBias. +// Graph contains only MklConv2D, no AddBias. TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_NoAddBias) { InitGraph( "node { name: 'A' op: 'Input'}" + "node { name: 'M' op: 'MklInput'}" "node { name: 'B' op: 'Input'}" - "node { name: 'C' op: 'Conv2D'" + "node { name: 'N' op: 'MklInput'}" + "node { name: 'C' op: 'MklConv2D'" " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'data_format' value { s: 'NCHW' } }" " attr { key: 'use_cudnn_on_gpu' value { b: false } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" - " input: ['A', 'B']}"); + " input: ['A', 'M', 'B', 'N']}"); EXPECT_EQ(DoNodeMerge(), - "A(Input);B(Input);C(Conv2D)|" - "A->C;B->C:1"); + "A(Input);B(Input);C(MklConv2D);M(MklInput);N(MklInput)|" + "A->C;B->C:2;M->C:1;N->C:3"); } -// Conv2D output does not go to BiasAdd. +// MklConv2D output does not go to BiasAdd. TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow1) { InitGraph( "node { name: 'A' op: 'Input'}" + "node { name: 'M' op: 'MklInput'}" "node { name: 'B' op: 'Input'}" - "node { name: 'C' op: 'Conv2D'" + "node { name: 'N' op: 'MklInput'}" + "node { name: 'C' op: 'MklConv2D'" " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'data_format' value { s: 'NCHW' } }" " attr { key: 'use_cudnn_on_gpu' value { b: false } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" - " input: ['A', 'B']}" + " input: ['A', 'M', 'B', 'N']}" "node { name: 'D' op: 'Input'}" "node { name: 'E' op: 'Input'}" "node { name: 'F' op: 'BiasAdd'" " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'data_format' value { s: 'NCHW' } }" - " input: ['D', 'E'] }"); // Output of Conv2D does not go to BiasAdd. + " input: ['D', 'E'] }"); // Output of MklConv2D does not go to BiasAdd. EXPECT_EQ(DoNodeMerge(), - "A(Input);B(Input);C(Conv2D);D(Input);E(Input);F(BiasAdd)|" - "A->C;B->C:1;D->F;E->F:1"); + "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);" + "M(MklInput);N(MklInput)|A->C;B->C:2;D->F;E->F:1;M->C:1;N->C:3"); } -// Conv2D has two outgoing edges: BiasAdd and some other dummy node (Add). +// MklConv2D has two outgoing edges: BiasAdd and some other dummy node (Add). // Merge should not be done in such case. TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) { InitGraph( "node { name: 'A' op: 'Input'}" + "node { name: 'M' op: 'MklInput'}" "node { name: 'B' op: 'Input'}" - "node { name: 'C' op: 'Conv2D'" + "node { name: 'N' op: 'MklInput'}" + "node { name: 'C' op: 'MklConv2D'" " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'data_format' value { s: 'NCHW' } }" " attr { key: 'use_cudnn_on_gpu' value { b: false } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" - " input: ['A', 'B']}" + " input: ['A', 'M', 'B', 'N']}" "node { name: 'D' op: 'Input'}" "node { name: 'E' op: 'Input'}" "node { name: 'F' op: 'BiasAdd'" @@ -211,8 +248,9 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) { " attr { key: 'T' value { type: DT_FLOAT } }" " input: ['C', 'E'] }"); EXPECT_EQ(DoNodeMerge(), - "A(Input);B(Input);C(Conv2D);D(Input);E(Input);F(BiasAdd);G(Add)|" - "A->C;B->C:1;C->G;D->F;E->F:1;E->G:1"); + "A(Input);B(Input);C(MklConv2D);D(Input);E(Input);F(BiasAdd);" + "G(Add);M(MklInput);N(MklInput)|A->C;B->C:2;C->G;D->F;" + "E->F:1;E->G:1;M->C:1;N->C:3"); } // data_format attribute value mismatch. Merge should not be done @@ -220,28 +258,63 @@ TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_Dataflow2) { TEST_F(OptimizerMergeTest, Conv2DWithBias_Negative_AttrMismatch) { InitGraph( "node { name: 'A' op: 'Input'}" + "node { name: 'M' op: 'MklInput'}" "node { name: 'B' op: 'Input'}" - "node { name: 'C' op: 'Conv2D'" + "node { name: 'N' op: 'MklInput'}" + "node { name: 'C' op: 'MklConv2D'" " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'data_format' value { s: 'NCHW' } }" " attr { key: 'use_cudnn_on_gpu' value { b: false } }" " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" " attr { key: 'padding' value { s: 'SAME' } }" - " input: ['A', 'B']}" + " input: ['A', 'M', 'B', 'N']}" "node { name: 'D' op: 'Input'}" "node { name: 'E' op: 'BiasAdd'" " attr { key: 'T' value { type: DT_FLOAT } }" " attr { key: 'data_format' value { s: 'NHCW' } }" " input: ['C', 'D'] }"); EXPECT_EQ(DoNodeMerge(), - "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd)|" - "A->C;B->C:1;C->E;D->E:1"); + "A(Input);B(Input);C(MklConv2D);D(Input);E(BiasAdd);M(MklInput);" + "N(MklInput)|A->C;B->C:2;C->E;D->E:1;M->C:1;N->C:3"); } -// Test set 2: Conv2D..BiasAddGrad -> Conv2DWithBiasBackpropBias rewrite tests +#if 0 +// This test set is disabled temporarily as we do not enable node rewrite. +// This test set will be enabled when we support Mkl-specific kernels for +// backward bias. +// +// Test set 2: MklConv2D..BiasAddGrad -> Conv2DWithBiasBackpropBias +// rewrite tests -// C=Conv2D(A,B); D=Sub(C,A); F=BiasAddGrad(D) +// C=MklConv2D(A,M,B,N); D=Sub(C,A); E=BiasAddGrad(D) TEST_F(OptimizerMergeTest, Conv2DBackprop_Positive) { + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'M' op: 'MklInput'}" + "node { name: 'B' op: 'Input'}" + "node { name: 'N' op: 'MklInput'}" + "node { name: 'C' op: 'MklConv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'SAME' } }" + " input: ['A', 'M', 'B', 'N']}" + "node { name: 'D' op: 'Sub'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['C', 'A']}" + "node { name: 'E' op: 'BiasAddGrad'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " input: ['D'] }"); + EXPECT_EQ(DoNodeMerge(), + "A(Input);B(Input);C(MklConv2D);D(Sub);E(Conv2DWithBiasBackpropBias);" + "M(MklInput);N(MklInput)|A->C;A->D:1;B->C:2;C->D;D->E;M->C:1;N->C:3"); +} + +// No MklConv2D in context, but Conv2D in context. No rewrite should happen. +// C=Conv2D(A,B); D=Sub(C,A); E=BiasAddGrad(D) +TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoMklConv2D) { InitGraph( "node { name: 'A' op: 'Input'}" "node { name: 'B' op: 'Input'}" @@ -260,12 +333,12 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Positive) { " attr { key: 'data_format' value { s: 'NCHW' } }" " input: ['D'] }"); EXPECT_EQ(DoNodeMerge(), - "A(Input);B(Input);C(Conv2D);D(Sub);E(Conv2DWithBiasBackpropBias)|" + "A(Input);B(Input);C(Conv2D);D(Sub);E(BiasAddGrad)|" "A->C;A->D:1;B->C:1;C->D;D->E"); } // No Conv2D in the context for BiasAddGrad. No rewrite should happen. -// C=Add(A,B); D=Sub(C,A); F=BiasAddGrad(D,E) +// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D) TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D) { InitGraph( "node { name: 'A' op: 'Input'}" @@ -287,7 +360,7 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D) { // No Conv2D in the context for BiasAddGrad, but MatMul in context. // Rewrite should happen, but name of BiasAddGrad does not change. -// C=MatMul(A,B); D=Sub(C,A); F=BiasAddGrad(D,E) +// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D) TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D_MatMul) { InitGraph( "node { name: 'A' op: 'Input'}" @@ -310,7 +383,7 @@ TEST_F(OptimizerMergeTest, Conv2DBackprop_Negative_NoConv2D_MatMul) { } // Test set 3: MatMul..BiasAddGrad -> BiasAddGrad rewrite tests -// C=MatMul(A,B); D=Sub(C,A); F=BiasAddGrad(D,E) +// C=MatMul(A,B); D=Sub(C,A); E=BiasAddGrad(D) TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Positive) { InitGraph( "node { name: 'A' op: 'Input'}" @@ -333,7 +406,7 @@ TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Positive) { } // No MatMul in the context for BiasAddGrad. No rewrite should happen. -// C=Add(A,B); D=Sub(C,A); F=BiasAddGrad(D,E) +// C=Add(A,B); D=Sub(C,A); E=BiasAddGrad(D) TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Negative_NoMatMul) { InitGraph( "node { name: 'A' op: 'Input'}" @@ -352,7 +425,7 @@ TEST_F(OptimizerMergeTest, MatMulBiasAddGrad_Negative_NoMatMul) { "A(Input);B(Input);C(Add);D(Sub);E(BiasAddGrad)|" "A->C;A->D:1;B->C:1;C->D;D->E"); } - +#endif static void BM_NodeMerge(int iters, int op_nodes) { testing::StopTiming(); diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.cc b/tensorflow/core/graph/mkl_tfconversion_pass.cc new file mode 100644 index 00000000000000..1e7b5e70942090 --- /dev/null +++ b/tensorflow/core/graph/mkl_tfconversion_pass.cc @@ -0,0 +1,271 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef INTEL_MKL + +#include +#include +#include +#include +#include +#include + +#include "tensorflow/core/framework/node_def_util.h" +#include "tensorflow/core/graph/algorithm.h" +#include "tensorflow/core/graph/node_builder.h" +#include "tensorflow/core/lib/gtl/map_util.h" +#include "tensorflow/core/lib/hash/hash.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/lib/core/status.h" +#include "tensorflow/core/common_runtime/function.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/common_runtime/optimization_registry.h" + +#include "tensorflow/core/graph/mkl_tfconversion_pass.h" +#include "tensorflow/core/util/mkl_util.h" + +namespace tensorflow { + +// This pass inserts Mkl to Tf tensor conversion nodes (represented by C) +// in the graph in between A and B, where A and B match any one +// of the following +// cases: +// 1) A = layer/Op that generates output in Mkl format and, +// B = layer/Op that does not accept input in Mkl format and, +// A -> B (there is a direct edge between A and B, then +// We will insert C such that A->C->B. +// +// 2) A = layer/Op that generates output in Mkl format and, +// B = NULL (in other words, A is the last layer in the graph), then +// We will insert C such that A->C->B. (C will be the last layer.) +// +// Note that case 1 applies to all outputs of A that are input to B. +// In other words, the conversions will be required for every output +// of A that is input to B. For example, let us say the output of A +// is A1, A2, A3, of which A1 and A2 are in Mkl format, but A3 is not +// in Mkl format, and all of them are input to B. In such case, we will +// do the conversion for A1 and A2 only. We do not need to do any conversion +// for A3. +// +// This pass relies on layers registering themselves about their Mkl compliant. +// Mkl compliant layer can accept inputs in Mkl format, and produce output in +// Mkl format. Non-compliant layer accepts inputs and outputs in +// TensorFlow format. +// +class MklToTfConversionPass : public GraphOptimizationPass { + public: + MklToTfConversionPass() {} + Status Run(const GraphOptimizationPassOptions& options); + + // Insert layout conversion node in the graph pointed by g. + // Function scans the graph for candidate edges where we + // need to insert conversion nodes. + // + // @return true even if single conversion node is inserted; + // false, otherwise. + bool RunPass(std::unique_ptr* g); + + + private: + // Is the input Op supported by Mkl-specific layout? + // + // @input op_name string of the op + // @return true if op is Mkl supported; false, otherwise. + inline bool IsMklSupportedOp(const string& op_name) const { + return mkl_layer_registry::IsMklLayer(op_name); + } + + // Insert layout conversion node on the edge pointed by 'e' from graph 'g'. + // + // Edge will be deleted once a call to this function is successful. + // Any attempt to use the edge after this call + // will lead to undefined behaviors. + // + // @return Success:OK() if insertion is successful, otherwise returns + // appropriate error status code. + Status InsertConversionNodeOnEdge(std::unique_ptr* g, Edge*); +}; + +// We register MklToTf insertion for phase 1 in post-partition grouping. +// We register this pass after partitioning so that we get a complete +// picture of inputs and outputs of the nodes in the graphs. +const OptimizationPassRegistry::Grouping kMklTfConvPassGroup = + OptimizationPassRegistry::POST_PARTITIONING; +REGISTER_OPTIMIZATION(kMklTfConvPassGroup, 1, MklToTfConversionPass); + +Status MklToTfConversionPass::InsertConversionNodeOnEdge( + std::unique_ptr* g, Edge *e) { + CHECK_NOTNULL(e); + + Node* src = e->src(); + Node* dst = e->dst(); + + CHECK_NOTNULL(src); + CHECK_NOTNULL(dst); + + Node* conversion_node = nullptr; + DataType src_datatype = DT_INVALID; + DataType dst_datatype = DT_INVALID; + string data_format; + + TF_CHECK_OK(GetNodeAttr(src->def(), "T", &src_datatype)); + TF_CHECK_OK(GetNodeAttr(dst->def(), "T", &dst_datatype)); + if (src_datatype != dst_datatype) { + string err_msg = "T attribute of " + src->name() + " and " + + dst->name() + " do not match. Will not insert" + + " MklToTf node in such case."; + return Status(error::Code::INVALID_ARGUMENT, err_msg.c_str()); + } + + // Lets build the conversion node and specify src as input. + TF_CHECK_OK(NodeBuilder((*g)->NewName("Mkl2Tf"), "MklToTf") + .Input(src, e->src_output()) + .Input(src, e->src_output()+1) // Mkl tensor immediately + // follows Tf tensor. + .Device(src->def().device()) // We want to get conversion node + // on same device as source node. + .Attr("T", src_datatype) + .Finalize(&**g, &conversion_node)); + + CHECK_NOTNULL(conversion_node); + if (GetNodeAttr(src->def(), "data_format", &data_format) == Status::OK()) { + conversion_node->AddAttr("data_format", data_format); + } + + // Get assigned device from source node and apply it to conversion node. + // We want conversion node to be on the same device as the source node. + conversion_node->set_assigned_device_name(src->assigned_device_name()); + + // Set the Mkl layer label for this op. + conversion_node->AddAttr("_kernel", mkl_layer_registry::kMklLayerLabel); + + // Now that we have added edge from src->conversion_node, let's add edge from + // output of conversion_node to the dest node. Since conversion_node + // has only 1 output, the src_output of conversion_node is 0. + CHECK_NOTNULL((*g)->AddEdge(conversion_node, 0, dst, e->dst_input())); + + VLOG(1) << "MklToTfConversionPass: Inserting Conversion node on: " + << src->type_string() << " and " << dst->type_string() + << " successful."; + + // Remove src->dst edge now. + (*g)->RemoveEdge(e); + return Status::OK(); +} + +bool MklToTfConversionPass::RunPass(std::unique_ptr* g) { + bool result = false; + + CHECK_NOTNULL(g); + + DumpGraph("Before MklToTfConversionPass", &**g); + + // Since we are looking for mkl-supported op node immediately + // followed by non-mkl op node, we will just iterate over edge + // set of the graph. + // vector to maintain candiadate edges whose source and destination + // are candidate for inserting conversion node + std::vector candidate_edges; + + for (const Edge *e : (*g)->edges()) { + Node* src = e->src(); + Node* dst = e->dst(); + + // We skip control edges. + if (e->IsControlEdge()) { + continue; + } + + VLOG(1) << "MklToTfConversionPass: InsertConversionNodes: " + << src->type_string() << " and " << dst->type_string(); + + // Let's get source and destination data type. + DataType src_datatype = DT_INVALID; + if (GetNodeAttr(src->def(), "T", &src_datatype) != Status::OK()) { + continue; + } + // We cannot check datatype on destination node because destination node + // may not be Mkl node. + DataType dst_datatype = DT_INVALID; + GetNodeAttr(dst->def(), "T", &dst_datatype); + + // Check if src with is Mkl-compliant, while dst is not Mkl-compliant. + if (IsMklSupportedOp(src->type_string()) && + !IsMklSupportedOp(dst->type_string())) { + VLOG(1) << "MklToTfConversionPass: Scheduled nodes " << src->name() + << " and " << dst->name() << " for inserting conversion nodes"; + candidate_edges.push_back(const_cast(e)); + } + } + + // Process all candidate edges and insert conversion nodes on them. + for (Edge* e : candidate_edges) { + // Even if we insert conversion node on a single edge, we + // need to return true. + string src_name = e->src()->name(); + string dst_name = e->dst()->name(); + if (InsertConversionNodeOnEdge(g, e) == Status::OK()) { + VLOG(1) << "MklToTfConversionPass: Inserted conversion " + << "node on edge between " << src_name << " and " << dst_name; + result = true; + } + } + + DumpGraph("After MklToTfConversionPass", &**g); + + // We need to return true even if we insert one conversion node + // anywhere in the graph. + return result; +} + +////////////////////////////////////////////////////////////////////////////// +// Run function for the pass +////////////////////////////////////////////////////////////////////////////// + +bool InsertMklToTfConversionNodes(std::unique_ptr* g) { + return MklToTfConversionPass().RunPass(g); +} + +Status MklToTfConversionPass::Run( + const GraphOptimizationPassOptions& options) { + if (options.graph == nullptr && options.partition_graphs == nullptr) { + return Status::OK(); + } + + auto process_graph = [&](std::unique_ptr* g) { + // Get the ownership of graph + std::unique_ptr* ng = std::move(g); + RunPass(ng); + // Return the ownership of graph back + g->reset(ng->release()); + }; + + if (kMklTfConvPassGroup != OptimizationPassRegistry::POST_PARTITIONING) { + // For any pre-partitioning phase, graph is stored in options.graph. + process_graph(options.graph); + } else { + // For post partitioning phase, graphs are stored in + // options.partition_graphs. + for (auto& pg : *options.partition_graphs) { + process_graph(&pg.second); + } + } + + return Status::OK(); +} + +} // namespace tensorflow + +#endif diff --git a/tensorflow/core/graph/mkl_tfconversion_pass.h b/tensorflow/core/graph/mkl_tfconversion_pass.h new file mode 100644 index 00000000000000..0562d8b3cd4ffa --- /dev/null +++ b/tensorflow/core/graph/mkl_tfconversion_pass.h @@ -0,0 +1,36 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// An optimization pass that inserts MklToTf conversion nodes in the graph + +#ifndef TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_ +#define TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_ + +#ifdef INTEL_MKL + +#include +#include +#include "tensorflow/core/graph/graph.h" + +namespace tensorflow { +// Interface to invoke the pass for unit test +// +// Returns true if and only if 'g' is mutated. +extern bool InsertMklToTfConversionNodes(std::unique_ptr* g); +} // namespace tensorflow + +#endif + +#endif // TENSORFLOW_CORE_GRAPH_MKL_TFCONVERSION_PASS_H_ diff --git a/tensorflow/core/graph/mkl_tfconversion_pass_test.cc b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc new file mode 100644 index 00000000000000..103ff295b3cc63 --- /dev/null +++ b/tensorflow/core/graph/mkl_tfconversion_pass_test.cc @@ -0,0 +1,243 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef INTEL_MKL + +#include "tensorflow/core/graph/mkl_tfconversion_pass.h" + +#include +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/graph/graph.h" +#include "tensorflow/core/graph/graph_constructor.h" +#include "tensorflow/core/graph/testlib.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/random/simple_philox.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/lib/strings/stringprintf.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/protobuf.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/platform/test_benchmark.h" + +namespace tensorflow { +namespace { + +class MklToTfConversionPass : public ::testing::Test { + public: + MklToTfConversionPass() : graph_(OpRegistry::Global()) {} + + static void InitGraph(const string& s, Graph* graph) { + GraphDef graph_def; + + auto parser = protobuf::TextFormat::Parser(); + CHECK(parser.MergeFromString(s, &graph_def)) << s; + GraphConstructorOptions opts; + TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, graph)); + } + + void InitGraph(const string& s) { + InitGraph(s, &graph_); + original_ = CanonicalGraphString(&graph_); + } + + static bool IncludeNode(const Node* n) { return n->IsOp(); } + + static string EdgeId(const Node* n, int index) { + if (index == 0) { + return n->name(); + } else if (index == Graph::kControlSlot) { + return strings::StrCat(n->name(), ":control"); + } else { + return strings::StrCat(n->name(), ":", index); + } + } + + string CanonicalGraphString(Graph* g) { + std::vector nodes; + std::vector edges; + for (const Node* n : g->nodes()) { + if (IncludeNode(n)) { + nodes.push_back(strings::StrCat(n->name(), "(", n->type_string(), ")")); + } + } + for (const Edge* e : g->edges()) { + if (IncludeNode(e->src()) && IncludeNode(e->dst())) { + edges.push_back(strings::StrCat(EdgeId(e->src(), e->src_output()), "->", + EdgeId(e->dst(), e->dst_input()))); + } + } + // Canonicalize + std::sort(nodes.begin(), nodes.end()); + std::sort(edges.begin(), edges.end()); + return strings::StrCat(str_util::Join(nodes, ";"), "|", + str_util::Join(edges, ";")); + } + + string DoRunMklToTfConversionPass() { + string before = CanonicalGraphString(&graph_); + LOG(ERROR) << "Before MklToTf conversion pass: " << before; + + std::unique_ptr* ug = new std::unique_ptr(&graph_); + InsertMklToTfConversionNodes(ug); + + string result = CanonicalGraphString(&graph_); + LOG(ERROR) << "After MklToTf conversion pass: " << result; + return result; + } + + const string& OriginalGraph() const { return original_; } + + Graph graph_; + string original_; +}; + +REGISTER_OP("Input").Output("o: float").SetIsStateful(); +REGISTER_OP("HalfInput").Output("o: half").SetIsStateful(); +REGISTER_OP("MklInput").Output("o: uint8").SetIsStateful(); + +TEST_F(MklToTfConversionPass, Basic) { + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Input'}" + "node { name: 'C' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + " input: ['A', 'B'] }" + "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }" + " input: ['A', 'B'] }"); + EXPECT_EQ(DoRunMklToTfConversionPass(), + "A(Input);B(Input);C(Mul);D(Mul)|" + "A->C;A->D;B->C:1;B->D:1"); +} + +// MklConv2D followed by Non-Mkl layer +// C=MklConv2D(A,M,B,N); E=Sub(C,D) +TEST_F(MklToTfConversionPass, Positive) { + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'M' op: 'MklInput'}" + "node { name: 'B' op: 'Input'}" + "node { name: 'N' op: 'MklInput'}" + "node { name: 'C' op: 'MklConv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'SAME' } }" + " input: ['A', 'M', 'B', 'N']}" + "node { name: 'D' op: 'Input'}" + "node { name: 'E' op: 'Sub'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['C', 'D']}"); + EXPECT_EQ(DoRunMklToTfConversionPass(), + "A(Input);B(Input);C(MklConv2D);D(Input);E(Sub);M(MklInput);" + "Mkl2Tf/_0(MklToTf);N(MklInput)|A->C;B->C:2;C->Mkl2Tf/_0;" + "C:1->Mkl2Tf/_0:1;D->E:1;M->C:1;Mkl2Tf/_0->E;N->C:3"); +} + +// MklConv2D followed by Non-Mkl layer, and MklConv2D uses half type +// C=MklConv2D(A,M,B,N); E=Sub(C,D) +// MklToTf node should be inserted. +TEST_F(MklToTfConversionPass, Positive_Type) { + InitGraph( + "node { name: 'A' op: 'HalfInput'}" + "node { name: 'M' op: 'MklInput'}" + "node { name: 'B' op: 'HalfInput'}" + "node { name: 'N' op: 'MklInput'}" + "node { name: 'C' op: 'MklConv2D'" + " attr { key: 'T' value { type: DT_HALF } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'SAME' } }" + " input: ['A', 'M', 'B', 'N']}" + "node { name: 'D' op: 'HalfInput'}" + "node { name: 'E' op: 'Sub'" + " attr {key: 'T' value { type: DT_HALF } }" + " input: ['C', 'D']}"); + EXPECT_EQ(DoRunMklToTfConversionPass(), + "A(HalfInput);B(HalfInput);C(MklConv2D);D(HalfInput);" + "E(Sub);M(MklInput);Mkl2Tf/_0(MklToTf);N(MklInput)|" + "A->C;B->C:2;C->Mkl2Tf/_0;C:1->Mkl2Tf/_0:1;D->E:1;" + "M->C:1;Mkl2Tf/_0->E;N->C:3"); +} + +// C=Conv2D(A,B); E=BiasAdd(C,D); Z=Sub(E,Y); +// There is no Mkl layer so no conversion op should be inserted. +TEST_F(MklToTfConversionPass, Negative_NoMklLayer) { + InitGraph( + "node { name: 'A' op: 'Input'}" + "node { name: 'B' op: 'Input'}" + "node { name: 'C' op: 'Conv2D'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " attr { key: 'use_cudnn_on_gpu' value { b: false } }" + " attr { key: 'strides' value { list: {i: 1, i:1, i:1, i:1} } }" + " attr { key: 'padding' value { s: 'SAME' } }" + " input: ['A', 'B']}" + "node { name: 'D' op: 'Input'}" + "node { name: 'E' op: 'BiasAdd'" + " attr { key: 'T' value { type: DT_FLOAT } }" + " attr { key: 'data_format' value { s: 'NCHW' } }" + " input: ['C', 'D'] }" + "node { name: 'Y' op: 'Input'}" + "node { name: 'Z' op: 'Sub'" + " attr {key: 'T' value { type: DT_FLOAT } }" + " input: ['E', 'Y']}"); + EXPECT_EQ(DoRunMklToTfConversionPass(), + "A(Input);B(Input);C(Conv2D);D(Input);E(BiasAdd);Y(Input);Z(Sub)|" + "A->C;B->C:1;C->E;D->E:1;E->Z;Y->Z:1"); +} + +static void BM_RunMklToTfConversionPass(int iters, int op_nodes) { + testing::StopTiming(); + string s; + for (int in = 0; in < 10; in++) { + s += strings::Printf("node { name: 'in%04d' op: 'Input'}", in); + } + random::PhiloxRandom philox(301, 17); + random::SimplePhilox rnd(&philox); + for (int op = 0; op < op_nodes; op++) { + s += strings::Printf( + "node { name: 'op%04d' op: 'Mul' attr { key: 'T' value { " + "type: DT_FLOAT } } input: ['in%04d', 'in%04d' ] }", + op, rnd.Uniform(10), rnd.Uniform(10)); + } + + bool first = true; + while (iters > 0) { + Graph* graph = new Graph(OpRegistry::Global()); + MklToTfConversionPass::InitGraph(s, graph); + int N = graph->num_node_ids(); + if (first) { + testing::SetLabel(strings::StrCat("Per graph node. Nodes: ", N)); + first = false; + } + { + testing::StartTiming(); + std::unique_ptr ug(graph); + InsertMklToTfConversionNodes(&ug); + testing::StopTiming(); + } + iters -= N; // Our benchmark units are individual graph nodes, + // not whole graphs + // delete graph; + } +} +BENCHMARK(BM_RunMklToTfConversionPass)->Arg(1000)->Arg(10000); + +} // namespace +} // namespace tensorflow + +#endif /* INTEL_MKL */ diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 9740f96a6d36f9..3b79d4c3db58fd 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -688,8 +688,15 @@ tf_kernel_library( tf_kernel_library( name = "transpose_op", - prefix = "transpose_op", - deps = ARRAY_DEPS, + srcs = [ + "transpose_op.cc", + ] + if_mkl([ + "mkl_transpose_op.cc", + ]), + hdrs = ["transpose_op.h"], + deps = ARRAY_DEPS + if_mkl([ + "//third_party/mkl:intel_binary_blob", + ]), ) tf_kernel_library( @@ -1735,6 +1742,22 @@ tf_cuda_cc_test( ], ) +tf_cuda_cc_test( + name = "resize_benchmark_test", + srcs = ["resize_op_benchmark_test.cc"], + deps = [ + ":image", + ":ops_testutil", + ":ops_util", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + ], +) + cc_library( name = "io", deps = [ @@ -4376,7 +4399,7 @@ tf_cc_test( if_mkl( tf_kernel_library( - name = "mkl_ops", + name = "mkl_matmul_op", prefix = "mkl_matmul", deps = [ ":math", @@ -4385,6 +4408,40 @@ if_mkl( ), ) +if_mkl( + tf_kernel_library( + name = "mkl_conv_op", + prefix = "mkl_conv", + deps = [ + ":bounds_check", + ":ops_util", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core:nn_ops_op_lib", + "//third_party/mkl:intel_binary_blob", + ], + ), +) + +if_mkl( + tf_kernel_library( + name = "mkl_tfconv_op", + prefix = "mkl_tfconv", + deps = [ + ":bounds_check", + ":ops_util", + "//tensorflow/core:core_cpu", + "//tensorflow/core:framework", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core:nn_ops_op_lib", + "//third_party/mkl:intel_binary_blob", + ], + ), +) + # ----------------------------------------------------------------------------- # Google-internal targets. These must be at the end for syncrepo. diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/adjust_hue_op.cc index 09300737c768cb..e8f32693f7a794 100644 --- a/tensorflow/core/kernels/adjust_hue_op.cc +++ b/tensorflow/core/kernels/adjust_hue_op.cc @@ -1,5 +1,4 @@ /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -12,16 +11,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#define EIGEN_USE_THREADS + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU +#endif + #include -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/tensor_types.h" #include "tensorflow/core/framework/types.h" +#include "tensorflow/core/kernels/adjust_hue_op.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/util/work_sharder.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { @@ -77,6 +84,7 @@ template class AdjustHueOp; namespace internal { + // Helper function to convert a RGB color to H-and-V-range. H is in the range // of [0, 6] instead of the normal [0, 1] static void rgb_to_hv_range(float r, float g, float b, float* h, float* v_min, @@ -185,6 +193,7 @@ static void hv_range_to_rgb(float h, float v_min, float v_max, float* r, } } // namespace internal + template <> class AdjustHueOp : public AdjustHueOpBase { public: @@ -237,4 +246,34 @@ class AdjustHueOp : public AdjustHueOpBase { REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_CPU), AdjustHueOp); +#if GOOGLE_CUDA +template <> +class AdjustHueOp : public AdjustHueOpBase { + public: + explicit AdjustHueOp(OpKernelConstruction* context) + : AdjustHueOpBase(context) {} + + virtual void DoCompute(OpKernelContext* context, const ComputeOptions& options) override { + const Tensor* input = options.input; + const Tensor* delta = options.delta; + Tensor* output = options.output; + const int64 number_of_elements = input->NumElements(); + GPUDevice device = context->eigen_gpu_device(); + const auto stream = device.stream(); + OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); + if (number_of_elements > 0) { + const float* input_data = input->flat().data(); + const float* delta_h = delta->flat().data(); + float* const output_data = output->flat().data(); + functor::AdjustHueGPU()(&device, number_of_elements, input_data, delta_h, + output_data); + } + } +}; + +REGISTER_KERNEL_BUILDER(Name("AdjustHue").Device(DEVICE_GPU), AdjustHueOp); + +#endif + +//} // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/adjust_hue_op.h b/tensorflow/core/kernels/adjust_hue_op.h new file mode 100644 index 00000000000000..5b30bd85405b8f --- /dev/null +++ b/tensorflow/core/kernels/adjust_hue_op.h @@ -0,0 +1,42 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H +#define _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H + +#if GOOGLE_CUDA +#define EIGEN_USE_GPU + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { + +typedef Eigen::GpuDevice GPUDevice; + +namespace functor { + +struct AdjustHueGPU { + void operator()( + GPUDevice* device, + const int64 number_of_elements, + const float* const input, + const float* const delta, + float* const output + ); +}; + +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA +#endif // _TENSORFLOW_CORE_KERNELS_ADJUST_HUE_OP_H diff --git a/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc new file mode 100644 index 00000000000000..2fc69ed101c3a1 --- /dev/null +++ b/tensorflow/core/kernels/adjust_hue_op_gpu.cu.cc @@ -0,0 +1,141 @@ +/* Copyright 2016 The TensorFlow Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/kernels/adjust_hue_op.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +namespace tensorflow { +namespace internal { + +namespace { + typedef struct RgbTuple { + float r; + float g; + float b; + } RgbTuple; + + typedef struct HsvTuple { + float h; + float s; + float v; + } HsvTuple; +} // anon namespace + +__device__ HsvTuple rgb2hsv_cuda(const float r, const float g, const float b) +{ + HsvTuple tuple; + const float M = fmaxf(r, fmaxf(g, b)); + const float m = fminf(r, fminf(g, b)); + const float chroma = M - m; + float h = 0.0f, s = 0.0f; + // hue + if (chroma > 0.0f) { + if (M == r) { + const float num = (g - b) / chroma; + const float sign = copysignf(1.0f, num); + h = ((sign < 0.0f) * 6.0f + sign * fmodf(sign * num, 6.0f)) / 6.0f; + } else if (M == g) { + h = ((b - r) / chroma + 2.0f) / 6.0f; + } else { + h = ((r - g) / chroma + 4.0f) / 6.0f; + } + } else { + h = 0.0f; + } + // saturation + if (M > 0.0) { + s = chroma / M; + } else { + s = 0.0f; + } + tuple.h = h; + tuple.s = s; + tuple.v = M; + return tuple; +} + +__device__ RgbTuple hsv2rgb_cuda(const float h, const float s, const float v) +{ + RgbTuple tuple; + const float new_h = h * 6.0f; + const float chroma = v * s; + const float x = chroma * (1.0f - fabsf(fmodf(new_h, 2.0f) - 1.0f)); + const float new_m = v - chroma; + const bool between_0_and_1 = new_h >= 0.0f && new_h < 1.0f; + const bool between_1_and_2 = new_h >= 1.0f && new_h < 2.0f; + const bool between_2_and_3 = new_h >= 2.0f && new_h < 3.0f; + const bool between_3_and_4 = new_h >= 3.0f && new_h < 4.0f; + const bool between_4_and_5 = new_h >= 4.0f && new_h < 5.0f; + const bool between_5_and_6 = new_h >= 5.0f && new_h < 6.0f; + tuple.r = chroma * (between_0_and_1 || between_5_and_6) + + x * (between_1_and_2 || between_4_and_5) + new_m; + tuple.g = chroma * (between_1_and_2 || between_2_and_3) + + x * (between_0_and_1 || between_3_and_4) + new_m; + tuple.b = chroma * (between_3_and_4 || between_4_and_5) + + x * (between_2_and_3 || between_5_and_6) + new_m; + return tuple; +} + +__global__ void adjust_hue_nhwc(const int64 number_elements, + const float * const __restrict__ input, + float * const output, + const float * const hue_delta) +{ + // multiply by 3 since we're dealing with contiguous RGB bytes for each pixel (NHWC) + const int64 idx = (blockDim.x * blockIdx.x + threadIdx.x) * 3; + // bounds check + if (idx > number_elements - 1) { + return; + } + const float delta = hue_delta[0]; + const HsvTuple hsv = rgb2hsv_cuda(input[idx], input[idx + 1], input[idx + 2]); + // hue adjustment + float new_h = fmodf(hsv.h + delta, 1.0f); + if (new_h < 0.0f) { + new_h = fmodf(1.0f + new_h, 1.0f); + } + const RgbTuple rgb = hsv2rgb_cuda(new_h, hsv.s, hsv.v); + output[idx] = rgb.r; + output[idx + 1] = rgb.g; + output[idx + 2] = rgb.b; +} +} // namespace internal + + +namespace functor { + +void AdjustHueGPU::operator()( + GPUDevice* device, + const int64 number_of_elements, + const float* const input, + const float* const delta, + float* const output +) { + const auto stream = device->stream(); + const CudaLaunchConfig config = GetCudaLaunchConfig(number_of_elements, *device); + const int threads_per_block = config.thread_per_block; + const int block_count = (number_of_elements + threads_per_block - 1) / threads_per_block; + internal::adjust_hue_nhwc<<>>( + number_of_elements, input, output, delta + ); +} +} // namespace functor +} // namespace tensorflow +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h index 492c358a526297..f93921d4a59a8f 100644 --- a/tensorflow/core/kernels/eigen_pooling.h +++ b/tensorflow/core/kernels/eigen_pooling.h @@ -338,6 +338,7 @@ struct AvgPoolMeanReducer { // In the case below, 0xd8 implies (false_mask) ? (b) : (a) // For details, refer to the vpternlogd instruction table at // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-vol-2c-manual.pdf + #define psel(a, b, false_mask) \ _mm512_castsi512_ps(_mm512_ternarylogic_epi32( \ _mm512_castps_si512(a), _mm512_castps_si512(b), \ diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc index 008ea110170b16..637a6cef95da5e 100644 --- a/tensorflow/core/kernels/fixed_length_record_reader_op.cc +++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc @@ -40,8 +40,8 @@ class FixedLengthRecordReader : public ReaderBase { // On success: // * input_buffer_ != nullptr, - // * input_buffer_->Tell() == footer_bytes_ - // * file_pos_limit_ == file size - header_bytes_ + // * input_buffer_->Tell() == header_bytes_ + // * file_pos_limit_ == file size - footer_bytes_ Status OnWorkStartedLocked() override { record_number_ = 0; uint64 file_size = 0; diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc new file mode 100644 index 00000000000000..93791851b1e1cd --- /dev/null +++ b/tensorflow/core/kernels/mkl_conv_ops.cc @@ -0,0 +1,457 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/nn_ops.cc. +#ifdef INTEL_MKL + +#include +#include +#include +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/framework/tensor_slice.h" +#include "tensorflow/core/kernels/bounds_check.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/lib/strings/numbers.h" +#include "tensorflow/core/lib/strings/str_util.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" + +#include "tensorflow/core/util/mkl_util.h" +#include "third_party/mkl/include/mkl_dnn.h" +#include "third_party/mkl/include/mkl_dnn_types.h" + +namespace tensorflow { + +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +template +class MklConv2DOp : public OpKernel { + public: + ~MklConv2DOp() {} + + explicit MklConv2DOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_)); + string data_format; + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format)); + OP_REQUIRES(context, FormatFromString(data_format, &data_format_), + errors::InvalidArgument("Invalid data format")); + OP_REQUIRES(context, strides_.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + + const int64 stride_n = GetTensorDim(strides_, data_format_, 'N'); + const int64 stride_c = GetTensorDim(strides_, data_format_, 'C'); + OP_REQUIRES( + context, stride_n == 1 && stride_c == 1, + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_)); + } + + void Compute(OpKernelContext* context) override { + const Tensor& input = MklGetInput(context, 0); + GetMklShape(context, 0, &(mkl_params_.input_shape)); + bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor(); + + const Tensor& filter = MklGetInput(context, 1); + MklShape mkl_filter_shape; + GetMklShape(context, 1, &mkl_filter_shape); + CHECK(!mkl_filter_shape.IsMklTensor()) + << "Conv filter should not be in MKL Layout"; + + if (biasEnabled) { + const Tensor& bias = MklGetInput(context, 2); + OP_REQUIRES(context, bias.dims() == 1, + errors::InvalidArgument("bias must be 1-dimensional: ", + bias.shape().DebugString())); + } + + if (!input_in_mkl_format) { + OP_REQUIRES(context, input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().DebugString())); + } + + OP_REQUIRES(context, filter.dims() == 4, + errors::InvalidArgument("filter must be 4-dimensional: ", + filter.shape().DebugString())); + + for (int i = 0; i < 3; i++) { + OP_REQUIRES(context, FastBoundsCheck(filter.dim_size(i), + std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); + } + + const int64 input_depth = input_in_mkl_format + ? mkl_params_.input_shape.GetSizes()[2] + : GetTensorDim(input, data_format_, 'C'); + OP_REQUIRES( + context, input_depth == filter.dim_size(2), + errors::InvalidArgument("input and filter must have the same depth: ", + input_depth, " vs ", filter.dim_size(2))); + // The last dimension for filter is out_depth. + const int out_depth = static_cast(filter.dim_size(3)); + + // The second dimension for input is rows/height. + // The first dimension for filter is rows/height. + const int64 input_rows_raw = input_in_mkl_format + ? mkl_params_.input_shape.GetSizes()[1] + : GetTensorDim(input, data_format_, 'H'); + OP_REQUIRES(context, FastBoundsCheck(input_rows_raw, + std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); + const int input_rows = static_cast(input_rows_raw); + const int filter_rows = static_cast(filter.dim_size(0)); + + // The third dimension for input is columns/width. + // The second dimension for filter is columns/width. + const int64 input_cols_raw = input_in_mkl_format + ? mkl_params_.input_shape.GetSizes()[0] + : GetTensorDim(input, data_format_, 'W'); + OP_REQUIRES(context, FastBoundsCheck(input_cols_raw, + std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); + const int input_cols = static_cast(input_cols_raw); + const int filter_cols = static_cast(filter.dim_size(1)); + + // The first dimension for input is batch. + const int64 input_batch_raw = input_in_mkl_format + ? mkl_params_.input_shape.GetSizes()[3] + : GetTensorDim(input, data_format_, 'N'); + OP_REQUIRES(context, FastBoundsCheck(input_batch_raw, + std::numeric_limits::max()), + errors::InvalidArgument("batch is too large")); + const int batch = static_cast(input_batch_raw); + + // For now we take the stride from the second and third dimensions only (we + // do not support striding on the batch or depth dimension). + const int stride_rows = GetTensorDim(strides_, data_format_, 'H'); + const int stride_cols = GetTensorDim(strides_, data_format_, 'W'); + + int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; + OP_REQUIRES_OK(context, + GetWindowedOutputSize(input_rows, filter_rows, stride_rows, + padding_, &out_rows, &pad_rows)); + OP_REQUIRES_OK(context, + GetWindowedOutputSize(input_cols, filter_cols, stride_cols, + padding_, &out_cols, &pad_cols)); + TensorShape out_shape = + ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth); + + // Output tensor is of the following dimensions: + // [ in_batch, out_rows, out_cols, out_depth ] + Tensor* output = nullptr; + + // If there is nothing to compute, return. + if (out_shape.num_elements() == 0) { + // TODO(jbobba): Verify correctness here + // Need semantics for Null MKL tensor + return; + } + + if (batch == 0) { + // Nothing to do, allocate output tensor and return + MklShape mkl_output_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(false); + AllocateOutputSetMklshape(context, 0, &output, input.shape(), + mkl_output_mkl_shape); + return; + } + + // Create MKL convolution primitives + mkl_params_.in_dims = input_in_mkl_format + ? mkl_params_.input_shape.GetDimension() + : input.dims(); + mkl_params_.filter_dims = filter.dims(); + mkl_params_.in_sizes[0] = static_cast(input_cols); + mkl_params_.in_sizes[1] = static_cast(input_rows); + mkl_params_.in_sizes[2] = static_cast(input_depth); + mkl_params_.in_sizes[3] = static_cast(batch); + mkl_params_.out_sizes[0] = static_cast(out_cols); + mkl_params_.out_sizes[1] = static_cast(out_rows); + mkl_params_.out_sizes[2] = static_cast(out_depth); + mkl_params_.out_sizes[3] = static_cast(batch); + mkl_params_.input_offset[0] = static_cast(-pad_cols); + mkl_params_.input_offset[1] = static_cast(-pad_rows); + mkl_params_.conv_stride[0] = static_cast(stride_cols); + mkl_params_.conv_stride[1] = static_cast(stride_rows); + + GetStridesFromSizes(data_format_, mkl_params_.out_strides, + mkl_params_.out_sizes); + GetStridesFromSizes(data_format_, mkl_params_.in_strides, + mkl_params_.in_sizes); + + // TF filter dimension order (out_depth, in_depth, cols, rows) -> + // MKL filter dimension order (out_depth, in_depth, rows, cols) + mkl_params_.filter_sizes[0] = filter.dim_size(1); // cols + mkl_params_.filter_sizes[1] = filter.dim_size(0); // rows + mkl_params_.filter_sizes[2] = filter.dim_size(2); // in_depth + mkl_params_.filter_sizes[3] = filter.dim_size(3); // out_depth + + // TF filter layout - (rows, cols, in_depth, out_depth) + mkl_params_.filter_strides[0] = + filter.dim_size(2) * filter.dim_size(3); // cols + mkl_params_.filter_strides[1] = + filter.dim_size(1) * filter.dim_size(2) * filter.dim_size(3); // rows + mkl_params_.filter_strides[2] = filter.dim_size(3); // in_depth + mkl_params_.filter_strides[3] = 1; // out_depth + + if (biasEnabled) { + const Tensor& bias = MklGetInput(context, 2); + mkl_params_.bias_sizes[0] = {static_cast(bias.dim_size(0))}; + mkl_params_.bias_strides[0] = {1}; + } + + // Create Convolution Primitive + if (biasEnabled) { + CHECK_EQ(dnnConvolutionCreateForwardBias_F32( + &mkl_prim_convolution_fwd_, nullptr, + dnnAlgorithmConvolutionDirect, mkl_params_.in_dims, + mkl_params_.in_sizes, mkl_params_.out_sizes, + mkl_params_.filter_sizes, mkl_params_.conv_stride, + mkl_params_.input_offset, dnnBorderZeros), + E_SUCCESS); + } else { + CHECK_EQ(dnnConvolutionCreateForward_F32( + &mkl_prim_convolution_fwd_, nullptr, + dnnAlgorithmConvolutionDirect, mkl_params_.in_dims, + mkl_params_.in_sizes, mkl_params_.out_sizes, + mkl_params_.filter_sizes, mkl_params_.conv_stride, + mkl_params_.input_offset, dnnBorderZeros), + E_SUCCESS); + } + + TensorShape mkl_output_tf_shape; + MklShape mkl_output_mkl_shape; + mkl_output_mkl_shape.SetMklTensor(true); + mkl_output_mkl_shape.SetMklLayout(mkl_prim_convolution_fwd_, + dnnResourceDst); + mkl_output_mkl_shape.SetTfLayout(mkl_params_.in_dims, mkl_params_.out_sizes, + mkl_params_.out_strides); + mkl_output_tf_shape.AddDim( + dnnLayoutGetMemorySize_F32( + static_cast(mkl_output_mkl_shape.GetMklLayout())) / + sizeof(T)); + AllocateOutputSetMklshape(context, 0, &output, mkl_output_tf_shape, + mkl_output_mkl_shape); + mkl_conv_res_[dnnResourceDst] = + static_cast(output->flat().data()); + + MklCreateInputLayouts(context); + + Tensor mkl_tmp_input_buf_tensor, mkl_tmp_filter_buf_tensor, + mkl_tmp_bias_buf_tensor; // Temp tensor used to allocate tmp + // buffers + MklPrepareConvolutionInputs(context, &mkl_tmp_input_buf_tensor, + &mkl_tmp_filter_buf_tensor, + &mkl_tmp_bias_buf_tensor); + + // Execute convolution + CHECK_EQ(dnnExecute_F32(mkl_prim_convolution_fwd_, mkl_conv_res_), + E_SUCCESS); + + MklCleanup(); + } + + private: + typedef struct { + int in_dims; + size_t in_sizes[4]; + size_t in_strides[4]; + size_t out_sizes[4]; + size_t out_strides[4]; + int filter_dims; + size_t filter_sizes[4]; + size_t filter_strides[4]; + size_t bias_sizes[1]; + size_t bias_strides[1]; + int input_offset[2]; + size_t conv_stride[2]; + MklShape input_shape; + } MklConv2DOpParams; + + // Create MKL dnnLayout_t objects for tensors coming into the layer + void MklCreateInputLayouts(OpKernelContext* context) { + bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor(); + if (input_in_mkl_format) { + mkl_lt_input_ = + static_cast(mkl_params_.input_shape.GetCurLayout()); + } else { + CHECK_EQ( + dnnLayoutCreate_F32(&mkl_lt_input_, mkl_params_.in_dims, + mkl_params_.in_sizes, mkl_params_.in_strides), + E_SUCCESS); + } + + CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_filter_, mkl_params_.filter_dims, + mkl_params_.filter_sizes, + mkl_params_.filter_strides), + E_SUCCESS); + + if (biasEnabled) { + CHECK_EQ(dnnLayoutCreate_F32(&mkl_lt_bias_, 1, mkl_params_.bias_sizes, + mkl_params_.bias_strides), + E_SUCCESS); + } + } + + // Compare incoming tensor layouts with MKL preferred layouts and convert + // data to the preferred layout if necessary + void MklPrepareConvolutionInputs(OpKernelContext* context, + Tensor* mkl_tmp_input_buf_tensor, + Tensor* mkl_tmp_filter_buf_tensor, + Tensor* mkl_tmp_bias_buf_tensor) { + bool mkl_convert_input, mkl_convert_filter, mkl_convert_bias; + dnnPrimitive_t mkl_prim_convert_filter, mkl_prim_convert_bias, + mkl_prim_convert_input; + dnnLayout_t mkl_lt_internal_filter, mkl_lt_internal_bias, + mkl_lt_internal_input; + void *mkl_buf_convert_input, *mkl_buf_convert_filter, + *mkl_buf_convert_bias; + mkl_prim_convert_filter = nullptr; + mkl_prim_convert_bias = nullptr; + mkl_prim_convert_input = nullptr; + mkl_lt_internal_filter = nullptr; + mkl_lt_internal_bias = nullptr; + mkl_lt_internal_input = nullptr; + mkl_buf_convert_input = nullptr; + mkl_buf_convert_filter = nullptr; + mkl_buf_convert_bias = nullptr; + + // Compare with internal layouts and convert if needed + const Tensor& input = MklGetInput(context, 0); + void* mkl_buf_input = + const_cast(static_cast(input.flat().data())); + CHECK_EQ( + dnnLayoutCreateFromPrimitive_F32( + &mkl_lt_internal_input, mkl_prim_convolution_fwd_, dnnResourceSrc), + E_SUCCESS); + mkl_convert_input = + !dnnLayoutCompare_F32(mkl_lt_internal_input, mkl_lt_input_); + if (mkl_convert_input) { + CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_input, mkl_lt_input_, + mkl_lt_internal_input), + E_SUCCESS); + AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, mkl_lt_internal_input, + &mkl_buf_convert_input); + CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_input, mkl_buf_input, + mkl_buf_convert_input), + E_SUCCESS); + dnnDelete_F32(mkl_prim_convert_input); + } + dnnLayoutDelete_F32(mkl_lt_internal_input); + + mkl_conv_res_[dnnResourceSrc] = + (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input; + + const Tensor& filter = MklGetInput(context, 1); + void* mkl_buf_filter = + const_cast(static_cast(filter.flat().data())); + CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_filter, + mkl_prim_convolution_fwd_, + dnnResourceFilter), + E_SUCCESS); + mkl_convert_filter = + !dnnLayoutCompare_F32(mkl_lt_internal_filter, mkl_lt_filter_); + if (mkl_convert_filter) { + CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_filter, mkl_lt_filter_, + mkl_lt_internal_filter), + E_SUCCESS); + AllocTmpBuffer(context, mkl_tmp_filter_buf_tensor, mkl_lt_internal_filter, + &mkl_buf_convert_filter); + CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_filter, mkl_buf_filter, + mkl_buf_convert_filter), + E_SUCCESS); + dnnDelete_F32(mkl_prim_convert_filter); + } + dnnLayoutDelete_F32(mkl_lt_internal_filter); + + mkl_conv_res_[dnnResourceFilter] = + (mkl_convert_filter) ? mkl_buf_convert_filter : mkl_buf_filter; + + if (biasEnabled) { + const Tensor& bias = MklGetInput(context, 2); + void* mkl_buf_bias = + const_cast(static_cast(bias.flat().data())); + CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&mkl_lt_internal_bias, + mkl_prim_convolution_fwd_, + dnnResourceBias), + E_SUCCESS); + mkl_convert_bias = + !dnnLayoutCompare_F32(mkl_lt_internal_bias, mkl_lt_bias_); + if (mkl_convert_bias) { + CHECK_EQ(dnnConversionCreate_F32(&mkl_prim_convert_bias, mkl_lt_bias_, + mkl_lt_internal_bias), + E_SUCCESS); + AllocTmpBuffer(context, mkl_tmp_bias_buf_tensor, mkl_lt_internal_bias, + &mkl_buf_convert_bias); + CHECK_EQ(dnnConversionExecute_F32(mkl_prim_convert_bias, mkl_buf_bias, + mkl_buf_convert_bias), + E_SUCCESS); + dnnDelete_F32(mkl_prim_convert_bias); + } + dnnLayoutDelete_F32(mkl_lt_internal_bias); + + mkl_conv_res_[dnnResourceBias] = + (mkl_convert_bias) ? mkl_buf_convert_bias : mkl_buf_bias; + } + } + + void MklCleanup() { + bool input_in_mkl_format = mkl_params_.input_shape.IsMklTensor(); + dnnDelete_F32(mkl_prim_convolution_fwd_); + if (!input_in_mkl_format) dnnLayoutDelete_F32(mkl_lt_input_); + dnnLayoutDelete_F32(mkl_lt_filter_); + if (biasEnabled) dnnLayoutDelete_F32(mkl_lt_bias_); + } + + std::vector strides_; + Padding padding_; + TensorFormat data_format_; + + MklConv2DOpParams mkl_params_; + dnnPrimitive_t mkl_prim_convolution_fwd_ = nullptr; + void* mkl_conv_res_[dnnResourceNumber]; + dnnLayout_t mkl_lt_filter_ = nullptr, mkl_lt_bias_ = nullptr, + mkl_lt_input_ = nullptr; + + +}; + +#define REGISTER_MKL_CPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("MklConv2D").Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_layer_registry::kMklLayerLabel), \ + MklConv2DOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("MklConv2DWithBias").Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_layer_registry::kMklLayerLabel), \ + MklConv2DOp); + +TF_CALL_float(REGISTER_MKL_CPU); + +} // namespace tensorflow +#endif // INTEL_MKL diff --git a/tensorflow/core/kernels/mkl_tfconv_op.cc b/tensorflow/core/kernels/mkl_tfconv_op.cc new file mode 100644 index 00000000000000..5925a5b7c11b09 --- /dev/null +++ b/tensorflow/core/kernels/mkl_tfconv_op.cc @@ -0,0 +1,135 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifdef INTEL_MKL + +#include +#include +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/numeric_op.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/util/tensor_format.h" +#include "tensorflow/core/kernels/ops_util.h" +#include "tensorflow/core/platform/macros.h" + +#include "third_party/mkl/include/mkl_dnn_types.h" +#include "third_party/mkl/include/mkl_dnn.h" +#include "tensorflow/core/util/mkl_util.h" + +namespace tensorflow { + typedef Eigen::ThreadPoolDevice CPUDevice; + +/////////////////////////////////////////////////////////// +// Op kernel +/////////////////////////////////////////////////////////// + +template +class MklToTfOp : public OpKernel { + public: + explicit MklToTfOp(OpKernelConstruction* context) : OpKernel(context) { + OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format_str)); + OP_REQUIRES_OK(context, context->GetAttr("T", &op_data_type)); + } + + void Compute(OpKernelContext* context) override { + // 1. Check that input tensor is in MKL format. + const Tensor& input_tensor = MklGetInput(context, 0); + MklShape input_shape; + GetMklShape(context, 0, &input_shape); + + // if input is already in Tf format, then just copy input tensor to output. + if (!input_shape.IsMklTensor()) { + context->set_output(0, input_tensor); + VLOG(1) << "MKLToTFConversion: No conversion needed, " + << "copying input to output"; + return; + } + + // Check that input data type is same as operator data type and that it is + // same as output data type. + DataType input_data_type = input_type(0); + DataType output_data_type = output_type(0); + CHECK_EQ(op_data_type, input_data_type); + CHECK_EQ(op_data_type, output_data_type); + + // We need to recreate Tf tensor shape based on sizes and strides. + // Ideally, we should know what the data_format is, but that attribute + // to this op is not reliable. So below, we rely of sorting logic where + // we sort strides first and then sizes. + TensorShape output_shape; + std::vector> shape_size; + for (size_t i = 0; i < input_shape.GetDimension(); i++) { + VLOG(1) << "Size: " << input_shape.GetSizes()[i] + << ", Strides: " << input_shape.GetStrides()[i]; + shape_size.push_back(std::make_pair(input_shape.GetSizes()[i], + input_shape.GetStrides()[i])); + } + + std::sort(shape_size.begin(), shape_size.end(), []( + std::pair a, std::pair b) { + return (a.second > b.second) || + (a.second == b.second && a.first > b.first); + }); + + for (std::pair s_s : shape_size) { + VLOG(1) << "Added dimension: " << s_s.first; + output_shape.AddDim(s_s.first); + } + + // Allocate output tensor. + Tensor* output_tensor = NULL; + OP_REQUIRES_OK(context, + context->allocate_output(0, output_shape, &output_tensor)); + + // 3. Get input and output layout pointers. + dnnLayout_t output_layout = static_cast( + input_shape.GetTfLayout()); + + // 4. Execute DNNConversion. + void *input_buffer = static_cast(const_cast( + input_tensor.flat().data())); + void *output_buffer = static_cast(const_cast( + output_tensor->flat().data())); + input_shape.GetConvertedFlatData(output_layout, input_buffer, + output_buffer); + + VLOG(1) << "MKLToTFConversion complete successfully."; + } + + private: + /// Data format of the operation + string data_format_str; + + /// Data type of the operation + DataType op_data_type; +}; + +/////////////////////////////////////////////////////////// +// Register kernel +/////////////////////////////////////////////////////////// + +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("MklToTf").Device(DEVICE_CPU).TypeConstraint("T") \ + .Label(mkl_layer_registry::kMklLayerLabel), \ + MklToTfOp); + +TF_CALL_float(REGISTER_CPU); +#undef REGISTER_CPU +} // namespace tensorflow +#endif /* INTEL_MKL */ diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc new file mode 100644 index 00000000000000..c00674d72f1fc0 --- /dev/null +++ b/tensorflow/core/kernels/mkl_transpose_op.cc @@ -0,0 +1,67 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// See docs in ../ops/array_ops.cc. + +#ifdef INTEL_MKL +#define EIGEN_USE_THREADS + +#include "tensorflow/core/kernels/transpose_op.h" +#include "tensorflow/core/kernels/transpose_functor.h" +#include "third_party/mkl/include/mkl_trans.h" + +namespace tensorflow { + +// output = TransposeOp(T input, T perm) takes a tensor +// of type T and rank N, and a permutation of 0, 1, ..., N-1. It +// shuffles the dimensions of the input tensor according to permutation. +// +// Specifically, the returned tensor output meets the following condition: +// 1) output.dims() == input.dims(); +// 2) output.dim_size(i) == input.dim_size(perm[i]); +// 3) output.tensor(i_0, i_1, ..., i_N-1) == +// input.tensor(j_0, j_1, ..., j_N-1), +// where i_s == j_{perm[s]} +// +// REQUIRES: perm is a vector of int32. +// REQUIRES: input.dims() == perm.size(). +// REQUIRES: perm is a permutation. + +Status MklTransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in, + gtl::ArraySlice perm, + Tensor* out) { + if (in.dims() == 2 && in.dtype() == DT_FLOAT) { + float* user_o = out->flat().data(); + const float* user_i = in.flat().data(); + + // Documentation here: https://software.intel.com/en-us/node/520863 + // Parameters: (ordering:row-major, operation:transpose, num_rows, num_cols, + // alpha (for scaling), array, dist_bet_adjacent_cols/rows + // (source), array, dist_bet_adjacent_cols/rows (dest)) + mkl_somatcopy('R', 'T', in.dim_size(0), in.dim_size(1), 1, + user_i, in.dim_size(1), + user_o, in.dim_size(0)); + + return Status::OK(); + } + + // Fallback to eigen if transpose parameters not supported by MKL + typedef Eigen::ThreadPoolDevice CPUDevice; + return ::tensorflow::DoTranspose(ctx->eigen_device(), in, perm, + out); +} // MklTransposeCpuOp::DoTranspose +} // namespace tensorflow + +#endif // INTEL_MKL diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc index ddc9c9823b13fc..3fe16c66b853b1 100644 --- a/tensorflow/core/kernels/pooling_ops_common.cc +++ b/tensorflow/core/kernels/pooling_ops_common.cc @@ -64,6 +64,8 @@ PoolParameters::PoolParameters(OpKernelContext* context, OP_REQUIRES_OK( context, GetWindowedOutputSize(tensor_in_cols, window_cols, col_stride, padding, &out_width, &pad_cols)); + pad_depth = 0; + out_depth = depth; } else { // Our current version of depthwise max pooling does not support // any padding, and expects the depth_window to equal the diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc index 9bfbe2a61a1b2b..f1627135c58550 100644 --- a/tensorflow/core/kernels/resize_nearest_neighbor_op.cc +++ b/tensorflow/core/kernels/resize_nearest_neighbor_op.cc @@ -66,9 +66,7 @@ class ResizeNearestNeighborOp : public OpKernel { const int64 in_x = std::min(static_cast(floorf(x * st.width_scale)), (st.in_width - 1)); - for (int c = 0; c < st.channels; ++c) { - output_data(b, y, x, c) = input_data(b, in_y, in_x, c); - } + std::copy_n(&input_data(b, in_y, in_x, 0), st.channels, &output_data(b, y, x, 0)); } } } diff --git a/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc b/tensorflow/core/kernels/resize_op_benchmark_test.cc similarity index 64% rename from tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc rename to tensorflow/core/kernels/resize_op_benchmark_test.cc index 07cf653c2fe5db..4d0805a737f77b 100644 --- a/tensorflow/core/kernels/resize_nearest_neighbor_op_benchmark_test.cc +++ b/tensorflow/core/kernels/resize_op_benchmark_test.cc @@ -21,7 +21,8 @@ limitations under the License. namespace tensorflow { -static Graph* BM_ResizeNearestNeighbor(int batches, int width, int height) { +static Graph* BM_Resize(const char* algorithm, + int batches, int width, int height) { Graph* g = new Graph(OpRegistry::Global()); Tensor in(DT_FLOAT, TensorShape({batches, width, height, 3})); in.flat().setRandom(); @@ -32,21 +33,26 @@ static Graph* BM_ResizeNearestNeighbor(int batches, int width, int height) { out_size_flat(1) = height * 2; Node* ret; - NodeBuilder(g->NewName("n"), "ResizeNearestNeighbor") - .Input(test::graph::Constant(g, in)) - .Input(test::graph::Constant(g, out_size)) - .Finalize(g, &ret); + Status s = NodeBuilder(g->NewName("n"), algorithm) + .Input(test::graph::Constant(g, in)) + .Input(test::graph::Constant(g, out_size)) + .Finalize(g, &ret); + assert(s.ok()); return g; } -#define BM_ResizeNearestNeighborDev(DEVICE, B, W, H) \ - static void BM_ResizeNearestNeighbor_##DEVICE##_##B##_##W##_##H(int iters) { \ +#define BM_ResizeDev(DEVICE, ALGORITHM, B, W, H) \ + static void BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H(int iters) { \ testing::ItemsProcessed(iters* B* W* H * 3); \ - test::Benchmark(#DEVICE, BM_ResizeNearestNeighbor(B, W, H)).Run(iters); \ + test::Benchmark(#DEVICE, BM_Resize(#ALGORITHM, B, W, H)).Run(iters); \ } \ - BENCHMARK(BM_ResizeNearestNeighbor_##DEVICE##_##B##_##W##_##H) + BENCHMARK(BM_Resize_##ALGORITHM##_##DEVICE##_##B##_##W##_##H) -BM_ResizeNearestNeighborDev(cpu, 1, 499, 499); -BM_ResizeNearestNeighborDev(gpu, 1, 499, 499); +BM_ResizeDev(cpu, ResizeNearestNeighbor, 10, 499, 499); +BM_ResizeDev(gpu, ResizeNearestNeighbor, 10, 499, 499); + +BM_ResizeDev(cpu, ResizeBilinear, 10, 499, 499); +BM_ResizeDev(gpu, ResizeBilinear, 10, 499, 499); } // namespace tensorflow + diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc index 4d303f0173206f..fb2ceb4a4a8212 100644 --- a/tensorflow/core/kernels/transpose_op.cc +++ b/tensorflow/core/kernels/transpose_op.cc @@ -180,6 +180,20 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in, out); } +#ifdef INTEL_MKL +#define REGISTER(T) \ + REGISTER_KERNEL_BUILDER(Name("Transpose") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("Tperm") \ + .HostMemory("perm"), \ + MklTransposeCpuOp); +TF_CALL_ALL_TYPES(REGISTER); +REGISTER(bfloat16); +#undef REGISTER + +#else // INTEL_MKL + #define REGISTER(T) \ REGISTER_KERNEL_BUILDER(Name("Transpose") \ .Device(DEVICE_CPU) \ @@ -190,6 +204,7 @@ Status TransposeCpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in, TF_CALL_ALL_TYPES(REGISTER) REGISTER(bfloat16); #undef REGISTER +#endif // INTEL_MKL #if GOOGLE_CUDA Status TransposeGpuOp::DoTranspose(OpKernelContext* ctx, const Tensor& in, diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h index 5f40bcecc18dd0..a69eecc2f83226 100644 --- a/tensorflow/core/kernels/transpose_op.h +++ b/tensorflow/core/kernels/transpose_op.h @@ -41,6 +41,17 @@ class TransposeCpuOp : public TransposeOp { gtl::ArraySlice perm, Tensor* out) override; }; +#ifdef INTEL_MKL +class MklTransposeCpuOp : public TransposeOp { + public: + explicit MklTransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {} + + protected: + Status DoTranspose(OpKernelContext* ctx, const Tensor& in, + gtl::ArraySlice perm, Tensor* out) override; +}; +#endif // INTEL_MKL + class TransposeGpuOp : public TransposeOp { public: explicit TransposeGpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {} diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index eee9961b288275..e56b27b0c01e7b 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -2502,4 +2502,45 @@ scale_after_normalization: A bool indicating whether the resulted tensor needs to be multiplied with gamma. )doc"); +#ifdef INTEL_MKL +REGISTER_OP("MklConv2D") + .Input("input: T") + .Input("mkl_input: uint8") + .Input("filter: T") + .Input("mkl_filter: uint8") + .Output("output: T") + .Output("mkl_output: uint8") + .Attr("T: {half, float, double}") + .Attr("strides: list(int)") + .Attr("use_cudnn_on_gpu: bool = true") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnetDataFormatAttrString()) + .SetShapeFn(shape_inference::Conv2DShape) + .Doc(R"doc( +MKL version of Conv2D +)doc"); + +REGISTER_OP("MklConv2DWithBias") + .Input("input: T") + .Input("mkl_input: uint8") + .Input("filter: T") + .Input("mkl_filter: uint8") + .Input("bias: T") + .Input("mkl_bias: uint8") + .Output("output: T") + .Output("mkl_output: uint8") + .Attr("T: {half, float, double}") + .Attr("strides: list(int)") + .Attr("use_cudnn_on_gpu: bool = true") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnetDataFormatAttrString()); + +REGISTER_OP("MklToTf") + .Input("input: T") + .Input("mkl_input: uint8") + .Output("output: T") + .Attr("T: {half, float, double}") + .Attr(GetConvnetDataFormatAttrString()); +#endif // INTEL_MKL + } // namespace tensorflow diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt index d17b52306d1298..aa2177dba4174e 100644 --- a/tensorflow/core/ops/ops.pbtxt +++ b/tensorflow/core/ops/ops.pbtxt @@ -25758,6 +25758,59 @@ op { summary: "Computes the sum along segments of a tensor." description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n`(output[i] = sum_{j...} data[j...]` where the sum is over tuples `j...` such\nthat `segment_ids[j...] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\nrange of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n
\n\n
" } +op { + name: "UnsortedSegmentSum" + input_arg { + name: "data" + type_attr: "T" + } + input_arg { + name: "segment_ids" + description: "A tensor whose shape is a prefix of `data.shape`." + type_attr: "Tindices" + } + input_arg { + name: "num_segments" + type: DT_INT32 + } + output_arg { + name: "output" + description: "Has same shape as data, except for the first `segment_ids.rank`\ndimensions, which are replaced with a single dimension which has size\n`num_segments`." + type_attr: "T" + } + attr { + name: "T" + type: "type" + allowed_values { + list { + type: DT_FLOAT + type: DT_DOUBLE + type: DT_INT64 + type: DT_INT32 + type: DT_UINT8 + type: DT_UINT16 + type: DT_INT16 + type: DT_INT8 + type: DT_QINT8 + type: DT_QUINT8 + type: DT_QINT32 + type: DT_HALF + } + } + } + attr { + name: "Tindices" + type: "type" + allowed_values { + list { + type: DT_INT32 + type: DT_INT64 + } + } + } + summary: "Computes the max along segments of a tensor." + description: "Read [the section on\nSegmentation](../../api_docs/python/math_ops.md#segmentation) for an explanation\nof segments.\n\nComputes a tensor such that\n\\\\(output_i = \\sum_j data_j\\\\) where sum is over `j` such\nthat `segment_ids[j] == i`. Unlike `SegmentSum`, `segment_ids`\nneed not be sorted and need not cover all values in the full\n range of valid values.\n\nIf the sum is empty for a given segment ID `i`, `output[i] = 0`.\n\n`num_segments` should equal the number of distinct segment IDs.\n\n
\n\n
" +} op { name: "Unstage" output_arg { diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl index 5db8b68048e3f2..f21a646ca1c530 100644 --- a/tensorflow/core/platform/default/build_config.bzl +++ b/tensorflow/core/platform/default/build_config.bzl @@ -4,11 +4,6 @@ load("@protobuf//:protobuf.bzl", "cc_proto_library") load("@protobuf//:protobuf.bzl", "py_proto_library") load("//tensorflow:tensorflow.bzl", "if_not_mobile") -# configure may change the following lines -WITH_GCP_SUPPORT = False -WITH_HDFS_SUPPORT = False -WITH_JEMALLOC = True - # Appends a suffix to a list of deps. def tf_deps(deps, suffix): tf_deps = [] @@ -196,61 +191,54 @@ def tf_additional_test_srcs(): def tf_kernel_tests_linkstatic(): return 0 -# jemalloc only enabled on Linux for now. -# TODO(jhseu): Enable on other platforms. def tf_additional_lib_defines(): - defines = [] - if WITH_JEMALLOC: - defines += select({ - "//tensorflow:linux_x86_64": [ - "TENSORFLOW_USE_JEMALLOC" - ], - "//conditions:default": [], - }) - return defines + return select({ + "//tensorflow:with_jemalloc": ["TENSORFLOW_USE_JEMALLOC"], + "//conditions:default": [], + }) def tf_additional_lib_deps(): - deps = [] - if WITH_JEMALLOC: - deps += select({ - "//tensorflow:linux_x86_64": ["@jemalloc"], - "//conditions:default": [], - }) - return deps + return select({ + "//tensorflow:with_jemalloc": ["@jemalloc"], + "//conditions:default": [], + }) def tf_additional_core_deps(): - deps = [] - if WITH_GCP_SUPPORT: - deps.append("//tensorflow/core/platform/cloud:gcs_file_system") - if WITH_HDFS_SUPPORT: - deps.append("//tensorflow/core/platform/hadoop:hadoop_file_system") - return deps + return select({ + "//tensorflow:with_gcp_support": [ + "//tensorflow/core/platform/cloud:gcs_file_system", + ], + "//conditions:default": [], + }) + select({ + "//tensorflow:with_hdfs_support": [ + "//tensorflow/core/platform/hadoop:hadoop_file_system", + ], + "//conditions:default": [], + }) # TODO(jart, jhseu): Delete when GCP is default on. def tf_additional_cloud_op_deps(): - deps = [] - if WITH_GCP_SUPPORT: - deps = select({ + return select({ "//tensorflow:windows": [], "//tensorflow:android": [], "//tensorflow:ios": [], - "//conditions:default": - ["//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib"], - }) - return deps + "//tensorflow:with_gcp_support": [ + "//tensorflow/contrib/cloud:bigquery_reader_ops_op_lib", + ], + "//conditions:default": [], + }) # TODO(jart, jhseu): Delete when GCP is default on. def tf_additional_cloud_kernel_deps(): - deps = [] - if WITH_GCP_SUPPORT: - deps = select({ + return select({ "//tensorflow:windows": [], "//tensorflow:android": [], "//tensorflow:ios": [], - "//conditions:default": - ["//tensorflow/contrib/cloud/kernels:bigquery_reader_ops"], - }) - return deps + "//tensorflow:with_gcp_support": [ + "//tensorflow/contrib/cloud/kernels:bigquery_reader_ops", + ], + "//conditions:default": [], + }) def tf_lib_proto_parsing_deps(): return [ diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl index 23a7b9065a62ad..79f97c12347b8c 100644 --- a/tensorflow/core/platform/default/build_config_root.bzl +++ b/tensorflow/core/platform/default/build_config_root.bzl @@ -2,8 +2,6 @@ # The functions in this file might be referred by tensorflow.bzl. They have to # be separate to avoid cyclic references. -WITH_XLA_SUPPORT = False - def tf_cuda_tests_tags(): return ["local"] @@ -11,16 +9,16 @@ def tf_sycl_tests_tags(): return ["local"] def tf_additional_plugin_deps(): - deps = [] - if WITH_XLA_SUPPORT: - deps.append("//tensorflow/compiler/jit") - return deps + return select({ + "//tensorflow:with_xla_support": ["//tensorflow/compiler/jit"], + "//conditions:default": [], + }) def tf_additional_xla_deps_py(): return [] def tf_additional_license_deps(): - licenses = [] - if WITH_XLA_SUPPORT: - licenses.append("@llvm//:LICENSE.TXT") - return licenses + return select({ + "//tensorflow:with_xla_support": ["@llvm//:LICENSE.TXT"], + "//conditions:default": [], + }) diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc index 1d0c9dc8cdf1fe..66bda85b2f690b 100644 --- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc +++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc @@ -58,6 +58,7 @@ class LibHDFS { std::function hdfsBuilderConnect; std::function hdfsNewBuilder; std::function hdfsBuilderSetNameNode; + std::function hdfsConfGetStr; std::function hdfsBuilderSetKerbTicketCachePath; std::function hdfsCloseFile; @@ -85,6 +86,7 @@ class LibHDFS { BIND_HDFS_FUNC(hdfsBuilderConnect); BIND_HDFS_FUNC(hdfsNewBuilder); BIND_HDFS_FUNC(hdfsBuilderSetNameNode); + BIND_HDFS_FUNC(hdfsConfGetStr); BIND_HDFS_FUNC(hdfsBuilderSetKerbTicketCachePath); BIND_HDFS_FUNC(hdfsCloseFile); BIND_HDFS_FUNC(hdfsPread); @@ -147,6 +149,18 @@ Status HadoopFileSystem::Connect(StringPiece fname, hdfsFS* fs) { hdfsBuilder* builder = hdfs_->hdfsNewBuilder(); if (scheme == "file") { hdfs_->hdfsBuilderSetNameNode(builder, nullptr); + } else if (scheme == "viewfs") { + char *defaultFS = NULL; + hdfs_->hdfsConfGetStr("fs.defaultFS", &defaultFS); + StringPiece defaultScheme, defaultCluster, defaultPath; + io::ParseURI(defaultFS, &defaultScheme, &defaultCluster, &defaultPath); + + if (scheme != defaultScheme || namenode != defaultCluster) { + return errors::Unimplemented("viewfs is only supported as a fs.defaultFS."); + } + // The default NameNode configuration will be used (from the XML configuration files). See: + // https://github.com/tensorflow/tensorflow/blob/v1.0.0/third_party/hadoop/hdfs.h#L259 + hdfs_->hdfsBuilderSetNameNode(builder, "default"); } else { hdfs_->hdfsBuilderSetNameNode(builder, nn.c_str()); } @@ -478,5 +492,6 @@ Status HadoopFileSystem::Stat(const string& fname, FileStatistics* stats) { } REGISTER_FILE_SYSTEM("hdfs", HadoopFileSystem); +REGISTER_FILE_SYSTEM("viewfs", HadoopFileSystem); } // namespace tensorflow diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h index aad35890af67b6..b6fb18bd9942db 100644 --- a/tensorflow/core/platform/macros.h +++ b/tensorflow/core/platform/macros.h @@ -53,6 +53,17 @@ limitations under the License. #define TF_SCANF_ATTRIBUTE(string_index, first_to_check) #endif +// Control visiblity outside .so +#if defined(COMPILER_MSVC) +# ifdef TF_COMPILE_LIBRARY +# define TF_EXPORT __declspec(dllexport) +# else +# define TF_EXPORT __declspec(dllimport) +# endif // TF_COMPILE_LIBRARY +#else +# define TF_EXPORT __attribute__((visibility("default"))) +#endif // COMPILER_MSVC + // GCC can be told that a certain branch is not likely to be taken (for // instance, a CHECK failure), and use that information in static analysis. // Giving it this information can help it optimize for the common case in diff --git a/tensorflow/core/platform/windows/cpu_info.h b/tensorflow/core/platform/windows/cpu_info.h index 77a1946e61b9f2..d6e78dbc8f9f25 100644 --- a/tensorflow/core/platform/windows/cpu_info.h +++ b/tensorflow/core/platform/windows/cpu_info.h @@ -16,6 +16,9 @@ limitations under the License. #ifndef TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_ #define TENSORFLOW_PLATFORM_WINDOWS_CPU_INFO_H_ +// included so __cpuidex function is available for GETCPUID on Windows +#include + // Byte order defines provided by gcc. MSVC doesn't define those so // we define them here. // We assume that all windows platform out there are little endian. diff --git a/tensorflow/core/platform/windows/intrinsics_port.h b/tensorflow/core/platform/windows/intrinsics_port.h index a4fa1e997109de..e52f5b16464e96 100644 --- a/tensorflow/core/platform/windows/intrinsics_port.h +++ b/tensorflow/core/platform/windows/intrinsics_port.h @@ -24,6 +24,9 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #define _mm_load_pd1 _mm_load1_pd + +// only define these intrinsics if immintrin.h doesn't have them (VS2015 and earlier) +#if _MSC_VER < 1910 static inline int _mm256_extract_epi32(__m256i a, const int i) { @@ -39,3 +42,4 @@ _mm256_insert_epi32(__m256i a, int b, const int i) } #endif #endif +#endif diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc index facadc7f57f2c8..72e7e06e65cea6 100644 --- a/tensorflow/core/platform/windows/windows_file_system.cc +++ b/tensorflow/core/platform/windows/windows_file_system.cc @@ -230,11 +230,9 @@ Status WindowsFileSystem::NewRandomAccessFile( result->reset(); // Open the file for read-only random access - // Random access is to disable read-ahead as the system reads too much data // Open in async mode which makes Windows allow more parallelism even // if we need to do sync I/O on top of it. - DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS | - FILE_FLAG_OVERLAPPED; + DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_OVERLAPPED; // Shared access is necessary for tests to pass // almost all tests would work with a possible exception of fault_injection. DWORD share_mode = FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE; @@ -306,8 +304,8 @@ Status WindowsFileSystem::NewReadOnlyMemoryRegionFromFile( result->reset(); Status s = Status::OK(); - // Open the file for read-only random access - DWORD file_flags = FILE_ATTRIBUTE_READONLY | FILE_FLAG_RANDOM_ACCESS; + // Open the file for read-only + DWORD file_flags = FILE_ATTRIBUTE_READONLY; // Open in async mode which makes Windows allow more parallelism even // if we need to do sync I/O on top of it. diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h new file mode 100644 index 00000000000000..6d09995b51bb5b --- /dev/null +++ b/tensorflow/core/util/mkl_util.h @@ -0,0 +1,296 @@ +/* Copyright 2017 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_UTIL_MKL_UTIL_H_ +#define TENSORFLOW_CORE_UTIL_MKL_UTIL_H_ +#ifdef INTEL_MKL +#include "third_party/mkl/include/mkl_dnn.h" +#include "third_party/mkl/include/mkl_dnn_types.h" +#include "third_party/mkl/include/mkl_service.h" + +#include "tensorflow/core/framework/tensor.h" +#include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/util/tensor_format.h" + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/macros.h" + +// The file contains a number of utility classes and functions used by MKL +// enabled kernels + +namespace tensorflow { + +// This class encapsulates all the meta data that is associated with an MKL +// tensor. A tensor is an MKL tensor if it was created as the result of an +// MKL operation, and did not go through a conversion to a standard +// Tensorflow tensor. + +class MklShape { + public: + MklShape() {} + TF_DISALLOW_COPY_AND_ASSIGN(MklShape); // Cannot copy + + ~MklShape() { + if (sizes_) delete[] sizes_; + if (strides_) delete[] strides_; + if (mklLayout_) CHECK_EQ(dnnLayoutDelete_F32(mklLayout_), E_SUCCESS); + if (tfLayout_) CHECK_EQ(dnnLayoutDelete_F32(tfLayout_), E_SUCCESS); + } + + const bool IsMklTensor() const { return isMklTensor_; } + + void SetMklTensor(const bool isMklTensor) { isMklTensor_ = isMklTensor; } + + void SetMklLayout(const void* primitive, size_t resourceType) { + CHECK_EQ( + dnnLayoutCreateFromPrimitive_F32(&mklLayout_, (dnnPrimitive_t)primitive, + (dnnResourceType_t)resourceType), + E_SUCCESS); + } + + void SetTfLayout(const size_t dimension, const size_t* sizes, + const size_t* strides) { + dimension_ = dimension; + if (dimension > 0) { // MKl doesn't support dimension 0 + sizes_ = new size_t[dimension]; + strides_ = new size_t[dimension]; + + for (int ii = 0; ii < dimension; ii++) { + sizes_[ii] = sizes[ii]; + strides_[ii] = strides[ii]; + } + CHECK_EQ(dnnLayoutCreate_F32(&tfLayout_, dimension, sizes, strides), + E_SUCCESS); + } + } + + const dnnLayout_t GetMklLayout() const { return mklLayout_; } + const dnnLayout_t GetTfLayout() const { return tfLayout_; } + const dnnLayout_t GetCurLayout() const { + return isMklTensor_ ? mklLayout_ : tfLayout_; + } + size_t GetDimension() const { return dimension_; } + const size_t* GetSizes() const { return sizes_; } + const size_t* GetStrides() const { return strides_; } + + void GetConvertedFlatData(dnnLayout_t targetLayout, void* input, + void* output) const { + dnnLayout_t curLayout; + if (isMklTensor_) + curLayout = mklLayout_; + else + curLayout = tfLayout_; + dnnPrimitive_t convert; + CHECK_EQ(dnnConversionCreate_F32(&convert, curLayout, targetLayout), + E_SUCCESS); + CHECK_EQ(dnnConversionExecute_F32(convert, input, output), E_SUCCESS); + CHECK_EQ(dnnDelete_F32(convert), E_SUCCESS); + } + +// The following methods are used for serializing and de-serializing the +// contents of the mklshape object. +// The data is serialized in this order +// isMklTensor_ +// dimension_ +// sizes +// strides +// mklLayout_ +// tfLayout_ + +#define SIZE_OF_MKL_DNN_BUF \ + (dnnLayoutSerializationBufferSize_F32()) // Size of buffer needed to + // serialize dnn_layout pointer + +// Size of buffer to hold the serialized object, the size is computed as follows +// sizeof(isMklTensor_) + sizeof(dimension_) + sizeof(sizes) + sizeof(strides) +// + sizeof(mklLayout_ buffer) + sizeof(tfLayout_ buffer) + +#define SIZE_OF_MKL_SERIAL_DATA(dims) \ + (2 * sizeof(size_t) + 2 * dims * sizeof(size_t) + 2 * SIZE_OF_MKL_DNN_BUF) + +// First we need to define some macro for offsets into the serial buffer where +// different elements of Mklshape is written/read from + +#define IS_MKL_TENSOR_OFFSET 0 +// Location from start of buffer where isMklTensor_ is serialized +#define DIMS_OFFSET \ + (IS_MKL_TENSOR_OFFSET + sizeof(size_t)) // Location of dimension_ +#define SIZES_OFFSET(dims) \ + (DIMS_OFFSET + \ + sizeof(size_t)) // Location of sizes. Note dim is not used here, left here + // to make macros consistent. +#define STRIDES_OFFSET(dims) \ + (SIZES_OFFSET(dims) + dims * sizeof(size_t)) // Location of strides +#define MKL_LAYOUT_OFFSET(dims) \ + (STRIDES_OFFSET(dims) + dims * sizeof(size_t)) // Location of mklLayout_ +#define TF_LAYOUT_OFFSET(dims) \ + (MKL_LAYOUT_OFFSET(dims) + SIZE_OF_MKL_DNN_BUF) // Location of tfLayout_ + + // TODO(agramesh1) make sure to create a const to share with rewrite pass + // for min size of MKL metadata tensor. + + void DeSerializeMklShape(const unsigned char* buf, size_t buf_size) { + CHECK(buf_size >= sizeof(size_t)) << "Bufsize too small in DeSerialize"; + // Make sure buffer holds at least isMklTensor_ + isMklTensor_ = + *reinterpret_cast(buf + IS_MKL_TENSOR_OFFSET) != 0; + + if (isMklTensor_) { // If it is an MKL Tensor then read the rest + dimension_ = *(reinterpret_cast(buf + DIMS_OFFSET)); + CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_)) + << "Bufsize too small in DeSerialize"; + sizes_ = new size_t[dimension_]; + strides_ = new size_t[dimension_]; + for (int i = 0; i < dimension_; i++) { + sizes_[i] = + reinterpret_cast(buf + SIZES_OFFSET(dimension_))[i]; + strides_[i] = reinterpret_cast( + buf + STRIDES_OFFSET(dimension_))[i]; + } + CHECK_EQ(dnnLayoutDeserialize_F32(&mklLayout_, + buf + MKL_LAYOUT_OFFSET(dimension_)), + E_SUCCESS); + CHECK_EQ(dnnLayoutDeserialize_F32(&tfLayout_, + buf + TF_LAYOUT_OFFSET(dimension_)), + E_SUCCESS); + } + } + + void SerializeMklShape(unsigned char* buf, size_t buf_size) const { + CHECK(buf_size >= SIZE_OF_MKL_SERIAL_DATA(dimension_)) + << "Bufsize too small to Serialize"; + *reinterpret_cast(buf + IS_MKL_TENSOR_OFFSET) = + isMklTensor_ ? 1 : 0; + if (isMklTensor_) { + *(reinterpret_cast(buf + DIMS_OFFSET)) = dimension_; + for (int i = 0; i < dimension_; i++) { + reinterpret_cast(buf + SIZES_OFFSET(dimension_))[i] = + sizes_[i]; + reinterpret_cast(buf + STRIDES_OFFSET(dimension_))[i] = + strides_[i]; + } + CHECK_EQ(dnnLayoutSerialize_F32(mklLayout_, + buf + MKL_LAYOUT_OFFSET(dimension_)), + E_SUCCESS); + CHECK_EQ( + dnnLayoutSerialize_F32(tfLayout_, buf + TF_LAYOUT_OFFSET(dimension_)), + E_SUCCESS); + } + } + + private: + bool isMklTensor_ = + false; // Flag to indicate if the tensor is an MKL tensor or not + dnnLayout_t mklLayout_ = nullptr; // Pointer to the MKL layout + dnnLayout_t tfLayout_ = nullptr; // Pointer to layout of corresponding + // Tensorflow tensor, used when conversion from MKL to standard tensor + size_t dimension_ = 0; + size_t* sizes_ = nullptr; // Required by MKL for conversions + size_t* strides_ = nullptr; // Required by MKL for conversions +}; + +int inline GetTensorDataIndex(int n) { + return 2 * n; // index corresponding to nth input/output tensor +} + +int inline GetTensorMetaDataIndex(int n) { + // index corresponding to meta data of nth input/output tensor + return 2 * n + 1; +} +// Get the MKL shape from the second string tensor +inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) { + mklshape->DeSerializeMklShape( + ctext->input(GetTensorMetaDataIndex(n)).flat().data(), + ctext->input(GetTensorMetaDataIndex(n)).flat().size() * + sizeof(uint8)); +} + +// Gets the actual input +inline const Tensor& MklGetInput(OpKernelContext* ctext, int n) { + return ctext->input(GetTensorDataIndex(n)); +} + +// Allocate the output tensor, create a second output tensor that will contain +// the MKL shape serialized +inline void AllocateOutputSetMklshape(OpKernelContext* ctext, int n, + Tensor** output, + const TensorShape& tfshape, + const MklShape& mklshape) { + Tensor* second_tensor = nullptr; + TensorShape second_shape; + second_shape.AddDim(SIZE_OF_MKL_SERIAL_DATA(mklshape.GetDimension())); + OP_REQUIRES_OK( + ctext, ctext->allocate_output(GetTensorDataIndex(n), tfshape, output)); + OP_REQUIRES_OK(ctext, ctext->allocate_output(GetTensorMetaDataIndex(n), + second_shape, &second_tensor)); + mklshape.SerializeMklShape( + second_tensor->flat().data(), + second_tensor->flat().size() * sizeof(uint8)); +} + +// Allocates a temp tensor and returns the data buffer for temporary storage. +// Currently +// we only support F32, will need to templatize if other types are added +inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out, + dnnLayout_t lt_buff, void** buf_out) { + TensorShape tf_shape; + + tf_shape.AddDim( + dnnLayoutGetMemorySize_F32(static_cast(lt_buff)) / + sizeof(float) + + 1); + OP_REQUIRES_OK(context, context->allocate_temp(DataTypeToEnum::v(), + tf_shape, tensor_out)); + *buf_out = static_cast(tensor_out->flat().data()); +} + +inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides, + const size_t* sizes) { + // MKL requires strides in NCHW + if (data_format == FORMAT_NHWC) { + strides[0] = sizes[2]; + strides[1] = sizes[0] * sizes[2]; + strides[2] = 1; + strides[3] = sizes[0] * sizes[1] * sizes[2]; + } else { + strides[0] = 1; + strides[1] = sizes[0]; + strides[2] = sizes[0] * sizes[1]; + strides[3] = sizes[0] * sizes[1] * sizes[2]; + } +} + +namespace mkl_layer_registry { + +static const char* kMklLayerLabel = "MklLayer"; +static const string kMklLayerLabelPattern = "label='MklLayer'"; + +// Check whether opname is registered as MKL-compliant in the registry. +// +// @input: name of the op +// @return: true if opname is registered as Mkl layer op +static inline bool IsMklLayer(const std::string& op_name) { + string kernel = KernelsRegisteredForOp(op_name); + return kernel.find(kMklLayerLabelPattern) != string::npos; +} + +} // namespace mkl_layer_registry + +} // namespace tensorflow +#endif // INTEL_MKL +#endif // TENSORFLOW_CORE_UTIL_MKL_UTIL_H_ diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md index f95298d377c25e..4fc4c2faa2a8af 100644 --- a/tensorflow/docs_src/extend/adding_an_op.md +++ b/tensorflow/docs_src/extend/adding_an_op.md @@ -1056,7 +1056,7 @@ cuda_op_kernel.cu.o -I $TF_INC -fPIC -lcudart Note that if your CUDA libraries are not installed in `/usr/local/lib64`, you'll need to specify the path explicitly in the second (g++) command above. -For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in +For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in `/usr/local/cuda-8.0`. ### Implement the gradient in Python {#implement-gradient} @@ -1160,7 +1160,9 @@ for ZeroOut: ``` `c->set_output(0, c->input(0));` declares that the first output's shape should -be set to the first input's shape. There are a number of common shape functions +be set to the first input's shape. If the output is selected by its index as in the above example, the second parameter of `set_output` should be a `ShapeHandle` object. You can create an empty `ShapeHandle` object by its default constructor. The `ShapeHandle` object for an input with index `idx` can be obtained by `c->input(idx)`. + +There are a number of common shape functions that apply to many ops, such as `shape_inference::UnchangedShape` which can be found in [common_shape_fns.h](https://www.tensorflow.org/code/tensorflow/core/framework/common_shape_fns.h) and used as follows: @@ -1220,7 +1222,15 @@ particular dimension has a very specific value using `InferenceContext::Dim` and `InferenceContext::WithValue`; you can specify that an output dimension is the sum / product of two input dimensions using `InferenceContext::Add` and `InferenceContext::Multiply`. See the `InferenceContext` class for -all of the various shape manipulations you can specify. +all of the various shape manipulations you can specify. The following example sets +shape of the first output to (n, 3), where first input has shape (n, ...) + +```c++ +.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) { + c->set_output(0, c->Matrix(c->Dim(c->input(0), 0), 3)); + return Status::OK(); +}); +``` If you have a complicated shape function, you should consider adding a test for validating that various input shape combinations produce the expected output diff --git a/tensorflow/docs_src/get_started/get_started.md b/tensorflow/docs_src/get_started/get_started.md index ae0007359d6b62..b71249de0aa658 100644 --- a/tensorflow/docs_src/get_started/get_started.md +++ b/tensorflow/docs_src/get_started/get_started.md @@ -374,7 +374,7 @@ estimator.fit(input_fn=input_fn, steps=1000) # Here we evaluate how well our model did. In a real example, we would want # to use a separate validation and testing data set to avoid overfitting. -estimator.evaluate(input_fn=input_fn) +print(estimator.evaluate(input_fn=input_fn)) ``` When run, it produces ``` diff --git a/tensorflow/docs_src/get_started/mnist/mechanics.md b/tensorflow/docs_src/get_started/mnist/mechanics.md index afd903901781ed..b55a5c19ff9db1 100644 --- a/tensorflow/docs_src/get_started/mnist/mechanics.md +++ b/tensorflow/docs_src/get_started/mnist/mechanics.md @@ -351,7 +351,7 @@ training. ```python if step % 100 == 0: - print 'Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration) + print('Step %d: loss = %.2f (%.3f sec)' % (step, loss_value, duration)) ``` #### Visualize the Status @@ -421,19 +421,19 @@ the training and test datasets. The `do_eval()` function is called thrice, for the training, validation, and test datasets. ```python -print 'Training Data Eval:' +print('Training Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.train) -print 'Validation Data Eval:' +print('Validation Data Eval:') do_eval(sess, eval_correct, images_placeholder, labels_placeholder, data_sets.validation) -print 'Test Data Eval:' +print('Test Data Eval:') do_eval(sess, eval_correct, images_placeholder, diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md index a400d916540dc4..fa8b6fb7f15f7f 100644 --- a/tensorflow/docs_src/programmers_guide/faq.md +++ b/tensorflow/docs_src/programmers_guide/faq.md @@ -92,12 +92,12 @@ two following snippets of code are equivalent: # Using `Session.run()`. sess = tf.Session() c = tf.constant(5.0) -print sess.run(c) +print(sess.run(c)) # Using `Tensor.eval()`. c = tf.constant(5.0) with tf.Session(): - print c.eval() + print(c.eval()) ``` In the second example, the session acts as a diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md index 9189618368323d..04bfca5f3bdc8d 100644 --- a/tensorflow/docs_src/programmers_guide/variables.md +++ b/tensorflow/docs_src/programmers_guide/variables.md @@ -144,6 +144,11 @@ specified list, of the variables in the graph. The saver object provides methods to run these ops, specifying paths for the checkpoint files to write to or read from. +Note that to restore a model checkpoint without a graph one must first import +the graph from the meta graph file (typical extension is `.meta`). This is +done with @{tf.train.import_meta_graph}, which in turn returns a `Saver` from +which one can than perform a `restore`. + ### Checkpoint Files Variables are saved in binary files that, roughly, contain a map from variable diff --git a/tensorflow/docs_src/tutorials/linear.md b/tensorflow/docs_src/tutorials/linear.md index 3569d47efdf6ae..30daf335bfac4c 100644 --- a/tensorflow/docs_src/tutorials/linear.md +++ b/tensorflow/docs_src/tutorials/linear.md @@ -217,7 +217,7 @@ results = e.evaluate(input_fn=input_fn_test, steps=1) # Print the stats for the evaluation. for key in sorted(results): - print "%s: %s" % (key, results[key]) + print("%s: %s" % (key, results[key])) ``` ### Wide and deep learning diff --git a/tensorflow/docs_src/tutorials/using_gpu.md b/tensorflow/docs_src/tutorials/using_gpu.md index e4e342adfe1a0c..d64cdafdefb287 100644 --- a/tensorflow/docs_src/tutorials/using_gpu.md +++ b/tensorflow/docs_src/tutorials/using_gpu.md @@ -28,7 +28,7 @@ c = tf.matmul(a, b) # Creates a session with log_device_placement set to True. sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) # Runs the op. -print sess.run(c) +print(sess.run(c)) ``` You should see the following output: @@ -61,7 +61,7 @@ with tf.device('/cpu:0'): # Creates a session with log_device_placement set to True. sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) # Runs the op. -print sess.run(c) +print(sess.run(c)) ``` You will see that now `a` and `b` are assigned to `cpu:0`. @@ -131,7 +131,7 @@ with tf.device('/gpu:2'): # Creates a session with log_device_placement set to True. sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) # Runs the op. -print sess.run(c) +print(sess.run(c)) ``` If the device you have specified does not exist, you will get @@ -160,7 +160,7 @@ with tf.device('/gpu:2'): sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True)) # Runs the op. -print sess.run(c) +print(sess.run(c)) ``` ## Using multiple GPUs @@ -182,7 +182,7 @@ with tf.device('/cpu:0'): # Creates a session with log_device_placement set to True. sess = tf.Session(config=tf.ConfigProto(log_device_placement=True)) # Runs the op. -print sess.run(sum) +print(sess.run(sum)) ``` You will see the following output. diff --git a/tensorflow/docs_src/tutorials/wide.md b/tensorflow/docs_src/tutorials/wide.md index 079efb201e8720..471811ea1a4184 100644 --- a/tensorflow/docs_src/tutorials/wide.md +++ b/tensorflow/docs_src/tutorials/wide.md @@ -188,7 +188,7 @@ def input_fn(df): categorical_cols = {k: tf.SparseTensor( indices=[[i, 0] for i in range(df[k].size)], values=df[k].values, - shape=[df[k].size, 1]) + dense_shape=[df[k].size, 1]) for k in CATEGORICAL_COLUMNS} # Merges the two dictionaries into one. feature_cols = dict(continuous_cols.items() + categorical_cols.items()) @@ -261,6 +261,8 @@ learned through the model training process we'll go through later. We'll do the similar trick to define the other categorical features: ```python +race = tf.contrib.layers.sparse_column_with_hash_bucket("race", hash_bucket_size=100) +marital_status = tf.contrib.layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100) relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100) workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100) occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000) @@ -377,7 +379,7 @@ the labels of the holdout data: ```python results = m.evaluate(input_fn=eval_input_fn, steps=1) for key in sorted(results): - print "%s: %s" % (key, results[key]) + print("%s: %s" % (key, results[key])) ``` The first line of the output should be something like `accuracy: 0.83557522`, diff --git a/tensorflow/docs_src/tutorials/wide_and_deep.md b/tensorflow/docs_src/tutorials/wide_and_deep.md index b5e5981fe13e9c..dd830eeca91cb6 100644 --- a/tensorflow/docs_src/tutorials/wide_and_deep.md +++ b/tensorflow/docs_src/tutorials/wide_and_deep.md @@ -255,7 +255,7 @@ After reading in the data, you can train and evaluate the model: m.fit(input_fn=train_input_fn, steps=200) results = m.evaluate(input_fn=eval_input_fn, steps=1) for key in sorted(results): - print "%s: %s" % (key, results[key]) + print("%s: %s" % (key, results[key])) ``` The first line of the output should be something like `accuracy: 0.84429705`. We diff --git a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java index a95e93ce69dfc5..c1a893e9ee4451 100644 --- a/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java +++ b/tensorflow/examples/android/src/org/tensorflow/demo/StylizeActivity.java @@ -432,7 +432,7 @@ private void setStyle(final ImageSlider slider, final float value) { // Everything else is 0, so just pick a suitable slider to push up when the // selected one goes down. if (adapter.items[lastOtherStyle] == slider) { - lastOtherStyle = lastOtherStyle + 1 % NUM_STYLES; + lastOtherStyle = (lastOtherStyle + 1) % NUM_STYLES; } adapter.items[lastOtherStyle].setValue(1.0f - value); } diff --git a/tensorflow/examples/learn/README.md b/tensorflow/examples/learn/README.md index b36986855fcd1f..37157fc2967906 100644 --- a/tensorflow/examples/learn/README.md +++ b/tensorflow/examples/learn/README.md @@ -1,7 +1,7 @@ # TF Learn Examples Learn is a high-level API for TensorFlow that allows you to create, -train, and use deep learning models easily. See the [Quickstart tutorial](../../g3doc/tutorials/tflearn/index.md) +train, and use deep learning models easily. See the [Quickstart tutorial](https://www.tensorflow.org/get_started/tflearn) for an introduction to the API. To run most of these examples, you need to install the `scikit learn` library (`sudo pip install sklearn`). diff --git a/tensorflow/examples/learn/boston.py b/tensorflow/examples/learn/boston.py index 2986ff9106b90f..19cfdee5130b89 100644 --- a/tensorflow/examples/learn/boston.py +++ b/tensorflow/examples/learn/boston.py @@ -16,19 +16,22 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from sklearn import cross_validation + +from sklearn import datasets +from sklearn import model_selection from sklearn import metrics from sklearn import preprocessing + import tensorflow as tf def main(unused_argv): # Load dataset - boston = tf.contrib.learn.datasets.load_dataset('boston') + boston = datasets.load_boston() x, y = boston.data, boston.target # Split dataset into train / test - x_train, x_test, y_train, y_test = cross_validation.train_test_split( + x_train, x_test, y_train, y_test = model_selection.train_test_split( x, y, test_size=0.2, random_state=42) # Scale data (training set) to 0 mean and unit standard deviation. diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py index 7b65eb521af3c5..ec2aa9b5731dce 100644 --- a/tensorflow/examples/learn/iris.py +++ b/tensorflow/examples/learn/iris.py @@ -17,7 +17,7 @@ from __future__ import division from __future__ import print_function - +from sklearn import datasets from sklearn import metrics from sklearn import model_selection @@ -26,7 +26,7 @@ def main(unused_argv): # Load dataset. - iris = tf.contrib.learn.datasets.load_dataset('iris') + iris = datasets.load_iris() x_train, x_test, y_train, y_test = model_selection.train_test_split( iris.data, iris.target, test_size=0.2, random_state=42) diff --git a/tensorflow/examples/learn/text_classification.py b/tensorflow/examples/learn/text_classification.py index c3d00a11b91e42..7e10014c392b66 100644 --- a/tensorflow/examples/learn/text_classification.py +++ b/tensorflow/examples/learn/text_classification.py @@ -24,6 +24,7 @@ import pandas from sklearn import metrics import tensorflow as tf +from tensorflow.contrib.layers.python.layers import encoders learn = tf.contrib.learn @@ -37,7 +38,7 @@ def bag_of_words_model(features, target): """A bag-of-words model. Note it disregards the word order in the text.""" target = tf.one_hot(target, 15, 1, 0) - features = tf.contrib.layers.bow_encoder( + features = encoders.bow_encoder( features, vocab_size=n_words, embed_dim=EMBEDDING_SIZE) logits = tf.contrib.layers.fully_connected(features, 15, activation_fn=None) loss = tf.contrib.losses.softmax_cross_entropy(logits, target) diff --git a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb index cbcc54ce3cf5bc..016b21cd12477a 100644 --- a/tensorflow/examples/tutorials/deepdream/deepdream.ipynb +++ b/tensorflow/examples/tutorials/deepdream/deepdream.ipynb @@ -278,7 +278,7 @@ " tensor = n.attr['value'].tensor\n", " size = len(tensor.tensor_content)\n", " if size > max_const_size:\n", - " tensor.tensor_content = bytes(\"\"%size, 'utf-8')\n", + " tensor.tensor_content = bytes(\"\"%size)\n", " return strip_def\n", " \n", "def rename_nodes(graph_def, rename_func):\n", diff --git a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py index 25800c109ed9fc..f54a7c37a1eb36 100644 --- a/tensorflow/examples/tutorials/word2vec/word2vec_basic.py +++ b/tensorflow/examples/tutorials/word2vec/word2vec_basic.py @@ -62,7 +62,7 @@ def read_data(filename): vocabulary_size = 50000 -def build_dataset(words): +def build_dataset(words, vocabulary_size): count = [['UNK', -1]] count.extend(collections.Counter(words).most_common(vocabulary_size - 1)) dictionary = dict() @@ -81,7 +81,7 @@ def build_dataset(words): reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data, count, dictionary, reverse_dictionary -data, count, dictionary, reverse_dictionary = build_dataset(words) +data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size) del words # Hint to reduce memory. print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) @@ -181,7 +181,7 @@ def generate_batch(batch_size, num_skips, skip_window): valid_embeddings, normalized_embeddings, transpose_b=True) # Add variable initializer. - init = tf.initialize_all_variables() + init = tf.global_variables_initializer() # Step 5: Begin training. num_steps = 100001 diff --git a/tensorflow/go/genop/generate.sh b/tensorflow/go/genop/generate.sh index b961f7200a098b..d791e39c409b5e 100644 --- a/tensorflow/go/genop/generate.sh +++ b/tensorflow/go/genop/generate.sh @@ -20,11 +20,17 @@ go get github.com/golang/protobuf/proto go get github.com/golang/protobuf/protoc-gen-go cd $(dirname $0) -TF_DIR=${GOPATH}/src/github.com/tensorflow/tensorflow -PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc" +for g in $(echo $GOPATH | sed "s/:/ /g"); do + TF_DIR="${g}/src/github.com/tensorflow/tensorflow" + PROTOC="${TF_DIR}/bazel-out/host/bin/external/protobuf/protoc" + if [ -x "${PROTOC}" ]; then + break + fi +done if [ ! -x "${PROTOC}" ] then + set +e PATH_PROTOC=$(which protoc) if [ ! -x "${PATH_PROTOC}" ] then @@ -34,6 +40,7 @@ then exit 1 fi PROTOC=$PATH_PROTOC + set -e fi # Ensure that protoc-gen-go is available in $PATH diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md index 26377ba0d2c42a..20eb6a8265c9af 100644 --- a/tensorflow/java/README.md +++ b/tensorflow/java/README.md @@ -110,7 +110,7 @@ libraries will need to be built from source. brew install swig ``` -3. [Configure](https://www.tensorflow.org/get_started/os_setup#configure_the_installation) +3. [Configure](https://www.tensorflow.org/install/install_sources#configure_the_installation) (e.g., enable GPU support) and build: ```sh @@ -120,8 +120,8 @@ libraries will need to be built from source. //tensorflow/java:libtensorflow_jni ``` -The JAR (`libtensorflow.jar`) and native library (`libtensorflow_jni.so`) will -be in `bazel-bin/tensorflow/java`. +The JAR (`libtensorflow.jar`) and native library (`libtensorflow_jni.so` on Linux or `libtensorflow_jni.dylib` on OS X) will +be in `bazel-bin/tensorflow/java`. Using these artifacts follow both steps 3 and 4 in the [quickstart](#quickstart) section in order to get your application up and running. ### Maven diff --git a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java index c3938fe23fa841..b4591dd8692e65 100644 --- a/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java +++ b/tensorflow/java/src/main/java/org/tensorflow/SavedModelBundle.java @@ -27,7 +27,8 @@ public class SavedModelBundle implements AutoCloseable { /** - * Load a saved model from an export directory. + * Load a saved model from an export directory. The model that is being loaded should be created using + * the Saved Model API. * * @param exportDir the directory path containing a saved model. * @param tags the tags identifying the specific metagraphdef to load. diff --git a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java index efd6c81b30c43e..692de2289d62ff 100644 --- a/tensorflow/java/src/main/java/org/tensorflow/Tensor.java +++ b/tensorflow/java/src/main/java/org/tensorflow/Tensor.java @@ -172,8 +172,7 @@ public static Tensor create(long[] shape, LongBuffer data) { * *

Creates a Tensor with the provided shape of any type where the tensor's data has been * encoded into {@code data} as per the specification of the TensorFlow C - * API. + * href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C API. * * @param dataType the tensor datatype. * @param shape the tensor shape. diff --git a/tensorflow/java/src/main/java/org/tensorflow/package-info.java b/tensorflow/java/src/main/java/org/tensorflow/package-info.java index 3b7b8079f90812..dd4859e1b14045 100644 --- a/tensorflow/java/src/main/java/org/tensorflow/package-info.java +++ b/tensorflow/java/src/main/java/org/tensorflow/package-info.java @@ -19,8 +19,8 @@ *

WARNING: The API is currently experimental and is not covered by TensorFlow API stability * guarantees. See README.md - * for installation instructions. + * href="https://www.tensorflow.org/code/tensorflow/java/README.md">README.md for installation + * instructions. * *

The LabelImage diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py index cc315556905914..038dc4147ab49d 100644 --- a/tensorflow/python/client/session.py +++ b/tensorflow/python/client/session.py @@ -711,14 +711,14 @@ def run(self, fetches, feed_dict=None, options=None, run_metadata=None): # v is the numpy array [10, 20] # 'fetches' can be a list. v = session.run([a, b]) - # v a Python list with 2 numpy arrays: the numpy array [10, 20] and the + # v is a Python list with 2 numpy arrays: the 1-D array [10, 20] and the # 1-D array [1.0, 2.0] # 'fetches' can be arbitrary lists, tuples, namedtuple, dicts: MyData = collections.namedtuple('MyData', ['a', 'b']) v = session.run({'k1': MyData(a, b), 'k2': [b, a]}) # v is a dict with - # v['k1'] is a MyData namedtuple with 'a' the numpy array [10, 20] and - # 'b' the numpy array [1.0, 2.0] + # v['k1'] is a MyData namedtuple with 'a' (the numpy array [10, 20]) and + # 'b' (the numpy array [1.0, 2.0]) # v['k2'] is a list with the numpy array [1.0, 2.0] and the numpy array # [10, 20]. ``` diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD index 84bcd8e701a80c..952c4adbfa3439 100644 --- a/tensorflow/python/debug/BUILD +++ b/tensorflow/python/debug/BUILD @@ -15,6 +15,7 @@ exports_files(["LICENSE"]) load("//tensorflow:tensorflow.bzl", "cuda_py_test") load("//tensorflow:tensorflow.bzl", "py_test") +load("//tensorflow:tensorflow.bzl", "if_not_windows") py_library( name = "debug_py", @@ -33,11 +34,12 @@ py_library( py_library( name = "debug_pip", deps = [ - ":debug_examples", ":debug_py", ":offline_analyzer", ":session_debug_testlib", - ], + ] + if_not_windows([ + ":debug_examples", + ]), ) py_library( diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py index bce7e30b68fa4c..71230ba00058e1 100644 --- a/tensorflow/python/kernel_tests/tensordot_op_test.py +++ b/tensorflow/python/kernel_tests/tensordot_op_test.py @@ -84,9 +84,7 @@ def test_invalid_axes(self): b_ph: b, axes_ph: axes_value}) - def test_no_partial_shape_inference(self): - # If one of the shapes is only partially defined, the output shape is - # unknown. + def test_partial_shape_inference(self): a = array_ops.placeholder(dtypes.float32) b = array_ops.placeholder(dtypes.float32) axes = ([1], [0]) @@ -95,13 +93,21 @@ def test_no_partial_shape_inference(self): a.set_shape([None, 2]) b.set_shape([2, 3]) output = math_ops.tensordot(a, b, axes) - self.assertEqual(output.get_shape().ndims, None) + output_shape = output.get_shape() + self.assertEqual(output_shape.ndims, 2) + output_shape = output_shape.as_list() + self.assertEqual(output_shape[0], None) + self.assertEqual(output_shape[1], 3) a = array_ops.placeholder(dtypes.float32) b = array_ops.placeholder(dtypes.float32) a.set_shape([2, 2]) b.set_shape([2, None]) output = math_ops.tensordot(a, b, axes) - self.assertEqual(output.get_shape().ndims, None) + output_shape = output.get_shape() + self.assertEqual(output_shape.ndims, 2) + output_shape = output_shape.as_list() + self.assertEqual(output_shape[0], 2) + self.assertEqual(output_shape[1], None) def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_): diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py index 2601c61c47d8e2..3e40423ad638f4 100644 --- a/tensorflow/python/layers/pooling.py +++ b/tensorflow/python/layers/pooling.py @@ -294,7 +294,7 @@ class AveragePooling2D(_Pooling2D): data_format: A string. The ordering of the dimensions in the inputs. `channels_last` (default) and `channels_first` are supported. `channels_last` corresponds to inputs with shape - `(batch, height, channels, width)` while `channels_first` corresponds to + `(batch, height, width, channels)` while `channels_first` corresponds to inputs with shape `(batch, channels, height, width)`. name: A string, the name of the layer. """ @@ -329,7 +329,7 @@ def average_pooling2d(inputs, data_format: A string. The ordering of the dimensions in the inputs. `channels_last` (default) and `channels_first` are supported. `channels_last` corresponds to inputs with shape - `(batch, height, channels, width)` while `channels_first` corresponds to + `(batch, height, width, channels)` while `channels_first` corresponds to inputs with shape `(batch, channels, height, width)`. name: A string, the name of the layer. diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py index f9fd5d77c99c90..c4a27009c3c3ce 100644 --- a/tensorflow/python/ops/control_flow_ops.py +++ b/tensorflow/python/ops/control_flow_ops.py @@ -275,7 +275,7 @@ def exit(data, name=None): def switch(data, pred, dtype=None, name=None): """Forwards `data` to an output determined by `pred`. - If `pred` is true, the `data` input is forwared to the first output. + If `pred` is false, the `data` input is forwared to the first output. Otherwise, the data goes to the second output. This op handles `Tensor`s and `IndexedSlices`. @@ -323,7 +323,7 @@ def switch(data, pred, dtype=None, name=None): def _SwitchRefOrTensor(data, pred, name="Switch"): """Forwards `data` to an output determined by `pred`. - If `pred` is true, the `data` input is forwared to the first output. + If `pred` is false, the `data` input is forwared to the first output. Otherwise, the data goes to the second output. This op handles `Tensor`s and `IndexedSlices`. diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index 62072e127916b2..0a2d4e4792c385 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -90,22 +90,23 @@ def _is_tensor(x): return isinstance(x, (ops.Tensor, variables.Variable)) -def _ImageDimensions(image): +def _ImageDimensions(image, rank): """Returns the dimensions of an image tensor. Args: - image: A 3-D Tensor of shape `[height, width, channels]`. + image: A rank-D Tensor. For 3-D of shape: `[height, width, channels]`. + rank: The expected rank of the image Returns: - A list of `[height, width, channels]` corresponding to the dimensions of the + A list of corresponding to the dimensions of the input image. Dimensions that are statically known are python integers, otherwise they are integer scalar tensors. """ if image.get_shape().is_fully_defined(): return image.get_shape().as_list() else: - static_shape = image.get_shape().with_rank(3).as_list() - dynamic_shape = array_ops.unstack(array_ops.shape(image), 3) + static_shape = image.get_shape().with_rank(rank).as_list() + dynamic_shape = array_ops.unstack(array_ops.shape(image), rank) return [s if s is not None else d for s, d in zip(static_shape, dynamic_shape)] @@ -144,22 +145,39 @@ def _Check3DImage(image, require_static=True): return [] -def _CheckAtLeast3DImage(image): +def _CheckAtLeast3DImage(image, require_static=True): """Assert that we are working with properly shaped image. Args: image: >= 3-D Tensor of size [*, height, width, depth] + require_static: If `True`, requires that all dimensions of `image` are + known and non-zero. Raises: ValueError: if image.shape is not a [>= 3] vector. + + Returns: + An empty list, if `image` has fully defined dimensions. Otherwise, a list + containing an assert op is returned. """ - if not image.get_shape().is_fully_defined(): + try: + if image.get_shape().ndims is None: + image_shape = image.get_shape().with_rank(3) + else: + image_shape = image.get_shape().with_rank_at_least(3) + except ValueError: + raise ValueError("'image' must be at least three-dimensional.") + if require_static and not image_shape.is_fully_defined(): raise ValueError('\'image\' must be fully defined.') - if image.get_shape().ndims < 3: - raise ValueError('\'image\' must be at least three-dimensional.') - if not all(x > 0 for x in image.get_shape()): + if any(x == 0 for x in image_shape): raise ValueError('all dims of \'image.shape\' must be > 0: %s' % - image.get_shape()) + image_shape) + if not image_shape.is_fully_defined(): + return [check_ops.assert_positive(array_ops.shape(image), + ["all dims of 'image.shape' " + "must be > 0."])] + else: + return [] def fix_image_flip_shape(image, result): @@ -397,14 +415,18 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height, `target_height` by `target_width`. Args: - image: 3-D tensor with shape `[height, width, channels]` + image: 4-D Tensor of shape `[batch, height, width, channels]` or + 3-D Tensor of shape `[height, width, channels]`. offset_height: Number of rows of zeros to add on top. offset_width: Number of columns of zeros to add on the left. target_height: Height of output image. target_width: Width of output image. Returns: - 3-D tensor of shape `[target_height, target_width, channels]` + If `image` was 4-D, a 4-D float Tensor of shape + `[batch, target_height, target_width, channels]` + If `image` was 3-D, a 3-D float Tensor of shape + `[target_height, target_width, channels]` Raises: ValueError: If the shape of `image` is incompatible with the `offset_*` or @@ -414,9 +436,22 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height, image = ops.convert_to_tensor(image, name='image') assert_ops = [] - assert_ops += _Check3DImage(image, require_static=False) + assert_ops += _CheckAtLeast3DImage(image, require_static=False) + + is_batch = True + image_shape = image.get_shape() + if image_shape.ndims == 3: + is_batch = False + image = array_ops.expand_dims(image, 0) + elif image_shape.ndims is None: + is_batch = False + image = array_ops.expand_dims(image, 0) + image.set_shape([None] * 4) + elif image_shape.ndims != 4: + raise ValueError('\'image\' must have either 3 or 4 dimensions.') + + batch, height, width, depth = _ImageDimensions(image, rank=4) - height, width, depth = _ImageDimensions(image) after_padding_width = target_width - offset_width - width after_padding_height = target_height - offset_height - height @@ -433,15 +468,18 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height, # Do not pad on the depth dimensions. paddings = array_ops.reshape( array_ops.stack([ - offset_height, after_padding_height, offset_width, + 0, 0, offset_height, after_padding_height, offset_width, after_padding_width, 0, 0 - ]), [3, 2]) + ]), [4, 2]) padded = array_ops.pad(image, paddings) padded_shape = [None if _is_tensor(i) else i - for i in [target_height, target_width, depth]] + for i in [batch, target_height, target_width, depth]] padded.set_shape(padded_shape) + if not is_batch: + padded = array_ops.squeeze(padded, squeeze_dims=[0]) + return padded @@ -455,7 +493,8 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height, `offset_height + target_height, offset_width + target_width`. Args: - image: 3-D tensor with shape `[height, width, channels]` + image: 4-D Tensor of shape `[batch, height, width, channels]` or + 3-D Tensor of shape `[height, width, channels]`. offset_height: Vertical coordinate of the top-left corner of the result in the input. offset_width: Horizontal coordinate of the top-left corner of the result in @@ -464,7 +503,10 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height, target_width: Width of the result. Returns: - 3-D tensor of image with shape `[target_height, target_width, channels]` + If `image` was 4-D, a 4-D float Tensor of shape + `[batch, target_height, target_width, channels]` + If `image` was 3-D, a 3-D float Tensor of shape + `[target_height, target_width, channels]` Raises: ValueError: If the shape of `image` is incompatible with the `offset_*` or @@ -474,9 +516,21 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height, image = ops.convert_to_tensor(image, name='image') assert_ops = [] - assert_ops += _Check3DImage(image, require_static=False) + assert_ops += _CheckAtLeast3DImage(image, require_static=False) + + is_batch = True + image_shape = image.get_shape() + if image_shape.ndims == 3: + is_batch = False + image = array_ops.expand_dims(image, 0) + elif image_shape.ndims is None: + is_batch = False + image = array_ops.expand_dims(image, 0) + image.set_shape([None] * 4) + elif image_shape.ndims != 4: + raise ValueError('\'image\' must have either 3 or 4 dimensions.') - height, width, depth = _ImageDimensions(image) + batch, height, width, depth = _ImageDimensions(image, rank=4) assert_ops += _assert(offset_width >= 0, ValueError, 'offset_width must be >= 0.') @@ -493,13 +547,16 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height, image = control_flow_ops.with_dependencies(assert_ops, image) cropped = array_ops.slice(image, - array_ops.stack([offset_height, offset_width, 0]), - array_ops.stack([target_height, target_width, -1])) + array_ops.stack([0, offset_height, offset_width, 0]), + array_ops.stack([-1, target_height, target_width, -1])) cropped_shape = [None if _is_tensor(i) else i - for i in [target_height, target_width, depth]] + for i in [batch, target_height, target_width, depth]] cropped.set_shape(cropped_shape) + if not is_batch: + cropped = array_ops.squeeze(cropped, squeeze_dims=[0]) + return cropped @@ -516,7 +573,8 @@ def resize_image_with_crop_or_pad(image, target_height, target_width): dimension. Args: - image: 3-D tensor of shape `[height, width, channels]` + image: 4-D Tensor of shape `[batch, height, width, channels]` or + 3-D Tensor of shape `[height, width, channels]`. target_height: Target height. target_width: Target width. @@ -524,13 +582,27 @@ def resize_image_with_crop_or_pad(image, target_height, target_width): ValueError: if `target_height` or `target_width` are zero or negative. Returns: - Cropped and/or padded image of shape - `[target_height, target_width, channels]` + Cropped and/or padded image. + If `images` was 4-D, a 4-D float Tensor of shape + `[batch, new_height, new_width, channels]`. + If `images` was 3-D, a 3-D float Tensor of shape + `[new_height, new_width, channels]`. """ image = ops.convert_to_tensor(image, name='image') + image_shape = image.get_shape() + is_batch = True + if image_shape.ndims == 3: + is_batch = False + image = array_ops.expand_dims(image, 0) + elif image_shape.ndims is None: + is_batch = False + image = array_ops.expand_dims(image, 0) + image.set_shape([None] * 4) + elif image_shape.ndims != 4: + raise ValueError('\'image\' must have either 3 or 4 dimensions.') assert_ops = [] - assert_ops += _Check3DImage(image, require_static=False) + assert_ops += _CheckAtLeast3DImage(image, require_static=False) assert_ops += _assert(target_width > 0, ValueError, 'target_width must be > 0.') assert_ops += _assert(target_height > 0, ValueError, @@ -563,7 +635,7 @@ def equal_(x, y): else: return x == y - height, width, _ = _ImageDimensions(image) + _, height, width, _ = _ImageDimensions(image, rank=4) width_diff = target_width - width offset_crop_width = max_(-width_diff // 2, 0) offset_pad_width = max_(width_diff // 2, 0) @@ -585,7 +657,7 @@ def equal_(x, y): if resized.get_shape().ndims is None: raise ValueError('resized contains no shape.') - resized_height, resized_width, _ = _ImageDimensions(resized) + _, resized_height, resized_width, _ = _ImageDimensions(resized, rank=4) assert_ops = [] assert_ops += _assert(equal_(resized_height, target_height), ValueError, @@ -594,6 +666,10 @@ def equal_(x, y): 'resized width is not correct.') resized = control_flow_ops.with_dependencies(assert_ops, resized) + + if not is_batch: + resized = array_ops.squeeze(resized, squeeze_dims=[0]) + return resized diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py index c8691f4eb89a5a..799f7e4935d789 100644 --- a/tensorflow/python/ops/image_ops_test.py +++ b/tensorflow/python/ops/image_ops_test.py @@ -299,7 +299,7 @@ def _adjustHueNp(self, x_np, delta_h): return y_v.reshape(x_np.shape) def _adjustHueTf(self, x_np, delta_h): - with self.test_session(use_gpu=False): + with self.test_session(use_gpu=True): x = constant_op.constant(x_np) y = image_ops.adjust_hue(x, delta_h) y_tf = y.eval() @@ -1185,9 +1185,13 @@ def testNon3DInput(self): offset_height, offset_width = [0, 0] target_height, target_width = [2, 2] - for x_shape in ([1, 3, 5, 1], [3, 5]): + for x_shape in ([3, 5],): + self._assertRaises(x, x_shape, offset_height, offset_width, target_height, + target_width, "'image' must be at least three-dimensional.") + + for x_shape in ([1, 3, 5, 1, 1],): self._assertRaises(x, x_shape, offset_height, offset_width, target_height, - target_width, "must be three-dimensional") + target_width, "'image' must have either 3 or 4 dimensions.") def testZeroLengthInput(self): # Input image has 0-length dimension(s). @@ -1430,9 +1434,13 @@ def testNon3DInput(self): offset_height, offset_width = [0, 0] target_height, target_width = [2, 2] - for x_shape in ([1, 3, 5, 1], [3, 5]): + for x_shape in ([3, 5],): self._assertRaises(x, x_shape, offset_height, offset_width, target_height, - target_width, "must be three-dimensional") + target_width, "'image' must be at least three-dimensional") + + for x_shape in ([1, 3, 5, 1, 1],): + self._assertRaises(x, x_shape, offset_height, offset_width, target_height, + target_width, "'image' must have either 3 or 4 dimensions.") def testZeroLengthInput(self): # Input image has 0-length dimension(s). @@ -2220,9 +2228,13 @@ def testNon3DInput(self): x = [0] * 15 target_height, target_width = [4, 4] - for x_shape in ([1, 3, 5, 1], [3, 5]): + for x_shape in ([3, 5],): + self._assertRaises(x, x_shape, target_height, target_width, + "'image' must have either 3 or 4 dimensions.") + + for x_shape in ([1, 3, 5, 1, 1],): self._assertRaises(x, x_shape, target_height, target_width, - "must be three-dimensional") + "'image' must have either 3 or 4 dimensions.") def testZeroLengthInput(self): # Input image has 0-length dimension(s). diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index d3d954f33d67f9..fe4a47b9ae2c22 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -2298,12 +2298,14 @@ def _tensordot_reshape(a, axes, flipped=False): assumes that `a` is the second argument in the contraction operation. Returns: - A pair `(reshaped_a, free_dims)` where `reshaped_a` is the tensor `a` - reshaped to allow contraction via `matmul` and `free_dims` is either a - list of integers or an `int32` `Tensor`, depending on if `axes` is a list - and the shape of `a` is fully defined. + A tuple `(reshaped_a, free_dims, free_dims_static)` where `reshaped_a` is + the tensor `a` reshaped to allow contraction via `matmul`, `free_dims` is + either a list of integers or an `int32` `Tensor`, depending on whether + the shape of a is fully specified, and free_dims_static is either a list + of integers and None values, or None, representing the inferred + static shape of the free dimensions + """ - # TODO(b/33084409): Implement partial shape inference. if a.get_shape().is_fully_defined() and isinstance(axes, (list, tuple)): shape_a = a.get_shape().as_list() axes = [i if i >= 0 else i + len(shape_a) for i in axes] @@ -2314,8 +2316,15 @@ def _tensordot_reshape(a, axes, flipped=False): perm = list(axes) + free if flipped else free + list(axes) new_shape = [prod_axes, prod_free] if flipped else [prod_free, prod_axes] reshaped_a = array_ops.reshape(array_ops.transpose(a, perm), new_shape) - return reshaped_a, free_dims + return reshaped_a, free_dims, free_dims else: + if a.get_shape().ndims is not None and isinstance(axes, (list, tuple)): + shape_a = a.get_shape().as_list() + axes = [i if i >= 0 else i + len(shape_a) for i in axes] + free = [i for i in xrange(len(shape_a)) if i not in axes] + free_dims_static = [shape_a[i] for i in free] + else: + free_dims_static = None shape_a = array_ops.shape(a) rank_a = array_ops.rank(a) axes = ops.convert_to_tensor(axes, dtype=dtypes.int32, name="axes") @@ -2334,7 +2343,7 @@ def _tensordot_reshape(a, axes, flipped=False): perm = array_ops.concat([free, axes], 0) new_shape = array_ops.stack([prod_free_dims, prod_axes_dims]) reshaped_a = array_ops.reshape(array_ops.transpose(a, perm), new_shape) - return reshaped_a, free_dims + return reshaped_a, free_dims, free_dims_static def _tensordot_axes(a, axes): """Generates two sets of contraction axes for the two tensor arguments.""" @@ -2366,16 +2375,19 @@ def _tensordot_axes(a, axes): a = ops.convert_to_tensor(a, name="a") b = ops.convert_to_tensor(b, name="b") a_axes, b_axes = _tensordot_axes(a, axes) - a_reshape, a_free_dims = _tensordot_reshape(a, a_axes) - b_reshape, b_free_dims = _tensordot_reshape(b, b_axes, True) + a_reshape, a_free_dims, a_free_dims_static = _tensordot_reshape(a, a_axes) + b_reshape, b_free_dims, b_free_dims_static = _tensordot_reshape(b, b_axes, True) ab_matmul = matmul(a_reshape, b_reshape) if isinstance(a_free_dims, list) and isinstance(b_free_dims, list): return array_ops.reshape(ab_matmul, a_free_dims + b_free_dims, name=name) else: - a_free_dims = ops.convert_to_tensor(a_free_dims) - b_free_dims = ops.convert_to_tensor(b_free_dims) - return array_ops.reshape( + a_free_dims = ops.convert_to_tensor(a_free_dims, dtype=dtypes.int32) + b_free_dims = ops.convert_to_tensor(b_free_dims, dtype=dtypes.int32) + product = array_ops.reshape( ab_matmul, array_ops.concat([a_free_dims, b_free_dims], 0), name=name) + if a_free_dims_static is not None and b_free_dims_static is not None: + product.set_shape(a_free_dims_static + b_free_dims_static) + return product # FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py index c267fb8ccdd974..bdb34dd78e64f5 100644 --- a/tensorflow/python/ops/metrics_impl.py +++ b/tensorflow/python/ops/metrics_impl.py @@ -1473,7 +1473,7 @@ def false_negatives(labels, predictions, weights=None, metrics_collections=None, updates_collections=None, name=None): - """Computes the total number of false positives. + """Computes the total number of false negatives. If `weights` is `None`, weights default to 1. Use weights of 0 to mask values. diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 51ec1c313b4eea..4a8ac42161c88d 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -278,7 +278,8 @@ def combined_op(converted_input, num_spatial_dims, _): For N=3, the valid values are "NDHWC" (default) and "NCDHW". Returns: - The output Tensor as described above. + The output Tensor as described above, dimensions will vary based on the op + provided. Raises: ValueError: if `padding` is invalid or the arguments are incompatible. @@ -529,17 +530,16 @@ def convolution(input, filter, # pylint: disable=redefined-builtin of N `strides` (defaulting [1]*N), this computes for each N-D spatial output position (x[0], ..., x[N-1]): + ``` output[b, x[0], ..., x[N-1], k] = - sum_{z[0], ..., z[N-1], q} - filter[z[0], ..., z[N-1], q, k] * padded_input[b, x[0]*strides[0] + dilation_rate[0]*z[0], ..., x[N-1]*strides[N-1] + dilation_rate[N-1]*z[N-1], q] - + ``` where `padded_input` is obtained by zero padding the input using an effective spatial filter shape of `(spatial_filter_shape-1) * dilation_rate + 1` and output striding `strides` as described in the @@ -682,6 +682,7 @@ def pool(input, # pylint: disable=redefined-builtin 0 <= x[i] < output_spatial_shape[i], 0 <= c < num_channels: + ``` output[b, x[0], ..., x[N-1], c] = REDUCE_{z[0], ..., z[N-1]} input[b, @@ -689,6 +690,7 @@ def pool(input, # pylint: disable=redefined-builtin ... x[N-1]*strides[N-1] - pad_before[N-1] + dilation_rate[N-1]*z[N-1], c], + ``` where the reduction function REDUCE depends on the value of `pooling_type`, and pad_before is defined based on the value of `padding` as described in the @@ -698,10 +700,12 @@ def pool(input, # pylint: disable=redefined-builtin In the case that `data_format` starts with `"NC"`, the `input` and output are simply transposed as follows: + ``` pool(input, data_format, **kwargs) = tf.transpose(pool(tf.transpose(input, [0] + range(2,N+2) + [1]), **kwargs), [0, N+1] + range(1, N+1)) + ``` Args: input: Tensor of rank N+2, of shape @@ -740,6 +744,7 @@ def pool(input, # pylint: disable=redefined-builtin If padding = "SAME": output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i]) + If padding = "VALID": output_spatial_shape[i] = ceil((input_spatial_shape[i] - (window_shape[i] - 1) * dilation_rate[i]) @@ -844,9 +849,14 @@ def atrous_conv2d(value, filters, rate, padding, name=None): More specifically: - output[b, i, j, k] = sum_{di, dj, q} filters[di, dj, q, k] * - value[b, i + rate * di, j + rate * dj, q] - + ``` + output[batch, height, width, out_channel] = + sum_{dheight, dwidth, in_channel} ( + filters[dheight, dwidth, in_channel, out_channel] * + value[batch, height + rate * dheight, width + rate * dwidth, in_channel] + ) + ``` + Atrous convolution allows us to explicitly control how densely to compute feature responses in fully convolutional networks. Used in conjunction with bilinear interpolation, it offers an alternative to `conv2d_transpose` in @@ -932,6 +942,14 @@ def atrous_conv2d(value, filters, rate, padding, name=None): Returns: A `Tensor` with the same type as `value`. + Output shape with `'VALID`` padding is: + + [batch, height - 2 * (filter_width - 1), + width - 2 * (filter_height - 1), out_channels]. + + Output shape with `'SAME'` padding is: + + [batch, height, width, out_channels]. Raises: ValueError: If input/output depth does not match `filters`' shape, or if diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py index 076c6d41d94b72..c3dddf85f3d719 100644 --- a/tensorflow/python/ops/rnn_cell_impl.py +++ b/tensorflow/python/ops/rnn_cell_impl.py @@ -13,7 +13,12 @@ # limitations under the License. # ============================================================================== -"""Module implementing RNN Cells.""" +"""Module implementing RNN Cells. + +This module contains the abstract definition of a RNN cell: `_RNNCell`. +Actual implementations of various types of RNN cells are located in +`tensorflow.contrib`. +""" from __future__ import absolute_import from __future__ import division @@ -72,10 +77,12 @@ def _zero_state_tensors(state_size, batch_size, dtype): class _RNNCell(object): """Abstract object representing an RNN cell. - The definition of cell in this package differs from the definition used in the - literature. In the literature, cell refers to an object with a single scalar - output. The definition in this package refers to a horizontal array of such - units. + Every `RNNCell` must have the properties below and implement `__call__` with + the following signature. + + This definition of cell differs from the definition used in the literature. + In the literature, 'cell' refers to an object with a single scalar output. + This definition refers to a horizontal array of such units. An RNN cell, in the most abstract setting, is anything that has a state and performs some operation that takes a matrix of inputs. @@ -84,13 +91,6 @@ class _RNNCell(object): state matrix with `self.state_size` columns. If `self.state_size` is a tuple of integers, then it results in a tuple of `len(state_size)` state matrices, each with a column size corresponding to values in `state_size`. - - This module provides a number of basic commonly used RNN cells, such as - LSTM (Long Short Term Memory) or GRU (Gated Recurrent Unit), and a number - of operators that allow add dropouts, projections, or embeddings for inputs. - Constructing multi-layer cells is supported by the class `MultiRNNCell`, - or by calling the `rnn` ops several times. Every `RNNCell` must have the - properties below and implement `__call__` with the following signature. """ def __call__(self, inputs, state, scope=None): @@ -140,7 +140,7 @@ def zero_state(self, batch_size, dtype): If `state_size` is a nested list or tuple, then the return value is a nested list or tuple (of the same structure) of `2-D` tensors with - the shapes `[batch_size x s]` for each s in `state_size`. + the shapes `[batch_size x s]` for each s in `state_size`. """ with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): state_size = self.state_size diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py index 70ecda1dda564e..335fd110e7a45d 100644 --- a/tensorflow/python/platform/tf_logging.py +++ b/tensorflow/python/platform/tf_logging.py @@ -37,6 +37,7 @@ # Determine whether we are in an interactive environment +_interactive = False try: # This is only defined in interactive shells if _sys.ps1: _interactive = True diff --git a/tensorflow/tensorboard/README.md b/tensorflow/tensorboard/README.md index b9addd4b68b80f..0c21bb508ff5ce 100644 --- a/tensorflow/tensorboard/README.md +++ b/tensorflow/tensorboard/README.md @@ -91,7 +91,7 @@ produce a consistent history of what happened. ### Runs: Comparing different executions of your model You may want to visually compare multiple executions of your model; for example, -suppose you've changed the hyperparameters and want to see if its converging +suppose you've changed the hyperparameters and want to see if it's converging faster. TensorBoard enables this through different "runs". When TensorBoard is passed a `logdir` at startup, it recursively walks the directory tree rooted at `logdir` looking for subdirectories that contain tfevents data. Every time it diff --git a/tensorflow/tensorboard/defs.bzl b/tensorflow/tensorboard/defs.bzl index 7ad97f91f863e5..bae7078c5b5ba7 100644 --- a/tensorflow/tensorboard/defs.bzl +++ b/tensorflow/tensorboard/defs.bzl @@ -36,7 +36,7 @@ def tensorboard_typescript_genrule(name, srcs, typings=[], **kwargs): # data attribute won't be considered when --genrule_strategy=sandboxed. See # https://github.com/bazelbuild/bazel/issues/1147 and its linked issues. data = [ - "@org_nodejs//:bin/node", + "@org_nodejs", "@com_microsoft_typescript", ] native.genrule( diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 471a2173aa6e9e..aebdfed837bac8 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -124,6 +124,7 @@ def tf_copts(): "/DLANG_CXX11", "/D__VERSION__=\\\"MSVC\\\"", "/DPLATFORM_WINDOWS", + "/DTF_COMPILE_LIBRARY", "/DEIGEN_HAS_C99_MATH", "/DTENSORFLOW_USE_EIGEN_THREADPOOL", ], @@ -392,7 +393,7 @@ def tf_cc_tests(srcs, deps, name='', linkstatic=0, tags=[], size="medium", def tf_cc_test_mkl(srcs, deps, name='', linkstatic=0, tags=[], size="medium", args=None): - tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args) + if_mkl(tf_cc_tests(srcs, deps, linkstatic, tags=tags, size=size, args=args)) def tf_cc_tests_gpu(srcs, deps, name='', linkstatic=0, tags=[], size="medium", args=None): diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD index 3b1901fd567921..a2ffca97ecbbb8 100644 --- a/tensorflow/tools/benchmark/BUILD +++ b/tensorflow/tools/benchmark/BUILD @@ -34,6 +34,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:framework", "//tensorflow/core:framework_internal", + "//tensorflow/core:framework_lite", "//tensorflow/core:protos_all_cc", "//tensorflow/core:tensorflow", "//tensorflow/core:test", diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc index 0fcfaf747b4532..db2ac31bafe8ca 100644 --- a/tensorflow/tools/benchmark/benchmark_model.cc +++ b/tensorflow/tools/benchmark/benchmark_model.cc @@ -36,6 +36,7 @@ limitations under the License. #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/init_main.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/platform.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/public/session.h" #include "tensorflow/core/util/command_line_flags.h" @@ -272,7 +273,11 @@ Status TimeMultipleRuns(double sleep_seconds, int num_runs, // This can be helpful to determine the effect of mobile processor // scaling and thermal throttling. if (sleep_seconds > 0.0) { +#ifdef PLATFORM_WINDOWS + Sleep(sleep_seconds * 1000); +#else nanosleep(&req, nullptr); +#endif } } std::stringstream stream; diff --git a/tensorflow/tools/ci_build/Dockerfile.android b/tensorflow/tools/ci_build/Dockerfile.android index 887589bc93d9b8..4d46c672ab5cc7 100644 --- a/tensorflow/tools/ci_build/Dockerfile.android +++ b/tensorflow/tools/ci_build/Dockerfile.android @@ -1,4 +1,4 @@ -FROM ubuntu:16.04 +FROM ubuntu:14.04 MAINTAINER Jan Prach @@ -10,9 +10,8 @@ RUN add-apt-repository -y ppa:openjdk-r/ppa && \ RUN /install/install_deb_packages.sh RUN /install/install_bazel.sh -# Set up bazelrc. -COPY install/.bazelrc /root/.bazelrc -ENV BAZELRC /root/.bazelrc +# Set up the master bazelrc configuration file. +COPY install/.bazelrc /etc/bazel.bazelrc # Install extra libraries for android sdk. RUN apt-get update && apt-get install -y \ diff --git a/tensorflow/tools/ci_build/Dockerfile.cmake b/tensorflow/tools/ci_build/Dockerfile.cmake index 8a28fe6cdf9595..22eaf11b91869b 100644 --- a/tensorflow/tools/ci_build/Dockerfile.cmake +++ b/tensorflow/tools/ci_build/Dockerfile.cmake @@ -7,9 +7,10 @@ COPY install/*.sh /install/ RUN /install/install_bootstrap_deb_packages.sh RUN /install/install_deb_packages.sh +RUN apt-get update +RUN apt-get install -y --no-install-recommends python-pip RUN pip install --upgrade numpy # Install golang RUN add-apt-repository -y ppa:ubuntu-lxc/lxd-stable -RUN apt-get update RUN apt-get install -y golang diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu b/tensorflow/tools/ci_build/Dockerfile.cpu index 8e0be14ca6409a..206108930a170d 100644 --- a/tensorflow/tools/ci_build/Dockerfile.cpu +++ b/tensorflow/tools/ci_build/Dockerfile.cpu @@ -1,4 +1,4 @@ -FROM ubuntu:16.04 +FROM ubuntu:14.04 MAINTAINER Jan Prach @@ -15,6 +15,5 @@ RUN /install/install_buildifier.sh RUN /install/install_auditwheel.sh RUN /install/install_golang.sh -# Set up bazelrc. -COPY install/.bazelrc /root/.bazelrc -ENV BAZELRC /root/.bazelrc +# Set up the master bazelrc configuration file. +COPY install/.bazelrc /etc/bazel.bazelrc diff --git a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu index 79cf1844f277a2..b914f51918c898 100644 --- a/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu +++ b/tensorflow/tools/ci_build/Dockerfile.debian.jessie.cpu @@ -22,6 +22,5 @@ RUN /install/install_golang.sh # Fix a virtualenv install issue specific to Debian Jessie. RUN pip install --upgrade virtualenv -# Set up bazelrc. -COPY install/.bazelrc /root/.bazelrc -ENV BAZELRC /root/.bazelrc +# Set up the master bazelrc configuration file. +COPY install/.bazelrc /etc/bazel.bazelrc diff --git a/tensorflow/tools/ci_build/Dockerfile.gpu b/tensorflow/tools/ci_build/Dockerfile.gpu index 1cf1e40404be02..68493965fa0ba5 100644 --- a/tensorflow/tools/ci_build/Dockerfile.gpu +++ b/tensorflow/tools/ci_build/Dockerfile.gpu @@ -1,7 +1,12 @@ -FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04 +FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04 MAINTAINER Jan Prach +# In the Ubuntu 14.04 images, cudnn is placed in system paths. Move them to +# /usr/local/cuda +RUN cp /usr/include/cudnn.h /usr/local/cuda/include +RUN cp /usr/lib/x86_64-linux-gnu/libcudnn* /usr/local/cuda/lib64 + # Copy and run the install scripts. COPY install/*.sh /install/ RUN /install/install_bootstrap_deb_packages.sh @@ -12,9 +17,8 @@ RUN /install/install_pip_packages.sh RUN /install/install_bazel.sh RUN /install/install_golang.sh -# Set up bazelrc. -COPY install/.bazelrc /root/.bazelrc -ENV BAZELRC /root/.bazelrc +# Set up the master bazelrc configuration file. +COPY install/.bazelrc /etc/bazel.bazelrc ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH # Configure the build for our CUDA configuration. diff --git a/tensorflow/tools/ci_build/Dockerfile.hadoop b/tensorflow/tools/ci_build/Dockerfile.hadoop index 7af9f38708f4f9..489493c26e4b4f 100644 --- a/tensorflow/tools/ci_build/Dockerfile.hadoop +++ b/tensorflow/tools/ci_build/Dockerfile.hadoop @@ -1,4 +1,4 @@ -FROM ubuntu:16.04 +FROM ubuntu:14.04 MAINTAINER Jonathan Hseu @@ -14,6 +14,5 @@ RUN /install/install_proto3.sh RUN /install/install_buildifier.sh RUN /install/install_hadoop.sh -# Set up bazelrc. -COPY install/.bazelrc /root/.bazelrc -ENV BAZELRC /root/.bazelrc +# Set up the master bazelrc configuration file. +COPY install/.bazelrc /etc/bazel.bazelrc diff --git a/tensorflow/tools/ci_build/Dockerfile.tensorboard b/tensorflow/tools/ci_build/Dockerfile.tensorboard index 12b8aa18dae583..9795872e2c4907 100644 --- a/tensorflow/tools/ci_build/Dockerfile.tensorboard +++ b/tensorflow/tools/ci_build/Dockerfile.tensorboard @@ -1,4 +1,4 @@ -FROM ubuntu:16.04 +FROM ubuntu:14.04 MAINTAINER Jan Prach diff --git a/tensorflow/tools/ci_build/README.md b/tensorflow/tools/ci_build/README.md index 4b7858ca894137..1fa618e698fa22 100644 --- a/tensorflow/tools/ci_build/README.md +++ b/tensorflow/tools/ci_build/README.md @@ -20,20 +20,20 @@ run continuous integration [ci.tensorflow.org](https://ci.tensorflow.org). 2. Clone tensorflow repository. ```bash -git clone https://github.com/tensorflow/tensorflow.git -``` + git clone https://github.com/tensorflow/tensorflow.git + ``` 3. Go to tensorflow directory ```bash -cd tensorflow -``` + cd tensorflow + ``` 4. Build what you want, for example ```bash -tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/... -``` + tensorflow/tools/ci_build/ci_build.sh CPU bazel test //tensorflow/... + ``` diff --git a/tensorflow/tools/ci_build/builds/run_pip_tests.sh b/tensorflow/tools/ci_build/builds/run_pip_tests.sh index be076cd4c034bc..10bed0b786b122 100755 --- a/tensorflow/tools/ci_build/builds/run_pip_tests.sh +++ b/tensorflow/tools/ci_build/builds/run_pip_tests.sh @@ -104,28 +104,26 @@ export TF_NEED_CUDA=$IS_GPU yes "" | ./configure # Figure out how many concurrent tests we can run and do run the tests. +BAZEL_PARALLEL_TEST_FLAGS="" if [[ $IS_GPU == 1 ]]; then # Number of test threads is the number of GPU cards available. if [[ $IS_MAC == 1 ]]; then - PAR_TEST_JOBS=1 + BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=1" else PAR_TEST_JOBS=$TF_GPU_COUNT + BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=${TF_GPU_COUNT} \ + --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute" fi - - # Actually run the tests. - bazel test ${BAZEL_FLAGS} --local_test_jobs=${PAR_TEST_JOBS} \ - --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \ - -- ${BAZEL_TEST_TARGETS} - else # Number of test threads is the number of physical CPUs. if [[ $IS_MAC == 1 ]]; then - PAR_TEST_JOBS=$(sysctl -n hw.ncpu) + BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(sysctl -n hw.ncpu)" else - PAR_TEST_JOBS=$(grep -c ^processor /proc/cpuinfo) + BAZEL_PARALLEL_TEST_FLAGS="--local_test_jobs=$(grep -c ^processor /proc/cpuinfo)" fi - - # Actually run the tests. - bazel test ${BAZEL_FLAGS} --local_test_jobs=${PAR_TEST_JOBS} \ - -- ${BAZEL_TEST_TARGETS} fi + +# Actually run the tests. +bazel test ${BAZEL_FLAGS} ${BAZEL_PARALLEL_TEST_FLAGS} -- \ + ${BAZEL_TEST_TARGETS} + diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh index e1a312b858e924..cb204bc25f3d6f 100755 --- a/tensorflow/tools/ci_build/ci_parameterized_build.sh +++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh @@ -19,7 +19,7 @@ # # The script obeys the following required environment variables: # TF_BUILD_CONTAINER_TYPE: (CPU | GPU | ANDROID | ANDROID_FULL) -# TF_BUILD_PYTHON_VERSION: (PYTHON2 | PYTHON3) +# TF_BUILD_PYTHON_VERSION: (PYTHON2 | PYTHON3 | PYTHON3.5) # TF_BUILD_IS_PIP: (NO_PIP | PIP | BOTH) # # The below environment variable is required, but will be deprecated together @@ -33,7 +33,8 @@ # ANDROID & PIP (Android and PIP builds are mutually exclusive) # # 2) TF_BUILD_PYTHON_VERSION is set to PYTHON3, the build will use the version -# pointed to by "which python3" on the system. +# pointed to by "which python3" on the system, which is typically python3.4. To +# build for python3.5, set the environment variable to PYTHON3.5 # # # Additionally, the script follows the directions of optional environment @@ -426,7 +427,9 @@ fi # Process Python version if [[ ${TF_BUILD_PYTHON_VERSION} == "python2" ]]; then : -elif [[ ${TF_BUILD_PYTHON_VERSION} == "python3" ]]; then +elif [[ ${TF_BUILD_PYTHON_VERSION} == "python3" || \ + ${TF_BUILD_PYTHON_VERSION} == "python3.4" || \ + ${TF_BUILD_PYTHON_VERSION} == "python3.5" ]]; then # Supply proper environment variable to select Python 3 if [[ "${DO_DOCKER}" == "1" ]]; then EXTRA_PARAMS="${EXTRA_PARAMS} -e CI_BUILD_PYTHON=${TF_BUILD_PYTHON_VERSION}" @@ -493,6 +496,30 @@ echo "" TMP_DIR="" DOCKERFILE_FLAG="" +if [[ "${TF_BUILD_PYTHON_VERSION}" == "python3.5" ]]; then + # Modify Dockerfile for Python3.5 build + TMP_DIR=$(mktemp -d) + echo "Docker build will occur in temporary directory: ${TMP_DIR}" + + # Copy the files required for the docker build + SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + cp -r "${SCRIPT_DIR}/install" "${TMP_DIR}/install" || \ + die "ERROR: Failed to copy directory ${SCRIPT_DIR}/install" + + DOCKERFILE="${SCRIPT_DIR}/Dockerfile.${TF_BUILD_CONTAINER_TYPE}" + cp "${DOCKERFILE}" "${TMP_DIR}/" || \ + die "ERROR: Failed to copy Dockerfile at ${DOCKERFILE}" + DOCKERFILE="${TMP_DIR}/Dockerfile.${TF_BUILD_CONTAINER_TYPE}" + + # Replace a line in the Dockerfile + sed -i \ + 's/RUN \/install\/install_pip_packages.sh/RUN \/install\/install_python3.5_pip_packages.sh/g' \ + "${DOCKERFILE}" && \ + echo "Copied and modified Dockerfile for Python 3.5 build: ${DOCKERFILE}" || \ + die "ERROR: Faild to copy and modify Dockerfile: ${DOCKERFILE}" + + DOCKERFILE_FLAG="--dockerfile ${DOCKERFILE}" +fi chmod +x ${TMP_SCRIPT} diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh index 44aaed8ae95398..9ecf16c46f12f0 100755 --- a/tensorflow/tools/ci_build/ci_sanity.sh +++ b/tensorflow/tools/ci_build/ci_sanity.sh @@ -105,7 +105,7 @@ do_pylint() { if [[ $1 == "PYTHON2" ]]; then PYLINT_BIN="python /usr/local/lib/python2.7/dist-packages/pylint/lint.py" elif [[ $1 == "PYTHON3" ]]; then - PYLINT_BIN="python3 /usr/local/lib/python3.5/dist-packages/pylint/lint.py" + PYLINT_BIN="python3 /usr/local/lib/python3.4/dist-packages/pylint/lint.py" else echo "Unrecognized python version (PYTHON2 | PYTHON3): $1" return 1 diff --git a/tensorflow/tools/ci_build/install/install_deb_packages.sh b/tensorflow/tools/ci_build/install/install_deb_packages.sh index 23dc6d42c49b41..a62a6f8a3c1dee 100755 --- a/tensorflow/tools/ci_build/install/install_deb_packages.sh +++ b/tensorflow/tools/ci_build/install/install_deb_packages.sh @@ -42,14 +42,14 @@ apt-get install -y --no-install-recommends \ openjdk-8-jre-headless \ pkg-config \ python-dev \ - python-pip \ + python-setuptools \ + python-virtualenv \ python3-dev \ - python3-pip \ + python3-setuptools \ rsync \ sudo \ swig \ unzip \ - virtualenv \ wget \ zip \ zlib1g-dev diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh index 19c46bbcd4689d..8011f8de243fb8 100755 --- a/tensorflow/tools/ci_build/install/install_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh @@ -16,56 +16,64 @@ set -e +# We don't apt-get install so that we can install a newer version of pip. Not +# needed after we upgrade to Ubuntu 16.04 +easy_install -U pip +easy_install3 -U pip + # Install pip packages from whl files to avoid the time-consuming process of # building from source. -pip install wheel +pip2 install wheel pip3 install wheel # Install six. -pip install --upgrade six==1.10.0 +pip2 install --upgrade six==1.10.0 pip3 install --upgrade six==1.10.0 # Install werkzeug. -pip install --upgrade werkzeug==0.11.10 +pip2 install --upgrade werkzeug==0.11.10 pip3 install --upgrade werkzeug==0.11.10 # Install protobuf. -pip install --upgrade protobuf==3.2.0 +pip2 install --upgrade protobuf==3.2.0 pip3 install --upgrade protobuf==3.2.0 # Remove obsolete version of six, which can sometimes confuse virtualenv. rm -rf /usr/lib/python3/dist-packages/six* -pip install --upgrade numpy==1.12.0 -pip3 install --upgrade numpy==1.12.0 +# numpy needs to be installed from source to fix segfaults. See: +# https://github.com/tensorflow/tensorflow/issues/6968 +# This workaround isn't needed for Ubuntu 16.04 or later. +pip2 install --no-binary=:all: --upgrade numpy==1.12.0 +pip3 install --no-binary=:all: --upgrade numpy==1.12.0 -pip install scipy==0.18.1 +pip2 install scipy==0.18.1 pip3 install scipy==0.18.1 -pip install scikit-learn==0.18.1 +pip2 install scikit-learn==0.18.1 pip3 install scikit-learn==0.18.1 # pandas required by tf.learn/inflow -pip install pandas==0.19.2 +pip2 install pandas==0.19.2 pip3 install pandas==0.19.2 # Benchmark tests require the following: -pip install psutil +pip2 install psutil pip3 install psutil -pip install py-cpuinfo +pip2 install py-cpuinfo pip3 install py-cpuinfo # pylint tests require the following: -pip install pylint +pip2 install pylint pip3 install pylint # pep8 tests require the following: -pip install pep8 +pip2 install pep8 pip3 install pep8 # tf.mock require the following for python2: -pip install mock +pip2 install mock -pip install portpicker +pip2 install portpicker pip3 install portpicker diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh new file mode 100755 index 00000000000000..e7e2d256cd98f1 --- /dev/null +++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# Copyright 2016 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# Install packages required by Python3.5 build + +# TODO(cais): Remove this file once we upgrade to ubuntu:16.04 docker images for +# Python 3.5 builds. + +# fkrull/deadsnakes is for Python3.5 +add-apt-repository -y ppa:fkrull/deadsnakes +apt-get update + +set +e +# Upgrade swig to 3.0.8 +SWIG_VERSION="3.0.8" +swig_ver_flat=$(echo $SWIG_VERSION | sed 's/\.//g' | sed 's/^0*//g') +local_swig_ver=$(swig -version | grep -i version | awk '{print $3}') +local_swig_ver_flat=$(echo $local_swig_ver | sed 's/\.//g' | sed 's/^0*//g') +if [[ -z $local_swig_ver_flat ]]; then + local_swig_ver_flat=0 +fi +if (( $local_swig_ver_flat < $swig_ver_flat )); then + set -e + wget -q http://downloads.sourceforge.net/swig/swig-3.0.8.tar.gz + tar xzf swig-3.0.8.tar.gz + pushd swig-3.0.8 + apt-get install -y --no-install-recommends libpcre3-dev + ./configure + make + make install + rm -f /usr/bin/swig + ln -s /usr/local/bin/swig /usr/bin/swig + popd + rm -rf swig-3.0.8 swig-3.0.8.tar.gz +fi +set -e +# Install Python 3.5 and dev library +apt-get install -y --no-install-recommends python3.5 libpython3.5-dev + +# Install pip3.5 +set +e +pip35_version=$(pip3.5 --version | grep "python 3.5") +if [[ -z $pip35_version ]]; then + set -e + wget -q https://bootstrap.pypa.io/get-pip.py + python3.5 get-pip.py + rm -f get-pip.py +fi + +set -e +# Install six. +pip3.5 install --upgrade six==1.10.0 + +# Install protobuf. +pip3.5 install --upgrade protobuf==3.2.0 + +# Remove obsolete version of six, which can sometimes confuse virtualenv. +rm -rf /usr/lib/python3/dist-packages/six* + +# Install numpy, scipy and scikit-learn required by the builds + +# numpy needs to be installed from source to fix segfaults. See: +# https://github.com/tensorflow/tensorflow/issues/6968 +# This workaround isn't needed for Ubuntu 16.04 or later. +pip3.5 install --no-binary=:all: --upgrade numpy==1.12.0 + +pip3.5 install scipy==0.18.1 + +pip3.5 install scikit-learn==0.18.1 + +# pandas required by tf.learn/inflow +pip3 install pandas==0.19.2 + +# Install recent-enough version of wheel for Python 3.5 wheel builds +pip3.5 install wheel==0.29.0 + +pip3.5 install portpicker + +pip3.5 install werkzeug diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh index 73c08e5d0b0ac3..1488e8d78c8505 100644 --- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh +++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh @@ -101,11 +101,8 @@ exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}" function get_failing_cpu_py_tests() { echo " //$1/tensorflow/python:basic_session_run_hooks_test + \ - //$1/tensorflow/python:bigquery_reader_ops_test + \ //$1/tensorflow/python:contrib_test + \ //$1/tensorflow/python:dequantize_op_test + \ - //$1/tensorflow/python:directory_watcher_test + \ - //$1/tensorflow/python:event_multiplexer_test + \ //$1/tensorflow/python:file_io_test + \ //$1/tensorflow/python:file_system_test + \ //$1/tensorflow/python:framework_meta_graph_test + \ diff --git a/tensorflow/tools/compatibility/README.md b/tensorflow/tools/compatibility/README.md index 9dba070a4f86d0..aabc7b253d68eb 100644 --- a/tensorflow/tools/compatibility/README.md +++ b/tensorflow/tools/compatibility/README.md @@ -11,7 +11,10 @@ It will print a list of errors it finds that it can't fix. You can also run it on a directory tree: ``` +# just upgrade the .py files tf_upgrade.py --intree coolcode --outtree coolcode-upgraded +# after upgrade the .py files, then copy all the other files to the outtree +tf_upgrade.py --intree coolcode --outtree coolcode-upgraded --copyotherfiles True ``` In either case, it will also dump out a report e.g. which will detail changes diff --git a/tensorflow/tools/compatibility/tf_upgrade.py b/tensorflow/tools/compatibility/tf_upgrade.py index 26bf11725609ea..80439f835a6783 100644 --- a/tensorflow/tools/compatibility/tf_upgrade.py +++ b/tensorflow/tools/compatibility/tf_upgrade.py @@ -140,6 +140,7 @@ def __init__(self): "tf.batch_svd": "tf.svd", "tf.batch_fft": "tf.fft", "tf.batch_ifft": "tf.ifft", + "tf.batch_fft2d": "tf.fft2d", "tf.batch_ifft2d": "tf.ifft2d", "tf.batch_fft3d": "tf.fft3d", "tf.batch_ifft3d": "tf.ifft3d", @@ -566,7 +567,7 @@ def process_opened_file(self, in_filename, in_file, out_filename, out_file): return 1, text, process_errors # pylint: enable=broad-except - def process_tree(self, root_directory, output_root_directory): + def process_tree(self, root_directory, output_root_directory, copy_other_files): """Processes upgrades on an entire tree of python files in place. Note that only Python files. If you have custom code in other languages, @@ -596,13 +597,21 @@ def process_tree(self, root_directory, output_root_directory): # Collect list of files to process (we do this to correctly handle if the # user puts the output directory in some sub directory of the input dir) files_to_process = [] + files_to_copy = [] for dir_name, _, file_list in os.walk(root_directory): py_files = [f for f in file_list if f.endswith(".py")] + copy_files = [f for f in file_list if not f.endswith(".py")] for filename in py_files: fullpath = os.path.join(dir_name, filename) fullpath_output = os.path.join( output_root_directory, os.path.relpath(fullpath, root_directory)) files_to_process.append((fullpath, fullpath_output)) + if copy_other_files: + for filename in copy_files: + fullpath = os.path.join(dir_name, filename) + fullpath_output = os.path.join( + output_root_directory, os.path.relpath(fullpath, root_directory)) + files_to_copy.append((fullpath, fullpath_output)) file_count = 0 tree_errors = [] @@ -619,6 +628,11 @@ def process_tree(self, root_directory, output_root_directory): _, l_report, l_errors = self.process_file(input_path, output_path) tree_errors += l_errors report += l_report + for input_path, output_path in files_to_copy: + output_directory = os.path.dirname(output_path) + if not os.path.isdir(output_directory): + os.makedirs(output_directory) + shutil.copy(input_path, output_path) return file_count, report, tree_errors @@ -650,6 +664,13 @@ def process_tree(self, root_directory, output_root_directory): dest="output_tree", help="If converting a whole tree of files, the output " "directory (relative or absolute).") + parser.add_argument( + "--copyotherfiles", + dest="copy_other_files", + help=("If converting a whole tree of files, whether to " + "copy the other files."), + type=bool, + default=False) parser.add_argument( "--reportfile", dest="report_filename", @@ -669,7 +690,7 @@ def process_tree(self, root_directory, output_root_directory): files_processed = 1 elif args.input_tree: files_processed, report_text, errors = upgrade.process_tree( - args.input_tree, args.output_tree) + args.input_tree, args.output_tree, args.copy_other_files) else: parser.print_help() if report_text: diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel index a67f1af2bdafa4..dd18b61017855d 100644 --- a/tensorflow/tools/docker/Dockerfile.devel +++ b/tensorflow/tools/docker/Dockerfile.devel @@ -61,12 +61,11 @@ RUN add-apt-repository -y ppa:openjdk-r/ppa && \ # Running bazel inside a `docker build` command causes trouble, cf: # https://github.com/bazelbuild/bazel/issues/134 # The easiest solution is to set up a bazelrc file forcing --batch. -RUN echo "startup --batch" >>/root/.bazelrc +RUN echo "startup --batch" >>/etc/bazel.bazelrc # Similarly, we need to workaround sandboxing issues: # https://github.com/bazelbuild/bazel/issues/418 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \ - >>/root/.bazelrc -ENV BAZELRC /root/.bazelrc + >>/etc/bazel.bazelrc # Install the most recent bazel release. ENV BAZEL_VERSION 0.4.5 WORKDIR / diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu index d1a733458dacdf..8ead2f15ae3b59 100644 --- a/tensorflow/tools/docker/Dockerfile.devel-gpu +++ b/tensorflow/tools/docker/Dockerfile.devel-gpu @@ -61,12 +61,11 @@ RUN add-apt-repository -y ppa:openjdk-r/ppa && \ # Running bazel inside a `docker build` command causes trouble, cf: # https://github.com/bazelbuild/bazel/issues/134 # The easiest solution is to set up a bazelrc file forcing --batch. -RUN echo "startup --batch" >>/root/.bazelrc +RUN echo "startup --batch" >>/etc/bazel.bazelrc # Similarly, we need to workaround sandboxing issues: # https://github.com/bazelbuild/bazel/issues/418 RUN echo "build --spawn_strategy=standalone --genrule_strategy=standalone" \ - >>/root/.bazelrc -ENV BAZELRC /root/.bazelrc + >>/etc/bazel.bazelrc # Install the most recent bazel release. ENV BAZEL_VERSION 0.4.5 WORKDIR / diff --git a/tensorflow/tools/git/gen_git_source.py b/tensorflow/tools/git/gen_git_source.py index c97ce7561fd256..299d50c35919a4 100755 --- a/tensorflow/tools/git/gen_git_source.py +++ b/tensorflow/tools/git/gen_git_source.py @@ -76,7 +76,11 @@ def configure(src_base_path, debug=False): # Remove and recreate the path if os.path.exists(gen_path): if os.path.isdir(gen_path): - shutil.rmtree(gen_path) + try: + shutil.rmtree(gen_path) + except PermissionError: + raise RuntimeError("Cannot delete directory %s due to permission " + "error, inspect and remove manually" % gen_path) else: raise RuntimeError("Cannot delete non-directory %s, inspect ", "and remove manually" % gen_path) diff --git a/tensorflow/tools/graph_transforms/summarize_graph_main.cc b/tensorflow/tools/graph_transforms/summarize_graph_main.cc index 8c23ae7a74901d..f45dfbba0ced35 100644 --- a/tensorflow/tools/graph_transforms/summarize_graph_main.cc +++ b/tensorflow/tools/graph_transforms/summarize_graph_main.cc @@ -109,7 +109,7 @@ Status SummarizeGraph(const GraphDef& graph, const string& graph_path) { if (node.op() == "Placeholder") { placeholders.push_back(&node); } - if (node.op() == "Variable") { + if (node.op() == "Variable" || node.op() == "VariableV2") { variables.push_back(&node); } } @@ -168,7 +168,8 @@ Status SummarizeGraph(const GraphDef& graph, const string& graph_path) { if (node.device() != "") { ++device_counts[node.device()]; } - if ((node.op() == "Const") || (node.op() == "Variable")) { + if ((node.op() == "Const") || (node.op() == "Variable") || + (node.op() == "VariableV2")) { Tensor tensor; if (node.attr().count("value") && tensor.FromProto(node.attr().at("value").tensor())) { diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index 74a78189677299..d9c67862e74995 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -66,23 +66,21 @@ py_binary( "README", "setup.py", ":included_headers", - "//tensorflow/contrib/ndlstm", "//tensorflow/contrib/nn:nn_py", "//tensorflow/contrib/session_bundle:session_bundle_pip", - "//tensorflow/contrib/slim", "//tensorflow/contrib/slim/python/slim/data:data_pip", - "//tensorflow/contrib/slim/python/slim/nets:nets_pip", - "//tensorflow/contrib/specs", - "//tensorflow/contrib/tensor_forest:init_py", - "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip", "//tensorflow/python:util_example_parser_configuration", "//tensorflow/python/debug:debug_pip", "//tensorflow/python/saved_model", "//tensorflow/python/tools:tools_pip", - # The following target has an issue when archiving them into the python - # zip, exclude them for now. - # "//tensorflow/tensorboard", - # This package does not build. Exclude it in windows for now. + "//tensorflow/tensorboard", + # These targets don't build on Windows yet. Exclude them for now. + # "//tensorflow/contrib/ndlstm", + # "//tensorflow/contrib/slim", + # "//tensorflow/contrib/slim/python/slim/nets:nets_pip", + # "//tensorflow/contrib/specs", + # "//tensorflow/contrib/tensor_forest:init_py", + # "//tensorflow/contrib/tensor_forest/hybrid:hybrid_pip", # "//tensorflow/examples/tutorials/mnist:package", ], srcs_version = "PY2AND3", diff --git a/tensorflow/tools/pip_package/MANIFEST.in b/tensorflow/tools/pip_package/MANIFEST.in index 22b00c428432c6..fe21f221b16a69 100644 --- a/tensorflow/tools/pip_package/MANIFEST.in +++ b/tensorflow/tools/pip_package/MANIFEST.in @@ -1,4 +1,6 @@ include README recursive-include * *.py recursive-include * *.so +recursive-include * *.dll +recursive-include * *.lib recursive-include * *.csv diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 25aecb5707584c..4c4973080f45ed 100644 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -64,6 +64,10 @@ temp_workaround_http_archive = repository_rule( # If TensorFlow is linked as a submodule. # path_prefix and tf_repo_name are no longer used. def tf_workspace(path_prefix = "", tf_repo_name = ""): + # We must check the bazel version before trying to parse any other BUILD + # files, in case the parsing of those build files depends on the bazel + # version we require here. + check_version("0.4.5") cuda_configure(name = "local_config_cuda") sycl_configure(name = "local_config_sycl") if path_prefix: diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl index 01e070f2be3adf..a2b3e7d79e93bb 100644 --- a/third_party/gpus/cuda_configure.bzl +++ b/third_party/gpus/cuda_configure.bzl @@ -826,8 +826,17 @@ def _cuda_autoconf_impl(repository_ctx): cuda_configure = repository_rule( implementation = _cuda_autoconf_impl, - local = True, + environ = [ + _GCC_HOST_COMPILER_PATH, + "TF_NEED_CUDA", + _CUDA_TOOLKIT_PATH, + _CUDNN_INSTALL_PATH, + _TF_CUDA_VERSION, + _TF_CUDNN_VERSION, + _TF_CUDA_COMPUTE_CAPABILITIES, + ], ) + """Detects and configures the local CUDA toolchain. Add the following to your WORKSPACE FILE: diff --git a/third_party/sycl/crosstool/computecpp.tpl b/third_party/sycl/crosstool/computecpp.tpl index 66dd9aea7bef75..595e7136a61ba2 100755 --- a/third_party/sycl/crosstool/computecpp.tpl +++ b/third_party/sycl/crosstool/computecpp.tpl @@ -65,7 +65,7 @@ def main(): # strip asan for the device computecpp_device_compiler_flags = ['-sycl-compress-name', '-DTENSORFLOW_USE_SYCL', '-Wno-unused-variable', '-I', COMPUTECPP_INCLUDE, '-isystem', COMPUTECPP_INCLUDE, '-std=c++11', '-sycl', '-emit-llvm', '-no-serial-memop', '-Xclang', '-cl-denorms-are-zero', '-Xclang', '-cl-fp32-correctly-rounded-divide-sqrt'] - computecpp_device_compiler_flags += [flag for flag in compiler_flags if not flag.startswith(('-fsanitize'))] + computecpp_device_compiler_flags += [flag for flag in compiler_flags if not flag.startswith(('-fsanitize', '-march=native', '-mavx'))] x = subprocess.call([COMPUTECPP_DRIVER] + computecpp_device_compiler_flags ) if(x == 0): diff --git a/util/python/python_config.sh b/util/python/python_config.sh index 789c4b35b35004..4b18bf3578d77e 100755 --- a/util/python/python_config.sh +++ b/util/python/python_config.sh @@ -181,7 +181,7 @@ function setup_python { # Write tools/bazel.rc echo "# Autogenerated by configure: DO NOT EDIT" > tools/bazel.rc sed -e "s/\$PYTHON_MAJOR_VERSION/$python_major_version/g" \ - -e "s[\$PYTHON_BINARY[\"$PYTHON_BIN_PATH\"[g" \ + -e "s|\$PYTHON_BINARY|\"$PYTHON_BIN_PATH\"|g" \ tools/bazel.rc.template >> tools/bazel.rc # Write tools/python_bin_path.sh echo "export PYTHON_BIN_PATH=\"$PYTHON_BIN_PATH\"" > tools/python_bin_path.sh