From 6e7db6fb37f99bc701f15594b4b95ad9124b371b Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Thu, 5 Nov 2020 23:44:45 +0300 Subject: [PATCH] [IE CLDNN] Plugin-side kernels caching (#2871) --- inference-engine/include/ie_plugin_config.hpp | 13 + .../src/cldnn_engine/cldnn_config.cpp | 36 ++- .../src/cldnn_engine/cldnn_config.h | 4 +- .../src/cldnn_engine/cldnn_engine.cpp | 1 + .../src/cldnn_engine/cldnn_remote_context.cpp | 3 +- .../functional/plugin/gpu/behavior/cache.cpp | 86 +++++++ .../behavior/core_threading_tests.cpp | 1 + .../common_test_utils/file_utils.hpp | 53 ++++ .../common_test_utils/unicode_utils.hpp | 86 ++++++- .../common_test_utils/w_dirent.h | 227 ++++++++++++++++++ .../thirdparty/clDNN/api/engine.hpp | 13 +- .../thirdparty/clDNN/api/program.hpp | 29 +++ .../thirdparty/clDNN/src/engine.cpp | 1 + .../clDNN/src/gpu/configuration.cpp | 3 +- .../thirdparty/clDNN/src/gpu/configuration.h | 1 + .../clDNN/src/gpu/kernels_cache.cpp | 213 +++++++++++++--- .../thirdparty/clDNN/src/gpu/kernels_cache.h | 6 +- 17 files changed, 730 insertions(+), 46 deletions(-) create mode 100644 inference-engine/tests/functional/plugin/gpu/behavior/cache.cpp create mode 100644 inference-engine/tests/ie_test_utils/common_test_utils/w_dirent.h diff --git a/inference-engine/include/ie_plugin_config.hpp b/inference-engine/include/ie_plugin_config.hpp index a59320420fc0b2..e6175eb53567eb 100644 --- a/inference-engine/include/ie_plugin_config.hpp +++ b/inference-engine/include/ie_plugin_config.hpp @@ -362,5 +362,18 @@ DECLARE_CONFIG_KEY(DUMP_EXEC_GRAPH_AS_DOT); */ DECLARE_CONFIG_KEY(ENFORCE_BF16); +/** +* @brief This key defines the directory which will be used to store any data cached by plugins. +* +* This key supports unicode symbols in path +* The underlying cache structure is not defined and might differ between OpenVINO releases +* Cached data might be platform/device specific and might be invalid after OpenVINO version change +* If this key is not specified or value is empty string, then caching is disabled. +* The key might enable caching for all plugin or some specific ones, e.g.: +* ie.SetConfig({{CONFIG_KEY(CACHE_DIR), "cache/"}}) - enables cache for all plugins that might want to use it +* ie.SetConfig({{CONFIG_KEY(CACHE_DIR), "cache/"}}, {"GPU"}) - enables cache only for GPU plugin +*/ +DECLARE_CONFIG_KEY(CACHE_DIR); + } // namespace PluginConfigParams } // namespace InferenceEngine diff --git a/inference-engine/src/cldnn_engine/cldnn_config.cpp b/inference-engine/src/cldnn_engine/cldnn_config.cpp index cd685ed682c64d..230939922ac87f 100644 --- a/inference-engine/src/cldnn_engine/cldnn_config.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_config.cpp @@ -8,16 +8,36 @@ #include "cldnn_config.h" #include "cpp_interfaces/exception2status.hpp" #include "cpp_interfaces/interface/ie_internal_plugin_config.hpp" +#include "ie_api.h" +#include "file_utils.h" #ifdef _WIN32 # include +#ifdef ENABLE_UNICODE_PATH_SUPPORT +# define mkdir(dir, mode) _wmkdir(dir) +#else # define mkdir(dir, mode) _mkdir(dir) -#endif +#endif // ENABLE_UNICODE_PATH_SUPPORT +#endif // _WIN32 using namespace InferenceEngine; namespace CLDNNPlugin { +static void createDirectory(std::string _path) { +#if defined(ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) + std::wstring widepath = FileUtils::multiByteCharToWString(_path.c_str()); + const wchar_t* path = widepath.c_str(); +#else + const char* path = _path.c_str(); +#endif + + auto err = mkdir(path, 0755); + if (err != 0 && errno != EEXIST) { + THROW_IE_EXCEPTION << "Couldn't create directory! (err=" << err << "; errno=" << errno << ")"; + } +} + void Config::UpdateFromMap(const std::map& configMap) { for (auto& kvp : configMap) { std::string key = kvp.first; @@ -129,16 +149,17 @@ void Config::UpdateFromMap(const std::map& configMap) } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_GRAPH_DUMPS_DIR) == 0) { if (!val.empty()) { graph_dumps_dir = val; - if (mkdir(graph_dumps_dir.c_str(), 0755) != 0) { - THROW_IE_EXCEPTION << "Couldn't create clDNN graph dump directory!"; - } + createDirectory(graph_dumps_dir); + } + } else if (key.compare(PluginConfigParams::KEY_CACHE_DIR) == 0) { + if (!val.empty()) { + kernels_cache_dir = val; + createDirectory(kernels_cache_dir); } } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_SOURCES_DUMPS_DIR) == 0) { if (!val.empty()) { sources_dumps_dir = val; - if (mkdir(sources_dumps_dir.c_str(), 0755) != 0) { - THROW_IE_EXCEPTION << "Couldn't create clDNN source dump directory!"; - } + createDirectory(sources_dumps_dir); } } else if (key.compare(PluginConfigParams::KEY_EXCLUSIVE_ASYNC_REQUESTS) == 0) { if (val.compare(PluginConfigParams::YES) == 0) { @@ -276,6 +297,7 @@ void Config::adjustKeyMapValues() { key_config_map[CLDNNConfigParams::KEY_CLDNN_GRAPH_DUMPS_DIR] = graph_dumps_dir; key_config_map[CLDNNConfigParams::KEY_CLDNN_SOURCES_DUMPS_DIR] = sources_dumps_dir; + key_config_map[PluginConfigParams::KEY_CACHE_DIR] = kernels_cache_dir; key_config_map[PluginConfigParams::KEY_GPU_THROUGHPUT_STREAMS] = std::to_string(throughput_streams); key_config_map[PluginConfigParams::KEY_DEVICE_ID] = device_id; diff --git a/inference-engine/src/cldnn_engine/cldnn_config.h b/inference-engine/src/cldnn_engine/cldnn_config.h index 8bc782eeb05573..9abf6396adb4a1 100644 --- a/inference-engine/src/cldnn_engine/cldnn_config.h +++ b/inference-engine/src/cldnn_engine/cldnn_config.h @@ -35,7 +35,8 @@ struct Config { tuningConfig(), graph_dumps_dir(""), sources_dumps_dir(""), - device_id("") { + device_id(""), + kernels_cache_dir("") { adjustKeyMapValues(); } @@ -59,6 +60,7 @@ struct Config { std::string graph_dumps_dir; std::string sources_dumps_dir; std::string device_id; + std::string kernels_cache_dir; std::map key_config_map; }; diff --git a/inference-engine/src/cldnn_engine/cldnn_engine.cpp b/inference-engine/src/cldnn_engine/cldnn_engine.cpp index 222f78b484ccd5..c1c5215432a797 100644 --- a/inference-engine/src/cldnn_engine/cldnn_engine.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_engine.cpp @@ -312,6 +312,7 @@ ExecutableNetworkInternal::Ptr clDNNEngine::LoadExeNetworkImpl(const InferenceEn context_config.sources_dumps_dir == current_config.sources_dumps_dir && context_config.tuningConfig.mode == current_config.tuningConfig.mode && context_config.tuningConfig.cache_file_path == current_config.tuningConfig.cache_file_path && + context_config.kernels_cache_dir == current_config.kernels_cache_dir && context_config.device_id == current_config.device_id; }; diff --git a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp index 3ff528904c1168..c8d4ceb4bb614f 100644 --- a/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_remote_context.cpp @@ -262,7 +262,8 @@ CLDNNExecutionContextImpl::CLDNNExecutionContextImpl(const std::shared_ptr +#include + +class CompiledKernelsCacheTest : public CommonTestUtils::TestsCommon { +protected: + std::string test_name = ::testing::UnitTest::GetInstance()->current_test_info()->name(); + std::shared_ptr function; + std::string cache_path; + + void SetUp() override { + function = ngraph::builder::subgraph::makeConvPoolRelu(); + cache_path = test_name + "_cache"; + } +}; + +TEST_F(CompiledKernelsCacheTest, CanCreateCacheDirAndDumpBinaries) { + std::shared_ptr ie = PluginCache::get().ie(); + // Create CNNNetwork from ngraph::Function + InferenceEngine::CNNNetwork cnnNet(function); + std::map config = {{ CONFIG_KEY(CACHE_DIR), cache_path }}; + try { + // Load CNNNetwork to target plugins + auto execNet = ie->LoadNetwork(cnnNet, "GPU", config); + + // Check that directory with cached kernels exists after loading network + ASSERT_TRUE(CommonTestUtils::directoryExists(cache_path)) << "Directory with cached kernels doesn't exist"; + // Check that folder contains cache files and remove them + ASSERT_GT(CommonTestUtils::removeFilesWithExt(cache_path, "cl_cache"), 0); + // Remove directory and check that it doesn't exist anymore + ASSERT_EQ(CommonTestUtils::removeDir(cache_path), 0); + ASSERT_FALSE(CommonTestUtils::directoryExists(cache_path)); + } catch (std::exception& ex) { + // Cleanup in case of any exception + if (CommonTestUtils::directoryExists(cache_path)) { + ASSERT_GE(CommonTestUtils::removeFilesWithExt(cache_path, "cl_cache"), 0); + ASSERT_EQ(CommonTestUtils::removeDir(cache_path), 0); + } + FAIL() << ex.what() << std::endl; + } +} + +#ifdef ENABLE_UNICODE_PATH_SUPPORT + +TEST_F(CompiledKernelsCacheTest, CanCreateCacheDirAndDumpBinariesUnicodePath) { + std::shared_ptr ie = PluginCache::get().ie(); + // Create CNNNetwork from ngraph::Function + InferenceEngine::CNNNetwork cnnNet(function); + for (std::size_t testIndex = 0; testIndex < CommonTestUtils::test_unicode_postfix_vector.size(); testIndex++) { + std::wstring postfix = L"_" + CommonTestUtils::test_unicode_postfix_vector[testIndex]; + std::wstring cache_path_w = CommonTestUtils::addUnicodePostfixToPath(cache_path, postfix); + + try { + auto cache_path_mb = FileUtils::wStringtoMBCSstringChar(cache_path_w); + std::map config = {{ CONFIG_KEY(CACHE_DIR), cache_path_mb }}; + // Load CNNNetwork to target plugins + auto execNet = ie->LoadNetwork(cnnNet, "GPU", config); + + // Check that directory with cached kernels exists after loading network + ASSERT_TRUE(CommonTestUtils::directoryExists(cache_path_w)) << "Directory with cached kernels doesn't exist"; + // Check that folder contains cache files and remove them + ASSERT_GT(CommonTestUtils::removeFilesWithExt(cache_path_w, L"cl_cache"), 0); + // Remove directory and check that it doesn't exist anymore + ASSERT_EQ(CommonTestUtils::removeDir(cache_path_w), 0); + ASSERT_FALSE(CommonTestUtils::directoryExists(cache_path_w)); + } catch (std::exception& ex) { + // Cleanup in case of any exception + if (CommonTestUtils::directoryExists(cache_path_w)) { + ASSERT_GE(CommonTestUtils::removeFilesWithExt(cache_path_w, L"cl_cache"), 0); + ASSERT_EQ(CommonTestUtils::removeDir(cache_path_w), 0); + } + FAIL() << ex.what() << std::endl; + } + } +} + +#endif // ENABLE_UNICODE_PATH_SUPPORT diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp index 8e6a2dc859fb73..5886a2998a0536 100644 --- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp +++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp @@ -13,6 +13,7 @@ namespace { Params params[] = { std::tuple{ CommonTestUtils::DEVICE_GPU, { { CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(YES) }}}, std::tuple{ CommonTestUtils::DEVICE_GPU, { { CONFIG_KEY(PERF_COUNT), CONFIG_VALUE(NO) }}}, + std::tuple{ CommonTestUtils::DEVICE_GPU, { { CONFIG_KEY(CACHE_DIR), "cache" }}}, }; } // namespace diff --git a/inference-engine/tests/ie_test_utils/common_test_utils/file_utils.hpp b/inference-engine/tests/ie_test_utils/common_test_utils/file_utils.hpp index c6e8e6a8496c34..0452c24a016c3f 100644 --- a/inference-engine/tests/ie_test_utils/common_test_utils/file_utils.hpp +++ b/inference-engine/tests/ie_test_utils/common_test_utils/file_utils.hpp @@ -6,8 +6,18 @@ #include #include #include +#include #include "test_constants.hpp" +#include "w_dirent.h" +#include "common_utils.hpp" + +#ifdef _WIN32 +#include +#define rmdir(dir) _rmdir(dir) +#else // _WIN32 +#include +#endif // _WIN32 namespace CommonTestUtils { @@ -62,4 +72,47 @@ inline void removeIRFiles(const std::string &xmlFilePath, const std::string &bin std::remove(binFileName.c_str()); } } + +// Removes all files with extension=ext from the given directory +// Return value: +// < 0 - error +// >= 0 - count of removed files +inline int removeFilesWithExt(std::string path, std::string ext) { + struct dirent *ent; + DIR *dir = opendir(path.c_str()); + int ret = 0; + if (dir != nullptr) { + while ((ent = readdir(dir)) != NULL) { + auto file = makePath(path, std::string(ent->d_name)); + struct stat stat_path; + stat(file.c_str(), &stat_path); + if (!S_ISDIR(stat_path.st_mode) && endsWith(file, "." + ext)) { + auto err = std::remove(file.c_str()); + if (err != 0) { + closedir(dir); + return err; + } + ret++; + } + } + closedir(dir); + } + + return ret; +} + +inline int removeDir(const std::string &path) { + return rmdir(path.c_str()); +} + +inline bool directoryExists(const std::string &path) { + struct stat sb; + + if (stat(path.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) { + return true; + } + + return false; +} + } // namespace CommonTestUtils diff --git a/inference-engine/tests/ie_test_utils/common_test_utils/unicode_utils.hpp b/inference-engine/tests/ie_test_utils/common_test_utils/unicode_utils.hpp index 3a13d715eb4ba7..44a4523a3a63cc 100644 --- a/inference-engine/tests/ie_test_utils/common_test_utils/unicode_utils.hpp +++ b/inference-engine/tests/ie_test_utils/common_test_utils/unicode_utils.hpp @@ -10,6 +10,8 @@ #include #include +#include "common_utils.hpp" +#include "w_dirent.h" #ifdef ENABLE_UNICODE_PATH_SUPPORT namespace CommonTestUtils { @@ -71,6 +73,88 @@ static void removeFile(std::wstring path) { } } +inline bool endsWith(const std::wstring& source, const std::wstring& expectedSuffix) { + return expectedSuffix.size() <= source.size() && source.compare(source.size() - expectedSuffix.size(), expectedSuffix.size(), expectedSuffix) == 0; +} + +// Removes all files with extension=ext from the given directory +// Return value: +// < 0 - error +// >= 0 - count of removed files +inline int removeFilesWithExt(std::wstring path, std::wstring ext) { + int ret = 0; +#ifdef _WIN32 + struct _wdirent *ent; + _WDIR *dir = _wopendir(path.c_str()); + if (dir != nullptr) { + while ((ent = _wreaddir(dir)) != NULL) { + auto file = ::FileUtils::makePath(path, std::wstring(ent->wd_name)); + struct _stat64i32 stat_path; + _wstat(file.c_str(), &stat_path); + if (!S_ISDIR(stat_path.st_mode) && endsWith(file, L"." + ext)) { + auto err = _wremove(file.c_str()); + if (err != 0) { + _wclosedir(dir); + return err; + } + ret++; + } + } + _wclosedir(dir); + } +#else + struct dirent *ent; + auto path_mb = FileUtils::wStringtoMBCSstringChar(path); + auto ext_mb = FileUtils::wStringtoMBCSstringChar(ext); + DIR *dir = opendir(path_mb.c_str()); + if (dir != nullptr) { + while ((ent = readdir(dir)) != NULL) { + std::string file = ::FileUtils::makePath(path_mb, std::string(ent->d_name)); + struct stat stat_path; + stat(file.c_str(), &stat_path); + if (!S_ISDIR(stat_path.st_mode) && ::CommonTestUtils::endsWith(file, "." + ext_mb)) { + auto err = std::remove(file.c_str()); + if (err != 0) { + closedir(dir); + return err; + } + ret++; + } + } + closedir(dir); + } +#endif + return ret; +} + +static int removeDir(std::wstring path) { + int result = 0; + if (!path.empty()) { +#ifdef _WIN32 + result = _wrmdir(path.c_str()); +#else + result = rmdir(FileUtils::wStringtoMBCSstringChar(path).c_str()); +#endif + } + return result; +} + +inline bool directoryExists(const std::wstring &path) { +#ifdef _WIN32 + struct _stat64i32 sb; + if (_wstat(path.c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) { + return true; + } +#else + struct stat sb; + if (stat(FileUtils::wStringtoMBCSstringChar(path).c_str(), &sb) == 0 && S_ISDIR(sb.st_mode)) { + return true; + } +#endif + + return false; +} + static const std::vector test_unicode_postfix_vector = { L"unicode_Яㅎあ", L"ひらがな日本語", @@ -83,4 +167,4 @@ static const std::vector test_unicode_postfix_vector = { }; } // namespace CommonTestUtils -#endif // ENABLE_UNICODE_PATH_SUPPORT \ No newline at end of file +#endif // ENABLE_UNICODE_PATH_SUPPORT diff --git a/inference-engine/tests/ie_test_utils/common_test_utils/w_dirent.h b/inference-engine/tests/ie_test_utils/common_test_utils/w_dirent.h new file mode 100644 index 00000000000000..933892623b0f5a --- /dev/null +++ b/inference-engine/tests/ie_test_utils/common_test_utils/w_dirent.h @@ -0,0 +1,227 @@ +// Copyright (C) 2020 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#if defined(_WIN32) + +#ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN_UNDEF +#endif + +#ifndef NOMINMAX +# define NOMINMAX +# define NOMINMAX_UNDEF +#endif + +#if defined(_M_IX86) && !defined(_X86_) && !defined(_AMD64_) +# define _X86_ +#endif + +#if defined(_M_X64) && !defined(_X86_) && !defined(_AMD64_) +# define _AMD64_ +#endif + +#if defined(_M_ARM) && !defined(_ARM_) && !defined(_ARM64_) +# define _ARM_ +#endif + +#if defined(_M_ARM64) && !defined(_ARM_) && !defined(_ARM64_) +# define _ARM64_ +#endif + +#include +#include +#include +#include +#include + +// Copied from linux libc sys/stat.h: +#define S_ISREG(m) (((m) & S_IFMT) == S_IFREG) +#define S_ISDIR(m) (((m) & S_IFMT) == S_IFDIR) + +struct dirent { + char *d_name; + + explicit dirent(const wchar_t *wsFilePath) { + size_t i; + auto slen = wcslen(wsFilePath); + d_name = static_cast(malloc(slen + 1)); + wcstombs_s(&i, d_name, slen + 1, wsFilePath, slen); + } + ~dirent() { + free(d_name); + } +}; + +class DIR { + WIN32_FIND_DATAA FindFileData; + HANDLE hFind; + dirent *next; + + static inline bool endsWith(const std::string &src, const char *with) { + int wl = static_cast(strlen(with)); + int so = static_cast(src.length()) - wl; + if (so < 0) return false; + return 0 == strncmp(with, &src[so], wl); + } + +public: + DIR(const DIR &other) = delete; + DIR(DIR &&other) = delete; + DIR& operator=(const DIR &other) = delete; + DIR& operator=(DIR &&other) = delete; + + explicit DIR(const char *dirPath) : next(nullptr) { + std::string ws = dirPath; + if (endsWith(ws, "\\")) + ws += "*"; + else + ws += "\\*"; + hFind = FindFirstFileA(ws.c_str(), &FindFileData); + FindFileData.dwReserved0 = hFind != INVALID_HANDLE_VALUE; + } + + ~DIR() { + if (!next) delete next; + next = nullptr; + FindClose(hFind); + } + + bool isValid() const { + return (hFind != INVALID_HANDLE_VALUE && FindFileData.dwReserved0); + } + + dirent* nextEnt() { + if (next != nullptr) delete next; + next = nullptr; + + if (!FindFileData.dwReserved0) return nullptr; + + wchar_t wbuf[4096]; + + size_t outSize; + mbstowcs_s(&outSize, wbuf, 4094, FindFileData.cFileName, 4094); + next = new dirent(wbuf); + FindFileData.dwReserved0 = FindNextFileA(hFind, &FindFileData); + return next; + } +}; + +struct _wdirent { + wchar_t *wd_name; + + explicit _wdirent(const wchar_t *wsFilePath) { + auto slen = wcslen(wsFilePath); + wd_name = static_cast(malloc(sizeof(wchar_t) * (slen + 1))); + wcscpy_s(wd_name, slen + 1, wsFilePath); + } + ~_wdirent() { + free(wd_name); + } +}; + +class _WDIR { + WIN32_FIND_DATAW FindFileData; + HANDLE hFind; + _wdirent *next; + + static inline bool endsWith(const std::wstring &src, const wchar_t *with) { + int wl = static_cast(wcslen(with)); + int so = static_cast(src.length()) - wl; + if (so < 0) return false; + return 0 == wcsncmp(with, &src[so], wl); + } + +public: + _WDIR(const _WDIR &other) = delete; + _WDIR(_WDIR &&other) = delete; + _WDIR& operator=(const _WDIR &other) = delete; + _WDIR& operator=(_WDIR &&other) = delete; + + explicit _WDIR(const wchar_t *dirPath) : next(nullptr) { + std::wstring ws = dirPath; + if (endsWith(ws, L"\\")) + ws += L"*"; + else + ws += L"\\*"; + hFind = FindFirstFileW(ws.c_str(), &FindFileData); + FindFileData.dwReserved0 = hFind != INVALID_HANDLE_VALUE; + } + + ~_WDIR() { + if (!next) delete next; + next = nullptr; + FindClose(hFind); + } + + bool isValid() const { + return (hFind != INVALID_HANDLE_VALUE && FindFileData.dwReserved0); + } + + _wdirent* nextEnt() { + if (next != nullptr) delete next; + next = nullptr; + + if (!FindFileData.dwReserved0) return nullptr; + + std::wstring buf(FindFileData.cFileName); + next = new _wdirent(buf.c_str()); + FindFileData.dwReserved0 = FindNextFileW(hFind, &FindFileData); + return next; + } +}; + + +static DIR* opendir(const char *dirPath) { + auto dp = new DIR(dirPath); + if (!dp->isValid()) { + delete dp; + return nullptr; + } + return dp; +} + +static _WDIR* _wopendir(const wchar_t *dirPath) { + auto dp = new _WDIR(dirPath); + if (!dp->isValid()) { + delete dp; + return nullptr; + } + return dp; +} + +static struct dirent* readdir(DIR *dp) { + return dp->nextEnt(); +} + +static struct _wdirent* _wreaddir(_WDIR *dp) { + return dp->nextEnt(); +} + +static void closedir(DIR *dp) { + delete dp; +} + +static void _wclosedir(_WDIR *dp) { + delete dp; +} + +#ifdef WIN32_LEAN_AND_MEAN_UNDEF +# undef WIN32_LEAN_AND_MEAN +# undef WIN32_LEAN_AND_MEAN_UNDEF +#endif + +#ifdef NOMINMAX_UNDEF +# undef NOMINMAX_UNDEF +# undef NOMINMAX +#endif + +#else + +#include +#include + +#endif diff --git a/inference-engine/thirdparty/clDNN/api/engine.hpp b/inference-engine/thirdparty/clDNN/api/engine.hpp index 2f160093511cda..9942aa930b4764 100644 --- a/inference-engine/thirdparty/clDNN/api/engine.hpp +++ b/inference-engine/thirdparty/clDNN/api/engine.hpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016-2019 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -69,10 +69,11 @@ struct engine_configuration { const throttle_mode_types throttle_mode; ///< Throttle mode (support of throttle hints in command queue). If cl_khr_throttle_hints extension ///< is not supported by current OpenCL implementation, the value must be set to cldnn_throttle_disabled. - bool enable_memory_pool; ///< Enables memory usage optimization. memory objects will be reused when possible - ///< (switched off for older drivers then NEO). - uint16_t n_streams; ///< Number of queues executed in parallel - const std::string tuning_cache_path; ///< Path to tuning kernel cache + bool enable_memory_pool; ///< Enables memory usage optimization. memory objects will be reused when possible + ///< (switched off for older drivers then NEO). + uint16_t n_streams; ///< Number of queues executed in parallel + const std::string kernels_cache_path; ///< Path to compiled kernels cache + const std::string tuning_cache_path; ///< Path to tuning kernel cache /// @brief Constructs engine configuration with specified options. /// @param profiling Enable per-primitive profiling. @@ -93,6 +94,7 @@ struct engine_configuration { throttle_mode_types throttle_mode = throttle_mode_types::disabled, bool memory_pool = true, uint16_t n_streams = 1, + const std::string& kernels_cache_path = "", const std::string& tuning_cache_path = "cache.json") : enable_profiling(profiling) , meaningful_kernels_names(decorate_kernel_names) @@ -106,6 +108,7 @@ struct engine_configuration { , throttle_mode(throttle_mode) , enable_memory_pool(memory_pool) , n_streams(n_streams) + , kernels_cache_path(kernels_cache_path) , tuning_cache_path(tuning_cache_path) { if (n_streams == 0) { throw std::invalid_argument("Invalid streams count set in engine config"); diff --git a/inference-engine/thirdparty/clDNN/api/program.hpp b/inference-engine/thirdparty/clDNN/api/program.hpp index 087aa4754590e5..8252271cb866e8 100644 --- a/inference-engine/thirdparty/clDNN/api/program.hpp +++ b/inference-engine/thirdparty/clDNN/api/program.hpp @@ -66,6 +66,8 @@ enum class build_option_type { /// @brief Specifies a directory to which stages of network compilation should be dumped. (default: empty, i.e. no dumping) graph_dumps_dir, + /// @brief Specifies a directory to which compiled kernels should be cached or can be loaded from. (default: empty, i.e. no caching) + kernels_cache_dir, /// @brief Name for serialization process serialize_network, load_program, @@ -146,6 +148,9 @@ struct build_option { /// @brief Specifies a directory to which stages of network compilation should be dumped (default: empty, i.e. no dumping) static std::shared_ptr graph_dumps_dir(const std::string& dir_path); + /// @brief Specifies a directory to which compiled kernels should be cached or can be loaded from. (default: empty, i.e. no caching) + static std::shared_ptr kernels_cache_dir(const std::string& dir_path); + /// @brief Specifies a name for serialization process. static std::shared_ptr serialize_network(const std::string& network_name); /// @brief Specifies a name of load_program process. @@ -251,6 +256,21 @@ struct build_option_directory : build_option { build_option_directory& operator=(const build_option_directory& other) = delete; }; +/// @brief @ref build_option specialization for selecting a directory. +template +struct build_option_kernels_cache_dir : build_option { + const std::string directory_path; + + explicit build_option_kernels_cache_dir(const std::string& dir_path) : directory_path(dir_path) {} + +private: + /// @brief Returns build_option_type::kernels_cache_dir. + build_option_type get_type() const override { return build_option_type::kernels_cache_dir; } + + build_option_kernels_cache_dir(const build_option_kernels_cache_dir& other) = delete; + build_option_kernels_cache_dir& operator=(const build_option_kernels_cache_dir& other) = delete; +}; + /// @brief @ref build_option specialization for serialization process. template struct build_option_serialization : build_option { @@ -342,6 +362,11 @@ struct build_option_traits { static std::shared_ptr make_default() { return build_option::graph_dumps_dir({}); } }; template <> +struct build_option_traits { + typedef build_option_directory object_type; + static std::shared_ptr make_default() { return build_option::kernels_cache_dir({}); } +}; +template <> struct build_option_traits { typedef build_option_serialization object_type; static std::shared_ptr make_default() { return build_option::serialize_network({}); } @@ -392,6 +417,10 @@ inline std::shared_ptr build_option::tuning_config(const tun inline std::shared_ptr build_option::graph_dumps_dir(const std::string& dir_path) { return std::make_shared>(dir_path); } + +inline std::shared_ptr build_option::kernels_cache_dir(const std::string& dir_path) { + return std::make_shared>(dir_path); +} inline std::shared_ptr build_option::serialize_network(const std::string& name) { return std::make_shared>(name); } diff --git a/inference-engine/thirdparty/clDNN/src/engine.cpp b/inference-engine/thirdparty/clDNN/src/engine.cpp index ac89a731e2cc7b..379e8dba0c3ba5 100644 --- a/inference-engine/thirdparty/clDNN/src/engine.cpp +++ b/inference-engine/thirdparty/clDNN/src/engine.cpp @@ -93,6 +93,7 @@ gpu_toolkit_config convert_configuration(const engine_configuration conf) { result.priority_mode = conf.priority_mode; result.throttle_mode = conf.throttle_mode; result.queues_num = conf.n_streams; + result.kernels_cache_path = conf.kernels_cache_path; result.tuning_cache_path = conf.tuning_cache_path; return result; } diff --git a/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp b/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp index d174da0862f02f..052a133a128eb1 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/configuration.cpp @@ -33,6 +33,7 @@ configuration::configuration() priority_mode(priority_mode_types::disabled), throttle_mode(throttle_mode_types::disabled), queues_num(0), - tuning_cache_path("cache.json") {} + tuning_cache_path("cache.json"), + kernels_cache_path("") {} } // namespace gpu } // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/src/gpu/configuration.h b/inference-engine/thirdparty/clDNN/src/gpu/configuration.h index ecb0d0e8cda5e5..cf402baa0fabb3 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/configuration.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/configuration.h @@ -42,6 +42,7 @@ struct configuration { throttle_mode_types throttle_mode; uint16_t queues_num; std::string tuning_cache_path; + std::string kernels_cache_path; }; } // namespace gpu } // namespace cldnn diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp index c0a5a10330448d..5e210e29563fc5 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp +++ b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -28,13 +28,89 @@ #include "kernel_selector_helper.h" -#define MAX_KERNELS_PER_PROGRAM 10 - -namespace cldnn { -namespace gpu { +#ifndef ENABLE_UNICODE_PATH_SUPPORT +# ifdef _WIN32 +# if defined __INTEL_COMPILER || defined _MSC_VER +# define ENABLE_UNICODE_PATH_SUPPORT +# endif +# elif defined(__GNUC__) && (__GNUC__ > 5 || (__GNUC__ == 5 && __GNUC_MINOR__ > 2)) || defined(__clang__) +# define ENABLE_UNICODE_PATH_SUPPORT +# endif +#endif + +#ifndef _WIN32 +#ifdef ENABLE_UNICODE_PATH_SUPPORT +#include +#include +#endif +#else +#include +#endif namespace { -std::string get_undef_jit(kernels_cache::source_code org_source_code) { + +std::mutex cacheAccessMutex; + +#ifdef ENABLE_UNICODE_PATH_SUPPORT +std::wstring multiByteCharToWString(const char* str) { +#ifdef _WIN32 + int strSize = static_cast(std::strlen(str)); + int size_needed = MultiByteToWideChar(CP_UTF8, 0, str, strSize, NULL, 0); + std::wstring wstrTo(size_needed, 0); + MultiByteToWideChar(CP_UTF8, 0, str, strSize, &wstrTo[0], size_needed); + return wstrTo; +#else + std::wstring_convert> wstring_encoder; + std::wstring result = wstring_encoder.from_bytes(str); + return result; +#endif // _WIN32 +} +#endif // ENABLE_UNICODE_PATH_SUPPORT + +static std::vector loadBinaryFromFile(std::string path) { + std::lock_guard lock(cacheAccessMutex); + +#if defined(ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) + std::wstring widefilename = multiByteCharToWString(path.c_str()); + const wchar_t* filename = widefilename.c_str(); + FILE *fp = _wfopen(filename, L"rb"); +#else + const char* filename = path.c_str(); + FILE *fp = fopen(filename, "rb"); +#endif + + if (fp) { + fseek(fp, 0, SEEK_END); + size_t nsize = (size_t)ftell(fp); + + fseek(fp, 0, SEEK_SET); + + std::vector ret(nsize); + + auto res = fread(ret.data(), sizeof(unsigned char), nsize, fp); + (void)res; + fclose(fp); + return ret; + } + + return {}; +} + +static void saveBinaryToFile(std::string path, const std::vector buffer) { + std::lock_guard lock(cacheAccessMutex); +#if defined(ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32) + std::wstring widefilename = multiByteCharToWString(path.c_str()); + const wchar_t* filename = widefilename.c_str(); +#else + const char* filename = path.c_str(); +#endif + std::ofstream out_file(filename, std::ios::out | std::ios::binary); + if (out_file.is_open()) { + out_file.write((char*)&buffer[0], buffer.size()); + } +} + +std::string get_undef_jit(cldnn::gpu::kernels_cache::source_code org_source_code) { const std::string white_space_with_new_lines = " \t\r\n"; const std::string white_space = " \t"; @@ -99,13 +175,39 @@ std::string reorder_options(const std::string& org_options) { inline bool does_options_support_batch_compilation(const std::string& options) { return options.find("-D") == std::string::npos && options.find("-I") == std::string::npos; } + } // namespace +namespace cldnn { +namespace gpu { + +std::string kernels_cache::get_cache_path() const { + auto path = _context.get_configuration().kernels_cache_path; + if (path.empty()) { + return {}; + } + + if (path.back() != '/' && path.back() != '\\') { + path += "/"; + } + return path; +} + +bool kernels_cache::is_cache_enabled() const { + return !_context.get_configuration().kernels_cache_path.empty(); +} + +size_t kernels_cache::get_max_kernels_per_batch() const { + return 10; +} + kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code& kernels_source_code) const { sorted_code scode; for (const auto& code : kernels_source_code) { - const source_code org_source_code = {code.kernel_strings->jit, code.kernel_strings->str}; + std::string full_code = code.kernel_strings->jit + code.kernel_strings->str; + full_code += get_undef_jit({full_code}); + const source_code org_source_code = { full_code }; std::string entry_point = code.kernel_strings->entry_point; std::string options = code.kernel_strings->options; bool batch_compilation = code.kernel_strings->batch_compilation; @@ -140,23 +242,33 @@ kernels_cache::sorted_code kernels_cache::get_program_source(const kernels_code& current_bucket.options = options; } - if ((current_bucket.kernels_counter % MAX_KERNELS_PER_PROGRAM) == 0) { + // Create new kernels bucket when the limit is reached + if ((current_bucket.kernels_counter % get_max_kernels_per_batch()) == 0) { current_bucket.source.push_back({}); } current_bucket.entry_point_to_id[entry_point] = code.id; + assert(org_source_code.size() == 1); - source_code new_source_code = org_source_code; + current_bucket.source.back().push_back(std::move(org_source_code.front())); - if (batch_compilation) { - new_source_code.push_back(get_undef_jit(org_source_code)); - } + current_bucket.kernels_counter++; + } - for (auto& s : new_source_code) { - current_bucket.source.back().push_back(std::move(s)); + // Compute hash value for each bucket + // Hash calculation might require additional optimizations, but currently execution time of this part is much smaller than loading + // of the precompiled binaries or get_undef_jit calls + // Hash is computed for string that contains compilation options + driver version + + // full source code (jit + template + undef sections) of all kernels in the bucket + for (auto& c : scode) { + program_code& code = c.second; + auto options = c.first; + for (size_t i = 0; i < code.source.size(); i++) { + std::string full_code = options + " " + _context.get_device_info().driver_version; + for (auto& ss : code.source[i]) + full_code += ss; + code.hash_values.push_back(std::hash()(full_code)); } - - current_bucket.kernels_counter++; } return scode; @@ -183,11 +295,26 @@ kernels_cache::kernel_id kernels_cache::set_kernel_source( return id; } +static std::vector getProgramBinaries(cl::Program program) { + // Get the size of the program binary in bytes. + std::vector binary_sizes = program.getInfo(); + + if (binary_sizes.size() != 1) + throw std::runtime_error("Invalid binaries count"); + + size_t binary_size = binary_sizes.front(); + // Binary is not available for the device. + if (binary_size == 0) + throw std::runtime_error("Binary is not avaliable after program build"); + + // Get program binary. + return program.getInfo().front(); +} + kernels_cache::kernels_map kernels_cache::build_program(const program_code& program_source) const { static uint32_t current_file_index = 0; - bool dump_sources = - !_context.get_configuration().ocl_sources_dumps_dir.empty() || program_source.dump_custom_program; + bool dump_sources = !_context.get_configuration().ocl_sources_dumps_dir.empty() || program_source.dump_custom_program; std::string dump_file_name = ""; if (dump_sources) { @@ -204,7 +331,19 @@ kernels_cache::kernels_map kernels_cache::build_program(const program_code& prog // failed to compile) uint32_t part_idx = 0; - for (const auto& sources : program_source.source) { + for (size_t i = 0; i < program_source.source.size(); i++) { + auto sources_bucket_to_compile = program_source.source[i]; + const auto& hash_value = program_source.hash_values[i]; + std::string cached_bin_name = get_cache_path() + std::to_string(hash_value) + ".cl_cache"; + cl::Program::Binaries precompiled_kernels = {}; + if (is_cache_enabled()) { + // Try to load file with name ${hash_value}.cl_cache which contains precompiled kernels for current bucket + // If read is successful, then remove kernels from compilation bucket + auto bin = loadBinaryFromFile(cached_bin_name); + if (!bin.empty()) { + precompiled_kernels.push_back(bin); + } + } auto current_dump_file_name = dump_file_name + std::to_string(part_idx++) + ".cl"; std::ofstream dump_file; @@ -212,23 +351,39 @@ kernels_cache::kernels_map kernels_cache::build_program(const program_code& prog dump_file.open(current_dump_file_name); if (dump_file.good()) { - for (auto& s : sources) dump_file << s; + for (auto& s : sources_bucket_to_compile) + dump_file << s; } } try { - cl::Program program(_context.context(), sources); - program.build({_context.device()}, program_source.options.c_str()); + cl::vector kernels; + // Run compilation + if (precompiled_kernels.empty()) { + cl::Program program(_context.context(), sources_bucket_to_compile); + program.build({_context.device()}, program_source.options.c_str()); - if (dump_sources && dump_file.good()) { - dump_file << "\n/* Build Log:\n"; - for (auto& p : program.getBuildInfo()) dump_file << p.second << "\n"; + if (dump_sources && dump_file.good()) { + dump_file << "\n/* Build Log:\n"; + for (auto& p : program.getBuildInfo()) + dump_file << p.second << "\n"; - dump_file << "*/\n"; - } + dump_file << "*/\n"; + } - cl::vector kernels; - program.createKernels(&kernels); + program.createKernels(&kernels); + if (is_cache_enabled()) { + // If kernels caching is enabled, then we save compiled bucket to binary file with name ${code_hash_value}.cl_cache + // Note: Bin file contains full bucket, not separate kernels, so kernels reuse across different models is quite limited + // Bucket size can be changed in get_max_kernels_per_batch() method, but forcing it to 1 will lead to much longer + // compile time. + saveBinaryToFile(cached_bin_name, getProgramBinaries(program)); + } + } else { + cl::Program program(_context.context(), {_context.device()}, precompiled_kernels); + program.build({_context.device()}, program_source.options.c_str()); + program.createKernels(&kernels); + } for (auto& k : kernels) { auto kernel_name = k.getInfo(); diff --git a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.h b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.h index e9667ac1d27a1d..234efee9ff145b 100644 --- a/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.h +++ b/inference-engine/thirdparty/clDNN/src/gpu/kernels_cache.h @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -45,6 +45,7 @@ class kernels_cache { struct program_code { std::vector source; + std::vector hash_values; uint32_t kernels_counter = 0; std::string options; bool dump_custom_program = false; @@ -96,6 +97,9 @@ class kernels_cache { sorted_code get_program_source(const kernels_code& kernels_source_code) const; kernels_map build_program(const program_code& pcode) const; + std::string get_cache_path() const; + bool is_cache_enabled() const; + size_t get_max_kernels_per_batch() const; public: explicit kernels_cache(gpu_toolkit& context, uint32_t prog_id); kernel_id set_kernel_source(const std::shared_ptr& kernel_string,