From ab8686258874c26e653a288a0163b477e8456bbb Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 8 Feb 2024 15:26:55 +0400 Subject: [PATCH 1/7] [GPU] Disable subtract folding for int8 compressed models for dGPU --- .../convert_fc_to_compressed.cpp | 7 +++-- .../src/plugin/transformations_pipeline.cpp | 11 +++++++- .../convert_fc_to_compressed_test.cpp | 26 +++++++++++++++++++ 3 files changed, 41 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp index f1561c52cb495e..e8ea3c6af3df32 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/convert_fc_to_compressed.cpp @@ -44,7 +44,10 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon auto convert_m = wrap_type({weights_m}); auto sub_const_m = wrap_type(consumers_count(1)); - auto subtract_m = wrap_type({convert_m, sub_const_m}); + auto sub_convert_const_m = wrap_type({sub_const_m}); + auto sub_with_convert_m = wrap_type({convert_m, sub_convert_const_m}); + auto sub_no_convert_m = wrap_type({convert_m, sub_const_m}); + auto subtract_m = std::make_shared(OutputVector{sub_with_convert_m, sub_no_convert_m}); auto mul_const_m = wrap_type(consumers_count(1)); auto mul_with_sub_m = wrap_type({subtract_m, mul_const_m}); @@ -97,7 +100,7 @@ ConvertFullyConnectedToFullyConnectedCompressed::ConvertFullyConnectedToFullyCon const auto& scale = reshape_const_to_2d(pattern_map.at(mul_const_m).get_node_shared_ptr()); std::shared_ptr optional_zero_point = nullptr; - const bool with_zero_point = pattern_map.count(subtract_m) > 0; + const bool with_zero_point = pattern_map.count(sub_no_convert_m) > 0 || pattern_map.count(sub_with_convert_m) > 0; if (with_zero_point) { optional_zero_point = reshape_const_to_2d(pattern_map.at(sub_const_m).get_node_shared_ptr()); } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index ee94acfbd95345..03c5118a7a8861 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -276,7 +276,16 @@ void TransformationsPipeline::apply(std::shared_ptr func) { return !is_type(next_node); }); - manager.register_pass(ov::element::TypeVector{ov::element::u8, ov::element::u4, ov::element::i4}, true); + // Disable subtract folding only for the dGPUs to meet the requirements of oneDNN: + // it expects to have the same data type for weights and zero points (apply it only for u8 data type, since other compression + // types are not supported by oneDNN) + if (device_info.supports_immad) { + manager.register_pass(ov::element::TypeVector{ov::element::u8}, false); + manager.register_pass(ov::element::TypeVector{ov::element::u4, ov::element::i4}, true); + } else { + manager.register_pass(ov::element::TypeVector{ov::element::u8, ov::element::u4, ov::element::i4}, true); + } + // Need to check if transfomrations work correctly for mixed models with both compression and quantization at the same time. if (!is_model_quantized) pass_config->set_callback(is_non_supported_decompression_op); diff --git a/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp index 1c7ebe72990ae4..70f5a45b25aef4 100644 --- a/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp @@ -401,6 +401,32 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed8) { } } +TEST_F(TransformationTestsF, ConvertFCToCompressed9) { + { + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto convert = std::make_shared(weights_const, ov::element::f16); + auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 1 }, { 1 }); + auto zp_convert = std::make_shared(zp_const, ov::element::f16); + auto sub = std::make_shared(convert, zp_convert); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto scale = std::make_shared(sub, scale_const); + auto fc = std::make_shared(input1, scale); + + model = std::make_shared(ov::NodeVector{ fc }, ov::ParameterVector{ input1 }); + manager.register_pass(); + } + { + auto input1 = std::make_shared(ov::element::f16, ov::PartialShape{ -1, 16 }); + auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); + auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); + auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 1 }, { 1 }); + auto fc_compressed = std::make_shared(input1, weights_const, scale_const, zp_const); + + model_ref = std::make_shared(ov::NodeVector{ fc_compressed }, ov::ParameterVector{ input1 }); + } +} + } // namespace intel_gpu } // namespace test } // namespace ov From b56b402e36337af2960304d397ec802717095150 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 8 Feb 2024 18:09:20 +0400 Subject: [PATCH 2/7] [GPU] Enable oneDNN primitive cache and apply cache cleanup at plugin destruction --- .../intel_gpu/include/intel_gpu/plugin/plugin.hpp | 1 + src/plugins/intel_gpu/src/plugin/plugin.cpp | 9 +++++++++ src/plugins/intel_gpu/thirdparty/CMakeLists.txt | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp index a5ea28b95f78ba..12b640305c10b6 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp @@ -45,6 +45,7 @@ class Plugin : public ov::IPlugin { public: Plugin(); + ~Plugin(); std::shared_ptr compile_model(const std::shared_ptr& model, const ov::AnyMap& properties) const override; diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 8592feaea0bb94..c9cc59729a36bb 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -158,6 +158,15 @@ Plugin::Plugin() { m_compiled_model_runtime_properties["OV_VERSION"] = ov_version.buildNumber; } +Plugin::~Plugin() { +#ifdef ENABLE_ONEDNN_FOR_GPU + // To prevent hanging during oneDNN's primitive cache desctruction, + // trigger earlier cache cleanup by setting its capacity to 0. + // Related ticket: 106154. + dnnl::set_primitive_cache_capacity(0); +#endif +} + std::shared_ptr Plugin::compile_model(const std::shared_ptr& model, const ov::AnyMap& orig_config) const { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::compile_model"); std::string device_id = get_device_id(orig_config); diff --git a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt index b7929efacd57b6..5b08fe0c476704 100644 --- a/src/plugins/intel_gpu/thirdparty/CMakeLists.txt +++ b/src/plugins/intel_gpu/thirdparty/CMakeLists.txt @@ -112,7 +112,7 @@ if(ENABLE_ONEDNN_FOR_GPU) "-DDNNL_LIBRARY_NAME=${DNNL_GPU_LIBRARY_NAME}" "-DCMAKE_INSTALL_PREFIX=${ONEDNN_INSTALL_DIR}" "-DDNNL_ENABLE_CONCURRENT_EXEC=ON" - "-DDNNL_ENABLE_PRIMITIVE_CACHE=OFF" + "-DDNNL_ENABLE_PRIMITIVE_CACHE=ON" "-DDNNL_ENABLE_WORKLOAD=INFERENCE" "-DDNNL_ENABLE_JIT_PROFILING=${BUILD_SHARED_LIBS}" "-DDNNL_ENABLE_ITT_TASKS=${BUILD_SHARED_LIBS}" From be4adb24eb727f771f6b5f04ffda38772c94191e Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Thu, 8 Feb 2024 20:11:25 +0400 Subject: [PATCH 3/7] [GPU] Enable 8bit compression support on dGPU via oneDNN --- .../graph/graph_optimizer/compile_graph.cpp | 12 +- .../impls/onednn/fully_connected_onednn.cpp | 122 +++++++++++++++-- .../src/graph/impls/onednn/utils.cpp | 3 + .../intel_gpu/src/graph/layout_optimizer.cpp | 35 ++++- .../intel_gpu/src/graph/primitive_inst.cpp | 2 + .../test_cases/fully_connected_gpu_test.cpp | 127 ++++++++++++++++++ src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 7 files changed, 281 insertions(+), 22 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp index f5b6ec2221addb..3f0a69916f9ef2 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp @@ -42,7 +42,17 @@ void compile_graph::run(program& p) { auto& node = *(std::next(proc_order.begin(), idx)); const bool use_shape_agnostic_impl = !p.get_config().get_property(ov::intel_gpu::use_only_static_kernels_for_dynamic_shape); const impl_types original_impl_type = node->get_preferred_impl_type(); - const bool change_initial_impl = node->is_dynamic() && original_impl_type == impl_types::onednn; + bool change_initial_impl = node->is_dynamic() && original_impl_type == impl_types::onednn; + + if (node->is_type() && change_initial_impl) { + const auto fc_prim = node->as().get_primitive(); + const auto weights_dt = node->get_input_layout(1).data_type; + + // Do not change impl (i.e. do not use ocl shape-agnostic kernels) in case of FC and 8bit compressed weights, + // since oneDNN primitives/kernels caching mechanism will be used instead. + if (fc_prim->compressed_weights && ov::element::Type(weights_dt).bitwidth() == 8) + change_initial_impl = false; + } if (change_initial_impl) node->set_preferred_impl_type(impl_types::ocl); diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp index 82d785ab943029..e9a18c00253059 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/fully_connected_onednn.cpp @@ -50,6 +50,26 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { args.insert({DNNL_ARG_BIAS, bias->get_onednn_memory(_pd.weights_desc(1), offset)}); } + const auto& prim = instance.get_impl_params()->typed_desc(); + if (prim->compressed_weights) { + const auto weights_dt = instance.get_input_layout(1).data_type; + OPENVINO_ASSERT(ov::element::Type(weights_dt).bitwidth() == 8, "[GPU] oneDNN supports only 8bit compressed weights"); + + if (!prim->decompression_scale.empty()) { + auto decompression_scale_idx = prim->bias.empty() ? 2 : 3; + auto scale_mem = instance.dep_memory_ptr(decompression_scale_idx); + dnnl::memory::desc desc = onednn::layout_to_memory_desc(scale_mem->get_layout(), dnnl::memory::format_tag::a, true); + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, scale_mem->get_onednn_memory(desc)}); + } + + if (!prim->decompression_zero_point.empty()) { + auto decompression_zp_idx = prim->bias.empty() ? 3 : 4; + auto zp_mem = instance.dep_memory_ptr(decompression_zp_idx); + dnnl::memory::desc desc = onednn::layout_to_memory_desc(zp_mem->get_layout(), dnnl::memory::format_tag::a, true); + args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_WEIGHTS, zp_mem->get_onednn_memory(desc)}); + } + } + return args; } @@ -91,13 +111,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { false); } - static std::shared_ptr get_fully_connected_primitive_descriptor(const kernel_impl_params& impl_params, - cldnn::engine& engine, size_t prim_input_size, bool has_bias, - const dnnl::primitive_attr& attr = dnnl::primitive_attr()) { - auto input_layout = impl_params.get_input_layout(0); - auto weights_layout = impl_params.get_input_layout(1); - auto output_layout = impl_params.get_output_layout(); - + static void transform_layouts(layout& input_layout, layout& weights_layout, layout& output_layout, size_t prim_input_size) { auto input_pshape = input_layout.get_partial_shape(); auto weights_pshape = weights_layout.get_partial_shape(); @@ -108,7 +122,7 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { } if (input_size > 3) { - input_layout.set_partial_shape(reshape_to_2d(input_pshape, feature)); + input_layout.set_partial_shape(reshape_to_2d(input_pshape, feature)); } if (weights_pshape.size() != 2) { weights_layout.set_partial_shape(reshape_to_2d(weights_pshape, feature)); @@ -123,6 +137,19 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { combine_bf_with_first_spatial_dim(input_layout); combine_bf_with_first_spatial_dim(output_layout); } + } + + static std::shared_ptr + get_inner_product_primitive_descriptor(const kernel_impl_params& impl_params, + cldnn::engine& engine, + size_t prim_input_size, + bool has_bias, + const dnnl::primitive_attr& attr = dnnl::primitive_attr()) { + auto input_layout = impl_params.get_input_layout(0); + auto weights_layout = impl_params.get_input_layout(1); + auto output_layout = impl_params.get_output_layout(); + + transform_layouts(input_layout, weights_layout, output_layout, prim_input_size); auto input_md = onednn::layout_to_memory_desc(input_layout, dnnl::memory::format_tag::undef, false); auto weights_md = onednn::layout_to_memory_desc(weights_layout, dnnl::memory::format_tag::any); @@ -149,6 +176,41 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { } } + static std::shared_ptr + get_matmul_primitive_descriptor(const kernel_impl_params& impl_params, + cldnn::engine& engine, + size_t prim_input_size, + bool has_bias, + const dnnl::primitive_attr& attr = dnnl::primitive_attr()) { + auto input_layout = impl_params.get_input_layout(0); + auto weights_layout = impl_params.get_input_layout(1); + auto output_layout = impl_params.get_output_layout(); + + transform_layouts(input_layout, weights_layout, output_layout, prim_input_size); + + auto input_md = onednn::layout_to_memory_desc(input_layout, dnnl::memory::format_tag::ab, false); + auto weights_md = onednn::layout_to_memory_desc(weights_layout, dnnl::memory::format_tag::ba); + auto output_md = onednn::layout_to_memory_desc(output_layout, dnnl::memory::format_tag::ab, false); + + if (has_bias) { + auto bias_md = onednn::layout_to_memory_desc(impl_params.get_input_layout(2), dnnl::memory::format_tag::ab, false); + return std::make_shared( + engine.get_onednn_engine(), + input_md, + weights_md, + bias_md, + output_md, + attr); + } else { + return std::make_shared( + engine.get_onednn_engine(), + input_md, + weights_md, + output_md, + attr); + } + } + public: void save(BinaryOutputBuffer& ob) const override { #ifdef ONEDNN_PRIMITIVE_SERIALIZATION @@ -158,8 +220,10 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { auto prim = impl_params->typed_desc(); size_t input_size = prim->input_size; bool has_bias = !prim->bias.empty(); + bool is_compressed = prim->compressed_weights; ob << input_size; ob << has_bias; + ob << is_compressed; std::vector prim_cache; prim_cache = _prim.get_cache_blob(); @@ -173,12 +237,19 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { size_t input_size = 2; bool has_bias = false; + bool is_compressed = false; ib >> input_size; ib >> has_bias; + ib >> is_compressed; const kernel_impl_params* impl_params = reinterpret_cast(ib.getKernelImplParams()); - auto prim_desc = get_fully_connected_primitive_descriptor(*impl_params, ib.get_engine(), input_size, has_bias, *_attrs); - _pd = *prim_desc; + if (is_compressed) { + auto prim_desc = get_matmul_primitive_descriptor(*impl_params, ib.get_engine(), input_size, has_bias, *_attrs); + _pd = *prim_desc; + } else { + auto prim_desc = get_inner_product_primitive_descriptor(*impl_params, ib.get_engine(), input_size, has_bias, *_attrs); + _pd = *prim_desc; + } std::vector prim_cache; ib >> prim_cache; @@ -194,10 +265,35 @@ struct fully_connected_onednn : typed_primitive_onednn_impl { auto& config = impl_params.prog->get_config(); auto attr = arg.get_onednn_primitive_attributes(); auto prim = impl_params.typed_desc(); - auto prim_desc = get_fully_connected_primitive_descriptor(impl_params, impl_params.prog->get_engine(), - prim->input_size, !prim->bias.empty(), *attr); - return cldnn::make_unique(engine, config, attr, *prim_desc, get_weights_reorder(impl_params, *prim_desc)); + // There may be a performance difference between InnerProduct and MatMul primitives in oneDNN, + // so use MatMul only for weights compression and IP for all other cases. + if (prim->compressed_weights) { + attr->set_fpmath_mode(dnnl::fpmath_mode::f16, true); + if (!prim->decompression_scale.empty()) { + auto decompression_scale_idx = !arg.bias_term() ? 2 : 3; + auto data_type = convert_data_type(arg.get_dependency(decompression_scale_idx).get_output_layout().data_type); + attr->set_scales(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, data_type); + } + + if (prim->decompression_zero_point_scalar.has_value()) { + OPENVINO_ASSERT(!prim->decompression_zero_point_scalar.has_value(), "[GPU] OneDNN can't use scalar as a zero point value\n"); + } else if (!prim->decompression_zero_point.empty()) { + auto decompression_zp_idx = !arg.bias_term() ? 3 : 4; + auto data_type = convert_data_type(arg.get_dependency(decompression_zp_idx).get_output_layout().data_type); + attr->set_zero_points(DNNL_ARG_WEIGHTS, 1 << 1, dnnl::memory::dims{}, data_type); + } + + auto prim_desc = get_matmul_primitive_descriptor(impl_params, impl_params.prog->get_engine(), + prim->input_size, !prim->bias.empty(), *attr); + + return cldnn::make_unique(engine, config, attr, *prim_desc); + } else { + auto prim_desc = get_inner_product_primitive_descriptor(impl_params, impl_params.prog->get_engine(), + prim->input_size, !prim->bias.empty(), *attr); + + return cldnn::make_unique(engine, config, attr, *prim_desc, get_weights_reorder(impl_params, *prim_desc)); + } } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp index 38e8a5097bb666..0d6c00636c8fb6 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/utils.cpp @@ -242,6 +242,9 @@ dnnl::memory::desc layout_to_memory_desc(cldnn::layout l, dnnl::memory::format_t } else if (target_fmt == dnnl::memory::format_tag::ab) { dims.push_back(l.batch()); dims.push_back(l.get_tensor().count() / l.batch()); + } else if (target_fmt == dnnl::memory::format_tag::ba) { + dims.push_back(l.feature()); + dims.push_back(l.get_tensor().count() / l.feature()); } else if (flatten) { dims = flatten_tensor(l.get_tensor()); } else { diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index 6054281a9b3658..0ce52b7e1a3d36 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -895,12 +895,25 @@ static bool is_node_for_onednn(deconvolution_node const& node) { static bool is_node_for_onednn(fully_connected_node const& node) { - if (!layout_optimizer::are_data_types_suitable_for_onednn((program_node&)node)) - return false; - auto fc_prim = node.get_primitive(); - // onednn impl doesn't support compressed weights for now - if (fc_prim->compressed_weights) + + if (fc_prim->compressed_weights) { + auto weights_dt = node.weights().get_output_layout().data_type; + if (ov::element::Type(weights_dt).bitwidth() != 8) + return false; + + if (fc_prim->decompression_zero_point_scalar.has_value()) + return false; + + if (!fc_prim->decompression_zero_point.empty()) { + auto decompression_zp_idx = fc_prim->bias.empty() ? 3 : 4; + auto decompression_zp_dt = node.get_input_layout(decompression_zp_idx).data_type; + if (weights_dt != decompression_zp_dt) + return false; + } + } + + if (!layout_optimizer::are_data_types_suitable_for_onednn((program_node&)node)) return false; auto output_layout = node.get_output_layout(); @@ -1332,8 +1345,16 @@ bool layout_optimizer::are_data_types_suitable_for_onednn(program_node& node) { return onednn_check_data_types_for_deconvolution(in_dt, wei_dt, out_dt); } else if (node.is_type() || node.is_type()) { bool is_fc = node.is_type(); - auto wei_dt = is_fc ? node.as().weights().get_output_layout(false).data_type : - node.as().get_input_layout(1).data_type; + data_types wei_dt; + if (is_fc) { + const auto& fc_node = node.as(); + const auto fc_prim = fc_node.get_primitive(); + wei_dt = fc_node.weights().get_output_layout(false).data_type; + if (fc_prim->compressed_weights && ov::element::Type(wei_dt).bitwidth() == 8) + return true; + } else { + wei_dt = node.as().get_input_layout(1).data_type; + } return onednn_check_data_types_for_fc_gemm(in_dt, wei_dt, out_dt); } else if (node.is_type()) { auto input_fmt = node.get_input_layout(0).format; diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index 8445d91c271bf8..7460a9599ab0d9 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -1676,6 +1676,8 @@ event::ptr primitive_inst::update_weights() { // incorrect memory buffer may be assigned, so reset cached weights for such case _reordered_weights_cache.add(original_layout, original_weights_memory); _impl_params->weights_layout = optional_layout(original_layout); + GPU_DEBUG_TRACE_DETAIL << id() << ": add original weights memory " << original_layout.to_short_string() << " to weights cache; " + << "cache_size=" << _reordered_weights_cache.size() << "/" << _reordered_weights_cache.capacity() << std::endl; } else { auto expected_layout = reorder_kernel_params->get_output_layout(); // Set original partial shape, because it may be lost during kernel_selector::weights_tensor -> layout conversion diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp index a1d517554d9c28..80026e38372600 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp @@ -1200,6 +1200,109 @@ class fully_connected_gpu_tests: public ::testing::Test { } } + void test_compressed_int8_scale(bool is_caching_test, bool is_dynamic, int64_t batch_num, bool use_bias = false, bool use_zp = false, bool is_3d = false) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + + int64_t ifm_num = 33; + int64_t ofm_num = 65; + + auto in_shape = is_3d ? ov::PartialShape({batch_num, 1, ifm_num}) : ov::PartialShape({batch_num, ifm_num}); + auto bias_shape = is_3d ? ov::PartialShape({1, 1, ofm_num}) : ov::PartialShape({1, ofm_num}); + auto input_mem = engine.allocate_memory({ in_shape, data_types::f16, format::bfyx }); + auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u8, format::bfyx }); + auto bias_mem = engine.allocate_memory({ bias_shape, data_types::f16, format::bfyx }); + auto scale_mem = engine.allocate_memory({ {ofm_num, 1}, data_types::f16, format::bfyx }); + auto zp_mem = engine.allocate_memory({ {ofm_num, 1}, data_types::u8, format::bfyx }); + + auto input_data = rg.generate_random_1d(batch_num * ifm_num, -1.0f, 1.0f); + set_values(input_mem, input_data); + + auto weigths_data = rg.generate_random_1d(ofm_num * ifm_num, 0, 10); + set_values(weights_mem, weigths_data); + + auto bias_data = rg.generate_random_1d(ofm_num, -2.0f, 2.0f);; + set_values(bias_mem, bias_data); + + auto scale_data = rg.generate_random_1d(ofm_num, -1.0f, 1.0f); + set_values(scale_mem, scale_data); + + auto zp_data = rg.generate_random_1d(ofm_num, 0, 4); + set_values(zp_mem, zp_data); + + auto in_partial_shape = is_3d ? ov::PartialShape({-1, -1, ifm_num}) : ov::PartialShape({-1, ifm_num}); + auto in_layout = is_dynamic ? layout{ in_partial_shape, data_types::f16, format::bfyx } + : layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx }; + + auto bias_id = use_bias ? "bias" : ""; + auto zp_id = use_zp ? "zp" : ""; + + auto fc_prim = fully_connected("fc_prim", input_info("input"), + "weights", bias_id, + "scale", zp_id, + data_types::f16, + padding(), + in_shape.size(), 2); + + auto get_ref_results = [&]() { + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + + topology topology( + input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + data("zp", zp_mem), + data("bias", bias_mem), + fc_prim + ); + + network network(engine, topology, config); + network.set_input_data("input", input_mem); + + auto outputs = network.execute(); + OPENVINO_ASSERT(outputs.size() == 1); + OPENVINO_ASSERT(outputs.begin()->first == "fc_prim"); + + auto output_layout = outputs.begin()->second.get_layout(); + auto output_mem = outputs.begin()->second.get_memory(); + + return engine.reinterpret_buffer(*output_mem, output_layout); + }; + + topology topology( + input_layout("input", in_layout), + data("weights", weights_mem), + data("scale", scale_mem), + data("zp", zp_mem), + data("bias", bias_mem), + fc_prim + ); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + network::ptr network = get_network(engine, topology, config, get_test_stream_ptr(), is_caching_test); + + network->set_input_data("input", input_mem); + + auto outputs = network->execute(); + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "fc_prim"); + + auto output_mem = outputs.begin()->second.get_memory(); + cldnn::mem_lock output_ptr (output_mem, get_test_stream()); + + auto ref_output_mem = get_ref_results(); + cldnn::mem_lock output_ptr_ref (ref_output_mem, get_test_stream()); + + const float threshold_fp16 = 1e-1; + for (size_t i = 0; i < output_ptr_ref.size(); i++) { + ASSERT_NEAR(output_ptr_ref[i], output_ptr[i], threshold_fp16) << "i = " << i; + } + } + void test_compressed_scale_bias(bool is_caching_test) { auto& engine = get_test_engine(); @@ -2764,6 +2867,30 @@ TEST_F(fully_connected_gpu_tests, compressed_scale_fp16_cached) { this->test_compressed_scale_fp16(false); } +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_b1) { + this->test_compressed_int8_scale(false, true, 1, false, false); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_b1_bias) { + this->test_compressed_int8_scale(false, true, 1, true, false); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_b1_bias_zp_3d) { + this->test_compressed_int8_scale(false, true, 1, true, true, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_zp_b1) { + this->test_compressed_int8_scale(false, true, 1, false, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_zp_b13) { + this->test_compressed_int8_scale(false, true, 13, false, true); +} + +TEST_F(fully_connected_gpu_tests, compressed_int8_scale_zp_b12_3d) { + this->test_compressed_int8_scale(false, true, 12, false, true, true); +} + TEST_F(fully_connected_gpu_tests, dynamic) { this->test_dynamic(false); } diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 494af5f9921bda..74ef417285129a 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 494af5f9921bdae98f1a0e2955fa7d76ff386c4f +Subproject commit 74ef417285129aa8ed0d2fd6fa4745ec3f22e96c From d8b3d3a6de3f6025873e36d0f70ccae0d64edcd3 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Fri, 9 Feb 2024 11:02:58 +0400 Subject: [PATCH 4/7] Update oneDNN version --- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 74ef417285129a..7b21ce5885500c 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 74ef417285129aa8ed0d2fd6fa4745ec3f22e96c +Subproject commit 7b21ce5885500c3f14cdba6e7ffa57561a9b8a57 From 451817341c61d4ca36685006747c96b6178c4dd4 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Fri, 9 Feb 2024 14:20:02 +0400 Subject: [PATCH 5/7] Restore cache capacity after cleanup --- src/plugins/intel_gpu/src/plugin/plugin.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index c9cc59729a36bb..0ee5f7ee02d6d2 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -164,6 +164,12 @@ Plugin::~Plugin() { // trigger earlier cache cleanup by setting its capacity to 0. // Related ticket: 106154. dnnl::set_primitive_cache_capacity(0); + + // In case of multiple ov::Core instances (and multiple GPU plugins) we need to restore original + // cache capacity to prevent working with zero-capacity cache in other GPU Plugin instances, since + // cache is shared between all of GPU Plugin instances and cache clean up affects all of them. + const int default_cache_capacity = 1024; + dnnl::set_primitive_cache_capacity(default_cache_capacity); #endif } From 1b0fc3396942b5a52583f008518ef196f789ee8f Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Wed, 20 Mar 2024 09:54:06 +0400 Subject: [PATCH 6/7] Update oneDNN submodule and remove cache cleanup WA --- .../intel_gpu/include/intel_gpu/plugin/plugin.hpp | 1 - src/plugins/intel_gpu/src/plugin/plugin.cpp | 15 --------------- src/plugins/intel_gpu/thirdparty/onednn_gpu | 2 +- 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp index 12b640305c10b6..a5ea28b95f78ba 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp @@ -45,7 +45,6 @@ class Plugin : public ov::IPlugin { public: Plugin(); - ~Plugin(); std::shared_ptr compile_model(const std::shared_ptr& model, const ov::AnyMap& properties) const override; diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 0ee5f7ee02d6d2..8592feaea0bb94 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -158,21 +158,6 @@ Plugin::Plugin() { m_compiled_model_runtime_properties["OV_VERSION"] = ov_version.buildNumber; } -Plugin::~Plugin() { -#ifdef ENABLE_ONEDNN_FOR_GPU - // To prevent hanging during oneDNN's primitive cache desctruction, - // trigger earlier cache cleanup by setting its capacity to 0. - // Related ticket: 106154. - dnnl::set_primitive_cache_capacity(0); - - // In case of multiple ov::Core instances (and multiple GPU plugins) we need to restore original - // cache capacity to prevent working with zero-capacity cache in other GPU Plugin instances, since - // cache is shared between all of GPU Plugin instances and cache clean up affects all of them. - const int default_cache_capacity = 1024; - dnnl::set_primitive_cache_capacity(default_cache_capacity); -#endif -} - std::shared_ptr Plugin::compile_model(const std::shared_ptr& model, const ov::AnyMap& orig_config) const { OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "Plugin::compile_model"); std::string device_id = get_device_id(orig_config); diff --git a/src/plugins/intel_gpu/thirdparty/onednn_gpu b/src/plugins/intel_gpu/thirdparty/onednn_gpu index 7b21ce5885500c..26c5598cccbc14 160000 --- a/src/plugins/intel_gpu/thirdparty/onednn_gpu +++ b/src/plugins/intel_gpu/thirdparty/onednn_gpu @@ -1 +1 @@ -Subproject commit 7b21ce5885500c3f14cdba6e7ffa57561a9b8a57 +Subproject commit 26c5598cccbc144ff49255a0b44f00cb9b19e6f3 From 9913c487c0448474f826582e4dd193a61f1e80c0 Mon Sep 17 00:00:00 2001 From: Sergey Shlyapnikov Date: Wed, 20 Mar 2024 10:49:47 +0400 Subject: [PATCH 7/7] Fix tests compilation issue --- .../unit/transformations/convert_fc_to_compressed_test.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp b/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp index 70f5a45b25aef4..12398c8221f4b7 100644 --- a/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/transformations/convert_fc_to_compressed_test.cpp @@ -411,7 +411,8 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed9) { auto sub = std::make_shared(convert, zp_convert); auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); auto scale = std::make_shared(sub, scale_const); - auto fc = std::make_shared(input1, scale); + auto no_bias = std::make_shared(); + auto fc = std::make_shared(input1, scale, no_bias); model = std::make_shared(ov::NodeVector{ fc }, ov::ParameterVector{ input1 }); manager.register_pass(); @@ -421,7 +422,8 @@ TEST_F(TransformationTestsF, ConvertFCToCompressed9) { auto weights_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 16 }, { 1 }); auto scale_const = ov::op::v0::Constant::create(ov::element::f16, ov::Shape{ 32, 1 }, { 1 }); auto zp_const = ov::op::v0::Constant::create(ov::element::u8, ov::Shape{ 32, 1 }, { 1 }); - auto fc_compressed = std::make_shared(input1, weights_const, scale_const, zp_const); + auto no_bias = std::make_shared(); + auto fc_compressed = std::make_shared(input1, weights_const, no_bias, scale_const, zp_const); model_ref = std::make_shared(ov::NodeVector{ fc_compressed }, ov::ParameterVector{ input1 }); }