From d6088e9f9832bf9e272994a939d1bf966944ed14 Mon Sep 17 00:00:00 2001 From: Taylor Yeonbok Lee Date: Wed, 18 May 2022 11:19:13 +0900 Subject: [PATCH] Fix func test (#68) --- .../intel_gpu/plugin/infer_request_legacy.hpp | 2 + .../src/plugin/infer_request_legacy.cpp | 61 +++++++++++++------ 2 files changed, 45 insertions(+), 18 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp index e9b31bda761016..50c282ea91f1f1 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/infer_request_legacy.hpp @@ -103,6 +103,8 @@ class InferRequestLegacy : public InferenceEngine::IInferRequestInternal { void allocate_inputs_dynamic(); void allocate_outputs_dynamic(); + InferenceEngine::Blob::Ptr reinterpret_device_blob(InferenceEngine::Blob::Ptr data, const InferenceEngine::TensorDesc& new_desc); + std::map internal_outputs; std::vector> internal_outputs_dynamic; }; diff --git a/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp b/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp index 8c42ba663ad3c3..249589ec644ec8 100644 --- a/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp +++ b/src/plugins/intel_gpu/src/plugin/infer_request_legacy.cpp @@ -47,7 +47,7 @@ void copyToFloat(float* dst, const InferenceEngine::Blob* src) { dst[i] = srcPtr[i]; } -template +template void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, ov::runtime::intel_gpu::buf_info* bi, cldnn::stream& stream) { size_t n = (bi == nullptr) ? dst->size() : bi->buf_size; size_t offset = (bi == nullptr) ? 0 : bi->buf_offset; @@ -56,12 +56,12 @@ void copyResultToOutputBlob(cldnn::memory::ptr src, Blob::Ptr dst, ov::runtime:: auto size = layout.get_tensor(); auto locked_dst = dst->buffer(); - auto dst_ptr = locked_dst.as(); + auto dst_ptr = locked_dst.as(); if (dst_ptr == nullptr) { IE_THROW() << "Invalid output blob"; } - cldnn::mem_lock src_lock{ src, stream }; - T* src_ptr = src_lock.data(); + cldnn::mem_lock src_lock{ src, stream }; + src_dt* src_ptr = src_lock.data(); dst_ptr += offset; if (layout.data_padding) { @@ -898,12 +898,17 @@ void InferRequestLegacy::copy_output_data(cldnn::memory::ptr src, Blob::Ptr dst, OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "InferRequestLegacy::copy_output_data"); auto& stream = m_graph->GetNetwork()->get_stream(); switch (dst->getTensorDesc().getPrecision()) { - case Precision::FP32: copyResultToOutputBlob(src, dst, bi, stream); break; - case Precision::FP16: copyResultToOutputBlob(src, dst, bi, stream); break; - case Precision::I32: copyResultToOutputBlob(src, dst, bi, stream); break; - case Precision::I64: copyResultToOutputBlob(src, dst, bi, stream); break; - case Precision::U8: copyResultToOutputBlob(src, dst, bi, stream); break; - case Precision::I8: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::FP64: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::FP32: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::FP16: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::I64: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::I32: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::I16: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::I8: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::U16: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::U32: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::U64: copyResultToOutputBlob(src, dst, bi, stream); break; + case Precision::U8: copyResultToOutputBlob(src, dst, bi, stream); break; default: IE_THROW(NotImplemented) << "The plugin does not support output " << dst->getTensorDesc().getPrecision() << " precision"; } } @@ -1047,22 +1052,39 @@ void InferRequestLegacy::allocate_outputs() { // while ExecutableNetwork contains proper ones. Thus replace dims with once from exec network // Can be removed once 76176 is resolved. desc.setDims(m_graph->GetOutputSize(no.first)); - GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_IF(debug_config->verbose >= 2) { GPU_DEBUG_COUT << "[" << no.first << ": output blob]" << std::endl; } outputsMap[no.first] = outputID; - if (m_graph->GetEngine()->use_unified_shared_memory()) { - // For USM case we create host blob using custom USM host allocator - // and then create shared device blob on top of this buffer - auto host_blob = create_host_blob(desc, std::make_shared(m_graph->GetContext().get())); + + + if (desc.getPrecision() == Precision::I16 || desc.getPrecision() == Precision::U16 || + desc.getPrecision() == Precision::U32 || desc.getPrecision() == Precision::U64 || + desc.getPrecision() == Precision::FP64) { + TensorDesc device_blob_desc = desc; + + if (desc.getPrecision() == Precision::U32 || desc.getPrecision() == Precision::U64) + device_blob_desc.setPrecision(Precision::I32); + else + device_blob_desc.setPrecision(Precision::FP32); + + auto host_blob = create_host_blob(desc); _outputs[no.first] = host_blob; - _deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as()); + auto device_blob = create_device_blob(device_blob_desc, output_layout); + _deviceOutputs[no.first] = device_blob; } else { - _outputs[no.first] = create_host_blob(desc); - _deviceOutputs[no.first] = create_device_blob(desc, output_layout); + if (m_graph->GetEngine()->use_unified_shared_memory()) { + // For USM case we create host blob using custom USM host allocator + // and then create shared device blob on top of this buffer + auto host_blob = create_host_blob(desc, std::make_shared(m_graph->GetContext().get())); + _outputs[no.first] = host_blob; + _deviceOutputs[no.first] = create_shared_device_blob(desc, output_layout, host_blob->buffer().as()); + } else { + _outputs[no.first] = create_host_blob(desc); + _deviceOutputs[no.first] = create_device_blob(desc, output_layout); + } } } } @@ -1115,6 +1137,7 @@ void InferRequestLegacy::prepare_input(const cldnn::primitive_id& inputName, Blo bool is_dev_input = remote_ptr != nullptr; switch (prec) { + case Precision::FP64: case Precision::FP32: case Precision::FP16: case Precision::I8: @@ -1123,6 +1146,8 @@ void InferRequestLegacy::prepare_input(const cldnn::primitive_id& inputName, Blo case Precision::I16: case Precision::U16: case Precision::I32: + case Precision::U32: + case Precision::U64: case Precision::I64: { auto impl = getBlobImpl(is_dev_input ? remote_ptr :