From 6664654699548ed382758045adbc541b0d98e923 Mon Sep 17 00:00:00 2001 From: Alexey Suhov Date: Mon, 21 Jan 2019 21:31:31 +0300 Subject: [PATCH] Publishing R5 content (#72) * Publishing R5 content * Updated ade revision * updated readme * add possibility to build CPU plugin with Intel MKL package --- .../src/gna_plugin/CMakeLists.txt | 86 +- inference-engine/src/gna_plugin/dnn.cpp | 2528 ++++++++++++++++ inference-engine/src/gna_plugin/dnn.h | 823 ++++++ .../src/gna_plugin/dnn_memory.cpp | 30 + .../src/gna_plugin/dnn_memory.hpp | 13 + .../src/gna_plugin/dnn_traits.hpp | 90 + inference-engine/src/gna_plugin/floatmath.cpp | 423 +++ inference-engine/src/gna_plugin/floatmath.h | 71 + .../src/gna_plugin/gna_allocator.hpp | 33 + .../src/gna_plugin/gna_api_wrapper.hpp | 79 +- .../src/gna_plugin/gna_device.cpp | 342 +-- .../src/gna_plugin/gna_device.hpp | 128 +- .../src/gna_plugin/gna_executable_network.hpp | 42 +- .../src/gna_plugin/gna_helper.cpp | 185 +- .../src/gna_plugin/gna_infer_request.hpp | 20 +- .../src/gna_plugin/gna_layer_info.hpp | 206 ++ .../src/gna_plugin/gna_mem_requests.hpp | 175 ++ .../src/gna_plugin/gna_memory.hpp | 227 ++ .../src/gna_plugin/gna_memory_state.hpp | 25 + .../src/gna_plugin/gna_model_serial.cpp | 396 +-- .../src/gna_plugin/gna_model_serial.hpp | 77 +- .../src/gna_plugin/gna_plugin.cpp | 2592 ++++++++++++----- .../src/gna_plugin/gna_plugin.hpp | 503 +++- .../src/gna_plugin/gna_plugin_config.hpp | 83 +- .../gna_plugin/gna_plugin_entry_points.cpp | 16 +- .../src/gna_plugin/gna_plugin_internal.hpp | 72 +- .../src/gna_plugin/gna_plugin_log.hpp | 30 +- .../src/gna_plugin/gna_plugin_passes.cpp | 338 +++ inference-engine/src/gna_plugin/lstm.cpp | 69 + inference-engine/src/gna_plugin/lstm.hpp | 209 ++ .../src/gna_plugin/polymorh_allocator.hpp | 68 + inference-engine/src/gna_plugin/pwl.h | 70 + .../src/gna_plugin/pwl_design.cpp | 681 +++++ .../quantization/layer_quantizer.hpp | 488 ++++ .../quantization/model_quantizer.hpp | 78 + .../gna_plugin/quantization/precision_ex.hpp | 95 + .../gna_plugin/quantization/quantization.cpp | 699 +++++ .../gna_plugin/quantization/quantization.h | 100 + .../quantization/quantized_layer_params.hpp | 24 + .../quantization/scale_factor_calc.hpp | 339 +++ inference-engine/src/gna_plugin/util.cpp | 46 + inference-engine/src/gna_plugin/util.h | 9 + 42 files changed, 10552 insertions(+), 2056 deletions(-) create mode 100644 inference-engine/src/gna_plugin/dnn.cpp create mode 100644 inference-engine/src/gna_plugin/dnn.h create mode 100644 inference-engine/src/gna_plugin/dnn_memory.cpp create mode 100644 inference-engine/src/gna_plugin/dnn_memory.hpp create mode 100644 inference-engine/src/gna_plugin/dnn_traits.hpp create mode 100644 inference-engine/src/gna_plugin/floatmath.cpp create mode 100644 inference-engine/src/gna_plugin/floatmath.h create mode 100644 inference-engine/src/gna_plugin/gna_allocator.hpp create mode 100644 inference-engine/src/gna_plugin/gna_layer_info.hpp create mode 100644 inference-engine/src/gna_plugin/gna_mem_requests.hpp create mode 100644 inference-engine/src/gna_plugin/gna_memory.hpp create mode 100644 inference-engine/src/gna_plugin/gna_memory_state.hpp create mode 100644 inference-engine/src/gna_plugin/gna_plugin_passes.cpp create mode 100644 inference-engine/src/gna_plugin/lstm.cpp create mode 100644 inference-engine/src/gna_plugin/lstm.hpp create mode 100644 inference-engine/src/gna_plugin/polymorh_allocator.hpp create mode 100644 inference-engine/src/gna_plugin/pwl.h create mode 100644 inference-engine/src/gna_plugin/pwl_design.cpp create mode 100644 inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp create mode 100644 inference-engine/src/gna_plugin/quantization/model_quantizer.hpp create mode 100644 inference-engine/src/gna_plugin/quantization/precision_ex.hpp create mode 100644 inference-engine/src/gna_plugin/quantization/quantization.cpp create mode 100644 inference-engine/src/gna_plugin/quantization/quantization.h create mode 100644 inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp create mode 100644 inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp create mode 100644 inference-engine/src/gna_plugin/util.cpp create mode 100644 inference-engine/src/gna_plugin/util.h diff --git a/inference-engine/src/gna_plugin/CMakeLists.txt b/inference-engine/src/gna_plugin/CMakeLists.txt index aa7045813e8f4e..f6a25b61844784 100644 --- a/inference-engine/src/gna_plugin/CMakeLists.txt +++ b/inference-engine/src/gna_plugin/CMakeLists.txt @@ -1,66 +1,60 @@ -# Copyright (C) 2018-2020 Intel Corporation +# Copyright (C) 2018 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +# set(TARGET_NAME "GNAPlugin") -if(ENABLE_LTO) - ie_enable_lto() -endif() - file(GLOB_RECURSE SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp) + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ) file(GLOB_RECURSE HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.h - ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) + ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp + ) -addVersionDefines(gna_plugin_entry_points.cpp CI_BUILD_NUMBER) +add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN) find_package(libGNA) +include_directories(${libGNA_INCLUDE_DIRS}) -ie_add_plugin(NAME ${TARGET_NAME} - DEVICE_NAME "GNA" - SOURCES ${SOURCES} ${HEADERS}) +include_directories( + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/src/inference_engine + ${CMAKE_CURRENT_SOURCE_DIR} + ${libGNA_INCLUDE_DIRS} +) -if(GNA_LIBRARY_VERSION STREQUAL "GNA2") - SET(GNA_LIBRARY_VERSION_NUMBER 2) -else() - SET(GNA_LIBRARY_VERSION_NUMBER 1) -endif() +add_definitions(-D_NO_MKL_) +add_library(${TARGET_NAME} SHARED ${SOURCES} ${HEADERS}) -#saving rpath to GNA shared library be used by CI -log_rpath_from_dir(GNA ${libGNA_LIBRARIES_BASE_PATH}) +if (LINUX) + find_package(Threads) +endif () -target_link_libraries(${TARGET_NAME} PRIVATE inference_engine inference_engine_lp_transformations ${INTEL_ITT_LIBS} Threads::Threads libGNA) -target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) -target_compile_definitions(${TARGET_NAME} - PRIVATE - _NO_MKL_ - PUBLIC - GNA_LIB_VER=${GNA_LIBRARY_VERSION_NUMBER}) +set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}) +#saving rpath to GNA shared library be used by CI +log_rpath_remove_top(GNA FALSE "/gna${libGNA_LIBRARY}" TRUE) + +target_link_libraries(${TARGET_NAME} inference_engine ${INTEL_ITT_LIBS} ${libGNA_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}) -add_library(${TARGET_NAME}_test_static STATIC ${SOURCES} ${HEADERS}) -target_compile_definitions(${TARGET_NAME}_test_static - PRIVATE - _NO_MKL_ - IMPLEMENT_INFERENCE_ENGINE_PLUGIN - PUBLIC - GNA_LIB_VER=${GNA_LIBRARY_VERSION_NUMBER} - INTEGER_LOW_P - USE_STATIC_IE) -target_link_libraries(${TARGET_NAME}_test_static PUBLIC inference_engine_preproc_s inference_engine_lp_transformations libGNA::API) -target_include_directories(${TARGET_NAME}_test_static PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) -set_target_properties(${TARGET_NAME}_test_static PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}_test_static) -if(WIN32) - # Correct 'jnl' macro/jit issue - target_compile_options(${TARGET_NAME} PRIVATE $<$:/bigobj> ) - target_compile_options(${TARGET_NAME}_test_static PRIVATE $<$:/bigobj> ) -endif() +set(TEST_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/gna_plugin.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/gna_plugin_passes.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/quantization/quantization.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/dnn.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/gna_device.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/pwl_design.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/floatmath.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/dnn_memory.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/util.cpp" + "${CMAKE_CURRENT_SOURCE_DIR}/gna_model_serial.cpp") -# install +add_library(${TARGET_NAME}_test_static STATIC ${TEST_SOURCES} ${HEADERS}) +target_compile_definitions(${TARGET_NAME}_test_static + PUBLIC -DINTEGER_LOW_P + -DUSE_STATIC_IE) -install(FILES "${GNA_KERNEL_LIBRARY}" - DESTINATION ${IE_CPACK_IE_DIR}/external/gna/lib - COMPONENT gna) +set_target_properties(${TARGET_NAME}_test_static PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}_test_static) diff --git a/inference-engine/src/gna_plugin/dnn.cpp b/inference-engine/src/gna_plugin/dnn.cpp new file mode 100644 index 00000000000000..8c94f720a1e5bb --- /dev/null +++ b/inference-engine/src/gna_plugin/dnn.cpp @@ -0,0 +1,2528 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +extern bool global_debug; + +#include +#include +#include +#include +#include
+#include +#include + +#ifndef _NO_MKL_ +#include +#endif +#include "dnn.h" +#ifdef INTEGER_REF +#include "convnet.h" +#include "igemv16.h" +#include "igemv8.h" +#include "sgemm.h" +#else +#include "floatmath.h" +#endif +#include "pwl.h" +#include "util.h" +#include "gna_plugin_log.hpp" + +#ifdef WIN32 +# define rand_r(X) rand() +#endif + +/** + * whether to dump weights and biases + */ +#define DUMP_WB +/** + * in light mode only layer names are dumped + * @param filename + * @param number_type + * @return + */ +#define LIGHT_DUMP + +static int & getDumpFolderId() { + static int N = 0; + return N; +} + +static std::string getDumpFolderNameGNA() { + return std::string("./gna_layers/")+std::to_string(getDumpFolderId() - 1)+"/"; +} + +static std::string getDumpFolderName() { + return std::string("./layers/")+std::to_string(getDumpFolderId() - 1)+"/"; +} + +static std::string getRefFolderName() { + return std::string("./ref_layers/")+std::to_string(getDumpFolderId() - 1)+"/"; +} + +void AmIntelDnn::BeginNewWrite() { + getDumpFolderId()++; +} + + +void AmIntelDnn::Init(void *ptr_memory, + uint32_t num_memory_bytes, + intel_dnn_number_type_t number_type, + float scale_factor) { + ptr_dnn_memory_ = ptr_memory; + num_bytes_dnn_memory_ = num_memory_bytes; + number_type_ = number_type; + input_scale_factor_ = scale_factor; + + ptr_active_outputs_ = nullptr; + num_active_outputs_ = 0; + num_left_context = 0; + num_right_context = 0; + do_rotate_input = false; + softmax_type = kSoftmaxNone; + ptr_sumgroup_sizes = nullptr; + num_sumgroup_sizes = 0; + ptr_priors = nullptr; + + + // component.clear(); +} + +void AmIntelDnn::InitActiveList(uint32_t *ptr_active_list) { + ptr_active_outputs_ = ptr_active_list; + if (ptr_active_list == nullptr) { + if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) { + num_active_outputs_ = component[component.size() - 1].num_rows_out; + } else { + num_active_outputs_ = component[component.size() - 1].num_columns_out; + } + } else { + num_active_outputs_ = 0; + } +} + +void AmIntelDnn::AddComponents(uint32_t num_components_to_add) { + component.resize(component.size() + num_components_to_add); + for (uint32_t i = 0; i < num_components_to_add; i++) { + ClearComponent(component.size() - i - 1); + } +} + +void AmIntelDnn::ClearComponent(uint32_t component_index) { + if (component_index > component.size() - 1) { + fprintf(stderr, "Error: attempt to clear non-existent component!\n"); + throw -1; + } + component[component_index].num_rows_in = 0; + component[component_index].num_columns_in = 0; + component[component_index].num_rows_out = 0; + component[component_index].num_columns_out = 0; + component[component_index].num_bytes_per_input = 0; + component[component_index].num_bytes_per_output = 0; + component[component_index].operation = kDnnNullOp; + component[component_index].macro_operation = kDnnMacroOpNone; + component[component_index].orientation_in = kDnnUnknownOrientation; + component[component_index].orientation_out = kDnnUnknownOrientation; + component[component_index].ptr_inputs = nullptr; + component[component_index].ptr_outputs = nullptr; + memset(&component[component_index].op, 0, sizeof(component[component_index].op)); +} + +void AmIntelDnn::ClearState() { + // To support recurrent networks, provide mechanism to clear persistent state + // (e.g., between utterances for speech recognition). For recurrent component, + // this means clearing the feedback buffer. For other components, just clear the + // output buffer since any feedback will come from some component's output. + for (uint32_t i = 0; i < component.size(); i++) { + if (component[i].operation == kDnnRecurrentOp) { + memset(component[i].op.recurrent.ptr_feedbacks, + 0, + component[i].op.recurrent.num_vector_delay * component[i].num_columns_out + * component[i].num_bytes_per_input); + } else { + memset(component[i].ptr_outputs, + 0, + component[i].num_bytes_per_output * component[i].num_rows_out * component[i].num_columns_out); + } + } +} + +void AmIntelDnn::InitAffineComponentPrivate(intel_dnn_component_t &comp, + uint32_t num_rows_in, + uint32_t num_columns, + uint32_t num_rows_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + float weight_scale_factor, + float output_scale_factor, + void *&ptr_inputs, + void *&ptr_outputs, + void *&ptr_weights, + void *&ptr_biases, + bool isDiag, + bool postInitMem) { + comp.num_rows_in = num_rows_in; + comp.num_columns_in = num_columns; + comp.num_rows_out = num_rows_out; + comp.num_columns_out = num_columns; + comp.num_bytes_per_input = num_bytes_per_input; + comp.num_bytes_per_output = num_bytes_per_output; + comp.operation = isDiag ? kDnnDiagonalOp : kDnnAffineOp; + comp.macro_operation = kDnnMacroOpNone; + comp.orientation_in = kDnnInterleavedOrientation; + comp.orientation_out = kDnnInterleavedOrientation; + comp.op.affine.num_bytes_per_weight = num_bytes_per_weight; + comp.op.affine.num_bytes_per_bias = num_bytes_per_bias; + comp.op.affine.weight_scale_factor = weight_scale_factor; + comp.output_scale_factor = output_scale_factor; + if (!postInitMem) { + comp.op.affine.ptr_weights = ptr_weights; + comp.op.affine.ptr_biases = ptr_biases; + comp.ptr_inputs = ptr_inputs; + comp.ptr_outputs = ptr_outputs; + } else { + ptr_weights = &comp.op.affine.ptr_weights; + ptr_biases = &comp.op.affine.ptr_biases; + ptr_inputs = &comp.ptr_inputs; + ptr_outputs = &comp.ptr_outputs; + } +} + +void AmIntelDnn::InitDiagonalComponent(uint32_t component_index, + uint32_t num_rows_in, + uint32_t num_columns, + uint32_t num_rows_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + float weight_scale_factor, + float output_scale_factor, + void *ptr_inputs, + void *ptr_outputs, + void *ptr_weights, + void *ptr_biases) { + component[component_index].num_rows_in = num_rows_in; + component[component_index].num_columns_in = num_columns; + component[component_index].num_rows_out = num_rows_out; + component[component_index].num_columns_out = num_columns; + component[component_index].num_bytes_per_input = num_bytes_per_input; + component[component_index].num_bytes_per_output = num_bytes_per_output; + component[component_index].operation = kDnnDiagonalOp; + component[component_index].macro_operation = kDnnMacroOpNone; + component[component_index].orientation_in = kDnnInterleavedOrientation; + component[component_index].orientation_out = kDnnInterleavedOrientation; + component[component_index].ptr_inputs = ptr_inputs; + component[component_index].ptr_outputs = ptr_outputs; + component[component_index].op.affine.num_bytes_per_weight = num_bytes_per_weight; + component[component_index].op.affine.num_bytes_per_bias = num_bytes_per_bias; + component[component_index].op.affine.weight_scale_factor = weight_scale_factor; + component[component_index].output_scale_factor = output_scale_factor; + component[component_index].op.affine.ptr_weights = ptr_weights; + component[component_index].op.affine.ptr_biases = ptr_biases; +} + +void AmIntelDnn::InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + uint32_t num_filters, + uint32_t num_filter_rows, + uint32_t num_filter_coefficients, + uint32_t num_feature_maps, + uint32_t num_feature_map_rows, + uint32_t num_feature_map_columns, + float weight_scale_factor, + float output_scale_factor, + void *&ptr_inputs, + void *&ptr_outputs, + void *&ptr_filters, + void *&ptr_biases, + bool postInitMem) { + comp.num_rows_in = num_rows_in; + comp.num_columns_in = num_columns_in; + comp.num_rows_out = num_rows_out; + comp.num_columns_out = num_columns_out; + comp.num_bytes_per_input = num_bytes_per_input; + comp.num_bytes_per_output = num_bytes_per_output; + comp.operation = kDnnConvolutional1dOp; + comp.macro_operation = kDnnMacroOpNone; + comp.orientation_in = kDnnNonInterleavedOrientation; + comp.orientation_out = kDnnNonInterleavedOrientation; + comp.ptr_inputs = ptr_inputs; + comp.ptr_outputs = ptr_outputs; + comp.op.conv1D.num_bytes_per_weight = num_bytes_per_weight; + comp.op.conv1D.num_bytes_per_bias = num_bytes_per_bias; + comp.op.conv1D.num_filters = num_filters; + comp.op.conv1D.num_filter_rows = num_filter_rows; + comp.op.conv1D.num_filter_coefficients = num_filter_coefficients; + comp.op.conv1D.num_feature_maps = num_feature_maps; + comp.op.conv1D.num_feature_map_rows = num_feature_map_rows; + comp.op.conv1D.num_feature_map_columns = num_feature_map_columns; + comp.op.conv1D.weight_scale_factor = weight_scale_factor; + comp.output_scale_factor = output_scale_factor; + + if (!postInitMem) { + comp.op.conv1D.ptr_filters = ptr_filters; + comp.op.conv1D.ptr_biases = ptr_biases; + comp.ptr_inputs = ptr_inputs; + comp.ptr_outputs = ptr_outputs; + } else { + ptr_filters = &comp.op.conv1D.ptr_filters; + ptr_biases = &comp.op.conv1D.ptr_biases; + ptr_inputs = &comp.ptr_inputs; + ptr_outputs = &comp.ptr_outputs; + } +} + +void AmIntelDnn::InitMaxpoolComponentPrivate(intel_dnn_component_t &comp, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_pool_size, + uint32_t num_pool_step, + uint32_t num_pool_stride, + bool do_sum_not_max, + float output_scale_factor, + void *&ptr_inputs, + void *&ptr_outputs, + bool postInitMem) { + comp.num_rows_in = num_rows_in; + comp.num_columns_in = num_columns_in; + comp.num_rows_out = num_rows_out; + comp.num_columns_out = num_columns_out; + comp.num_bytes_per_input = num_bytes_per_input; + comp.num_bytes_per_output = num_bytes_per_output; + comp.operation = kDnnMaxPoolOp; + comp.macro_operation = kDnnMacroOpNone; + comp.orientation_in = kDnnNonInterleavedOrientation; + comp.orientation_out = kDnnNonInterleavedOrientation; + comp.op.maxpool.num_inputs = num_pool_size; + comp.op.maxpool.num_inputs_step = num_pool_step; + comp.op.maxpool.num_inputs_stride = num_pool_stride; + comp.op.maxpool.do_sum_not_max = do_sum_not_max; + comp.output_scale_factor = output_scale_factor; + + if (!postInitMem) { + comp.ptr_inputs = ptr_inputs; + comp.ptr_outputs = ptr_outputs; + } else { + ptr_inputs = &comp.ptr_inputs; + ptr_outputs = &comp.ptr_outputs; + } +} + +void AmIntelDnn::InitCopyComponentPrivate(intel_dnn_component_t &comp, + intel_dnn_orientation_t orientation, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + float output_scale_factor, + uint32_t num_copy_rows, + uint32_t num_copy_columns, + void *&ptr_inputs, + void *&ptr_outputs, + bool postInitMem) { + comp.num_rows_in = num_rows_in; + comp.num_columns_in = num_columns_in; + comp.num_rows_out = num_rows_out; + comp.num_columns_out = num_columns_out; + comp.num_bytes_per_input = num_bytes_per_input; + comp.num_bytes_per_output = num_bytes_per_output; + comp.operation = kDnnCopyOp; + comp.macro_operation = kDnnMacroOpNone; + comp.orientation_in = orientation; + comp.orientation_out = orientation; + comp.ptr_inputs = ptr_inputs; + comp.ptr_outputs = ptr_outputs; + comp.output_scale_factor = output_scale_factor; + comp.op.copy.num_copy_rows = num_copy_rows; + comp.op.copy.num_copy_columns = num_copy_columns; + + if (!postInitMem) { + comp.ptr_inputs = ptr_inputs; + comp.ptr_outputs = ptr_outputs; + } else { + ptr_inputs = &comp.ptr_inputs; + ptr_outputs = &comp.ptr_outputs; + } +} + +void AmIntelDnn::InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &comp, + DnnActivation function_id, + intel_dnn_orientation_t orientation, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_segments, + float output_scale_factor, + void *&ptr_inputs, + void *&ptr_outputs, + intel_pwl_segment_t *ptr_segments, + bool postInitMem) { + comp.num_rows_in = num_rows; + comp.num_columns_in = num_columns; + comp.num_rows_out = num_rows; + comp.num_columns_out = num_columns; + comp.num_bytes_per_input = num_bytes_per_input; + comp.num_bytes_per_output = num_bytes_per_output; + comp.operation = kDnnPiecewiselinearOp; + comp.macro_operation = kDnnMacroOpNone; + comp.orientation_in = orientation; + comp.orientation_out = orientation; + comp.op.pwl.func_id = function_id; + comp.op.pwl.num_segments = num_segments; + comp.output_scale_factor = output_scale_factor; + + if (!postInitMem) { + comp.ptr_inputs = ptr_inputs; + comp.ptr_outputs = ptr_outputs; + comp.op.pwl.ptr_segments = ptr_segments; + } else { + ptr_inputs = &comp.ptr_inputs; + ptr_outputs = &comp.ptr_outputs; + if (ptr_segments != nullptr) { + *reinterpret_cast(ptr_segments) = + reinterpret_cast(& comp.op.pwl.ptr_segments); + } + } +} + +void AmIntelDnn::InitRecurrentComponent(uint32_t component_index, + uint32_t num_rows, + uint32_t num_columns_in, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_vector_delay, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + float weight_scale_factor, + float output_scale_factor, + void *ptr_inputs, + void *ptr_feedbacks, + void *ptr_outputs, + void *ptr_weights, + void *ptr_biases) { + component[component_index].num_rows_in = num_rows; + component[component_index].num_columns_in = num_columns_in; + component[component_index].num_rows_out = num_rows; + component[component_index].num_columns_out = num_columns_out; + component[component_index].num_bytes_per_input = num_bytes_per_input; + component[component_index].num_bytes_per_output = num_bytes_per_output; + component[component_index].operation = kDnnRecurrentOp; + component[component_index].macro_operation = kDnnMacroOpNone; + component[component_index].orientation_in = kDnnNonInterleavedOrientation; + component[component_index].orientation_out = kDnnNonInterleavedOrientation; + component[component_index].ptr_inputs = ptr_inputs; + component[component_index].ptr_outputs = ptr_outputs; + component[component_index].op.recurrent.num_vector_delay = num_vector_delay; + component[component_index].op.recurrent.num_bytes_per_weight = num_bytes_per_weight; + component[component_index].op.recurrent.num_bytes_per_bias = num_bytes_per_bias; + component[component_index].op.recurrent.weight_scale_factor = weight_scale_factor; + component[component_index].output_scale_factor = output_scale_factor; + component[component_index].op.recurrent.ptr_feedbacks = ptr_feedbacks; + component[component_index].op.recurrent.ptr_weights = ptr_weights; + component[component_index].op.recurrent.ptr_biases = ptr_biases; +} + +void AmIntelDnn::InitInterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns, + uint32_t num_bytes_per_input, uint32_t num_bytes_per_output, + float output_scale_factor, void *ptr_inputs, void *ptr_outputs) { + component[component_index].num_rows_in = num_rows; + component[component_index].num_columns_in = num_columns; + component[component_index].num_rows_out = num_columns; + component[component_index].num_columns_out = num_rows; + component[component_index].num_bytes_per_input = num_bytes_per_input; + component[component_index].num_bytes_per_output = num_bytes_per_output; + component[component_index].operation = kDnnInterleaveOp; + component[component_index].macro_operation = kDnnMacroOpNone; + component[component_index].orientation_in = kDnnNonInterleavedOrientation; + component[component_index].orientation_out = kDnnInterleavedOrientation; + component[component_index].ptr_inputs = ptr_inputs; + component[component_index].ptr_outputs = ptr_outputs; + component[component_index].output_scale_factor = output_scale_factor; +} + +void AmIntelDnn::InitDeinterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns, + uint32_t num_bytes_per_input, uint32_t num_bytes_per_output, + float output_scale_factor, void *ptr_inputs, void *ptr_outputs) { + component[component_index].num_rows_in = num_rows; + component[component_index].num_columns_in = num_columns; + component[component_index].num_rows_out = num_columns; + component[component_index].num_columns_out = num_rows; + component[component_index].num_bytes_per_input = num_bytes_per_input; + component[component_index].num_bytes_per_output = num_bytes_per_output; + component[component_index].operation = kDnnDeinterleaveOp; + component[component_index].macro_operation = kDnnMacroOpNone; + component[component_index].orientation_in = kDnnInterleavedOrientation; + component[component_index].orientation_out = kDnnNonInterleavedOrientation; + component[component_index].ptr_inputs = ptr_inputs; + component[component_index].ptr_outputs = ptr_outputs; + component[component_index].output_scale_factor = output_scale_factor; +} + +__inline void ApplyAffineTransform(intel_dnn_component_t *component, uint32_t *list, uint32_t listsize) { + auto transform = &component->op.affine; + int m = component->num_rows_out; + int n = component->num_columns_in; + int k = component->num_rows_in; + int lda = component->num_rows_in; + int ldb = component->num_columns_in; + int ldc = component->num_columns_out; + + switch (component->num_bytes_per_input) { +#ifdef INTEGER_REF + case 2: + if (component->op.affine.num_bytes_per_weight == 1) { + int8_t *A = reinterpret_cast(transform->ptr_weights); + int16_t *B = reinterpret_cast(component->ptr_inputs); + int32_t *C = reinterpret_cast(component->ptr_outputs); + intel_compound_bias_t *bias = reinterpret_cast(transform->ptr_biases); + if (list == nullptr) { + // PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor); + // PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor); + // PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor); + igemm8_gna(m, n, k, A, lda, B, ldb, bias, C, ldc); + } else { + // PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor); + // PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor); + // PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor); + igemm8_gna_subset(m, n, k, A, lda, B, ldb, bias, C, ldc, list, listsize); + } + // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor); + } else if (component->op.affine.num_bytes_per_weight == 2) { + int16_t *A = reinterpret_cast(transform->ptr_weights); + int16_t *B = reinterpret_cast(component->ptr_inputs); + int32_t *C = reinterpret_cast(component->ptr_outputs); + int32_t *bias = reinterpret_cast(transform->ptr_biases); + if (list == nullptr) { + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < n; j++) { + C[i*ldc+j] = bias[i]; + } + } + // PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.weight_scale_factor); + // PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor); + // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor); + cblas_igemm16(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc); + } else { + for (int l = 0; l < listsize; l++) { + int i = list[l]; + for (uint32_t j = 0; j < n; j++) { + C[l*ldc+j] = bias[i]; + } + } + // PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.scale_factor); + // PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.scale_factor); + // PrintMatrixInt32("C int32", C, m, n, ldc, component->op.affine.scale_factor * component->op.affine.scale_factor); + cblas_igemm16_subset(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc, list, listsize); + } + // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor); + } else { + fprintf(stderr, "Bad weight width in ApplyAffineTransform!\n"); + throw -1; + } + break; +#endif // #ifdef INTEGER_REF + case 4: { + auto A = reinterpret_cast(transform->ptr_weights); + auto B = reinterpret_cast(component->ptr_inputs); + auto C = reinterpret_cast(component->ptr_outputs); + auto bias = reinterpret_cast(transform->ptr_biases); + if (list == nullptr) { + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < n; j++) { + C[i * ldc + j] = bias[i]; + } + } + // if (global_debug) PrintMatrixFloat32("A float", A, m, k, lda); + // if (global_debug) PrintMatrixFloat32("B float", B, k, n, ldb); + // if (global_debug) PrintMatrixFloat32("C float before", C, m, n, ldc); + cblas_sgemm1(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc); + // if (global_debug) PrintMatrixFloat32("C float after", C, m, n, ldc); + } else { + for (int l = 0; l < listsize; l++) { + int i = list[l]; + for (uint32_t j = 0; j < n; j++) { + C[l * ldc + j] = bias[i]; + } + } + // PrintMatrixFloat32("A float", A, k, m, lda); + // PrintMatrixFloat32("trans(B) float", B, k, n, ldb); + // PrintMatrixFloat32("C float before", C, listsize, n, ldc); + cblas_sgemm_subset(CblasRowMajor, + CblasNoTrans, + CblasNoTrans, + m, + n, + k, + 1.0, + A, + lda, + B, + ldb, + 1.0, + C, + ldc, + list, + listsize); + // PrintMatrixFloat32("C float after", C, listsize, n, ldc); + } + } + break; + default:fprintf(stderr, "Bad data width in ApplyAffineTransform!\n"); + throw -1; + } +} + +__inline void ApplyDiagonalTransform(intel_dnn_component_t *component) { + auto transform = &component->op.affine; + int m = component->num_rows_out; + int n = component->num_columns_in; + int ldb = component->num_columns_in; + int ldc = component->num_columns_out; + + switch (component->num_bytes_per_input) { +#ifdef INTEGER_REF + case 2: + if (component->op.affine.num_bytes_per_weight == 1) { + int8_t *A = reinterpret_cast(transform->ptr_weights); + int16_t *B = reinterpret_cast(component->ptr_inputs); + int32_t *C = reinterpret_cast(component->ptr_outputs); + intel_compound_bias_t *bias = reinterpret_cast(transform->ptr_biases); + // PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor); + // PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor); + // PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor); + isbmm8_gna(m, n, A, lda, B, ldb, bias, C, ldc); + // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor); + } else if (component->op.affine.num_bytes_per_weight == 2) { + int16_t *A = reinterpret_cast(transform->ptr_weights); + int16_t *B = reinterpret_cast(component->ptr_inputs); + int32_t *C = reinterpret_cast(component->ptr_outputs); + int32_t *bias = reinterpret_cast(transform->ptr_biases); + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < n; j++) { + C[i*ldc+j] = bias[i]; + } + } + // PrintMatrixInt16("A int16", A, 1, m, lda, component->op.affine.weight_scale_factor); + // PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor); + // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor); + cblas_isbmm16(m, n, A, lda, B, ldb, C, ldc); + // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor); + } else { + fprintf(stderr, "Bad weight width in ApplyDiagonalTransform!\n"); + throw -1; + } + break; +#endif // #ifdef INTEGER_REF + case 4: { + auto A = reinterpret_cast(transform->ptr_weights); + auto B = reinterpret_cast(component->ptr_inputs); + auto C = reinterpret_cast(component->ptr_outputs); + auto bias = reinterpret_cast(transform->ptr_biases); + for (uint32_t i = 0; i < m; i++) { + for (uint32_t j = 0; j < n; j++) { + C[i * ldc + j] = bias[i]; + } + } + // PrintMatrixFloat32("A float", A, 1, m, lda); + // PrintMatrixFloat32("B float", B, k, n, ldb); + // PrintMatrixFloat32("C float before", C, m, n, ldc); + for (uint32_t j = 0; j < n; j++) { + float *Bcol = B + j * ldb; + float *Ccol = C + j * ldc; + cblas_ssbmv1(CblasRowMajor, CblasLower, m, 0, 1.0, A, 1, Bcol, 1, 1.0, Ccol, 1); + } + // PrintMatrixFloat32("C float after", C, m, n, ldc); + } + break; + default:fprintf(stderr, "Bad data width in ApplyDiagonalTransform!\n"); + throw -1; + } +} + +__inline void ApplyRecurrentTransform(intel_dnn_component_t *component, uint32_t row, void *ptr_feedbacks) { + intel_recurrent_t *transform = &component->op.recurrent; + int k1 = component->num_columns_in; + int k2 = component->num_columns_out; + int n = k2; + + if (component->op.recurrent.ptr_feedbacks == nullptr) { + fprintf(stderr, "nullptr feedback pointer in ApplyRecurrentTransform()!\n"); + throw -1; + } + + switch (component->num_bytes_per_input) { +#ifdef INTEGER_REF + case 2: + if (component->op.recurrent.num_bytes_per_weight == 1) { + int16_t *A1 = reinterpret_cast(component->ptr_inputs) + row * component->num_columns_in; + int16_t *A2 = reinterpret_cast(ptr_feedbacks); + int8_t *X = reinterpret_cast(transform->ptr_weights); + intel_compound_bias_t *B = reinterpret_cast(transform->ptr_biases); + int32_t *C = reinterpret_cast(component->ptr_outputs) + row * component->num_columns_out; + // PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor); + // PrintMatrixInt16("A2 int", A2, 1, k2, k2); + // PrintMatrixInt8("X int", X, k, n, n, component->op.recurrent.weight_scale_factor); + // PrintMatrixInt32("B int", B, 1, 2*n, 2*n, component->output_scale_factor); + igemv8_gna_split(n, k1, k2, A1, A2, X, B, C); + // PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor); + } else if (component->op.recurrent.num_bytes_per_weight == 2) { + int16_t *A1 = reinterpret_cast(component->ptr_inputs) + row * component->num_columns_in; + int16_t *A2 = reinterpret_cast(ptr_feedbacks); + int16_t *X = reinterpret_cast(transform->ptr_weights); + int32_t *B = reinterpret_cast(transform->ptr_biases); + int32_t *C = reinterpret_cast(component->ptr_outputs) + row * component->num_columns_out; + // PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor); + // PrintMatrixInt16("A2 int", A2, 1, k2, k2, component->op.recurrent.weight_scale_factor); + // PrintMatrixInt16("X int", X, k, n, n, component->op.recurrent.weight_scale_factor); + // PrintMatrixInt32("B int", B, 1, n, n, component->output_scale_factor); + igemv16_split(n, k1, k2, A1, A2, X, B, C); + // PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor); + } else { + fprintf(stderr, "Weight width not supported in ApplyRecurrentTransform!\n"); + throw -1; + } + break; +#endif // #ifdef INTEGER_REF + case 4: { + auto A1 = reinterpret_cast(component->ptr_inputs) + row * component->num_columns_in; + auto A2 = reinterpret_cast(ptr_feedbacks); + auto X = reinterpret_cast(transform->ptr_weights); + auto B = reinterpret_cast(transform->ptr_biases); + auto C = reinterpret_cast(component->ptr_outputs) + row * component->num_columns_out; + // PrintMatrixFloat32("A1 float", A1, 1, k1, k1); + // PrintMatrixFloat32("A2 float", A2, 1, k2, k2); + // PrintMatrixFloat32("X float", X, k, n, n); + // PrintMatrixFloat32("B float", B, 1, n, n); + sgemv_split(n, k1, k2, A1, A2, X, B, C); + // PrintMatrixFloat32("C float", C, 1, n, n); + } + break; + default:fprintf(stderr, "Bad data width in ApplyRecurrentTransform!\n"); + throw -1; + } +} + +__inline void ApplyConvolutional1DTransform(intel_dnn_component_t *component) { + switch (component->num_bytes_per_input) { +#ifdef INTEGER_REF + case 2: + CNNFilter16(component); + break; +#endif // #ifdef INTEGER_REF + case 4: + // PrintMatrixFloat32("Input float", reinterpret_cast(component->ptr_inputs), + // component->num_rows_in, component->num_columns_in, component->num_columns_in); + // PrintMatrixFloat32("Filt float", reinterpret_cast(component->op.conv1D.ptr_filters), + // component->op.conv1D.num_filters, + // component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps, + // component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps); + // PrintMatrixFloat32("Bias float", reinterpret_cast(component->op.conv1D.ptr_biases), 1, + // component->op.conv1D.num_filters, component->op.conv1D.num_filters); + CNNFilter32(component); + // PrintMatrixFloat32("Output float", reinterpret_cast(component->ptr_outputs, component->num_rows_out, + // component->num_columns_out, component->num_columns_out); + break; + default:fprintf(stderr, "Bad data width in ApplyConvolutionalTransform!\n"); + throw -1; + } +} + +__inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component, + intel_dnn_number_type_t number_type, + uint32_t listsize) { + if (number_type == kDnnFloat) { + // PrintMatrixFloat32("PWL Input float", reinterpret_cast(component->ptr_inputs), component->num_rows_in, + // component->num_columns_in, component->num_columns_in); + PwlApply32(component, listsize); + // PrintMatrixFloat32("PWL Output float", reinterpret_cast(component->ptr_outputs), component->num_rows_out, + // component->num_columns_out, component->num_columns_out); +#ifdef INTEGER_REF + } else if (component->num_bytes_per_output == 2) { + PwlApply16(component, listsize); +#endif // #ifdef INTEGER_REF + } else { + fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n"); + throw -1; + } +} + +__inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component, + intel_dnn_number_type_t number_type, + uint32_t listsize, + uint32_t num_row) { + if (number_type == kDnnFloat) { + PwlApply32(component, num_row, num_row, 0, listsize - 1); +#ifdef INTEGER_REF + } else if (component->num_bytes_per_output == 2) { + PwlApply16(component, num_row, num_row, 0, listsize-1); +#endif // #ifdef INTEGER_REF + } else { + fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n"); + throw -1; + } +} + +__inline void ApplyMaxPoolTransform(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) { + if (component->num_bytes_per_input == 4) { + // PrintMatrixFloat32("Input float", reinterpret_cast(component->ptr_inputs), component->num_rows_in, + // component->num_columns_in, component->num_columns_in); + CNNMaxPool(component, number_type); + // PrintMatrixFloat32("Output float", reinterpret_cast(component->ptr_outputs), component->num_rows_out, + // component->num_columns_out, component->num_columns_out); + } else { + fprintf(stderr, "Bad data width in ApplyMaxPoolTransform!\n"); + throw -1; + } +} + +__inline void ApplyTranspose(intel_dnn_component_t *component) { + int m = component->num_rows_in; + int n = component->num_columns_in; + int lda = component->num_columns_in; + int ldb = component->num_columns_out; + // B = Transpose(A) where A is mxn and B is nxm + switch (component->num_bytes_per_input) { +#ifdef INTEGER_REF + case 1: + { + int8_t *A = reinterpret_cast(component->ptr_inputs); + int8_t *B = reinterpret_cast(component->ptr_outputs); + for (uint32_t row = 0; row < m; row++) { + for (uint32_t col = 0; col < n; col++) { + B[col*ldb+row] = A[row*lda+col]; + } + } + } + break; + case 2: + { + int16_t *A = reinterpret_cast(component->ptr_inputs); + int16_t *B = reinterpret_cast(component->ptr_outputs); + for (uint32_t row = 0; row < m; row++) { + for (uint32_t col = 0; col < n; col++) { + B[col*ldb+row] = A[row*lda+col]; + } + } + } + break; +#endif // #ifdef INTEGER_REF + case 4: { + auto A = reinterpret_cast(component->ptr_inputs); + auto B = reinterpret_cast(component->ptr_outputs); + for (uint32_t row = 0; row < m; row++) { + for (uint32_t col = 0; col < n; col++) { + B[col * ldb + row] = A[row * lda + col]; + } + } + } + break; + default:fprintf(stderr, "Bad data width in ApplyInterleave!\n"); + throw -1; + } +} + +__inline void ApplyCopy(intel_dnn_component_t *component) { + auto src = reinterpret_cast(component->ptr_inputs); + auto dst = reinterpret_cast(component->ptr_outputs); + int32_t m = component->op.copy.num_copy_rows; + int32_t n = component->op.copy.num_copy_columns; + int32_t lda = component->num_columns_in; + int32_t ldb = component->num_columns_out; + if (m > component->num_rows_in) { + fprintf(stderr, "Error: attempt to copy more columns than matrix has!\n"); + throw -1; + } else { + switch (component->num_bytes_per_input) { +#ifdef INTEGER_REF + case 2: + { + int16_t *A = reinterpret_cast(src); + int16_t *B = reinterpret_cast(dst); + for (uint32_t row = 0; row < m; row++) { + for (uint32_t col = 0; col < n; col++) { + B[row*ldb + col] = A[row*lda + col]; + } + } + } + break; +#endif // #ifdef INTEGER_REF + case 4: { + auto A = reinterpret_cast(src); + auto B = reinterpret_cast(dst); + for (uint32_t row = 0; row < m; row++) { + for (uint32_t col = 0; col < n; col++) { + B[row * ldb + col] = A[row * lda + col]; + } + } + } + break; + default:fprintf(stderr, "Bad data width in ApplyCopy!\n"); + throw -1; + } + } +} + +uint32_t AmIntelDnn::CopyActiveList(std::vector > &active_list, uint32_t list_index) { + if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) { + num_active_outputs_ = component[component.size() - 1].num_rows_out; + } else { + num_active_outputs_ = component[component.size() - 1].num_columns_out; + } + + if (!active_list.empty()) { + if (list_index >= active_list.size()) { + fprintf(stderr, "Index %d beyond end of active list in CopyActiveList()\n", list_index); + throw -1; + } + if (active_list[list_index].size() > component[component.size() - 1].num_rows_out) { + fprintf(stderr, "Active list too large in CopyActiveList()\n"); + throw -1; + } + + if (ptr_active_outputs_ != nullptr) { + num_active_outputs_ = active_list[list_index].size(); + memcpy(ptr_active_outputs_, active_list[list_index].data(), num_active_outputs_ * sizeof(uint32_t)); + } + } + + return (num_active_outputs_); +} + +void AmIntelDnn::Propagate() { + for (uint32_t i = 0; i < component.size(); i++) { + intel_dnn_component_t *comp = &component[i]; + uint32_t *ptr_active_outputs = nullptr; + uint32_t num_active_outputs = (comp->orientation_out == kDnnInterleavedOrientation) + ? comp->num_rows_out : comp->num_columns_out; + + if (i == component.size() - 1) { // active list applies to last component + ptr_active_outputs = ptr_active_outputs_; + num_active_outputs = num_active_outputs_; + } else if (i == component.size() - 2) { // also applies to last two components when last is PWL + if ((component[i].operation == kDnnAffineOp) && (component[i + 1].operation == kDnnPiecewiselinearOp)) { + ptr_active_outputs = ptr_active_outputs_; + num_active_outputs = num_active_outputs_; + } + } + + switch (comp->operation) { + case kDnnAffineOp :ApplyAffineTransform(comp, ptr_active_outputs, num_active_outputs); + break; + case kDnnDiagonalOp:ApplyDiagonalTransform(comp); + break; + case kDnnRecurrentOp: + if ((i < component.size() - 1) && (component[i + 1].operation == kDnnPiecewiselinearOp)) { + intel_dnn_component_t *comp_pwl = &component[i + 1]; + for (uint32_t j = 0; j < comp->num_rows_in; j++) { + void *ptr_feedbacks = + reinterpret_cast(reinterpret_cast(comp->op.recurrent.ptr_feedbacks) + j * comp_pwl->num_columns_out); + ApplyRecurrentTransform(comp, j, ptr_feedbacks); + // PrintOutputs(i); + ApplyPiecewiseLinearTransform(comp_pwl, number_type_, num_active_outputs, j); + } + i++; // skip next component + } else { + fprintf(stderr, "Missing PiecewiseLinear component after Recurrent component in Propagate!\n"); + throw -1; + } + break; + case kDnnConvolutional1dOp:ApplyConvolutional1DTransform(comp); + break; + case kDnnPiecewiselinearOp:ApplyPiecewiseLinearTransform(comp, number_type_, num_active_outputs); + break; + case kDnnMaxPoolOp:ApplyMaxPoolTransform(comp, number_type_); + break; + case kDnnInterleaveOp:ApplyTranspose(comp); + break; + case kDnnDeinterleaveOp:ApplyTranspose(comp); + break; + case kDnnCopyOp:ApplyCopy(comp); + break; + default:fprintf(stderr, "Bad operation in Propagate!\n"); + throw -1; + break; + } + // PrintOutputs(i); fflush(stdout); + } +} + +intel_dnn_macro_operation_t AmIntelDnn::MacroOperation(uint32_t component_index) { + return (component[component_index].macro_operation); +} + +void AmIntelDnn::SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation) { + component[component_index].macro_operation = macro_operation; +} + +float AmIntelDnn::InputScaleFactor(uint32_t component_index) { + float scale_factor = 1.0; + + if (component_index == 0) { + scale_factor = input_scale_factor_; + } else { + if (component[component_index - 1].operation == kDnnAffineOp) { + scale_factor = component[component_index - 1].output_scale_factor; + } else if (component[component_index - 1].operation == kDnnDiagonalOp) { + scale_factor = component[component_index - 1].output_scale_factor; + } else if (component[component_index - 1].operation == kDnnConvolutional1dOp) { + scale_factor = component[component_index - 1].output_scale_factor; + } else if (component[component_index - 1].operation == kDnnRecurrentOp) { + scale_factor = component[component_index - 1].output_scale_factor; + } else if (component[component_index - 1].operation == kDnnInterleaveOp) { + scale_factor = component[component_index - 1].output_scale_factor; + } else if (component[component_index - 1].operation == kDnnDeinterleaveOp) { + scale_factor = component[component_index - 1].output_scale_factor; + } else if (component[component_index - 1].operation == kDnnCopyOp) { + scale_factor = component[component_index - 1].output_scale_factor; + } + } + + return (scale_factor); +} + +float AmIntelDnn::WeightScaleFactor(uint32_t component_index) { + float scale_factor = 1.0; + + if (component[component_index].operation == kDnnAffineOp) { + scale_factor = component[component_index].op.affine.weight_scale_factor; + } else if (component[component_index].operation == kDnnDiagonalOp) { + scale_factor = component[component_index].op.affine.weight_scale_factor; + } else if (component[component_index].operation == kDnnConvolutional1dOp) { + scale_factor = component[component_index].op.conv1D.weight_scale_factor; + } else if (component[component_index].operation == kDnnRecurrentOp) { + scale_factor = component[component_index].op.recurrent.weight_scale_factor; + } + + return (scale_factor); +} + +float AmIntelDnn::OutputScaleFactor(intel_dnn_component_t &comp) { + return comp.output_scale_factor; +} + +void AmIntelDnn::SetOutputScaleFactor(uint32_t component_index, float scale_factor) { + component[component_index].output_scale_factor = scale_factor; +} + +void AmIntelDnn::PrintOutputs(uint32_t component_index) { + float scale_factor = OutputScaleFactor(component_index); + uint32_t num_rows = component[component_index].num_rows_out; + uint32_t num_columns = component[component_index].num_columns_out; + + printf("component %d : %s\n", component_index, intel_dnn_operation_name[component[component_index].operation]); + if (number_type_ == kDnnFloat) { + auto ptr_output = reinterpret_cast(component[component_index].ptr_outputs); + for (int i = 0; i < num_rows; i++) { + for (int j = 0; j < num_columns; j++) { + printf("%d %d : %e\n", i, j, ptr_output[i * num_columns + j] / scale_factor); + } + } + } else { + switch (component[component_index].num_bytes_per_output) { + case 1: { + auto ptr_output = reinterpret_cast(component[component_index].ptr_outputs); + for (int i = 0; i < num_rows; i++) { + for (int j = 0; j < num_columns; j++) { + printf("%d %d : %e\n", i, j, static_cast(ptr_output[i * num_columns + j]) / scale_factor); + } + } + } + break; + case 2: { + auto ptr_output = reinterpret_cast(component[component_index].ptr_outputs); + for (int i = 0; i < num_rows; i++) { + for (int j = 0; j < num_columns; j++) { + printf("%d %d : %e\n", i, j, static_cast(ptr_output[i * num_columns + j]) / scale_factor); + } + } + } + break; + case 4: { + auto ptr_output = reinterpret_cast(component[component_index].ptr_outputs); + for (int i = 0; i < num_rows; i++) { + for (int j = 0; j < num_columns; j++) { + printf("%d %d : %e\n", i, j, static_cast(ptr_output[i * num_columns + j]) / scale_factor); + } + } + } + break; + default: + fprintf(stderr, + "Bad num_bytes_per_output in component %d in AmIntelDnn::PrintOutputs()\n", + component_index); + throw -1; + } + } +} + +uint32_t AmIntelDnn::CompareScores(void *ptr_refscorearray, intel_score_error_t *score_error, uint32_t num_frames) { + intel_dnn_component_t *ptr_component = &component[component.size() - 1]; + intel_dnn_orientation_t orientation = ptr_component->orientation_out; + float scale_factor = OutputScaleFactor(component.size() - 1); + uint32_t num_errors = 0; + uint32_t num_rows = (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : num_frames; + uint32_t num_columns = (orientation == kDnnInterleavedOrientation) ? num_frames : ptr_component->num_columns_out; + uint32_t num_row_step_ref = + (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : ptr_component->num_columns_out; + uint32_t num_row_step = ptr_component->num_columns_out; + + if (ptr_component->operation == kDnnAffineOp) { + num_rows = num_active_outputs_; + } + + ClearScoreError(score_error); + + if (number_type_ == kDnnFloat) { + auto A = reinterpret_cast(ptr_component->ptr_outputs); + auto B = reinterpret_cast(ptr_refscorearray); + for (int i = 0; i < num_rows; i++) { + for (int j = 0; j < num_columns; j++) { + float score = A[i * num_row_step + j]; + float refscore = + (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref + + j]; + float scaled_score = score / scale_factor; + float error = fabs(refscore - scaled_score); + float rel_error = error / (fabs(refscore) + 1e-20); + float squared_error = error * error; + float squared_rel_error = rel_error * rel_error; + score_error->num_scores++; + score_error->sum_error += error; + score_error->sum_squared_error += squared_error; + if (error > score_error->max_error) { + score_error->max_error = error; + } + score_error->sum_rel_error += rel_error; + score_error->sum_squared_rel_error += squared_rel_error; + if (rel_error > score_error->max_rel_error) { + score_error->max_rel_error = rel_error; + } + if (error > score_error->threshold) { + num_errors++; + } + } + } + } else if (number_type_ == kDnnInt) { + auto B = reinterpret_cast(ptr_refscorearray); + for (int i = 0; i < num_rows; i++) { + for (int j = 0; j < num_columns; j++) { + float score; + if (ptr_component->num_bytes_per_output == 4) { + auto A = reinterpret_cast(ptr_component->ptr_outputs); + score = static_cast(A[i * num_row_step + j]); + } else if (ptr_component->num_bytes_per_output == 2) { + auto A = reinterpret_cast(ptr_component->ptr_outputs); + score = static_cast(A[i * num_row_step + j]); + } else { + fprintf(stderr, + "Unsupported output width (%d) in AmIntelDnn::CompareScores()!\n", + ptr_component->num_bytes_per_output); + throw -1; + } + float refscore = + (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref + + j]; + float scaled_score = score / scale_factor; + float error = fabs(refscore - scaled_score); + float rel_error = error / (fabs(refscore) + 1e-20); + float squared_error = error * error; + float squared_rel_error = rel_error * rel_error; + score_error->num_scores++; + score_error->sum_error += error; + score_error->sum_squared_error += squared_error; + if (error > score_error->max_error) { + score_error->max_error = error; + } + score_error->sum_rel_error += rel_error; + score_error->sum_squared_rel_error += squared_rel_error; + if (rel_error > score_error->max_rel_error) { + score_error->max_rel_error = rel_error; + } + if (error > score_error->threshold) { + num_errors++; + } + } + } + } else { + fprintf(stderr, "Unknown number type in AmIntelDnn::CompareScores()!\n"); + throw -1; + } + + score_error->num_errors = num_errors; + + return (num_errors); +} + +void AmIntelDnn::WriteGraphWizModel(const char *filename) { + auto & components = component; + +#define IS_AFFINE(k)\ + (components[k].operation == kDnnAffineOp ||\ + components[k].operation == kDnnDiagonalOp) + +#define IS_CONV(k)\ + (components[k].operation == kDnnConvolutional1dOp) + +#define IS_RELU(k)\ + (components[k].operation == kDnnPiecewiselinearOp &&\ + components[k].op.pwl.func_id == kActRelu) + + +#define IS_DIAG(k)\ + (components[k].operation == kDnnDiagonalOp) + +#define OUTPUTS(idx)\ + components[idx].ptr_outputs, components[idx].num_rows_out*components[idx].num_columns_out * components[idx].num_bytes_per_output + +#define INPUTS(idx)\ + components[idx].ptr_inputs, components[idx].num_rows_in*components[idx].num_columns_in * components[idx].num_bytes_per_input + +#define BIASES(idx)\ + components[idx].op.affine.ptr_biases, components[idx].num_rows_in*components[idx].num_columns_in * components[idx].op.affine.num_bytes_per_bias + +#define WEIGHTS(idx)\ + components[idx].op.affine.ptr_weights, components[idx].op.affine.num_bytes_per_weight * components[idx].num_rows_in*components[idx].num_columns_in * \ + (IS_DIAG(idx) ? 1 : components[idx].num_rows_out*components[idx].num_columns_out) + + auto intersected = [](void * ptra, size_t asize, void * ptrb, size_t bsize) { + return !(((reinterpret_cast(ptra) + asize) <= ptrb) || ((reinterpret_cast(ptrb) + bsize) <= ptra)); + }; + + auto equals = [](void * ptra, size_t asize, void * ptrb, size_t bsize) { + // return !((((char*)ptra + asize) < ptrb) || (((char*)ptrb + bsize) < ptra)); + return ptra >= ptrb && ptra < reinterpret_cast(ptrb) + bsize; + }; + + std::fstream graph("graph.dot", std::ios::out); + graph << "strict digraph {"; + std::set weights; + std::set biases; + std::set outputs; + std::set layersNames; + + auto generate_layer_name = [&](int k) { + std::string l; + if (components[k].operation == kDnnPiecewiselinearOp) { + l += intel_dnn_activation_name[components[k].op.pwl.func_id]; + } else { + l += intel_dnn_operation_name[components[k].operation]; + } + l += "_" + std::to_string(k); + if (components[k].operation == kDnnPiecewiselinearOp) { + graph << l << " [shape=box, style=filled, fillcolor=yellow"; + } else { + graph << l << " [shape=box"; + } + + graph << ", label=<\n" + " \n" + " \n"; + if (IS_AFFINE(k)) { + graph << " \n"; + graph << " \n"; + graph << " \n"; + } + if (IS_RELU(k)) { + graph << " \n"; + } + if (IS_CONV(k)) { + auto &conv = components[k].op.conv1D; + graph << " \n"; + graph << " \n"; + graph << " \n"; + graph << " \n"; + graph << " \n"; + graph << " \n"; + graph << " \n"; + graph << " \n"; + graph << " \n"; + } + graph<< " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + " \n" + "
" << l << "
" << components[k].num_rows_in << "x" << components[k].num_rows_out<< "
wscale" << components[k].op.affine.weight_scale_factor<< "
wbit" << components[k].op.affine.num_bytes_per_weight<< "
bbit" << components[k].op.affine.num_bytes_per_bias<< "
negative_slope" << components[k].op.pwl.func_id.negative_slope<< "
num_filters" << conv.num_filters<< "
num_filter_rows" << conv.num_filter_rows<< "
num_filter_coefficients" << conv.num_filter_coefficients<< "
num_feature_maps" << conv.num_feature_maps<< "
num_feature_map_rows" << conv.num_feature_map_rows<< "
num_feature_map_columns" << conv.num_feature_map_columns<< "
wscale" << conv.weight_scale_factor<< "
wbit" << conv.num_bytes_per_weight<< "
bbit" << conv.num_bytes_per_bias<< "
num_rows_in" << components[k].num_rows_in<< "
num_columns_in" << components[k].num_columns_in<< "
num_rows_out" << components[k].num_rows_out<< "
num_columns_out" << components[k].num_columns_out<< "
oscale" << components[k].output_scale_factor<< "
ibit" << components[k].num_bytes_per_input<< "
obit" << components[k].num_bytes_per_output<< "
>];\n"; + + return l; + }; + + + for (int k = 0; k < components.size(); ++k) { + std::string l = generate_layer_name(k); + layersNames.insert(l); + int lidx = std::distance(layersNames.begin(), layersNames.find(l)); + int widx = 0; + int bidx = 0; + + if (IS_AFFINE(k)) { + weights.insert(components[k].op.affine.ptr_weights); + biases.insert(components[k].op.affine.ptr_biases); + + widx = std::distance(weights.begin(), weights.find(components[k].op.affine.ptr_weights)); + bidx = std::distance(biases.begin(), biases.find(components[k].op.affine.ptr_biases)); + } + + + auto lw = "weights_" + std::to_string(lidx) + "_" + std::to_string(widx);; + auto lb = "biases_" + std::to_string(lidx) + "_" + std::to_string(bidx); + + if (IS_AFFINE(k)) { + graph << lw << " -> " << l << "[style=bold];"; + graph << lb << " -> " << l << "[style=bold];"; + } + + graph << "\n"; + + bool inputConnected = false; + + for (int k2 = 0; k2 < components.size(); ++k2) { + if (k2 == k) continue; + + + std::string r = generate_layer_name(k2); + + int w2idx = 0; + int b2idx = 0; + + if (IS_AFFINE(k2)) { + weights.insert(components[k2].op.affine.ptr_weights); + biases.insert(components[k2].op.affine.ptr_biases); + + w2idx = std::distance(weights.begin(), weights.find(components[k2].op.affine.ptr_weights)); + b2idx = std::distance(biases.begin(), biases.find(components[k2].op.affine.ptr_biases)); + } + + auto rw = "weights_" + std::to_string(w2idx); + auto rb = "biases_" + std::to_string(b2idx); + + // ---------------------------------------------------------- + // output to input connections + if (intersected(OUTPUTS(k2), INPUTS(k))) { + graph << r <<" -> "<< l << ";"; + inputConnected = true; + } + + // ---------------------------------------------------------- + // output to biases connections + if (IS_AFFINE(k) && intersected(OUTPUTS(k2), BIASES(k))) { + graph << r << " -> " << lb << " [label=\"OB\", fontcolor=blue, color=blue, style=dashed];"; + } + + // ---------------------------------------------------------- + // output to weights connections + if (IS_AFFINE(k) && equals(OUTPUTS(k2), WEIGHTS(k))) { + graph << r << " -> " << lw << " [label=\"OW\", fontcolor=magenta, color=magenta, style=dashed];"; + } + + // ---------------------------------------------------------- + // weights to input connections + if (IS_AFFINE(k2) && equals(WEIGHTS(k2), INPUTS(k))) { + graph << rw << " -> " << l << " [label=\"WI\", fontcolor=red, color=red, style=dashed];"; + inputConnected = true; + } + + // ---------------------------------------------------------- + // weights to bias connections + if (IS_AFFINE(k2) && IS_AFFINE(k) && equals(WEIGHTS(k2), BIASES(k))) { + graph << rw << " -> " << lb << " [label=\"WB\", fontcolor=darkgreen,color=darkgreen, style=dashed];"; + } + } + if (!inputConnected) { + // drawing tmp connection + outputs.insert(components[k].ptr_inputs); + auto tidx = std::distance(outputs.begin(), outputs.find(components[k].ptr_inputs)); + graph << tidx << " -> " << l + << " [label=\"FROM_TMP\", fontcolor=darkgreen,color=orange, style=dashed];"; + } + } + + for (int k = 0; k < components.size(); ++k) { + std::string l = generate_layer_name(k); + + int tidx = 0; + for (auto tmpOutPtrs : outputs) { + if (components[k].ptr_outputs == tmpOutPtrs) { + graph << l << " -> " << tidx << " [label=\"TO_TMP\", fontcolor=darkgreen,color=orange, style=dashed];"; + } + tidx++; + } + } + + graph << "}"; +} + +void AmIntelDnn::WriteDnnText(const char *filename, intel_dnn_number_type_t number_type) { + if ((number_type_ == kDnnFloat) && (number_type == kDnnInt)) { + fprintf(stderr, "Error trying to write floating point DNN as integer in AmIntelDnn::WriteDnnText().\n"); + fprintf(stderr, " Please convert to integer first.\n"); + throw -1; + } +#ifndef LIGHT_DUMP + std::ofstream out_file1(filename, std::ios::out); + std::ofstream &out_file = out_file1; +#else + std::ofstream out_file((std::string(filename) + ".light").c_str(), std::ios::out); +#endif + if (out_file.good()) { + uint32_t num_inputs = component[0].num_rows_in; + uint32_t num_outputs = + (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) ? component[component.size() + - 1].num_rows_out : component[component.size() - 1].num_columns_out; + uint32_t num_layers = num_gna_layers(); + uint32_t num_group = this->num_group_in(); + uint32_t layer = 0; + + out_file << "\n"; + out_file << " " << intel_dnn_number_type_name[number_type] << "\n"; + out_file << " " << intel_dnn_softmax_name[softmax_type] << "\n"; + out_file << " " << std::dec << num_bytes_dnn_memory_ << "\n"; + out_file << " " << std::dec << num_group << "\n"; + out_file << " " << std::dec << num_inputs << "\n"; + out_file << " " << std::dec << num_outputs << "\n"; + out_file << " " << std::dec << num_layers << "\n"; + for (uint32_t i = 0; i < component.size(); i++) { +#ifdef LIGHT_DUMP + std::stringstream out_file_name; + out_file_name << getDumpFolderName() << std::setfill('0') << std::setw(2) << i << "_" + << intel_dnn_operation_name[component[i].operation] + << "-" << component[i].num_rows_in + << "-" << component[i].num_rows_out; + if (component[i].operation == kDnnPiecewiselinearOp) { + out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id.type]; + } + std::ofstream out_file((out_file_name.str() + ".txt").c_str(), std::ios::out); +#endif + + uint32_t num_rows_in = component[i].num_rows_in; + uint32_t num_columns_in = component[i].num_columns_in; + uint32_t num_rows_out = component[i].num_rows_out; + uint32_t num_columns_out = component[i].num_columns_out; + uint32_t num_bytes_per_input = component[i].num_bytes_per_input; + uint32_t num_bytes_per_output = component[i].num_bytes_per_output; + if ((component[i].operation == kDnnAffineOp) + || (component[i].operation == kDnnDiagonalOp) + || (component[i].operation == kDnnRecurrentOp) + || (component[i].operation == kDnnConvolutional1dOp) + || (component[i].operation == kDnnInterleaveOp) + || (component[i].operation == kDnnDeinterleaveOp) + || (component[i].operation == kDnnCopyOp)) { + out_file << " " << std::dec << layer << "\n"; + layer++; + } + out_file << " " << intel_dnn_operation_name[component[i].operation] << "\n"; + out_file << " " << intel_dnn_macro_operation_name[component[i].macro_operation] << "\n"; + out_file << " " << std::dec << num_rows_in << "\n"; + out_file << " " << std::dec << num_columns_in << "\n"; + out_file << " " << std::dec << num_rows_out << "\n"; + out_file << " " << std::dec << num_columns_out << "\n"; + out_file << " " << std::dec << (component[i].orientation_in == kDnnInterleavedOrientation ? + "interleaved" : "deinterleaved") << "\n"; + out_file << " " << std::dec << (component[i].orientation_out == kDnnInterleavedOrientation ? + "interleaved" : "deinterleaved") << "\n"; + + if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) { + out_file << " " << std::dec << sizeof(float) << "\n"; + out_file << " " << std::dec << sizeof(float) << "\n"; + } else { + out_file << " " << std::dec << num_bytes_per_input << "\n"; + out_file << " " << std::dec << num_bytes_per_output << "\n"; + } + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].ptr_inputs, ptr_dnn_memory_) << "\n"; + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].ptr_outputs, ptr_dnn_memory_) << "\n"; + switch (component[i].operation) { + case kDnnAffineOp: + case kDnnDiagonalOp: { + uint32_t num_bytes_per_weight = component[i].op.affine.num_bytes_per_weight; + uint32_t num_bytes_per_bias = component[i].op.affine.num_bytes_per_bias; + float weight_scale_factor = component[i].op.affine.weight_scale_factor; + float output_scale_factor = component[i].output_scale_factor; + uint32_t num_weight_rows = (component[i].operation == kDnnDiagonalOp) ? 1 : num_rows_out; + uint32_t num_weight_columns = num_rows_in; + if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) { + out_file << " " << std::dec << 4 << "\n"; + out_file << " " << std::dec << 4 << "\n"; + } else { + out_file << " " << std::dec << num_bytes_per_weight << "\n"; + out_file << " " << std::dec << num_bytes_per_bias << "\n"; + } + if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) { + out_file << std::setprecision(12) << std::scientific << " " << 1.0 << "\n"; + out_file << std::setprecision(12) << std::scientific << " " << 1.0 << "\n"; + } else { + out_file << std::setprecision(12) << std::scientific << " " + << weight_scale_factor << "\n"; + out_file << std::setprecision(12) << std::scientific << " " + << output_scale_factor << "\n"; + } + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].op.affine.ptr_weights, ptr_dnn_memory_) << "\n"; + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].op.affine.ptr_biases, ptr_dnn_memory_) << "\n"; + + std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out); + std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out); + + if (num_bytes_per_weight == 1) { + int8_t *ptr_weight = reinterpret_cast(component[i].op.affine.ptr_weights); + intel_compound_bias_t *ptr_bias = reinterpret_cast(component[i].op.affine.ptr_biases); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_weight_rows; row++) { + for (uint32_t col = 0; col < num_weight_columns; col++) { + if (number_type == kDnnFloat) { + float val = + static_cast(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier + / weight_scale_factor; + out_wfile << std::setprecision(4) << val << " "; + } else { + out_wfile << int((int8_t) ptr_weight[row * num_weight_columns + col]) << " "; + } + out_wfile << "\n"; + } + } +#endif + } else if (num_bytes_per_weight == 2) { + int16_t *ptr_weight = reinterpret_cast(component[i].op.affine.ptr_weights); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_weight_rows; row++) { + for (uint32_t col = 0; col < num_weight_columns; col++) { + if (number_type == kDnnFloat) { + out_wfile << std::setprecision(12) + << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " "; + } else { + out_wfile << ptr_weight[row * num_weight_columns + col] << " "; + } + out_wfile << "\n"; + } + } +#endif + } else if (number_type_ == kDnnFloat) { + float *ptr_weight = reinterpret_cast(component[i].op.affine.ptr_weights); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_weight_rows; row++) { + for (uint32_t col = 0; col < num_weight_columns; col++) { + out_wfile << std::setprecision(5) + << ptr_weight[row * num_weight_columns + col] << " "; + out_wfile << "\n"; + } + } +#endif + } else { + fprintf(stderr, "Unsupported weight type in WriteDnnText!\n"); + throw -1; + } + if (number_type_ == kDnnInt) { + if (num_bytes_per_weight == 1) { + intel_compound_bias_t + *ptr_biases = reinterpret_cast(component[i].op.affine.ptr_biases); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_rows_out; row++) { + out_bfile << std::setw(8) << ptr_biases[row].bias << ", "; + out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n"; + } +#endif + } else { + int32_t *ptr_biases = reinterpret_cast(component[i].op.affine.ptr_biases); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_rows_out; row++) { + if (number_type == kDnnInt) { + out_bfile << std::setw(8) << ptr_biases[row] << "\n"; + } else { + out_bfile << std::setw(8) << ptr_biases[row] / output_scale_factor << "\n"; + } + } +#endif + } + + } else { + float *ptr_biases = reinterpret_cast(component[i].op.affine.ptr_biases); +#ifdef DUMP_WB + + for (uint32_t row = 0; row < num_rows_out; row++) { + out_bfile << std::setprecision(5) << ptr_biases[row] << "\n"; + } +#endif + } + } + break; + case kDnnConvolutional1dOp: { + uint32_t num_filters = component[i].op.conv1D.num_filters; + uint32_t num_filter_rows = component[i].op.conv1D.num_filter_rows; + uint32_t num_filter_coefficients = component[i].op.conv1D.num_filter_coefficients; + uint32_t num_feature_maps = component[i].op.conv1D.num_feature_maps; + uint32_t num_feature_map_rows = component[i].op.conv1D.num_feature_map_rows; + uint32_t num_feature_map_columns = component[i].op.conv1D.num_feature_map_columns; + uint32_t num_filter_outputs = + component[i].op.conv1D.num_feature_map_rows - component[i].op.conv1D.num_filter_rows + 1; + uint32_t num_bytes_per_weight = component[i].op.conv1D.num_bytes_per_weight; + uint32_t num_bytes_per_bias = component[i].op.conv1D.num_bytes_per_bias; + float weight_scale_factor = component[i].op.conv1D.weight_scale_factor; + float output_scale_factor = component[i].output_scale_factor; + out_file << " " << std::dec << num_filters << "\n"; + out_file << " " << std::dec << num_filter_coefficients << "\n"; + out_file << " " << std::dec << num_filter_rows << "\n"; + out_file << " " << std::dec << num_feature_maps << "\n"; + out_file << " " << std::dec << num_feature_map_rows << "\n"; + out_file << " " << std::dec << num_feature_map_columns << "\n"; + if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) { + out_file << " " << std::dec << 4 << "\n"; + out_file << " " << std::dec << 4 << "\n"; + } else { + out_file << " " << std::dec << num_bytes_per_weight << "\n"; + out_file << " " << std::dec << num_bytes_per_bias << "\n"; + } + if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) { + out_file << std::setprecision(12) << std::scientific << " " << 1.0 << "\n"; + out_file << std::setprecision(12) << std::scientific << " " << 1.0 << "\n"; + } else { + out_file << std::setprecision(12) << std::scientific << " " + << weight_scale_factor << "\n"; + out_file << std::setprecision(12) << std::scientific << " " + << output_scale_factor << "\n"; + } + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].op.conv1D.ptr_filters, ptr_dnn_memory_) << "\n"; + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].op.conv1D.ptr_biases, ptr_dnn_memory_) << "\n"; + + + std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out); + std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out); + + + if (num_bytes_per_weight == 1) { + int8_t *ptr_weight = reinterpret_cast(component[i].op.conv1D.ptr_filters); + intel_compound_bias_t *ptr_bias = reinterpret_cast(component[i].op.conv1D.ptr_biases); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_filters; row++) { + for (uint32_t col = 0; col < num_filter_coefficients; col++) { + if (number_type == kDnnFloat) { + float val = static_cast(ptr_weight[row * num_filter_coefficients + col]) + * ptr_bias[row].multiplier / weight_scale_factor; + out_wfile << std::setprecision(12) <(component[i].op.conv1D.ptr_filters); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_filters; row++) { + for (uint32_t col = 0; col < num_filter_coefficients; col++) { + if (number_type == kDnnFloat) { + out_wfile << std::setprecision(12) + << ptr_weight[row * num_filter_coefficients + col] / weight_scale_factor + << "\n"; + } else { + out_wfile << "0x" << std::setfill('0') << std::setw(4) << std::hex + << ptr_weight[row * num_filter_coefficients + col] << "\n"; + } + } + } +#endif + } else if (number_type_ == kDnnFloat) { + float *ptr_weight = reinterpret_cast(component[i].op.conv1D.ptr_filters); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_filters; row++) { + for (uint32_t col = 0; col < num_filter_coefficients; col++) { + out_wfile << std::setprecision(12) + << ptr_weight[row * num_filter_coefficients + col] << "\n"; + } + out_wfile << "\n"; + } +#endif + } else { + fprintf(stderr, "Unsupported filter weight type in WriteDnnText!\n"); + throw -1; + } + + if (number_type_ == kDnnInt) { + if (number_type == kDnnInt) { + if (num_bytes_per_weight == 1) { + intel_compound_bias_t + *ptr_biases = reinterpret_cast(component[i].op.conv1D.ptr_biases); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_filters; row++) { + out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex + << ptr_biases[row].bias << " "; + out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex + << int(ptr_biases[row].multiplier) << "\n"; + } +#endif + } else { + int32_t *ptr_biases = reinterpret_cast(component[i].op.conv1D.ptr_biases); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_filters; row++) { + out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[row] + << "\n"; + } +#endif + } + } else { + int32_t *ptr_biases = reinterpret_cast(component[i].op.conv1D.ptr_biases); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_filters; row++) { + out_bfile << std::setprecision(12) + << ptr_biases[row] / output_scale_factor << "\n"; + } +#endif + } + } else { + float *ptr_biases = reinterpret_cast(component[i].op.conv1D.ptr_biases); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_filters; row++) { + out_bfile << std::setprecision(12) << ptr_biases[row] << "\n"; + } +#endif + } + out_file << "\n"; + } + break; + case kDnnRecurrentOp: { + float weight_scale_factor = component[i].op.recurrent.weight_scale_factor; + float output_scale_factor = component[i].output_scale_factor; + uint32_t num_vector_delay = component[i].op.recurrent.num_vector_delay; + uint32_t num_bytes_per_weight = component[i].op.recurrent.num_bytes_per_weight; + uint32_t num_bytes_per_bias = component[i].op.recurrent.num_bytes_per_bias; + uint32_t num_weight_rows = num_columns_out; + uint32_t num_weight_columns = num_columns_in + num_columns_out; + out_file << " " << std::dec << num_vector_delay << "\n"; + if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) { + out_file << " " << std::dec << 4 << "\n"; + out_file << " " << std::dec << 4 << "\n"; + } else { + out_file << " " << std::dec << num_bytes_per_weight << "\n"; + out_file << " " << std::dec << num_bytes_per_bias << "\n"; + } + if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) { + out_file << std::setprecision(12) << std::scientific << " " << 1.0 << "\n"; + out_file << std::setprecision(12) << std::scientific << " " << 1.0 << "\n"; + } else { + out_file << std::setprecision(12) << std::scientific << " " + << weight_scale_factor << "\n"; + out_file << std::setprecision(12) << std::scientific << " " + << output_scale_factor << "\n"; + } + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].op.recurrent.ptr_weights, ptr_dnn_memory_) << "\n"; + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].op.recurrent.ptr_biases, ptr_dnn_memory_) << "\n"; + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].op.recurrent.ptr_feedbacks, ptr_dnn_memory_) << "\n"; + if (num_bytes_per_weight == 1) { + int8_t *ptr_weight = reinterpret_cast(component[i].op.recurrent.ptr_weights); + intel_compound_bias_t + *ptr_bias = reinterpret_cast(component[i].op.recurrent.ptr_biases); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_weight_rows; row++) { + out_file << " "; + for (uint32_t col = 0; col < num_weight_columns; col++) { + if (number_type == kDnnFloat) { + float val = + static_cast(ptr_weight[row * num_weight_columns + col]) * ptr_bias[col].multiplier + / weight_scale_factor; + out_file << std::setprecision(12) << std::scientific << val << " "; + } else { + out_file << "0x" << std::setfill('0') << std::setw(2) << std::hex + << int((uint8_t) ptr_weight[row * num_weight_columns + col]) << " "; + } + } + out_file << "\n"; + } +#endif + } else if (num_bytes_per_weight == 2) { + int16_t *ptr_weight = reinterpret_cast(component[i].op.recurrent.ptr_weights); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_weight_rows; row++) { + out_file << " "; + for (uint32_t col = 0; col < num_weight_columns; col++) { + if (number_type == kDnnFloat) { + out_file << std::setprecision(12) << std::scientific + << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " "; + } else { + out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex + << ptr_weight[row * num_weight_columns + col] << " "; + } + } + out_file << "\n"; + } +#endif + } else if (number_type_ == kDnnFloat) { + float *ptr_weight = reinterpret_cast(component[i].op.recurrent.ptr_weights); +#ifdef DUMP_WB + for (uint32_t row = 0; row < num_weight_rows; row++) { + out_file << " "; + for (uint32_t col = 0; col < num_weight_columns; col++) { + out_file << std::setprecision(12) << std::scientific + << ptr_weight[row * num_weight_columns + col] << " "; + } + out_file << "\n"; + } +#endif + } else { + fprintf(stderr, "Unsupported weight type in WriteDnnText!\n"); + throw -1; + } + if (number_type_ == kDnnInt) { + if (number_type == kDnnInt) { + if (num_bytes_per_weight == 1) { + intel_compound_bias_t + *ptr_biases = reinterpret_cast(component[i].op.recurrent.ptr_biases); + out_file << "" << " "; +#ifdef DUMP_WB + for (uint32_t col = 0; col < num_columns_out; col++) { + out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex + << ptr_biases[col].bias << " "; + out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex + << ptr_biases[col].multiplier << " "; + } +#endif + } else { + int32_t *ptr_biases = reinterpret_cast(component[i].op.recurrent.ptr_biases); + out_file << "" << " "; +#ifdef DUMP_WB + for (uint32_t col = 0; col < num_columns_out; col++) { + out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[col] + << " "; + } +#endif + } + } else { + int32_t *ptr_biases = reinterpret_cast(component[i].op.recurrent.ptr_biases); + out_file << "" << " "; +#ifdef DUMP_WB + for (uint32_t col = 0; col < num_columns_out; col++) { + out_file << std::setprecision(12) << std::scientific + << ptr_biases[col] / output_scale_factor << " "; + } +#endif + } + } else { + float *ptr_biases = reinterpret_cast(component[i].op.recurrent.ptr_biases); + out_file << "" << " "; +#ifdef DUMP_WB + for (uint32_t col = 0; col < num_columns_out; col++) { + out_file << std::setprecision(12) << std::scientific << ptr_biases[col] << " "; + } +#endif + } + out_file << "\n"; + } + break; + case kDnnMaxPoolOp: { + uint32_t num_pool_type = (component[i].op.maxpool.do_sum_not_max) ? 2 : 1; + out_file << " " << std::dec << num_pool_type << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.num_inputs << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.num_inputs_step << "\n"; + out_file << " " << std::dec << component[i].op.maxpool.num_inputs_stride << "\n"; + out_file << std::setprecision(12) << std::scientific << " " + << component[i].output_scale_factor << "\n"; + } + break; + case kDnnPiecewiselinearOp: { + intel_pwl_segment_t *ptr_segment = component[i].op.pwl.ptr_segments; + DnnActivationType func_id = component[i].op.pwl.func_id.type; + uint32_t num_segments = component[i].op.pwl.num_segments; + float output_scale_factor = component[i].output_scale_factor; + out_file << " " << intel_dnn_activation_name[func_id] << "\n"; + out_file << " " << std::dec << sizeof(int16_t) << "\n"; + out_file << " " << std::dec << sizeof(int16_t) << "\n"; + out_file << " " << std::dec << sizeof(int32_t) << "\n"; + if (number_type == kDnnFloat) { + out_file << std::setprecision(12) << std::scientific << " " << 1.0 << "\n"; + out_file << " " << std::dec << 0 << "\n"; + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n"; + } else { + out_file << std::setprecision(12) << std::scientific << " " + << output_scale_factor << "\n"; + out_file << " " << std::dec << num_segments << "\n"; + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n"; + if (number_type_ == kDnnInt) { + out_file << " "; + for (int segment = 0; segment < num_segments; segment++) { + out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex + << ptr_segment[segment].slope << " "; + } + out_file << "\n"; + out_file << " "; + for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) { + out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex + << ptr_segment[segment].yBase << " "; + } + out_file << "\n"; + out_file << " "; + for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) { + out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex + << ptr_segment[segment].xBase << " "; + } + out_file << "\n"; + } else if (num_segments > 0) { + fprintf(stderr, + "Number of segments must be zero in floating point model in WriteDnnText!\n"); + throw -1; + } + } + } + break; + case kDnnInterleaveOp: + out_file << std::setprecision(12) << std::scientific << " " + << component[i].output_scale_factor << "\n"; + break; + case kDnnDeinterleaveOp: + out_file << std::setprecision(12) << std::scientific << " " + << component[i].output_scale_factor << "\n"; + break; + case kDnnCopyOp: + out_file << std::setprecision(12) << std::scientific << " " + << component[i].output_scale_factor << "\n"; + out_file << " " << std::dec << component[i].op.copy.num_copy_rows << "\n"; + out_file << " " << std::dec << component[i].op.copy.num_copy_columns << "\n"; + break; + default: + out_file << " Unsupported Component : " + << intel_dnn_operation_name[component[i].operation] << "\n"; + // fprintf(stderr, "Component type %s not yet supported in AmIntelDnn::WriteDnnText()!\n", + // intel_dnn_operation_name[component[i].operation]); + // throw -1; + break; + } + } + if (ptr_active_outputs() != nullptr) { + out_file << " " << "0x" << std::setfill('0') << std::setw(8) << std::hex + << MemoryOffset(ptr_active_outputs(), ptr_dnn_memory_) << "\n"; + } + out_file << "\n"; + out_file.close(); + } else { + fprintf(stderr, "Failed to open %s for writing!\n", filename); + throw -1; + } +} + +void AmIntelDnn::InitGNAStruct(intel_nnet_type_t *ptr_nnet) { + intel_nnet_layer_t *pLayer; + + if (ptr_nnet == nullptr) + THROW_GNA_EXCEPTION << "Invalid input parameter"; + if (component.empty()) + THROW_GNA_EXCEPTION << "empty model in AmIntelDnn::FillGNAStruct()"; + + ptr_nnet->nLayers = 0; + for (auto && c : component) { + if (c.operation == kDnnAffineOp + || (c.operation == kDnnDiagonalOp) + || (c.operation == kDnnConvolutional1dOp) + || (c.operation == kDnnDeinterleaveOp) + || (c.operation == kDnnInterleaveOp) + || (c.operation == kDnnRecurrentOp) + || (c.operation == kDnnCopyOp) + ) { + ptr_nnet->nLayers++; + } + } + ptr_nnet->nGroup = num_group_in(); + ptr_nnet->pLayers = reinterpret_cast(_mm_malloc(ptr_nnet->nLayers * sizeof(intel_nnet_layer_t), 64)); + if (ptr_nnet->pLayers == nullptr) + THROW_GNA_EXCEPTION << "out of memory in AmIntelDnn::FillGNAStruct()"; + pLayer = ptr_nnet->pLayers; + + for (int i = 0; i < component.size(); i++) { + // std::cout << "Component + " << i <<"=GNA_" << std::distance(ptr_nnet->pLayers, pLayer) << "\n"; + switch (component[i].operation) { + case kDnnAffineOp: + pLayer->nInputRows = component[i].num_rows_in; + pLayer->nInputColumns = component[i].num_columns_in; + pLayer->nOutputRows = component[i].num_rows_out; + pLayer->nOutputColumns = component[i].num_columns_out; + pLayer->nBytesPerInput = component[i].num_bytes_per_input; + pLayer->nBytesPerOutput = component[i].num_bytes_per_output; // will be overwritten if PWL op is needed + pLayer->nBytesPerIntermediateOutput = sizeof(int32_t); + pLayer->pInputs = component[i].ptr_inputs; + pLayer->pOutputsIntermediate = component[i].ptr_outputs; + pLayer->pOutputs = component[i].ptr_outputs; // will be overwritten if PWL op is needed + pLayer->nLayerKind = INTEL_AFFINE; + { + pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64); + if (pLayer->pLayerStruct == nullptr) { + THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE layer structure."; + } + auto pAffineLayer = reinterpret_cast(pLayer->pLayerStruct); + pAffineLayer->pwl.pSegments = nullptr; + pAffineLayer->pwl.nSegments = 0; + + pAffineLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias; + pAffineLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight; + pAffineLayer->affine.pBiases = component[i].op.affine.ptr_biases; + pAffineLayer->affine.pWeights = component[i].op.affine.ptr_weights; + } + if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) { + pLayer++; + } + break; + case kDnnDiagonalOp: + pLayer->nInputRows = component[i].num_rows_in; + pLayer->nInputColumns = component[i].num_columns_in; + pLayer->nOutputRows = component[i].num_rows_out; + pLayer->nOutputColumns = component[i].num_columns_out; + pLayer->nBytesPerInput = component[i].num_bytes_per_input; + pLayer->nBytesPerOutput = component[i].num_bytes_per_output; // will be overwritten if PWL op is needed + pLayer->nBytesPerIntermediateOutput = sizeof(int32_t); + pLayer->pInputs = component[i].ptr_inputs; + pLayer->pOutputsIntermediate = component[i].ptr_outputs; + pLayer->pOutputs = component[i].ptr_outputs; // will be overwritten if PWL op is needed + pLayer->nLayerKind = INTEL_AFFINE_DIAGONAL; + { + pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64); + if (pLayer->pLayerStruct == nullptr) { + THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE_DIAGONAL layer structure."; + } + auto pDiagonalLayer = reinterpret_cast(pLayer->pLayerStruct); + pDiagonalLayer->pwl.pSegments = nullptr; + pDiagonalLayer->pwl.nSegments = 0; + + pDiagonalLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias; + pDiagonalLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight; + pDiagonalLayer->affine.pBiases = component[i].op.affine.ptr_biases; + pDiagonalLayer->affine.pWeights = component[i].op.affine.ptr_weights; + } + if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) { + pLayer++; + } + break; + case kDnnRecurrentOp: + pLayer->nInputRows = component[i].num_rows_in; + pLayer->nInputColumns = component[i].num_columns_in; + pLayer->nOutputRows = component[i].num_rows_out; + pLayer->nOutputColumns = component[i].num_columns_out; + pLayer->nBytesPerInput = component[i].num_bytes_per_input; + pLayer->nBytesPerOutput = component[i].num_bytes_per_output; // will be overwritten if PWL op is needed + pLayer->nBytesPerIntermediateOutput = sizeof(int32_t); + pLayer->pInputs = component[i].ptr_inputs; + pLayer->pOutputsIntermediate = component[i].ptr_outputs; + pLayer->pOutputs = component[i].ptr_outputs; // will be overwritten if PWL op is needed + pLayer->nLayerKind = INTEL_RECURRENT; + { + pLayer->pLayerStruct = _mm_malloc(sizeof(intel_recurrent_layer_t), 64); + if (pLayer->pLayerStruct == nullptr) { + THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_RECURRENT layer structure."; + } + auto pRecurrentLayer = reinterpret_cast(pLayer->pLayerStruct); + pRecurrentLayer->pFeedbackBuffer = component[i].op.recurrent.ptr_feedbacks; + pRecurrentLayer->pwl.pSegments = nullptr; + pRecurrentLayer->pwl.nSegments = 0; + + pRecurrentLayer->affine.nBytesPerBias = component[i].op.recurrent.num_bytes_per_bias; + pRecurrentLayer->affine.nBytesPerWeight = component[i].op.recurrent.num_bytes_per_weight; + pRecurrentLayer->affine.pBiases = component[i].op.recurrent.ptr_biases; + pRecurrentLayer->affine.pWeights = component[i].op.recurrent.ptr_weights; + } + if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) { + pLayer++; + } + break; + case kDnnConvolutional1dOp: + pLayer->nInputRows = component[i].num_rows_in; + pLayer->nInputColumns = component[i].num_columns_in; + pLayer->nOutputRows = component[i].num_rows_out; + pLayer->nOutputColumns = component[i].num_columns_out; + pLayer->nBytesPerInput = component[i].num_bytes_per_input; + pLayer->nBytesPerOutput = component[i].num_bytes_per_output; // will be overwritten + pLayer->nBytesPerIntermediateOutput = sizeof(int32_t); + pLayer->pInputs = component[i].ptr_inputs; + pLayer->pOutputsIntermediate = component[i].ptr_outputs; + pLayer->pOutputs = component[i].ptr_outputs; // will be overwritten + pLayer->nLayerKind = INTEL_CONVOLUTIONAL; + { + pLayer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64); + if (pLayer->pLayerStruct == nullptr) { + THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_CONVOLUTIONAL layer structure."; + } + auto pConvolutionalLayer = reinterpret_cast(pLayer->pLayerStruct); + pConvolutionalLayer->nBytesBias = component[i].op.conv1D.num_bytes_per_bias; + pConvolutionalLayer->nBytesFilterCoefficient = component[i].op.conv1D.num_bytes_per_weight; + pConvolutionalLayer->nFilters = component[i].op.conv1D.num_filters; + pConvolutionalLayer->nFilterRows = component[i].op.conv1D.num_filter_rows; + pConvolutionalLayer->nFilterCoefficients = component[i].op.conv1D.num_filter_coefficients; + pConvolutionalLayer->nFeatureMaps = component[i].op.conv1D.num_feature_maps; + pConvolutionalLayer->nFeatureMapRows = component[i].op.conv1D.num_feature_map_rows; + pConvolutionalLayer->nFeatureMapColumns = component[i].op.conv1D.num_feature_map_columns; + pConvolutionalLayer->poolType = INTEL_NO_POOLING; // will be overwritten + pConvolutionalLayer->nPoolSize = 0; // will be overwritten + pConvolutionalLayer->nPoolStride = 0; // will be overwritten + pConvolutionalLayer->pwl.nSegments = 0; // will be overwritten + pConvolutionalLayer->pwl.pSegments = nullptr; // will be overwritten + pConvolutionalLayer->pBiases = component[i].op.conv1D.ptr_biases; + pConvolutionalLayer->pFilters = component[i].op.conv1D.ptr_filters; + } + if (i == component.size() - 1 || ((component[i + 1].operation != kDnnMaxPoolOp) + && (component[i + 1].operation != kDnnPiecewiselinearOp))) { + pLayer++; + } + break; + case kDnnMaxPoolOp: + if (i == 0) { + THROW_GNA_EXCEPTION << "Pooling component with no preceeding component"; + } else if (pLayer->nLayerKind == INTEL_CONVOLUTIONAL) { + if (pLayer->pLayerStruct == nullptr) { + THROW_GNA_EXCEPTION "INTEL_CONVOLUTIONAL layer structure was not initialized."; + } + auto pConvolutionalLayer = reinterpret_cast(pLayer->pLayerStruct); + // it is possible to have activation preceding to maxpool + if (pConvolutionalLayer->pwl.nSegments != 0) { + THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at." << i; + } else { + pConvolutionalLayer->poolType = + (component[i].op.maxpool.do_sum_not_max) ? INTEL_SUM_POOLING : INTEL_MAX_POOLING; + pConvolutionalLayer->nPoolSize = component[i].op.maxpool.num_inputs; + pConvolutionalLayer->nPoolStride = component[i].op.maxpool.num_inputs_step; + + + // number of output columns correction - based on GNA-library expectations + auto nFltSize = pConvolutionalLayer->nFilterCoefficients; + auto fltStrideSz = pConvolutionalLayer->nFeatureMaps * pConvolutionalLayer->nFeatureMapColumns; // always move 1 "row" + auto maxNCOE = (pLayer->nInputColumns - nFltSize) / fltStrideSz + 1; + // FLAT input matrix, pooled outputs per filter + pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((maxNCOE - 1) / pConvolutionalLayer->nPoolStride + 1); + + // old code + // pLayer->nOutputColumns /= pConvolutionalLayer->nPoolStride; + } + } else { + THROW_GNA_EXCEPTION << "Pooling component applied to non-convolutional layer"; + } + break; + case kDnnPiecewiselinearOp: + pLayer->pOutputs = component[i].ptr_outputs; + pLayer->nBytesPerOutput = component[i].num_bytes_per_output; + if (pLayer->pLayerStruct == nullptr) { + THROW_GNA_EXCEPTION << pLayer->nLayerKind << " layer structure was not initialized."; + } + if (i == 0) { + THROW_GNA_EXCEPTION << "PWL component with no preceding component."; + } else if ((component[i - 1].operation == kDnnAffineOp) + || (component[i - 1].operation == kDnnDiagonalOp)) { + auto pAffineLayer = reinterpret_cast(pLayer->pLayerStruct); + pAffineLayer->pwl.nSegments = component[i].op.pwl.num_segments; + pAffineLayer->pwl.pSegments = component[i].op.pwl.ptr_segments; + } else if (component[i - 1].operation == kDnnRecurrentOp) { + auto pRecurrentLayer = reinterpret_cast(pLayer->pLayerStruct); + pRecurrentLayer->pwl.nSegments = component[i].op.pwl.num_segments; + pRecurrentLayer->pwl.pSegments = component[i].op.pwl.ptr_segments; + } else if ((component[i - 1].operation == kDnnConvolutional1dOp) + || ((component[i - 1].operation == kDnnMaxPoolOp) + && (component[i - 2].operation == kDnnConvolutional1dOp))) { + auto pConvolutionalLayer = reinterpret_cast(pLayer->pLayerStruct); + pConvolutionalLayer->pwl.nSegments = component[i].op.pwl.num_segments; + pConvolutionalLayer->pwl.pSegments = component[i].op.pwl.ptr_segments; + if (component[i - 1].operation != kDnnMaxPoolOp) { + pLayer->nOutputColumns = component[i].num_columns_out; + } + } + pLayer++; + + break; + case kDnnInterleaveOp: + pLayer->nInputRows = component[i].num_rows_in; + pLayer->nInputColumns = component[i].num_columns_in; + pLayer->nOutputRows = component[i].num_rows_out; + pLayer->nOutputColumns = component[i].num_columns_out; + pLayer->nBytesPerInput = component[i].num_bytes_per_input; + pLayer->nBytesPerOutput = component[i].num_bytes_per_output; + pLayer->nBytesPerIntermediateOutput = sizeof(int32_t); + pLayer->pInputs = component[i].ptr_inputs; + pLayer->pOutputsIntermediate = nullptr; + pLayer->pOutputs = component[i].ptr_outputs; + pLayer->nLayerKind = INTEL_INTERLEAVE; + pLayer->pLayerStruct = nullptr; + pLayer++; + break; + case kDnnDeinterleaveOp: + pLayer->nInputRows = component[i].num_rows_in; + pLayer->nInputColumns = component[i].num_columns_in; + pLayer->nOutputRows = component[i].num_rows_out; + pLayer->nOutputColumns = component[i].num_columns_out; + pLayer->nBytesPerInput = component[i].num_bytes_per_input; + pLayer->nBytesPerOutput = component[i].num_bytes_per_output; + pLayer->nBytesPerIntermediateOutput = sizeof(int32_t); + pLayer->pInputs = component[i].ptr_inputs; + pLayer->pOutputsIntermediate = nullptr; + pLayer->pOutputs = component[i].ptr_outputs; + pLayer->nLayerKind = INTEL_DEINTERLEAVE; + pLayer->pLayerStruct = nullptr; + pLayer++; + break; + case kDnnCopyOp: + pLayer->nInputRows = component[i].num_rows_in; + pLayer->nInputColumns = component[i].num_columns_in; + pLayer->nOutputRows = component[i].num_rows_out; + pLayer->nOutputColumns = component[i].num_columns_out; + pLayer->nBytesPerInput = component[i].num_bytes_per_input; + pLayer->nBytesPerOutput = component[i].num_bytes_per_output; + pLayer->nBytesPerIntermediateOutput = sizeof(int32_t); + pLayer->pInputs = component[i].ptr_inputs; + pLayer->pOutputsIntermediate = nullptr; + pLayer->pOutputs = component[i].ptr_outputs; + pLayer->nLayerKind = INTEL_COPY; + pLayer->pLayerStruct = nullptr; + { + pLayer->pLayerStruct = _mm_malloc(sizeof(intel_copy_layer_t), 64); + if (pLayer->pLayerStruct == nullptr) { + THROW_GNA_EXCEPTION << pLayer->nLayerKind << " could not allocate memory for INTEL_COPY layer structure."; + } + auto *pCopyLayer = reinterpret_cast(pLayer->pLayerStruct); + pCopyLayer->nCopyRows = component[i].op.copy.num_copy_rows; + pCopyLayer->nCopyCols = component[i].op.copy.num_copy_columns; + } + pLayer++; + break; + default: { + THROW_GNA_EXCEPTION << "GNA does yet not support " << intel_dnn_operation_name[component[i].operation]; + } + } + } + // enable debugging of partial array of components + ptr_nnet->nLayers = std::distance(ptr_nnet->pLayers, pLayer); +} + +void AmIntelDnn::DestroyGNAStruct(intel_nnet_type_t *ptr_nnet) { + ptr_nnet->nGroup = 0; + if (ptr_nnet->pLayers != nullptr) { + for (int i = 0; i < ptr_nnet->nLayers; i++) { + switch (ptr_nnet->pLayers[i].nLayerKind) { + case INTEL_AFFINE:break; + case INTEL_AFFINE_DIAGONAL:break; + case INTEL_RECURRENT:break; + case INTEL_CONVOLUTIONAL:break; + case INTEL_INTERLEAVE:break; + case INTEL_DEINTERLEAVE:break; + case INTEL_COPY:break; + default:break; + } + if (ptr_nnet->pLayers[i].pLayerStruct != nullptr) { + _mm_free(ptr_nnet->pLayers[i].pLayerStruct); + } + } + if (ptr_nnet->pLayers != nullptr) { + _mm_free(ptr_nnet->pLayers); + } + } + ptr_nnet->nLayers = 0; +} + +void AmIntelDnn::GetScaledOutput(float *ptr_output, uint32_t component_index) { + if (component_index > num_components()) { + fprintf(stderr, "Illegal component index %d in GetScaledOutput\n", component_index); + throw -1; + } + if (ptr_output != nullptr) { + float scale_factor = OutputScaleFactor(component_index); + uint32_t num_elements = component[component_index].num_rows_out * component[component_index].num_columns_out; + if (number_type_ == kDnnFloat) { + float *ptr_input = reinterpret_cast(component[component_index].ptr_outputs); + for (uint32_t i = 0; i < num_elements; i++) { + ptr_output[i] = ptr_input[i] / scale_factor; + } + } else if (component[component_index].num_bytes_per_output == 2) { + int16_t *ptr_input = reinterpret_cast(component[component_index].ptr_outputs); + for (uint32_t i = 0; i < num_elements; i++) { + ptr_output[i] = static_cast(ptr_input[i]) / scale_factor; + } + } else { + int32_t *ptr_input = reinterpret_cast(component[component_index].ptr_outputs); + for (uint32_t i = 0; i < num_elements; i++) { + ptr_output[i] = static_cast(ptr_input[i]) / scale_factor; + } + } + } else { + fprintf(stderr, "Output pointer is nullptr in GetScaledOutput\n"); + throw -1; + } +} + +void AmIntelDnn::WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet) { +#ifdef LIGHT_DUMP + if (nnet) { + for (int i = 0; i < nnet->nLayers; i++) { + auto component = nnet->pLayers; + std::stringstream out_file_name; + auto getLayerType = [](intel_layer_kind_t kind){ + switch (kind){ + case INTEL_AFFINE : return "affine"; + case INTEL_AFFINE_DIAGONAL : return "diag"; + case INTEL_RECURRENT : return "recurrent"; + case INTEL_CONVOLUTIONAL : return "convolution"; + case INTEL_INTERLEAVE : return "interleave"; + case INTEL_DEINTERLEAVE : return "deinterleave"; + case INTEL_COPY : return "copy"; + default: return "unknown"; + } + }; + out_file_name << std::setfill('0') << std::setw(2) << i << "_" + << getLayerType(component[i].nLayerKind) + << "-" << nnet->pLayers[i].nInputRows + << "-" << nnet->pLayers[i].nOutputRows; + + auto inputfileName = getDumpFolderNameGNA() + out_file_name.str() + "_input.txt"; + auto outFileName = getDumpFolderNameGNA() + out_file_name.str() + "_output.txt"; + auto pwlFileName = getDumpFolderNameGNA() + out_file_name.str() + "_pwl.txt"; + auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt"; + + std::ofstream out_file(outFileName.c_str(), std::ios::out); + std::ofstream pwl_file(pwlFileName.c_str(), std::ios::out); + std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in); + std::ofstream in_file(inputfileName.c_str(), std::ios::out); + + float summOfDiff = 0.f; + float summOfSqDiff = 0.f; + float maxD = 0.0f; + int numItems = 0; + + auto write_pwl = [&pwl_file](intel_pwl_func_t & pwl) { + for (int k =0; k < pwl.nSegments; k++) { + pwl_file << pwl.pSegments[k].slope << ", " << pwl.pSegments[k].xBase << ", " << pwl.pSegments[k].yBase << "\n"; + } + }; + if (nnet->pLayers[i].nLayerKind == INTEL_AFFINE || nnet->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL) { + auto affine = reinterpret_cast(nnet->pLayers[i].pLayerStruct); + write_pwl(affine->pwl); + } + if (nnet->pLayers[i].nLayerKind == INTEL_CONVOLUTIONAL) { + auto conv = reinterpret_cast(nnet->pLayers[i].pLayerStruct); + write_pwl(conv->pwl); + } + + for (int k = 0; k < component[i].nOutputRows; k++) { + for (int j = 0; j < component[i].nOutputColumns; j++) { + float floatValue = 0.f; + if (component[i].nBytesPerOutput == 4) { + auto value = (reinterpret_cast(component[i].pOutputs)[k * component[i].nOutputColumns + j]); + floatValue = (static_cast(value) / 1.0); + } else { + auto value = reinterpret_cast(component[i].pOutputs)[k * component[i].nOutputColumns + j]; + floatValue = (static_cast(value) / 1.0); + } + out_file << std::setw(8) << floatValue << "\n"; + if (ref_out_file) { + float ref_value = 0.f; + ref_out_file >> ref_value; + float diff = (ref_value - floatValue); + diff = diff < 0 ? -diff : diff; + summOfDiff += diff; + summOfSqDiff += diff * diff; + maxD = std::max(maxD, diff); + numItems++; + } + } + } + if (numItems) { + auto rmse = sqrt(summOfSqDiff / numItems); + auto avg = summOfDiff / numItems; + std :: cout << std::left << std::setw(55) << out_file_name.str() + << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse + << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg + << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl; + } + + + for (int k = 0; k < component[i].nInputRows; k++) { + for (int j = 0; j < component[i].nInputColumns; j++) { + if (component[i].nBytesPerInput == 4) { + in_file << std::setw(8) + << (reinterpret_cast(component[i].pInputs)[k * component[i].nInputColumns + j]); + } else { + in_file << std::setw(8) + << (reinterpret_cast(component[i].pInputs)[k * component[i].nInputColumns + j]); + } + in_file << "\n"; + } + } + } + } +#endif +} + +void AmIntelDnn::WriteInputAndOutputText() { +#ifdef LIGHT_DUMP + for (int i = 0; i < num_components(); i++) { + std::stringstream out_file_name; + out_file_name << std::setfill('0') << std::setw(2) << i << "_" + << intel_dnn_operation_name[component[i].operation] + << "-" << component[i].num_rows_in + << "-" << component[i].num_rows_out; + if (component[i].operation == kDnnPiecewiselinearOp) { + out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id]; + } + auto inputfileName = getDumpFolderName() + out_file_name.str() + "_input.txt"; + auto outFileName = getDumpFolderName() + out_file_name.str() + "_output.txt"; + auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt"; + + std::ofstream out_file(outFileName.c_str(), std::ios::out); + std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in); + std::ofstream in_file(inputfileName.c_str(), std::ios::out); + + float summOfDiff = 0.f; + float summOfSqDiff = 0.f; + float maxD = 0.0f; + int numItems = 0; + + for (int k = 0; k < component[i].num_rows_out; k++) { + for (int j = 0; j < component[i].num_columns_out; j++) { + float floatValue = 0.f; + if (component[i].num_bytes_per_output == 4) { + if (number_type_ == kDnnInt) { + auto value = (reinterpret_cast(component[i].ptr_outputs)[k * component[i].num_columns_out+ j]); + // out_file << std::setw(8) << value << "\n"; + floatValue = (static_cast(value) / component[i].output_scale_factor); + + } else { + floatValue = (reinterpret_cast(component[i].ptr_outputs)[ + k * component[i].num_columns_out+ j]) / component[i].output_scale_factor; + } + } else { + auto value = reinterpret_cast(component[i].ptr_outputs)[k * component[i].num_columns_out+ j]; + // out_file << std::setw(8) << value << "\n"; + floatValue = (static_cast(value) / component[i].output_scale_factor); + } + out_file << std::setw(8) << floatValue << "\n"; + if (ref_out_file) { + float ref_value = 0.f; + ref_out_file >> ref_value; + float diff = (ref_value - floatValue); + diff = diff < 0.f ? -diff : diff; + summOfDiff += diff; + summOfSqDiff += diff * diff; + maxD = std::max(maxD, diff); + numItems++; + } + } + } + if (numItems) { + auto rmse = sqrt(summOfSqDiff / numItems); + auto avg = summOfDiff / numItems; + std :: cout << std::left << std::setw(55) << out_file_name.str() + << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse + << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg + << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl; + } + + + for (int k = 0; k < component[i].num_rows_in; k++) { + for (int j = 0; j < component[i].num_columns_in; j++) { + if (component[i].num_bytes_per_input == 4) { + if (number_type_ == kDnnInt) { + in_file << std::setw(8) + << (reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in + + j]); + } else { + in_file << std::setw(8) + << (reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in + + j]); + } + } else { + in_file << std::setw(8) + << (reinterpret_cast(component[i].ptr_inputs)[k * component[i].num_columns_in + + j]); + } + in_file << "\n"; + } + } +#endif + } +} + +bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2) { + bool isCompatible = true; + + // compare basic structures to see if they are compatible + if (dnn1.num_components() != dnn2.num_components()) isCompatible = false; + for (int i = 0; i < dnn1.num_components(); i++) { + if (dnn1.component[i].num_rows_in != dnn2.component[i].num_rows_in) isCompatible = false; + if (dnn1.component[i].num_columns_in != dnn2.component[i].num_columns_in) isCompatible = false; + if (dnn1.component[i].num_rows_out != dnn2.component[i].num_rows_out) isCompatible = false; + if (dnn1.component[i].num_columns_out != dnn2.component[i].num_columns_out) isCompatible = false; + if (dnn1.component[i].operation != dnn2.component[i].operation) isCompatible = false; + } + + return (isCompatible); +} + +void ClearScoreError(intel_score_error_t *error) { + error->num_scores = 0; + error->num_errors = 0; + error->max_error = 0.0; + error->sum_error = 0.0; + error->sum_squared_error = 0.0; + error->max_rel_error = 0.0; + error->sum_rel_error = 0.0; + error->sum_squared_rel_error = 0.0; +} + +void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error) { + total_error->num_errors += error->num_errors; + total_error->num_scores += error->num_scores; + total_error->sum_error += error->sum_error; + total_error->sum_squared_error += error->sum_squared_error; + if (error->max_error > total_error->max_error) { + total_error->max_error = error->max_error; + } + total_error->sum_rel_error += error->sum_rel_error; + total_error->sum_squared_rel_error += error->sum_squared_rel_error; + if (error->max_rel_error > total_error->max_rel_error) { + total_error->max_rel_error = error->max_rel_error; + } +} + +void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs) { + // Assumes input vector contains log likelihoods + // The computes x[i] = x[i] - log(sum_j exp(x[j])) + // This normalizes the likelihoods by the sum of likelihoods but stores them as log likelihoods + + float max_score = ptr_input[0]; + float sum = 0.0; + float diff; + // find max score for normalization to [0,1] + for (uint32_t i = 0; i < num_inputs; i++) { + if (ptr_input[i] > max_score) { + max_score = ptr_input[i]; + } + } + for (uint32_t i = 0; i < num_inputs; i++) { + sum += exp(ptr_input[i] - max_score); + } + if (sum < 1.0e-20) { + fprintf(stderr, "Warning: attempt to take log(0) in SoftmaxGoogle()!\n"); + sum = 1.0e-20; + } + diff = max_score + log(sum); + for (uint32_t i = 0; i < num_outputs; i++) { + ptr_output[i] = ptr_input[i] - diff; + } +} diff --git a/inference-engine/src/gna_plugin/dnn.h b/inference-engine/src/gna_plugin/dnn.h new file mode 100644 index 00000000000000..8a1506dbe88ef9 --- /dev/null +++ b/inference-engine/src/gna_plugin/dnn.h @@ -0,0 +1,823 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gna-api.h" + +#define DNN_MAX_BATCH_SIZE 8 +#define DNN_MAX_INPUTS 3072 +#define DNN_MAX_OUTPUTS 8192 +#define DNN_MAX_ERROR 1.0e-4f +#define DNN_NUM_BYTES_INT_BIAS 4 +#define DNN_NUM_BYTES_INT_AFFINE_OUT 4 +#define DNN_RAND_INT8_AMPLITUDE 127.0f +#define DNN_RAND_INT16_AMPLITUDE 16384.0f +#define DNN_RAND_INT32_AMPLITUDE 1048576.0f +#define DNN_RAND_FLOAT32_AMPLITUDE 8.0f + +enum DnnActivationType { + kActNone, + kActSigmoid, + kActTanh, + kActRelu, + kActLeakyRelu, + kActIdentity, + kActKaldiLstmClipping, + kActCustom, + kActNumType +}; +struct DnnActivation { + // for prelu + DnnActivationType type; + float negative_slope; + operator DnnActivationType () const noexcept { + return type; + } + static DnnActivation fromType(DnnActivationType type) { + DnnActivation activation; + activation.type = type; + activation.negative_slope = 0.0f; + return activation; + } +}; + +static_assert(std::is_trivial::value, "DnnActivation is not trival type"); + +static const char *intel_dnn_activation_name[kActNumType] = { + "kActNone", + "kActSigmoid", + "kActTanh", + "kActRelu", + "kActLeakyRelu", + "kActIdentity", + "kActKaldiLstmClipping", + "kActCustom" +}; + +typedef enum DnnSoftmaxType { + kSoftmaxNone, + kSoftmaxKaldiSumgroup, + kSoftmaxEesen, + kSoftmaxGoogle, + kSoftmaxNumType +} intel_dnn_softmax_type_t; + +static const char *intel_dnn_softmax_name[kSoftmaxNumType] = { + "kSoftmaxNone", + "kSoftmaxKaldiSumGroup", + "kSoftmaxKaldiApplyLog", + "kSoftmaxGoogle" +}; + +typedef enum { + kDnnUnknownOrientation, + kDnnInterleavedOrientation, + kDnnNonInterleavedOrientation, + kDnnNumOrientation +} intel_dnn_orientation_t; + +typedef enum { + kDnnNullOp, + kDnnAffineOp, + kDnnDiagonalOp, + kDnnConvolutional1dOp, + kDnnPiecewiselinearOp, + kDnnMaxPoolOp, + kDnnRecurrentOp, + kDnnInterleaveOp, + kDnnDeinterleaveOp, + kDnnCopyOp, + kDnnNumOp +} intel_dnn_operation_t; + +static const char *intel_dnn_operation_name[kDnnNumOp] = { + "kDnnNullOp", + "kDnnAffineOp", + "kDnnDiagonalOp", + "kDnnConvolutional1dOp", + "kDnnPiecewiselinearOp", + "kDnnMaxPoolOp", + "kDnnRecurrentOp", + "kDnnInterleaveOp", + "kDnnDeinterleaveOp", + "kDnnCopyOp" +}; + +typedef enum { + kDnnMacroOpNone, + kDnnMacroOpLstm, + kDnnMacroOpBiLstm, + kDnnNumMacroOp +} intel_dnn_macro_operation_t; + +static const char *intel_dnn_macro_operation_name[kDnnNumMacroOp] = { + "kDnnMacroOpNone", + "kDnnMacroOpLstm", + "kDnnMacroOpBiLstm" +}; + +typedef enum { + kDnnFloat, + kDnnInt, + kDnnNumNumberType +} intel_dnn_number_type_t; + +static const char *intel_dnn_number_type_name[kDnnNumNumberType] = { + "kDnnFloat", + "kDnnInt" +}; + +typedef struct { + uint32_t num_bytes_per_weight; + uint32_t num_bytes_per_bias; + float weight_scale_factor; + void *ptr_weights; + void *ptr_biases; +} intel_affine_t; + +typedef struct { + uint32_t num_bytes_per_weight; + uint32_t num_bytes_per_bias; + uint32_t num_filters; + uint32_t num_filter_rows; + uint32_t num_filter_coefficients; + uint32_t num_feature_maps; + uint32_t num_feature_map_rows; + uint32_t num_feature_map_columns; + float weight_scale_factor; + void *ptr_filters; // filters stored one after the other + void *ptr_biases; +} intel_convolutionalD_t; + +typedef struct { + uint32_t num_inputs; // pool size + uint32_t num_inputs_step; // pool step + uint32_t num_inputs_stride; // pool stride (number of convolution filters) + bool do_sum_not_max; +} intel_maxpool_t; + +typedef struct { + DnnActivation func_id; // identifies function being approximated + uint32_t num_segments; + intel_pwl_segment_t *ptr_segments; +} intel_piecewiselinear_t; + +typedef struct { + uint32_t num_vector_delay; + uint32_t num_bytes_per_weight; + uint32_t num_bytes_per_bias; + float weight_scale_factor; + void *ptr_feedbacks; + void *ptr_weights; + void *ptr_biases; +} intel_recurrent_t; + +typedef struct { +} intel_interleave_t; + +typedef struct { +} intel_deinterleave_t; + +typedef struct { + uint32_t num_copy_columns; // number of columns to copy + uint32_t num_copy_rows; // number of rows to copy +} intel_copy_t; + +typedef struct { + uint32_t num_rows_in; + uint32_t num_columns_in; + uint32_t num_rows_out; + uint32_t num_columns_out; + uint32_t num_bytes_per_input; + uint32_t num_bytes_per_output; + intel_dnn_operation_t operation; + intel_dnn_macro_operation_t macro_operation; + intel_dnn_orientation_t orientation_in; + intel_dnn_orientation_t orientation_out; + union operation_struct_t { + intel_affine_t affine; + intel_convolutionalD_t conv1D; + intel_maxpool_t maxpool; + intel_piecewiselinear_t pwl; + intel_recurrent_t recurrent; + intel_interleave_t interleave; + intel_deinterleave_t deinterleave; + intel_copy_t copy; + } op; + void *ptr_inputs; + void *ptr_outputs; + float output_scale_factor; +} intel_dnn_component_t; + +typedef struct { + uint32_t num_scores; + uint32_t num_errors; + float threshold; + float max_error; + float rms_error; + float sum_error; + float sum_rms_error; + float sum_squared_error; + float max_rel_error; + float sum_rel_error; + float sum_squared_rel_error; +} intel_score_error_t; + +class AmIntelDnn { + public: + AmIntelDnn() + : ptr_active_outputs_(NULL), + num_active_outputs_(0), + input_scale_factor_(1.0), + num_left_context(0), + num_right_context(0), + do_rotate_input(false), + num_rotate_rows(0), + num_rotate_columns(0), + softmax_type(kSoftmaxNone), + ptr_sumgroup_sizes(NULL), + num_sumgroup_sizes(0), + ptr_priors(NULL) { + } + + ~AmIntelDnn() { + component.clear(); + if (ptr_sumgroup_sizes != NULL) { + _mm_free(ptr_sumgroup_sizes); + } + if (ptr_priors != NULL) { + _mm_free(ptr_priors); + } + } + + uint32_t num_components() { return (uint32_t) component.size(); } + + void Init(void *ptr_memory, uint32_t num_memory_bytes, intel_dnn_number_type_t number_type, float scale_factor); + void InitActiveList(uint32_t *ptr_active_list); + + template + static void InitAffineComponent(intel_dnn_component_t &comp, + uint32_t num_rows_in, + uint32_t num_columns, + uint32_t num_rows_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + float weight_scale_factor, + float output_scale_factor, + A *&ptr_inputs, + B *&ptr_outputs, + C *&ptr_weights, + D *&ptr_biases, + bool isDiag = false) { + InitAffineComponentPrivate(comp, + num_rows_in, + num_columns, + num_rows_out, + num_bytes_per_input, + num_bytes_per_output, + num_bytes_per_weight, + num_bytes_per_bias, + weight_scale_factor, + output_scale_factor, + (void *&) ptr_inputs, + (void *&) ptr_outputs, + (void *&) ptr_weights, + (void *&) ptr_biases, + isDiag, + true); + } + + template + void InitAffineComponent(uint32_t component_index, + uint32_t num_rows_in, + uint32_t num_columns, + uint32_t num_rows_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + float weight_scale_factor, + float output_scale_factor, + A *&ptr_inputs, + B *&ptr_outputs, + C *&ptr_weights, + D *&ptr_biases, + bool isDiag = false) { + InitAffineComponentPrivate(component[component_index], + num_rows_in, + num_columns, + num_rows_out, + num_bytes_per_input, + num_bytes_per_output, + num_bytes_per_weight, + num_bytes_per_bias, + weight_scale_factor, + output_scale_factor, + (void *&) ptr_inputs, + (void *&) ptr_outputs, + (void *&) ptr_weights, + (void *&) ptr_biases, + isDiag, + false); + } + + void InitDiagonalComponent(uint32_t component_index, + uint32_t num_rows_in, + uint32_t num_columns, + uint32_t num_rows_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + float weight_scale_factor, + float output_scale_factor, + void *ptr_inputs, + void *ptr_outputs, + void *ptr_weights, + void *ptr_biases); + + template + void InitConvolutional1DComponent(uint32_t component_index, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + uint32_t num_filters, + uint32_t num_filter_rows, + uint32_t num_filter_coefficients, + uint32_t num_feature_maps, + uint32_t num_feature_map_rows, + uint32_t num_feature_map_columns, + float weight_scale_factor, + float output_scale_factor, + A *& ptr_inputs, + B *& ptr_outputs, + C *& ptr_filters, + D *& ptr_biases) { + InitConvolutional1DComponentPrivate(component[component_index], + num_rows_in, + num_columns_in, + num_rows_out, + num_columns_out, + num_bytes_per_input, + num_bytes_per_output, + num_bytes_per_weight, + num_bytes_per_bias, + num_filters, + num_filter_rows, + num_filter_coefficients, + num_feature_maps, + num_feature_map_rows, + num_feature_map_columns, + weight_scale_factor, + output_scale_factor, + (void *&) ptr_inputs, + (void *&) ptr_outputs, + (void *&) ptr_filters, + (void *&) ptr_biases, + false); + } + + template + static void InitConvolutional1DComponent(intel_dnn_component_t &comp, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + uint32_t num_filters, + uint32_t num_filter_rows, + uint32_t num_filter_coefficients, + uint32_t num_feature_maps, + uint32_t num_feature_map_rows, + uint32_t num_feature_map_columns, + float weight_scale_factor, + float output_scale_factor, + A *& ptr_inputs, + B *& ptr_outputs, + C *& ptr_filters, + D *& ptr_biases) { + InitConvolutional1DComponentPrivate(comp, + num_rows_in, + num_columns_in, + num_rows_out, + num_columns_out, + num_bytes_per_input, + num_bytes_per_output, + num_bytes_per_weight, + num_bytes_per_bias, + num_filters, + num_filter_rows, + num_filter_coefficients, + num_feature_maps, + num_feature_map_rows, + num_feature_map_columns, + weight_scale_factor, + output_scale_factor, + (void *&) ptr_inputs, + (void *&) ptr_outputs, + (void *&) ptr_filters, + (void *&) ptr_biases, + true); + } + + + + // TODO: this functions accepted component_index only used in legacy code + void InitMaxpoolComponent(uint32_t component_index, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_pool_size, + uint32_t num_pool_step, + uint32_t num_pool_stride, + bool do_sum_not_max, + float output_scale_factor, + void * ptr_inputs, + void * ptr_outputs) { + InitMaxpoolComponentPrivate(component[component_index], + num_rows_in, + num_columns_in, + num_rows_out, + num_columns_out, + num_bytes_per_input, + num_bytes_per_output, + num_pool_size, + num_pool_step, + num_pool_stride, + do_sum_not_max, + output_scale_factor, + (void *&) ptr_inputs, + (void *&) ptr_outputs, + false); + } + + template + static void InitMaxpoolComponent(intel_dnn_component_t &cmp, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_pool_size, + uint32_t num_pool_step, + uint32_t num_pool_stride, + bool do_sum_not_max, + float output_scale_factor, + A *&ptr_inputs, + B *&ptr_outputs) { + InitMaxpoolComponentPrivate(cmp, + num_rows_in, + num_columns_in, + num_rows_out, + num_columns_out, + num_bytes_per_input, + num_bytes_per_output, + num_pool_size, + num_pool_step, + num_pool_stride, + do_sum_not_max, + output_scale_factor, + (void *&) ptr_inputs, + (void *&) ptr_outputs, + true); + } + + + + + void InitPiecewiseLinearComponent(uint32_t component_index, + DnnActivation function_id, + intel_dnn_orientation_t orientation, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_segments, + float output_scale_factor, + void * ptr_inputs, + void * ptr_outputs, + intel_pwl_segment_t *ptr_segments) { + InitPiecewiseLinearComponentPrivate(component[component_index], + function_id, + orientation, + num_rows, + num_columns, + num_bytes_per_input, + num_bytes_per_output, + num_segments, + output_scale_factor, + ptr_inputs, + ptr_outputs, + ptr_segments, + false); + } + template + static void InitPiecewiseLinearComponent(intel_dnn_component_t &cmp, + DnnActivation function_id, + intel_dnn_orientation_t orientation, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_segments, + float output_scale_factor, + A *&ptr_inputs, + B *&ptr_outputs, + intel_pwl_segment_t *ptr_segments) { + InitPiecewiseLinearComponentPrivate(cmp, + function_id, + orientation, + num_rows, + num_columns, + num_bytes_per_input, + num_bytes_per_output, + num_segments, + output_scale_factor, + (void *&) ptr_inputs, + (void *&) ptr_outputs, + ptr_segments, + true); + } + + + void InitRecurrentComponent(uint32_t component_index, + uint32_t num_rows, + uint32_t num_columns_in, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_vector_delay, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + float weight_scale_factor, + float output_scale_factor, + void *ptr_inputs, + void *ptr_feedbacks, + void *ptr_outputs, + void *ptr_weights, + void *ptr_biases); + void InitInterleaveComponent(uint32_t component_index, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + float output_scale_factor, + void *ptr_inputs, + void *ptr_outputs); + void InitDeinterleaveComponent(uint32_t component_index, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + float output_scale_factor, + void *ptr_inputs, + void *ptr_outputs); + void InitCopyComponent(uint32_t component_index, + intel_dnn_orientation_t orientation, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + float output_scale_factor, + uint32_t num_copy_rows, + uint32_t num_copy_columns, + void *ptr_inputs, + void *ptr_outputs) { + InitCopyComponentPrivate(component[component_index], + orientation, + num_rows_in, + num_columns_in, + num_rows_out, + num_columns_out, + num_bytes_per_input, + num_bytes_per_output, + output_scale_factor, + num_copy_rows, + num_copy_columns, + ptr_inputs, + ptr_outputs, + false); + } + + template + static void InitCopyComponent(intel_dnn_component_t &cmp, + intel_dnn_orientation_t orientation, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + float output_scale_factor, + uint32_t num_copy_rows, + uint32_t num_copy_columns, + A *&ptr_inputs, + B *&ptr_outputs) { + InitCopyComponentPrivate(cmp, + orientation, + num_rows_in, + num_columns_in, + num_rows_out, + num_columns_out, + num_bytes_per_input, + num_bytes_per_output, + output_scale_factor, + num_copy_rows, + num_copy_columns, + (void *&) ptr_inputs, + (void *&) ptr_outputs, + true); + } + void AddComponents(uint32_t num_components_to_add); + void ClearComponent(uint32_t component_index); + void ClearState(); + uint32_t CopyActiveList(std::vector > &active_list, uint32_t list_index); + void Propagate(); + intel_dnn_macro_operation_t MacroOperation(uint32_t component_index); + void SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation); + float InputScaleFactor(uint32_t component_index); + float WeightScaleFactor(uint32_t component_index); + float OutputScaleFactor(uint32_t component_index) { + return OutputScaleFactor(component[component_index]); + } + float OutputScaleFactor(intel_dnn_component_t &comp); + void SetInputScaleFactor(float scale_factor) { input_scale_factor_ = scale_factor; } + void SetOutputScaleFactor(uint32_t component_index, float scale_factor); + void PrintOutputs(uint32_t component_index); + uint32_t CompareScores(void *ptr_scores, intel_score_error_t *score_error, uint32_t num_frames); + void WriteGraphWizModel(const char *filename); + void WriteDnnText(const char *filename, intel_dnn_number_type_t number_type); + uint32_t MemoryRequiredToReadDnnText(const char *filename); + void ReadDnnText(const char *filename, void *ptr_memory, uint32_t num_memory_bytes, float *ptr_scale_in); + + void InitGNAStruct(intel_nnet_type_t *ptr_nnet); + void DestroyGNAStruct(intel_nnet_type_t *ptr_nnet); + void GetScaledOutput(float *ptr_output, uint32_t component_index); + uint32_t *ptr_active_outputs() { return (ptr_active_outputs_); } + uint32_t num_active_outputs() { return (num_active_outputs_); } + uint32_t num_gna_layers() { + uint32_t num_layers = 0; + for (uint32_t i = 0; i < component.size(); i++) { + if ((component[i].operation == kDnnAffineOp) || (component[i].operation == kDnnDiagonalOp) + || (component[i].operation == kDnnConvolutional1dOp) || (component[i].operation == kDnnCopyOp) + || (component[i].operation == kDnnDeinterleaveOp) || (component[i].operation == kDnnInterleaveOp) + || (component[i].operation == kDnnRecurrentOp)) { + num_layers++; + } + } + return (num_layers); + } + uint32_t num_group_in() { + return ((component.size() > 0) ? ((component[0].orientation_in == kDnnInterleavedOrientation) + ? component[0].num_columns_in : component[0].num_rows_in) : 0); + } + uint32_t num_group_out() { + return ((component.size() > 0) ? ((component[component.size() - 1].orientation_out + == kDnnInterleavedOrientation) ? component[component.size() - 1].num_columns_out : component[ + component.size() - 1].num_rows_out) : 0); + } + + std::vector component; + uint32_t num_left_context; + uint32_t num_right_context; + bool do_rotate_input; + uint32_t num_rotate_rows = 0; + uint32_t num_rotate_columns = 0; + DnnSoftmaxType softmax_type; + uint32_t *ptr_sumgroup_sizes; + uint32_t num_sumgroup_sizes; + float *ptr_priors; + + void WriteInputAndOutputText(); + static void WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet); + void BeginNewWrite(); + + private: + void *ptr_dnn_memory_; + uint32_t num_bytes_dnn_memory_; + uint32_t *ptr_active_outputs_; + uint32_t num_active_outputs_; + intel_dnn_number_type_t number_type_; + float input_scale_factor_; + + static void InitCopyComponentPrivate(intel_dnn_component_t &cmp, + intel_dnn_orientation_t orientation, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + float output_scale_factor, + uint32_t num_copy_rows, + uint32_t num_copy_columns, + void *&ptr_inputs, + void *&ptr_outputs, + bool postInitMem); + + static void InitMaxpoolComponentPrivate(intel_dnn_component_t &cmp, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_pool_size, + uint32_t num_pool_step, + uint32_t num_pool_stride, + bool do_sum_not_max, + float output_scale_factor, + void *&ptr_inputs, + void *&ptr_outputs, + bool postInitMem); + + static void InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &cmp, + DnnActivation function_id, + intel_dnn_orientation_t orientation, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_segments, + float output_scale_factor, + void *& ptr_inputs, + void *& ptr_outputs, + intel_pwl_segment_t *ptr_segments, + bool postInitMem); + + static void InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp, + uint32_t num_rows_in, + uint32_t num_columns_in, + uint32_t num_rows_out, + uint32_t num_columns_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + uint32_t num_filters, + uint32_t num_filter_rows, + uint32_t num_filter_coefficients, + uint32_t num_feature_maps, + uint32_t num_feature_map_rows, + uint32_t num_feature_map_columns, + float weight_scale_factor, + float output_scale_factor, + void *& ptr_inputs, + void *& ptr_outputs, + void *& ptr_filters, + void *& ptr_biases, + bool postInitMem); + + static void InitAffineComponentPrivate(intel_dnn_component_t &comp, + uint32_t num_rows_in, + uint32_t num_columns, + uint32_t num_rows_out, + uint32_t num_bytes_per_input, + uint32_t num_bytes_per_output, + uint32_t num_bytes_per_weight, + uint32_t num_bytes_per_bias, + float weight_scale_factor, + float output_scale_factor, + void *&ptr_inputs, + void *&ptr_outputs, + void *&ptr_weights, + void *&ptr_biases, + bool isDiag, + bool postInitMem); +}; + +void PlotFloatIntDnn(AmIntelDnn *dnn, AmIntelDnn *dnn_int); +bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2); +void ClearScoreError(intel_score_error_t *error); +void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error); +void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs); diff --git a/inference-engine/src/gna_plugin/dnn_memory.cpp b/inference-engine/src/gna_plugin/dnn_memory.cpp new file mode 100644 index 00000000000000..16496b5bf33c5c --- /dev/null +++ b/inference-engine/src/gna_plugin/dnn_memory.cpp @@ -0,0 +1,30 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "dnn_memory.hpp" +#include "gna-api.h" + +void MemoryAssign(void **ptr_dest, + void **ptr_memory, + uint32_t num_bytes_needed, + uint32_t *ptr_num_bytes_used, + uint32_t num_memory_bytes, + const char *name) { + if (*ptr_num_bytes_used + ALIGN(num_bytes_needed, 64) > num_memory_bytes) { + fprintf(stderr, + "Out of memory in %s (%d+ALIGN(%d)>%d)!\n", + name, + *ptr_num_bytes_used, + num_bytes_needed, + num_memory_bytes); + throw -1; + } else { + uint8_t *ptr_bytes = reinterpret_cast(*ptr_memory); + *ptr_dest = *ptr_memory; + *ptr_memory = ptr_bytes + ALIGN(num_bytes_needed, 64); + *ptr_num_bytes_used += ALIGN(num_bytes_needed, 64); + } +} diff --git a/inference-engine/src/gna_plugin/dnn_memory.hpp b/inference-engine/src/gna_plugin/dnn_memory.hpp new file mode 100644 index 00000000000000..5ab2c961f40ade --- /dev/null +++ b/inference-engine/src/gna_plugin/dnn_memory.hpp @@ -0,0 +1,13 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +extern void MemoryAssign(void **ptr_dest, + void **ptr_memory, + uint32_t num_bytes_needed, + uint32_t *ptr_num_bytes_used, + uint32_t num_memory_bytes, + const char *name); diff --git a/inference-engine/src/gna_plugin/dnn_traits.hpp b/inference-engine/src/gna_plugin/dnn_traits.hpp new file mode 100644 index 00000000000000..0a92bb342013b3 --- /dev/null +++ b/inference-engine/src/gna_plugin/dnn_traits.hpp @@ -0,0 +1,90 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "dnn.h" + +template +struct DnnTrait {}; + +template<> +struct DnnTrait { + using Type = intel_affine_t; + static Type *getLayer(intel_dnn_component_t &component) { + return &component.op.affine; + } +}; + +template<> +struct DnnTrait { + using Type = intel_piecewiselinear_t; + static Type *getLayer(intel_dnn_component_t &component) { + return &component.op.pwl; + } +}; + +template<> +struct DnnTrait { + using Type = intel_affine_t; + static Type *getLayer(intel_dnn_component_t &component) { + return &component.op.affine; + } +}; + +template<> +struct DnnTrait { + using Type = intel_convolutionalD_t; + static Type *getLayer(intel_dnn_component_t &component) { + return &component.op.conv1D; + } +}; + +template<> +struct DnnTrait { + using Type = intel_maxpool_t; + static Type *getLayer(intel_dnn_component_t &component) { + return &component.op.maxpool; + } +}; + +template<> +struct DnnTrait { + using Type = intel_recurrent_t; + static Type *getLayer(intel_dnn_component_t &component) { + return &component.op.recurrent; + } +}; + +template<> +struct DnnTrait { + using Type = intel_interleave_t; + static Type *getLayer(intel_dnn_component_t &component) { + return &component.op.interleave; + } +}; + +template<> +struct DnnTrait { + using Type = intel_deinterleave_t; + static Type *getLayer(intel_dnn_component_t &component) { + return &component.op.deinterleave; + } +}; + +template<> +struct DnnTrait { + using Type = intel_copy_t; + static Type *getLayer(intel_dnn_component_t &component) { + return &component.op.copy; + } +}; + +template<> +struct DnnTrait { + using Type = void; + static Type *getLayer(intel_dnn_component_t &component) { + return nullptr; + } +}; diff --git a/inference-engine/src/gna_plugin/floatmath.cpp b/inference-engine/src/gna_plugin/floatmath.cpp new file mode 100644 index 00000000000000..3ea41127959395 --- /dev/null +++ b/inference-engine/src/gna_plugin/floatmath.cpp @@ -0,0 +1,423 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "floatmath.h" +#include "pwl.h" +#include "gna_plugin_log.hpp" +#include + + +void CNNFilter32(intel_dnn_component_t *component) { + float *ptr_filters = reinterpret_cast(component->op.conv1D.ptr_filters); + float *ptr_biases = reinterpret_cast(component->op.conv1D.ptr_biases); + float *ptr_inputs = reinterpret_cast(component->ptr_inputs); + float *ptr_outputs = reinterpret_cast(component->ptr_outputs); + uint32_t num_group = component->num_rows_in; + uint32_t num_filter_outputs = component->op.conv1D.num_feature_map_rows - component->op.conv1D.num_filter_rows + 1; + uint32_t + num_inputs_band_stride = component->op.conv1D.num_feature_maps * component->op.conv1D.num_feature_map_columns; + uint32_t num_filter_coefficients = component->op.conv1D.num_filter_coefficients; + + if ((component->num_rows_in != 1) || (component->num_rows_out != 1) + || (component->num_columns_out != num_filter_outputs * component->op.conv1D.num_filters)) { + THROW_GNA_EXCEPTION << "Bad problem dimensions in CNNFilter32!"; + } + + for (uint32_t j = 0; j < num_filter_outputs; j++) { + float *ptr_in = ptr_inputs + j * num_inputs_band_stride; + for (uint32_t i = 0; i < component->op.conv1D.num_filters; i++) { + float *ptr_coef = ptr_filters + i * num_filter_coefficients; + float sum = ptr_biases[i]; + for (uint32_t k = 0; k < num_filter_coefficients; k++) { + sum += ptr_in[k] * ptr_coef[k]; + } + ptr_outputs[j * component->op.conv1D.num_filters + i] = sum; + } + } +} + +void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) { + if (number_type == kDnnInt) { + int32_t *ptr_inputs = reinterpret_cast(component->ptr_inputs); + int32_t *ptr_outputs = reinterpret_cast(component->ptr_outputs); + uint32_t num_inputs = component->num_columns_in; + uint32_t num_columns = component->op.maxpool.num_inputs_stride; + uint32_t num_pool_size = component->op.maxpool.num_inputs; + uint32_t num_pool_step = component->op.maxpool.num_inputs_step; + uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride; + uint32_t num_rows_out = num_rows_in / num_pool_step; + + for (uint32_t i = 0; i < num_columns; i++) { + int32_t m = 0; + if (component->op.maxpool.do_sum_not_max) { + uint32_t num_saturate = 0; + for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) { + int64_t sum = 0; + uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size; + for (uint32_t k = j; k < num_end; k++) { + sum += ptr_inputs[k * num_columns + i]; + } + if (sum > 2147483647.0) { + ptr_outputs[m * num_columns + i] = 2147483647L; + num_saturate++; + } else if (sum < -2147483648.0) { + ptr_outputs[m * num_columns + i] = -2147483648L; + num_saturate++; + } else { + ptr_outputs[m * num_columns + i] = (int32_t) sum; + } + m++; + } + if (num_saturate > 0) { + fprintf(stderr, "Warning: %d saturations in CNNMaxPool()\n", num_saturate); + } + } else { + for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) { + int32_t max = INT32_MIN; + uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size; + for (uint32_t k = j; k < num_end; k++) { + if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i]; + } + ptr_outputs[m * num_columns + i] = max; + m++; + } + } + } + } else { + float *ptr_inputs = reinterpret_cast(component->ptr_inputs); + float *ptr_outputs = reinterpret_cast(component->ptr_outputs); + uint32_t num_inputs = component->num_columns_in; + uint32_t num_columns = component->op.maxpool.num_inputs_stride; + uint32_t num_pool_size = component->op.maxpool.num_inputs; + uint32_t num_pool_step = component->op.maxpool.num_inputs_step; + uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride; + uint32_t num_rows_out = num_rows_in / num_pool_step; + + for (uint32_t i = 0; i < num_columns; i++) { + int32_t m = 0; + if (component->op.maxpool.do_sum_not_max) { + for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) { + float sum = 0.0; + uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size; + for (uint32_t k = j; k < num_end; k++) { + sum += ptr_inputs[k * num_columns + i]; + } + ptr_outputs[m * num_columns + i] = sum; + m++; + } + } else { + for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) { + float max = -1e20f; + uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size; + for (uint32_t k = j; k < num_end; k++) { + if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i]; + } + ptr_outputs[m * num_columns + i] = max; + m++; + } + } + } + } +} + +void PwlApply16(intel_dnn_component_t *component, uint32_t num_subset_size) { + if (component->orientation_in == kDnnInterleavedOrientation) { // subsets only supported in interleaved orientation + PwlApply16(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1); + } else { + PwlApply16(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1); + } +} + +void PwlApply16(intel_dnn_component_t *component, + uint32_t num_row_start, + uint32_t num_row_end, + uint32_t num_col_start, + uint32_t num_col_end) { + uint32_t num_saturate = 0; + uint32_t num_segments = component->op.pwl.num_segments; + if (num_segments > 0) { + intel_pwl_segment_t *ptr_segment = component->op.pwl.ptr_segments; + for (int i = num_row_start; i <= num_row_end; i++) { + int32_t *ptr_input = reinterpret_cast(component->ptr_inputs) + i * component->num_columns_in; + int16_t *ptr_output = reinterpret_cast(component->ptr_outputs) + i * component->num_columns_in; + for (int j = num_col_start; j <= num_col_end; j++) { + int32_t xbase = (int32_t) (ptr_segment[0].xBase & XBASEMASK); + int32_t input = ptr_input[j]; + if (input <= xbase) { + ptr_output[j] = ptr_segment[0].yBase; + } else { + uint32_t slope_shift; + int16_t slope, ybase; + int64_t diff, prod, prod_shift, sum; + uint32_t k = num_segments / 2; + uint32_t k_upper = num_segments; + uint32_t k_lower = 0; + while (k_upper > k_lower + 1) { + xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK); + if (xbase > input) { + k_upper = k; + k = (k + k_lower) / 2; + } else { + k_lower = k; + k = (k_upper + k) / 2; + } + } + xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK); + slope_shift = ((ptr_segment[k].xBase & ~XBASEMASK) + 1) * 8; + slope = ptr_segment[k].slope; + ybase = ptr_segment[k].yBase; + diff = (int64_t) input - (int64_t) xbase; + prod = diff * slope; + prod_shift = prod >> slope_shift; + sum = prod_shift + (int64_t) ybase; + if (sum > 32767LL) { + ptr_output[j] = 32767; + num_saturate++; + } else if (sum < -32768LL) { + ptr_output[j] = -32768; + num_saturate++; + } else { + ptr_output[j] = (int16_t) sum; + } + } + } + } + } + + if (num_saturate > 0) { + fprintf(stderr, "Warning: %d saturations in PwlApply16!\n", num_saturate); + } +} + +void PwlApply32(intel_dnn_component_t *component, uint32_t num_subset_size) { + if (component->orientation_in == kDnnInterleavedOrientation) { // subsets only supported in interleaved orientation + PwlApply32(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1); + } else { + PwlApply32(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1); + } +} + +void PwlApply32(intel_dnn_component_t *component, + uint32_t num_row_start, + uint32_t num_row_end, + uint32_t num_col_start, + uint32_t num_col_end) { + intel_piecewiselinear_t *transform = reinterpret_cast(&component->op.pwl); + float *ptr_in = reinterpret_cast(component->ptr_inputs); + float *ptr_out = reinterpret_cast(component->ptr_outputs); + uint32_t num_columns = component->num_columns_in; + switch (transform->func_id.type) { + case kActSigmoid: + for (uint32_t i = num_row_start; i <= num_row_end; i++) { + for (uint32_t j = num_col_start; j <= num_col_end; j++) { + ptr_out[i * num_columns + j] = 0.5 * (1.0 + tanh(0.5 * ptr_in[i * num_columns + j])); + } + } + break; + case kActTanh: + for (uint32_t i = num_row_start; i <= num_row_end; i++) { + for (uint32_t j = num_col_start; j <= num_col_end; j++) { + ptr_out[i * num_columns + j] = tanh(ptr_in[i * num_columns + j]); + } + } + break; + case kActRelu: + for (uint32_t i = num_row_start; i <= num_row_end; i++) { + for (uint32_t j = num_col_start; j <= num_col_end; j++) { + ptr_out[i * num_columns + j] = + (ptr_in[i * num_columns + j] < 0.0f) ? ptr_in[i * num_columns + j] * transform->func_id.negative_slope : ptr_in[i * num_columns + j]; + } + } + break; + case kActIdentity: + for (uint32_t i = num_row_start; i <= num_row_end; i++) { + for (uint32_t j = num_col_start; j <= num_col_end; j++) { + ptr_out[i * num_columns + j] = ptr_in[i * num_columns + j]; + } + } + break; + case kActKaldiLstmClipping: + for (uint32_t i = num_row_start; i <= num_row_end; i++) { + for (uint32_t j = num_col_start; j <= num_col_end; j++) { + float val = ptr_in[i * num_columns + j]; + if (val > KALDI_LSTM_CLIP_UPPER) { + ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_UPPER; + } else if (val < KALDI_LSTM_CLIP_LOWER) { + ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_LOWER; + } else { + ptr_out[i * num_columns + j] = val; + } + } + } + break; + case kActCustom: + // break; + default:fprintf(stderr, "Unknown piecewise linear function type!\n"); + throw -1; + } +} + +#ifdef __cplusplus +extern "C" { // API uses C linkage so that it can be used by C and C++ applications +#endif + +#ifdef _NO_MKL_ +void cblas_sgemm1(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N, + const MKL_INT K, const float alpha, const float *A, + const MKL_INT lda, const float *B, const MKL_INT ldb, + const float beta, float *C, const MKL_INT ldc) { + int i, j, k; + + if (Layout != CblasRowMajor) { + fprintf(stderr, "Only row major is supported in cblas_sgemm!\n"); + throw -1; + } + + if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) { + for (i = 0; i < M; i++) { + for (j = 0; j < N; j++) { + float sum = (beta == 1.0) ? C[i * ldc + j] : 0; + for (k = 0; k < K; k++) { + sum += A[i * lda + k] * B[k * ldb + j]; + } + C[i * ldc + j] = sum; + } + } + } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) { + for (i = 0; i < M; i++) { + for (j = 0; j < N; j++) { + float sum; + sum = beta * C[i * ldc + j]; + for (k = 0; k < K; k++) { + sum += alpha * A[i * lda + k] * B[j * ldb + k]; + } + C[i * ldc + j] = sum; + } + } + } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) { + for (i = 0; i < M; i++) { + for (j = 0; j < N; j++) { + float sum = (beta == 1.0) ? C[i * ldc + j] : 0; + for (k = 0; k < K; k++) { + sum += A[k * lda + i] * B[k * ldb + j]; + } + C[i * ldc + j] = sum; + } + } + } else { + fprintf(stderr, "Expected A not transposed in cblas_sgemm!\n"); + throw -1; + } +} +void cblas_ssbmv1(const CBLAS_LAYOUT Layout, const CBLAS_UPLO Uplo, + const MKL_INT N, const MKL_INT K, const float alpha, const float *A, + const MKL_INT lda, const float *X, const MKL_INT incX, + const float beta, float *Y, const MKL_INT incY) { + int i, j, k; + + if (Layout != CblasRowMajor) { + fprintf(stderr, "Only row major is supported in cblas_ssbmv!\n"); + throw -1; + } + if (Uplo != CblasLower) { + fprintf(stderr, "Only lower format is supported in cblas_ssbmv!\n"); + throw -1; + } + if (K != 0) { + fprintf(stderr, "Only diagonal matrices supported in cblas_ssbmv at this time!\n"); + throw -1; + } + if ((alpha == 1.0) && (beta == 1.0) && (incX == 1) && (incY == 1)) { + for (i = 0; i < N; i++) { + Y[i] += A[i] * X[i]; + } + } else { + fprintf(stderr, "Only alpha=1, beta=1, incX=1, incY=1, LDA=1 supported in cblas_ssbmv at this time!\n"); + throw -1; + } +} +#endif // #ifdef _NO_MKL_ + +void cblas_sgemm_subset(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N, + const MKL_INT K, const float alpha, const float *A, + const MKL_INT lda, const float *B, const MKL_INT ldb, + const float beta, float *C, const MKL_INT ldc, + const uint32_t *OutputList, const MKL_INT L) { + int i, j, k, l; + + if (Layout != CblasRowMajor) { + fprintf(stderr, "Only row major is supported in cblas_sgemm_subset!\n"); + throw -1; + } + + if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) { + for (l = 0; l < L; l++) { + i = OutputList[l]; + for (j = 0; j < N; j++) { + float sum = (beta == 1.0) ? C[l * ldc + j] : 0; + for (k = 0; k < K; k++) { + sum += A[i * lda + k] * B[k * ldb + j]; + } + C[l * ldc + j] = sum; + } + } + } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) { + for (i = 0; i < M; i++) { + for (l = 0; l < L; l++) { + float sum; + j = OutputList[l]; + sum = beta * C[i * ldc + l]; + for (k = 0; k < K; k++) { + sum += alpha * A[i * lda + k] * B[j * ldb + k]; + } + C[i * ldc + l] = sum; + } + } + } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) { + for (l = 0; l < L; l++) { + i = OutputList[l]; + for (j = 0; j < N; j++) { + float sum = (beta == 1.0) ? C[l * ldc + j] : 0; + for (k = 0; k < K; k++) { + sum += A[k * lda + i] * B[k * ldb + j]; + } + C[l * ldc + j] = sum; + } + } + } else { + fprintf(stderr, "Expected A not transposed in cblas_sgemm_subset!\n"); + throw -1; + } +} + +// C = [ A1 A2 ] * X + B +void sgemv_split(const uint32_t N, + const uint32_t K1, + const uint32_t K2, + const float *A1, + const float *A2, + const float *X, + const float *B, + float *C) { + uint32_t num_columns = K1 + K2; + uint32_t num_rows = N; + uint32_t i, j; + + for (i = 0; i < num_rows; i++) { + float sum = B[i]; + for (j = 0; j < K1; j++) { + sum += A1[j] * X[i * num_columns + j]; + } + for (j = K1; j < num_columns; j++) { + sum += A2[j - K1] * X[i * num_columns + j]; + } + C[i] = sum; + } +} + +#ifdef __cplusplus +} // end extern "C" +#endif diff --git a/inference-engine/src/gna_plugin/floatmath.h b/inference-engine/src/gna_plugin/floatmath.h new file mode 100644 index 00000000000000..ff9bf9938016ab --- /dev/null +++ b/inference-engine/src/gna_plugin/floatmath.h @@ -0,0 +1,71 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#ifndef _NO_MKL_ +#include +#include +#endif +// #include "types.h" +#include "dnn.h" + +#ifndef CBLAS_LAYOUT +#define CBLAS_LAYOUT CBLAS_ORDER +#endif + +#define CNN_MAX_POOL_SIZE 6 + +void CNNFilter32(intel_dnn_component_t *component); +void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type); + +#ifdef _NO_MKL_ +#ifndef _MKL_H_ +#define _MKL_H_ +typedef enum { CblasRowMajor = 101, CblasColMajor = 102 } CBLAS_LAYOUT; +typedef enum { CblasNoTrans = 111, CblasTrans = 112, CblasConjTrans = 113 } CBLAS_TRANSPOSE; +typedef enum { CblasUpper = 121, CblasLower = 122 } CBLAS_UPLO; +typedef enum { CblasNonUnit = 131, CblasUnit = 132 } CBLAS_DIAG; +typedef enum { CblasLeft = 141, CblasRight = 142 } CBLAS_SIDE; +typedef CBLAS_LAYOUT CBLAS_ORDER; /* this for backward compatibility with CBLAS_ORDER */ +#define MKL_INT int +#endif // #ifndef _MKL_H_ +#endif // #ifdef _NO_MKL_ + +#ifdef __cplusplus +extern "C" { // API uses C linkage so that it can be used by C and C++ applications +#endif + +#ifdef _NO_MKL_ +void cblas_sgemm1(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N, + const MKL_INT K, const float alpha, const float *A, + const MKL_INT lda, const float *B, const MKL_INT ldb, + const float beta, float *C, const MKL_INT ldc); +void cblas_ssbmv1(const CBLAS_LAYOUT Layout, const CBLAS_UPLO Uplo, + const MKL_INT N, const MKL_INT K, const float alpha, const float *A, + const MKL_INT lda, const float *X, const MKL_INT incX, + const float beta, float *Y, const MKL_INT incY); +#endif // #ifdef _NO_MKL_ +void cblas_sgemm_subset(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA, + const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N, + const MKL_INT K, const float alpha, const float *A, + const MKL_INT lda, const float *B, const MKL_INT ldb, + const float beta, float *C, const MKL_INT ldc, + const uint32_t *OutputList, const MKL_INT L); +void sgemv_split(const uint32_t N, + const uint32_t K1, + const uint32_t K2, + const float *A1, + const float *A2, + const float *X, + const float *B, + float *C); + +#ifdef __cplusplus +} +#endif + diff --git a/inference-engine/src/gna_plugin/gna_allocator.hpp b/inference-engine/src/gna_plugin/gna_allocator.hpp new file mode 100644 index 00000000000000..ae62b1f76f6b1e --- /dev/null +++ b/inference-engine/src/gna_plugin/gna_allocator.hpp @@ -0,0 +1,33 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include "gna_device.hpp" +#include "polymorh_allocator.hpp" + +/** + * wrap GNA interface into c++ allocator friendly one + */ +class GNAAllocator { + std::reference_wrapper _device; + + public: + typedef uint8_t value_type; + + explicit GNAAllocator(GNADeviceHelper &device) : _device(device) { + } + uint8_t *allocate(std::size_t n) { + uint32_t granted = 0; + auto result = _device.get().alloc(n, &granted); + if (result == nullptr || granted == 0) { + throw std::bad_alloc(); + } + return result; + } + void deallocate(uint8_t *p, std::size_t n) { + _device.get().free(); + } +}; diff --git a/inference-engine/src/gna_plugin/gna_api_wrapper.hpp b/inference-engine/src/gna_plugin/gna_api_wrapper.hpp index bce210b01d2f6e..fb9d2cc2ef152e 100644 --- a/inference-engine/src/gna_plugin/gna_api_wrapper.hpp +++ b/inference-engine/src/gna_plugin/gna_api_wrapper.hpp @@ -1,22 +1,11 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#if defined __INTEL_COMPILER || defined _MSC_VER -#include -#else -#include -#endif #include #include "gna_plugin_log.hpp" - -#if GNA_LIB_VER == 2 -#include -#include -#endif - namespace GNAPluginNS { /** @@ -27,62 +16,9 @@ template class CPPWrapper { }; -#if GNA_LIB_VER == 2 -template <> -class CPPWrapper { - public: - Gna2Model obj; - - CPPWrapper() { - obj.NumberOfOperations = 0; - obj.Operations = nullptr; - } - - /** - * creates nnet structure of n layers - * @param n - number of layers - */ - explicit CPPWrapper(size_t n) { - if (n == 0) { - THROW_GNA_EXCEPTION << "Can't allocate array of intel_nnet_layer_t objects of zero length"; - } - obj.Operations = reinterpret_cast(gnaUserAllocator(n * sizeof(Gna2Operation))); - if (obj.Operations == nullptr) { - THROW_GNA_EXCEPTION << "out of memory in while allocating "<< n << " GNA layers"; - } - obj.NumberOfOperations = n; - for (int i = 0; i < obj.NumberOfOperations; i++) { - obj.Operations[i].Type = Gna2OperationTypeNone; - obj.Operations[i].Operands = nullptr; - obj.Operations[i].NumberOfOperands = 0; - obj.Operations[i].Parameters = nullptr; - obj.Operations[i].NumberOfParameters = 0; - } - } - ~CPPWrapper() { - if (obj.Operations != nullptr) { - for (int i = 0; i < obj.NumberOfOperations; i++) { - freeGna2Operation(obj.Operations[i]); - } - gnaUserFree(obj.Operations); - obj.Operations = nullptr; - } - obj.NumberOfOperations = 0; - } - Gna2Model * operator ->() { - return &obj; - } - Gna2Model * operator *() { - return &obj; - } - operator Gna2Model &() { - return *this; - } -}; -#else template <> class CPPWrapper { -public: + public: intel_nnet_type_t obj; CPPWrapper() { @@ -96,18 +32,14 @@ class CPPWrapper { * @param n - number of layers */ explicit CPPWrapper(size_t n) { - if (n == 0) { - THROW_GNA_EXCEPTION << "Can't allocate array of intel_nnet_layer_t objects of zero length"; - } obj.pLayers = reinterpret_cast(_mm_malloc(n * sizeof(intel_nnet_layer_t), 64)); if (obj.pLayers == nullptr) { - THROW_GNA_EXCEPTION << "out of memory in while allocating " << n << " GNA layers"; + THROW_GNA_EXCEPTION << "out of memory in while allocating "<< n << " GNA layers"; } obj.nLayers = n; for (int i = 0; i < obj.nLayers; i++) { obj.pLayers[i].pLayerStruct = nullptr; } - obj.nGroup = 0; } ~CPPWrapper() { for (int i = 0; i < obj.nLayers; i++) { @@ -123,10 +55,9 @@ class CPPWrapper { intel_nnet_type_t * operator *() { return &obj; } - operator intel_nnet_type_t &() { + operator intel_nnet_type_t &() { return *this; } }; -#endif -} // namespace GNAPluginNS +} // namespace GNAPluginNS \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/gna_device.cpp b/inference-engine/src/gna_plugin/gna_device.cpp index aec8699036c756..3936bc89b4410e 100644 --- a/inference-engine/src/gna_plugin/gna_device.cpp +++ b/inference-engine/src/gna_plugin/gna_device.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -7,301 +7,62 @@ #include #include #include -#include -#if GNA_LIB_VER == 2 -#include "gna_api_wrapper.hpp" -#include "gna2-device-api.h" -#include "gna2-inference-api.h" -#include "gna2-instrumentation-api.h" -#include "gna2-memory-api.h" -#include "gna2_model_export_helper.hpp" -#else #include "gna-api-status.h" #include "gna-api.h" -#endif #include "details/ie_exception.hpp" #include "gna_plugin_log.hpp" +#include "gna/gna_config.hpp" uint8_t* GNADeviceHelper::alloc(uint32_t size_requested, uint32_t *size_granted) { - void * memPtr; -#if GNA_LIB_VER == 1 - memPtr = GNAAlloc(nGNAHandle, size_requested, size_granted); -#else - const auto status = Gna2MemoryAlloc(size_requested, size_granted, &memPtr); - checkGna2Status(status); -#endif - if (memPtr == nullptr) { - THROW_GNA_EXCEPTION << "GNAAlloc failed to allocate memory. Requested: " << size_requested << " Granted: " << *(size_granted); - } - dumpXNNROPtr = memPtr; - dumpXNNROSize = *size_granted; - return static_cast(memPtr); + return reinterpret_cast(GNAAlloc(nGNAHandle, size_requested, size_granted)); } -void GNADeviceHelper::free(void * ptr) { -#if GNA_LIB_VER == 1 - GNAFree(nGNAHandle); -#else - const auto status = Gna2MemoryFree(ptr); - checkGna2Status(status); -#endif +void GNADeviceHelper::propagateSync(const intel_nnet_type_t *pNeuralNetwork, + const uint32_t *pActiveIndices, + uint32_t nActiveIndices) { + wait(propagate(pNeuralNetwork, pActiveIndices, nActiveIndices)); } -#if GNA_LIB_VER == 1 uint32_t GNADeviceHelper::propagate(const intel_nnet_type_t *pNeuralNetwork, const uint32_t *pActiveIndices, uint32_t nActiveIndices) { uint32_t reqId; - nGNAStatus = GNAPropagateForward(nGNAHandle, pNeuralNetwork, pActiveIndices, nActiveIndices, &reqId, nGNAProcType); checkStatus(); return reqId; } -#else -void GNADeviceHelper::setUpActiveList(const uint32_t requestConfigId, uint32_t layerIndex, uint32_t* ptr_active_indices, uint32_t num_active_indices) { - const auto status = Gna2RequestConfigEnableActiveList(requestConfigId, layerIndex, num_active_indices, ptr_active_indices); - checkGna2Status(status); -} -void GNADeviceHelper::propagateSync(const uint32_t requestConfigId) { - wait(propagate(requestConfigId)); -} - -uint32_t GNADeviceHelper::propagate(const uint32_t requestConfigId) { - uint32_t reqId; - const auto status = Gna2RequestEnqueue(requestConfigId, &reqId); - checkGna2Status(status); - return reqId; -} - -uint32_t GNADeviceHelper::createModel(const Gna2Model& gnaModel) const { - uint32_t modelId; - const auto status = Gna2ModelCreate(nGnaDeviceIndex, &gnaModel, &modelId); - - checkGna2Status(status, gnaModel); - return modelId; -} - -void GNADeviceHelper::releseModel(const uint32_t model_id) { - const auto status = Gna2ModelRelease(model_id); - checkGna2Status(status); -} - -uint32_t GNADeviceHelper::createRequestConfig(const uint32_t model_id) { - uint32_t reqConfId; - auto status = Gna2RequestConfigCreate(model_id, &reqConfId); - checkGna2Status(status); - status = Gna2RequestConfigSetAccelerationMode(reqConfId, gna2AccelerationMode); - checkGna2Status(status); - if (gna2HwConsistency != Gna2DeviceVersionSoftwareEmulation) { - status = Gna2RequestConfigEnableHardwareConsistency(reqConfId, gna2HwConsistency); - checkGna2Status(status); - } - status = Gna2InstrumentationConfigAssignToRequestConfig(instrumentationConfigId, reqConfId); - checkGna2Status(status); - - return reqConfId; -} - -void GNADeviceHelper::checkGna2Status(Gna2Status status, const Gna2Model& gnaModel) { - if (!Gna2StatusIsSuccessful(status)) { - std::vector gna2StatusBuffer(1024); - const auto s = Gna2StatusGetMessage(status, gna2StatusBuffer.data(), gna2StatusBuffer.size()); - if (!Gna2StatusIsSuccessful(s)) - snprintf(gna2StatusBuffer.data(), gna2StatusBuffer.size(), "Gna2StatusGetMessage(%d) returned (%d)", - static_cast(status), static_cast(s)); - if (status == Gna2StatusDeviceIngoingCommunicationError || - status == Gna2StatusDeviceOutgoingCommunicationError) { - THROW_GNA_EXCEPTION << "Unsuccessful Gna2Status: (" << status << ") " << gna2StatusBuffer.data() << ", consider updating the GNA driver"; - } - - Gna2ModelError error; - Gna2ModelGetLastError(&error); - - std::stringstream ss; - ss << "\n GNA Library Error:\n"; - const Gna2ItemType type = error.Source.Type; - const std::string errorType = errorTypes.find(type) != errorTypes.end() - ? errorTypes.at(type) - : "Unknown Error Type"; - - ss << " Type (" << std::to_string(type) << "): " << errorType << "\n"; - - if (error.Source.OperationIndex != GNA2_DISABLED) { - const Gna2OperationType opTypeIndex = gnaModel.Operations[error.Source.OperationIndex].Type; - const std::string operationType = operationTypes.find(opTypeIndex) != operationTypes.end() - ? operationTypes.at(opTypeIndex) - : "Unknown Operation Type"; - const std::string operandType = operandTypes.find({ opTypeIndex, error.Source.OperandIndex }) != operandTypes.end() - ? operandTypes.at({ opTypeIndex, error.Source.OperandIndex }) - : "Unknown Operand Type"; - - ss << " OperationIndex (" << std::to_string(error.Source.OperationIndex) << "): " - << operationType << "\n"; - ss << " OperandIndex(" << std::to_string(error.Source.OperandIndex) << "): " - << operandType << "\n"; - ss << " ParamIndex (" << std::to_string(error.Source.ParameterIndex) << ")\n"; - ss << " DimIndex (" << std::to_string(error.Source.ShapeDimensionIndex) << ")\n"; - } - - const Gna2ErrorType reason = error.Reason; - const std::string errorReason = errorReasons.find(reason) != errorReasons.end() - ? errorReasons.at(reason) - : "Unknown Error Reason"; - ss << " Reason (" << std::to_string(reason) << "): " << errorReason << "\n"; - ss << " Value (0x" << std::hex << std::to_string(error.Value) << ")"; - - THROW_GNA_EXCEPTION << "\nUnsuccessful Gna2Status: (" << status << ") " << gna2StatusBuffer.data() << ss.str(); - } -} - -void GNADeviceHelper::checkGna2Status(Gna2Status status) { - if (!Gna2StatusIsSuccessful(status)) { - std::vector gna2StatusBuffer(1024); - const auto s = Gna2StatusGetMessage(status, gna2StatusBuffer.data(), gna2StatusBuffer.size()); - if (!Gna2StatusIsSuccessful(s)) - snprintf(gna2StatusBuffer.data(), gna2StatusBuffer.size(), "Gna2StatusGetMessage(%d) returned (%d)", - static_cast(status), static_cast(s)); - if (status == Gna2StatusDeviceIngoingCommunicationError || - status == Gna2StatusDeviceOutgoingCommunicationError) { - THROW_GNA_EXCEPTION << "Unsuccessful Gna2Status: (" << status << ") " << gna2StatusBuffer.data() << ", consider updating the GNA driver"; - } - THROW_GNA_EXCEPTION << "Unsuccessful Gna2Status: (" << status << ") " << gna2StatusBuffer.data(); - } -} - -const std::map GNADeviceHelper::errorTypes = { - {Gna2ItemTypeNone, "Model context is not applicable or unnecessary"}, - {Gna2ItemTypeModelNumberOfOperations, "Gna2Model::NumberOfOperations"}, - {Gna2ItemTypeModelOperations, "Gna2Model::Operations array"}, - {Gna2ItemTypeOperationType, "Gna2Model::Operations[x]->Gna2Operation::Type"}, - {Gna2ItemTypeOperationOperands, "Gna2Model::Operations[x]->Gna2Operation::Operands array"}, - {Gna2ItemTypeOperationNumberOfOperands, "Gna2Model::Operations[x]->Gna2Operation::NumberOfOperands"}, - {Gna2ItemTypeOperationParameters, "Gna2Model::Operations[x]->Gna2Operation::Parameters array"}, - {Gna2ItemTypeOperationNumberOfParameters, "Gna2Model::Operations[x]->Gna2Operation::NumberOfParameters"}, - {Gna2ItemTypeOperandMode, "Gna2Model::Operations[x]->Gna2Operation::Operands[y]->Gna2Tensor::Mode"}, - {Gna2ItemTypeOperandLayout, "Gna2Model::Operations[x]->Gna2Operation::Operands[y]->Gna2Tensor::Layout"}, - {Gna2ItemTypeOperandType, "Gna2Model::Operations[x]->Gna2Operation::Operands[y]->Gna2Tensor::Type"}, - {Gna2ItemTypeOperandData, "Gna2Model::Operations[x]->Gna2Operation::Operands[y]->Gna2Tensor::Data"}, - {Gna2ItemTypeParameter, "Gna2Model::Operations[x]->Gna2Operation::Parameters[z]->Parameter, can be of type Gna2Shape, enumeration or integer"}, - {Gna2ItemTypeShapeNumberOfDimensions, "Gna2Model::Operations[x]->{Gna2Tensor}, Parameter}->Gna2Shape::NumberOfDimensions"}, - {Gna2ItemTypeShapeDimensions, "Gna2Model::Operations[x]->{Gna2Tensor}, Parameter}->Gna2Shape::Dimensions"}, - {Gna2ItemTypeInternal, "Internal model item, that is a derivative of other model parameters"} -}; - -const std::map GNADeviceHelper::errorReasons = { - { Gna2ErrorTypeNone, "No error detected"}, - { Gna2ErrorTypeNotTrue, "Item value was expected to be true"}, - { Gna2ErrorTypeNotFalse, "Item value was expected to be false"}, - { Gna2ErrorTypeNullNotAllowed, "Item value was expected to be not null"}, - { Gna2ErrorTypeNullRequired, "Item value was expected to be null"}, - { Gna2ErrorTypeBelowRange, "Item value was below supported range"}, - { Gna2ErrorTypeAboveRange, "Item value was above supported range"}, - { Gna2ErrorTypeNotEqual, "Item value was not equal supported one"}, - { Gna2ErrorTypeNotGtZero, "Item value was below zero"}, - { Gna2ErrorTypeNotZero, "Item value was not equal zero"}, - { Gna2ErrorTypeNotOne, "Item value was not equal one"}, - { Gna2ErrorTypeNotInSet, "Item value was not in supported set of values"}, - { Gna2ErrorTypeNotMultiplicity, "Item value was not multiple of supported value"}, - { Gna2ErrorTypeNotSuccess, "Item value was invalid, no detailed information available"}, - { Gna2ErrorTypeNotAligned, "Item value was not aligned to supported value"}, - { Gna2ErrorTypeArgumentMissing, "Some operation argument was not provided"}, - { Gna2ErrorTypeArgumentInvalid, "Given operation argument was invalid or unexpected"}, - { Gna2ErrorTypeRuntime, "Runtime error occurred during model creation"}, - { Gna2ErrorTypeOther, "Unable to determine the root cause of the issue"} -}; - -const std::map GNADeviceHelper::operationTypes = { - { Gna2OperationTypeNone, "None"}, - { Gna2OperationTypeConvolution, "Convolution"}, - { Gna2OperationTypeCopy, "Copy"}, - { Gna2OperationTypeFullyConnectedAffine, "FullyConnectedAffine"}, - { Gna2OperationTypeElementWiseAffine, "ElementWiseAffine"}, - { Gna2OperationTypeGmm, "GMM"}, - { Gna2OperationTypeRecurrent, "Recurrent"}, - { Gna2OperationTypeTransposition, "Transpose"}, - { Gna2OperationTypeThreshold, "Threshold"} -}; - -const std::map , const std::string> GNADeviceHelper::operandTypes = { - {{Gna2OperationTypeConvolution, 0}, "Input"}, - {{Gna2OperationTypeConvolution, 1}, "Output"}, - {{Gna2OperationTypeConvolution, 2}, "Filters"}, - {{Gna2OperationTypeConvolution, 3}, "Biases"}, - {{Gna2OperationTypeConvolution, 4}, "Activation"}, - {{Gna2OperationTypeCopy, 0}, "Input"}, - {{Gna2OperationTypeCopy, 1}, "Output"}, - {{Gna2OperationTypeFullyConnectedAffine, 0}, "Input"}, - {{Gna2OperationTypeFullyConnectedAffine, 1}, "Output"}, - {{Gna2OperationTypeFullyConnectedAffine, 2}, "Weights"}, - {{Gna2OperationTypeFullyConnectedAffine, 3}, "Biases"}, - {{Gna2OperationTypeFullyConnectedAffine, 4}, "Activation"}, - {{Gna2OperationTypeFullyConnectedAffine, 5}, "WeightScaleFactors"}, - {{Gna2OperationTypeElementWiseAffine, 0}, "Input"}, - {{Gna2OperationTypeElementWiseAffine, 1}, "Output"}, - {{Gna2OperationTypeElementWiseAffine, 2}, "Weights"}, - {{Gna2OperationTypeElementWiseAffine, 3}, "Biases"}, - {{Gna2OperationTypeElementWiseAffine, 4}, "Activation"}, - {{Gna2OperationTypeGmm, 0}, "Input"}, - {{Gna2OperationTypeGmm, 1}, "Output"}, - {{Gna2OperationTypeGmm, 2}, "Means"}, - {{Gna2OperationTypeGmm, 3}, "InverseCovariances"}, - {{Gna2OperationTypeGmm, 4}, "Constants"}, - {{Gna2OperationTypeRecurrent, 0}, "Input"}, - {{Gna2OperationTypeRecurrent, 1}, "Output"}, - {{Gna2OperationTypeRecurrent, 2}, "Weights"}, - {{Gna2OperationTypeRecurrent, 3}, "Biases"}, - {{Gna2OperationTypeRecurrent, 4}, "Activation"}, - {{Gna2OperationTypeTransposition, 0}, "Input"}, - {{Gna2OperationTypeTransposition, 1}, "Output"}, - {{Gna2OperationTypeThreshold, 0}, "Input"}, - {{Gna2OperationTypeThreshold, 1}, "Output"} -}; -#endif void GNADeviceHelper::wait(uint32_t reqId) { -#if GNA_LIB_VER == 2 - const auto status = Gna2RequestWait(reqId, GNA_TIMEOUT); - checkGna2Status(status); -#else if (isPerformanceMeasuring) { nGNAStatus = GNAWaitPerfRes(nGNAHandle, GNA_TIMEOUT, reqId, &nGNAPerfResults); + updateGnaPerfCounters(); } else { - nGNAStatus = GNAWait(nGNAHandle, GNA_TIMEOUT, reqId); + nGNAStatus = GNAWait(nGNAHandle, 1000000, reqId); } checkStatus(); -#endif - updateGnaPerfCounters(); } -#if GNA_LIB_VER == 1 GNADeviceHelper::DumpResult GNADeviceHelper::dumpXnn(const intel_nnet_type_t *pNeuralNetwork, const uint32_t *pActiveIndices, uint32_t nActiveIndices) { -#else -GNADeviceHelper::DumpResult GNADeviceHelper::dumpXnn(const uint32_t modelId) { -#endif DumpResult r; + intel_gna_status_t gna_status; -#if GNA_LIB_VER == 1 if (!pNeuralNetwork) { - THROW_GNA_EXCEPTION << "GNADumpXnn got invalid NeuralNetwork parameter \n"; + THROW_GNA_EXCEPTION<< "GNADumpXnn got invalid NeuralNetwork parameter \n"; } r.model.reset(GNADumpXnn(pNeuralNetwork, pActiveIndices, nActiveIndices, &r.header, &nGNAStatus, - [](size_t count)-> void* {return new char[count]();}), - [](void * ptr) {::operator delete[](ptr);}); + [](size_t count)-> void* {return ::operator new(count);}), + [](void * ptr) {::operator delete(ptr);}); + checkStatus(); -#else - r.model.reset( - ExportSueLegacyUsingGnaApi2(modelId, &r.header), - gnaUserFree); -#endif if (r.model == nullptr) { THROW_GNA_EXCEPTION << "GNADumpXnn returned nullptr"; @@ -310,88 +71,31 @@ GNADeviceHelper::DumpResult GNADeviceHelper::dumpXnn(const uint32_t modelId) { return r; } -#if GNA_LIB_VER == 2 - -void GNADeviceHelper::dumpXnnForDeviceVersion( - const uint32_t modelId, - std::ostream & outStream, - const Gna2DeviceVersion targetDeviceVersion) { - - Gna2ModelSueCreekHeader sueHeader; - auto ptr = ExportSueLegacyUsingGnaApi2(modelId, &sueHeader); - gnaUserFree(ptr); - - ExportGnaDescriptorPartiallyFilled(sueHeader.NumberOfLayers, outStream); - - ExportLdForDeviceVersion(modelId, outStream, targetDeviceVersion); - if (dumpXNNROPtr == nullptr) { - THROW_GNA_EXCEPTION << "Bad RO pointer (nullptr)"; - } - outStream.write(static_cast(dumpXNNROPtr), dumpXNNROSize); - - // TODO: GNA2: remove - outStream.write("Gna2ModelSueCreekHeader", 24); - outStream.write(reinterpret_cast(&sueHeader), sizeof(sueHeader)); -} -#endif - -#if GNA_LIB_VER == 1 void GNADeviceHelper::checkStatus() const { if ((nGNAStatus != GNA_NOERROR) && (nGNAStatus != GNA_SSATURATE)) { THROW_GNA_EXCEPTION << "Bad GNA status " << nGNAStatus << ", " << GNAStatusName[nGNAStatus]; } } -#endif void GNADeviceHelper::open(uint8_t n_threads) { -#if GNA_LIB_VER == 1 nGNAHandle = GNADeviceOpenSetThreads(&nGNAStatus, n_threads); + checkStatus(); -#else - auto status = Gna2DeviceGetVersion(nGnaDeviceIndex, &detectedGnaDevVersion); - checkGna2Status(status); - if (gna2AccelerationMode == Gna2AccelerationModeHardware && - detectedGnaDevVersion == Gna2DeviceVersionSoftwareEmulation) { - gnalog() << "GNA Device not detected, consider using other mode of acceleration"; - } - status = Gna2DeviceOpen(nGnaDeviceIndex); - checkGna2Status(status); - // TODO: GNA2: uncomment when scratchpad repaired - // status = Gna2DeviceSetNumberOfThreads(nGnaDeviceIndex, n_threads); - // checkGna2Status(status); -#endif - deviceOpened = true; } void GNADeviceHelper::close() { -#if GNA_LIB_VER == 1 GNADeviceClose(nGNAHandle); nGNAHandle = 0; -#else - const auto status = Gna2DeviceClose(nGnaDeviceIndex); - checkGna2Status(status); -#endif - deviceOpened = false; } void GNADeviceHelper::setOMPThreads(uint8_t const n_threads) { -#if GNA_LIB_VER == 1 gmmSetThreads(n_threads); -#else - const auto status = Gna2DeviceSetNumberOfThreads(nGnaDeviceIndex, n_threads); - checkGna2Status(status); -#endif } void GNADeviceHelper::updateGnaPerfCounters() { - if (!isPerformanceMeasuring) - return; -#if GNA_LIB_VER == 2 - instrumentationTotal[0] = instrumentationResults[0]; - instrumentationTotal[1] = instrumentationResults[1]; -#else nGNAPerfResultsTotal.hw.stall = nGNAPerfResults.hw.stall; nGNAPerfResultsTotal.hw.total = nGNAPerfResults.hw.total; + nGNAPerfResultsTotal.lib.submit = nGNAPerfResults.lib.submit; nGNAPerfResultsTotal.lib.preprocess = nGNAPerfResults.lib.preprocess; nGNAPerfResultsTotal.lib.process = nGNAPerfResults.lib.process; @@ -406,26 +110,16 @@ void GNADeviceHelper::updateGnaPerfCounters() { nGNAPerfResultsTotal.drv.startHW = nGNAPerfResults.drv.startHW; nGNAPerfResultsTotal.drv.scoreHW = nGNAPerfResults.drv.scoreHW; nGNAPerfResultsTotal.drv.intProc = nGNAPerfResults.drv.intProc; -#endif } void GNADeviceHelper::getGnaPerfCounters(std::map& retPerfCounters) { InferenceEngine::InferenceEngineProfileInfo info; info.status = InferenceEngine::InferenceEngineProfileInfo::EXECUTED; - info.cpu_uSec = 0; - info.execution_index = 0; - info.realTime_uSec = 0; + // Hardware -#if GNA_LIB_VER == 1 info.realTime_uSec = nGNAPerfResultsTotal.hw.total; -#else - info.realTime_uSec = instrumentationTotal[0]; -#endif retPerfCounters["1.1 Total scoring time in HW"] = info; -#if GNA_LIB_VER == 1 + info.realTime_uSec = nGNAPerfResultsTotal.hw.stall; -#else - info.realTime_uSec = instrumentationTotal[1]; -#endif retPerfCounters["1.2 Stall scoring time in HW"] = info; } diff --git a/inference-engine/src/gna_plugin/gna_device.hpp b/inference-engine/src/gna_plugin/gna_device.hpp index f122445907a22a..782821137dbbac 100644 --- a/inference-engine/src/gna_plugin/gna_device.hpp +++ b/inference-engine/src/gna_plugin/gna_device.hpp @@ -1,82 +1,38 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include +#include "gna-api-dumper.h" +#include "gna-api-instrumentation.h" +#include "ie_common.h" #include #include #include #include -#include - -#if GNA_LIB_VER == 2 -#include "gna2-common-api.h" -#include "gna2-inference-api.h" -#include "gna2-instrumentation-api.h" - -#include "gna2-memory-api.h" -#include "gna2-model-api.h" -#include "gna2-model-suecreek-header.h" -#else -#include -#include "gna-api-dumper.h" -#include "gna-api-instrumentation.h" -#endif - - /** * holds gna - style handle in RAII way */ class GNADeviceHelper { -#if GNA_LIB_VER == 1 intel_gna_status_t nGNAStatus = GNA_NOERROR; intel_gna_handle_t nGNAHandle = 0; intel_gna_proc_t nGNAProcType = GNA_AUTO; intel_gna_perf_t nGNAPerfResults; intel_gna_perf_t nGNAPerfResultsTotal; -#else - uint32_t nGnaDeviceIndex = 0; - Gna2AccelerationMode gna2AccelerationMode = Gna2AccelerationModeAuto; - Gna2DeviceVersion gna2HwConsistency = Gna2DeviceVersionSoftwareEmulation; - Gna2DeviceVersion detectedGnaDevVersion = Gna2DeviceVersionSoftwareEmulation; - - static const uint32_t TotalGna2InstrumentationPoints = 2; - Gna2InstrumentationPoint gna2InstrumentationPoints[TotalGna2InstrumentationPoints] = { - Gna2InstrumentationPointHwTotalCycles, - Gna2InstrumentationPointHwStallCycles }; - - uint64_t instrumentationResults[TotalGna2InstrumentationPoints] = {}; - uint64_t instrumentationTotal[TotalGna2InstrumentationPoints] = {}; - uint32_t instrumentationConfigId = 0; - -#define MAX_TIMEOUT 500000 -#endif const uint32_t GNA_TIMEOUT = MAX_TIMEOUT; - bool isPerformanceMeasuring = false; - bool deviceOpened = false; -public: -#if GNA_LIB_VER == 1 + bool isPerformanceMeasuring; + + public: explicit GNADeviceHelper(intel_gna_proc_t proc_type = GNA_AUTO, uint8_t lib_async_n_threads = 1, bool use_openmp = false, bool isPerformanceMeasuring = false) : nGNAProcType(proc_type), isPerformanceMeasuring(isPerformanceMeasuring) { -#else - explicit GNADeviceHelper(Gna2AccelerationMode gna2accMode = Gna2AccelerationModeAuto, - Gna2DeviceVersion gna2HwConsistency = Gna2DeviceVersionSoftwareEmulation, - uint8_t lib_async_n_threads = 1, - bool use_openmp = false, - bool isPerformanceMeasuring = false) : - gna2AccelerationMode(gna2accMode), - gna2HwConsistency(gna2HwConsistency), - isPerformanceMeasuring(isPerformanceMeasuring) { -#endif - open(lib_async_n_threads); initGnaPerfCounters(); + open(lib_async_n_threads); if (use_openmp) { uint8_t num_cores = std::thread::hardware_concurrency(); @@ -84,17 +40,12 @@ class GNADeviceHelper { } } - GNADeviceHelper(const GNADeviceHelper&) = delete; - GNADeviceHelper& operator= (const GNADeviceHelper&) = delete; ~GNADeviceHelper() { - if (deviceOpened) { - close(); - } + close(); } uint8_t *alloc(uint32_t size_requested, uint32_t *size_granted); -#if GNA_LIB_VER == 1 void propagateSync(const intel_nnet_type_t *pNeuralNetwork, const uint32_t *pActiveIndices, uint32_t nActiveIndices); @@ -102,80 +53,39 @@ class GNADeviceHelper { uint32_t propagate(const intel_nnet_type_t *pNeuralNetwork, const uint32_t *pActiveIndices, uint32_t nActiveIndices); -#else - void setUpActiveList(unsigned req_config_id, uint32_t layerIndex, uint32_t* ptr_active_indices, uint32_t num_active_indices); - void propagateSync(const uint32_t requestConfigId); - uint32_t propagate(const uint32_t requestConfigId); -#if GNA_LIB_VER == 2 - uint32_t createModel(const Gna2Model& gnaModel) const; -#else - uint32_t createModel(const intel_nnet_type_t& intel_nnet_type); -#endif - void releseModel(const uint32_t model_id); - uint32_t createRequestConfig(const uint32_t model_id); - bool hasGnaHw() const { - return Gna2DeviceVersionSoftwareEmulation != detectedGnaDevVersion; - } - static void checkGna2Status(Gna2Status status); - static void checkGna2Status(Gna2Status status, const Gna2Model& gnaModel); -#endif + void wait(uint32_t id); + struct DumpResult { -#if GNA_LIB_VER == 2 - Gna2ModelSueCreekHeader header; -#else intel_gna_model_header header; -#endif std::shared_ptr model; }; - const void * dumpXNNROPtr = nullptr; - uint32_t dumpXNNROSize = 0; - -#if GNA_LIB_VER == 1 DumpResult dumpXnn(const intel_nnet_type_t *pNeuralNetwork, const uint32_t *pActiveIndices, uint32_t nActiveIndices); - intel_gna_status_t getGNAStatus() const noexcept { - return nGNAStatus; - } -#else - DumpResult dumpXnn(const uint32_t modelId); - void dumpXnnForDeviceVersion(const uint32_t modelId, - std::ostream & outStream, - Gna2DeviceVersion targetDeviceVersion); -#endif - void free(void * ptr); + void free() { + GNAFree(nGNAHandle); + } void updateGnaPerfCounters(); void getGnaPerfCounters(std::map& retPerfCounters); + private: void open(uint8_t const n_threads); void close(); -#if GNA_LIB_VER == 1 + void checkStatus() const; -#else - static const std::map errorTypes; - static const std::map errorReasons; - static const std::map operationTypes; - static const std::map , const std::string > operandTypes; -#endif + void setOMPThreads(uint8_t const n_threads); void initGnaPerfCounters() { -#if GNA_LIB_VER == 1 nGNAPerfResults = {{0, 0, 0, 0, 0, 0, 0}, {0, 0}, {0, 0, 0}, {0, 0}}; nGNAPerfResultsTotal = {{0, 0, 0, 0, 0, 0, 0}, {0, 0}, {0, 0, 0}, {0, 0}}; -#else - const auto status = Gna2InstrumentationConfigCreate(TotalGna2InstrumentationPoints, - gna2InstrumentationPoints, - instrumentationResults, - &instrumentationConfigId); - checkGna2Status(status); -#endif } -}; // NOLINT +}; + diff --git a/inference-engine/src/gna_plugin/gna_executable_network.hpp b/inference-engine/src/gna_plugin/gna_executable_network.hpp index 90f01ff17cbfef..1230624fb15530 100644 --- a/inference-engine/src/gna_plugin/gna_executable_network.hpp +++ b/inference-engine/src/gna_plugin/gna_executable_network.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -8,11 +8,10 @@ #include #include -#include #include #include "gna_infer_request.hpp" #include "gna_plugin.hpp" -#include +#include #include namespace GNAPluginNS { @@ -21,26 +20,16 @@ class GNAExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafe std::shared_ptr plg; public: - GNAExecutableNetwork(const std::string &aotFileName, std::shared_ptr plg) - : plg(plg) { + GNAExecutableNetwork(const std::string &aotFileName, const std::map &config) : + plg(std::make_shared(config)) { plg->ImportNetwork(aotFileName); _networkInputs = plg->GetInputs(); _networkOutputs = plg->GetOutputs(); } - GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, std::shared_ptr plg) - : plg(plg) { - InferenceEngine::NetPass::ConvertPrecision(network, InferenceEngine::Precision::I64, InferenceEngine::Precision::I32); - InferenceEngine::NetPass::ConvertPrecision(network, InferenceEngine::Precision::U64, InferenceEngine::Precision::I32); - plg->LoadNetwork(network); - } - - GNAExecutableNetwork(const std::string &aotFileName, const std::map &config) - : GNAExecutableNetwork(aotFileName, std::make_shared(config)) { - } - GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, const std::map &config) - : GNAExecutableNetwork(network, std::make_shared(config)) { + : plg(std::make_shared(config)) { + plg->LoadNetwork(network); } InferenceEngine::AsyncInferRequestInternal::Ptr @@ -60,24 +49,5 @@ class GNAExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafe void Export(const std::string &modelFileName) override { plg->Export(modelFileName); } - - using ExecutableNetworkInternal::Export; - - void ExportImpl(std::ostream&) override { - THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str; - } - - void GetConfig(const std::string &name, - InferenceEngine::Parameter &result, - InferenceEngine::ResponseDesc* /*resp*/) const override { - result = plg->GetConfig(name, {}); - } - - void GetMetric(const std::string& name, - InferenceEngine::Parameter& result, - InferenceEngine::ResponseDesc* /* resp */) const override { - result = plg->GetMetric(name, {}); - } }; - } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/gna_helper.cpp b/inference-engine/src/gna_plugin/gna_helper.cpp index 729ae1b4fffdff..604828c339dd0e 100644 --- a/inference-engine/src/gna_plugin/gna_helper.cpp +++ b/inference-engine/src/gna_plugin/gna_helper.cpp @@ -1,9 +1,10 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // -// gna_helper.cpp : various GNA-related utility functions -// +#include "lstm.hpp" + +#define USING_GCC #define PROFILE #include @@ -12,10 +13,7 @@ #include #include #include -#include -#include "gna_plugin_log.hpp" - -#include "gna_lib_ver_selector.hpp" +#include "gna-api.h" #ifndef WIN32 #include @@ -77,10 +75,8 @@ void profilerRtcStartAccumulate(intel_gna_profiler_rtc *p) { // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &p->start); } void profilerRtcStopAccumulate(intel_gna_profiler_rtc *p) { + timespec diff; if (nullptr == p) return; -// TODO: consider removing dead code from this file - -// timespec diff; // clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &p->stop); // if ((p->stop.tv_nsec - p->start.tv_nsec)<0) { // diff.tv_sec = p->stop.tv_sec - p->start.tv_sec - 1; @@ -118,7 +114,7 @@ void PrintMatrixInt32(char *ptr_name, int32_t *ptr_matrix, int num_rows, int num } void PrintMatrixFloat32(char *ptr_name, float *ptr_matrix, int num_rows, int num_cols, int lda) { -#if (defined _WIN32 || defined _WIN64) && (_MSC_VER < 1900) +#if (_WIN32 || _WIN64) && (_MSC_VER < 1900) _set_output_format(_TWO_DIGIT_EXPONENT); #endif printf("%s: %dx%d lda %d\n", ptr_name, num_rows, num_cols, lda); @@ -228,7 +224,7 @@ uint32_t BufferOffsetFromAddress(std::vector &vBuffer, vo } std::string LayerName(intel_nnet_layer_t *pLayer) { - const auto nKind = pLayer->nLayerKind; + intel_layer_kind_t nKind = pLayer->nLayerKind; std::string sKind; if (nKind == INTEL_AFFINE) { sKind = "affine"; @@ -246,7 +242,7 @@ std::string LayerName(intel_nnet_layer_t *pLayer) { } uint32_t NumInputs(intel_nnet_layer_t *pLayer) { - const auto nKind = pLayer->nLayerKind; + intel_layer_kind_t nKind = pLayer->nLayerKind; uint32_t nInputs; if ((nKind == INTEL_AFFINE) || (nKind == INTEL_AFFINE_DIAGONAL)) { nInputs = pLayer->nInputRows; @@ -262,7 +258,7 @@ uint32_t NumInputs(intel_nnet_layer_t *pLayer) { } uint32_t NumOutputs(intel_nnet_layer_t *pLayer) { - const auto nKind = pLayer->nLayerKind; + intel_layer_kind_t nKind = pLayer->nLayerKind; uint32_t nOutputs; if ((nKind == INTEL_AFFINE) || (nKind == INTEL_AFFINE_DIAGONAL)) { nOutputs = pLayer->nOutputRows; @@ -278,7 +274,7 @@ uint32_t NumOutputs(intel_nnet_layer_t *pLayer) { } uint32_t NumGroupSize(intel_nnet_layer_t *pLayer) { - const auto nKind = pLayer->nLayerKind; + intel_layer_kind_t nKind = pLayer->nLayerKind; uint32_t nGroupSize; if ((nKind == INTEL_AFFINE) || (nKind == INTEL_AFFINE_DIAGONAL)) { nGroupSize = pLayer->nOutputColumns; @@ -292,3 +288,162 @@ uint32_t NumGroupSize(intel_nnet_layer_t *pLayer) { } return (nGroupSize); } + +void ExportGnaNetworkAndrzej(const char *ptr_name, intel_nnet_type_t *pNeuralNetwork) { + std::string sXmlFileName; + sXmlFileName.append(ptr_name); + sXmlFileName.append("/model.xml"); + std::ofstream xml_file(sXmlFileName.c_str(), std::ios::out); + if (xml_file.good()) { + std::vector vBuffer; + // find all the memory regions in the network + for (uint32_t layer = 0; layer < pNeuralNetwork->nLayers; layer++) { + intel_nnet_layer_t *pLayer = &pNeuralNetwork->pLayers[layer]; + intel_affine_layer_t *pAffineLayer = reinterpret_cast(pLayer->pLayerStruct); + uint32_t nPWLSegments = 0; + uint32_t nWeightWidth = 0; + AddBufferEntry(vBuffer, + LayerName(pLayer), + "pInputs", + pLayer->pInputs, + pLayer->nBytesPerInput * pLayer->nInputColumns * pLayer->nInputRows); + AddBufferEntry(vBuffer, + LayerName(pLayer), + "pOutputs", + pLayer->pOutputs, + pLayer->nBytesPerOutput * pLayer->nOutputColumns * pLayer->nOutputRows); + AddBufferEntry(vBuffer, + LayerName(pLayer), + "pOutputsIntermediate", + pLayer->pOutputsIntermediate, + pLayer->nBytesPerIntermediateOutput * pLayer->nOutputColumns * pLayer->nOutputRows); + if ((pLayer->nLayerKind == INTEL_AFFINE) || (pLayer->nLayerKind == INTEL_AFFINE_DIAGONAL)) { + uint32_t nBytesWeights = + (pLayer->nLayerKind == INTEL_AFFINE) ? pAffineLayer->affine.nBytesPerWeight * pLayer->nInputRows + * pLayer->nOutputRows : pAffineLayer->affine.nBytesPerWeight * pLayer->nOutputRows; + nPWLSegments = pAffineLayer->pwl.nSegments; + nWeightWidth = pAffineLayer->affine.nBytesPerWeight; + AddBufferEntry(vBuffer, LayerName(pLayer), "pWeights", pAffineLayer->affine.pWeights, nBytesWeights); + AddBufferEntry(vBuffer, + LayerName(pLayer), + "pBiases", + pAffineLayer->affine.pBiases, + pAffineLayer->affine.nBytesPerBias * pLayer->nOutputRows); + if (nPWLSegments > 0) { + AddBufferEntry(vBuffer, + LayerName(pLayer), + "pSegments", + pAffineLayer->pwl.pSegments, + sizeof(intel_pwl_segment_t) * nPWLSegments); + } + } else if (pLayer->nLayerKind == INTEL_INTERLEAVE) { + } else if (pLayer->nLayerKind == INTEL_DEINTERLEAVE) { + } else { + fprintf(stderr, "Error: layer kind not yet supported in ExportGnaNetworkAndrzej()!\n"); + exit(EXIT_FAILURE); + } + } + // write XML network description + xml_file << "\n"; + xml_file << "\n\n\n"; + xml_file << " \n"; + xml_file << " nGroup << "\">\n"; + for (uint32_t layer = 0; layer < pNeuralNetwork->nLayers; layer++) { + intel_nnet_layer_t *pLayer = &pNeuralNetwork->pLayers[layer]; + intel_affine_layer_t *pAffineLayer = reinterpret_cast(pLayer->pLayerStruct); + // below is hard-coded for the Google LSTM model -- it is only for debugging + std::string sClass = (layer < pNeuralNetwork->nLayers - 1) ? "LSTM_" : "DNN_"; + std::string sName; + uint32_t nGoogleLayer; + if (pNeuralNetwork->nGroup == 1) { + sName = (layer < pNeuralNetwork->nLayers - 1) ? intel_lstm_projected_layer_name[layer % NUM_LSTM_LAYERS] + : "final affine layer"; + nGoogleLayer = layer / NUM_LSTM_LAYERS; + } else if (pNeuralNetwork->nGroup == 4) { + sName = (layer < pNeuralNetwork->nLayers - 1) ? intel_lstm_projected_layer_g4_name[layer + % NUM_LSTM_G4_LAYERS] : "final affine layer"; + nGoogleLayer = layer / NUM_LSTM_G4_LAYERS; + } else { + sName = "affine"; sName + std::to_string(layer); + nGoogleLayer = layer; + // fprintf(stderr, "Error: unsupported grouping factor in ExportGnaNetworkAndrzej()!\n"); + // exit(EXIT_FAILURE); + } + xml_file << " \n"; + xml_file << " \n"; + xml_file << " " << NumInputs(pLayer) << "\n"; + xml_file << " pInputs) + << "\">"; + xml_file << BufferNameFromAddress(vBuffer, pLayer->pInputs) << "\n"; + xml_file << " \n"; + xml_file << " \n"; + xml_file << " " << NumOutputs(pLayer) << "\n"; + xml_file << " pOutputs) + << "\">"; + xml_file << BufferNameFromAddress(vBuffer, pLayer->pOutputs) << "\n"; + xml_file << " \n"; + if (pLayer->pOutputsIntermediate != NULL) { + xml_file << " \n"; + xml_file << " " << NumOutputs(pLayer) << "\n"; + xml_file << " pOutputsIntermediate) << "\">"; + xml_file << BufferNameFromAddress(vBuffer, pLayer->pOutputsIntermediate) << "\n"; + xml_file << " \n"; + } + if ((pLayer->nLayerKind == INTEL_AFFINE) || (pLayer->nLayerKind == INTEL_AFFINE_DIAGONAL)) { + xml_file << " " << BufferNameFromAddress(vBuffer, pAffineLayer->affine.pWeights) + << "\n"; + xml_file << " affine.pBiases) << "\">"; + xml_file << BufferNameFromAddress(vBuffer, pAffineLayer->affine.pBiases) << "\n"; + if (pAffineLayer->pwl.nSegments > 0) { + xml_file << " " << BufferNameFromAddress(vBuffer, pAffineLayer->pwl.pSegments) + << "\n"; + } + } + xml_file << " \n\n"; + } + xml_file << " \n\n"; + xml_file.flush(); + + // write buffer list to XML and create data files + xml_file << " \n"; + for (uint32_t i = 0; i < vBuffer.size(); i++) { + std::string sName = ptr_name; + sName.append("/"); + sName.append(BufferNameFromAddress(vBuffer, vBuffer.at(i).pAddress)); + bool found = false; + for (uint32_t j = 0; j < i; j++) { + std::string sPrevName = BufferNameFromAddress(vBuffer, vBuffer.at(j).pAddress); + if (sPrevName.compare(sName) == 0) found = true; + } + if (!found) { + xml_file << " \n"; + xml_file << " " << sName << "\n"; + if (sName.compare(0, 4, "buf_") == 0) { + xml_file << " " << vBuffer.at(i).nBytes << "\n"; + } else { + std::string sFileName; + sFileName.append(sName); + sFileName.append(".dat"); + xml_file << " " << sFileName << "\n"; + std::ofstream data_file(sFileName.c_str(), std::ios::binary); + data_file.write(reinterpret_cast(vBuffer.at(i).pAddress), vBuffer.at(i).nBytes); + data_file.close(); + } + xml_file << " \n"; + } + } + xml_file << " \n"; + xml_file << "\n"; + xml_file << " \n"; + xml_file << " 65536\n"; + xml_file << " \n"; + xml_file << "\n"; + xml_file.close(); + } else { + fprintf(stderr, "Failed to open %s for writing!\n", ptr_name); + } +} diff --git a/inference-engine/src/gna_plugin/gna_infer_request.hpp b/inference-engine/src/gna_plugin/gna_infer_request.hpp index 17688f4688a5ce..ba8e99f7920237 100644 --- a/inference-engine/src/gna_plugin/gna_infer_request.hpp +++ b/inference-engine/src/gna_plugin/gna_infer_request.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -31,15 +31,8 @@ class GNAInferRequest : public InferenceEngine::AsyncInferRequestInternal { } // copy inputs blobs since we need to have them in separate address space to allow simultaneous infer requests - for (auto output : _networkOutputs) { - _outputs[output.first] = - plg->GetOutputBlob(output.first, output.second->getTensorDesc().getPrecision()); - } - - for (auto input : _networkInputs) { - _inputs[input.first] = - plg->GetInputBlob(input.first, input.second->getTensorDesc().getPrecision()); - } + _outputs[_networkOutputs.begin()->first] = plg->GetOutputBlob(networkOutputs.begin()->second->getPrecision()); + _inputs[_networkInputs.begin()->first] = plg->GetInputBlob(networkInputs.begin()->second->getInputPrecision()); } /** * @brief Infers specified input(s) in synchronous mode @@ -72,12 +65,7 @@ class GNAInferRequest : public InferenceEngine::AsyncInferRequestInternal { } InferenceEngine::StatusCode Wait(int64_t millis_timeout) override { - if (inferRequestIdx == -1) { - return InferenceEngine::INFER_NOT_STARTED; - } else if (millis_timeout < -1) { - THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str; - } - + if (inferRequestIdx == -1) return InferenceEngine::INFER_NOT_STARTED; plg->Wait(inferRequestIdx); return InferenceEngine::OK; } diff --git a/inference-engine/src/gna_plugin/gna_layer_info.hpp b/inference-engine/src/gna_plugin/gna_layer_info.hpp new file mode 100644 index 00000000000000..7e6da438ef23a8 --- /dev/null +++ b/inference-engine/src/gna_plugin/gna_layer_info.hpp @@ -0,0 +1,206 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include "inference_engine.hpp" +#include "details/caseless.hpp" +#include "ie_algorithm.hpp" + + +namespace GNAPluginNS { + +/** + * @brief detecting of const pointer for dynamic cast operations + * @tparam T + */ +template +struct is_const_pointer : public std::false_type{ +}; + +template +struct is_const_pointer : public std::true_type{ +}; + + +/** + * similar to type traits determined in standard library this trait provides details per layer type, with some attributes specific for GNA + * we don't need to have compile time performance for this yet + */ +class LayerInfo { + InferenceEngine::CNNLayer * layer; + +#define IS_VALID() if (nullptr == layer) return false + + public: + explicit LayerInfo(InferenceEngine::CNNLayer & layer) + : LayerInfo(&layer) { + } + explicit LayerInfo(const InferenceEngine::CNNLayerPtr & layer) + : LayerInfo(layer.get()) { + } + explicit LayerInfo(InferenceEngine::CNNLayer * layer) + : layer(layer) { + } + bool has16BOutput() const noexcept { + IS_VALID(); + static InferenceEngine::details::caseless_set layersWith16BOutputs = {"memory", "input", "split", "slice", "concat", "copy"}; + return layersWith16BOutputs.find(layer->type) != layersWith16BOutputs.end() || + isActivation() || + (isCrop() && !isCropAffined()); + } + bool has32BOutput() const noexcept { + IS_VALID(); + static InferenceEngine::details::caseless_set layersWith32BOutputs = + {"FullyConnected", "InnerProduct", "Eltwise", "ScaleShift", "Convolution", "Pooling"}; + return (layersWith32BOutputs.find(layer->type) != layersWith32BOutputs.end()) || + (isCrop() && isCropAffined()); + } + static bool isBatchSizeConstrained(const std::string name) { + static InferenceEngine::details::caseless_set layersWithConstrains = {"memory", "convolution"}; + return layersWithConstrains.find(name) != layersWithConstrains.end(); + } + bool isActivation() const noexcept { + IS_VALID(); + static InferenceEngine::details::caseless_set activations = {"clamp", "sigmoid", "identity", "relu", "leakyrelu", "tanh", "prelu"}; + return activations.find(layer->type) != activations.end(); + } + bool isRelu() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "relu"); + } + bool isConvolution() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "convolution"); + } + bool isPower() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "power"); + } + bool has32BInput() const noexcept { + IS_VALID(); + return isActivation() || isPooling(); + } + bool isInput() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "input"); + } + bool isEltwise() const noexcept { + IS_VALID(); + return nullptr != as(); + } + bool isEltwiseSum() const noexcept { + IS_VALID(); + if (!isEltwise()) return false; + return dynamic_cast(layer)->_operation == + InferenceEngine::EltwiseLayer::Sum; + } + bool isEltwiseMul() const noexcept { + IS_VALID(); + if (!isEltwise()) return false; + return dynamic_cast(layer)->_operation == + InferenceEngine::EltwiseLayer::Prod; + } + bool isIdentity() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "identity"); + } + bool isFullyConnected() const noexcept { + return InferenceEngine::details::CaselessEq()(layer->type, "FullyConnected") || + InferenceEngine::details::CaselessEq()(layer->type, "InnerProduct"); + } + bool isConvolutional() const noexcept { + return InferenceEngine::details::CaselessEq()(layer->type, "Convolution"); + } + bool isSplit() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "split"); + } + bool isSlice() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "slice"); + } + bool isConcat() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "concat"); + } + bool isReshape() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "reshape"); + } + bool isPermute() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "permute"); + } + bool isPooling() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "Pooling"); + } + bool isMaxPooling() const noexcept { + IS_VALID(); + if (!isPooling()) return false; + return as()->_type == InferenceEngine::PoolingLayer::MAX; + } + bool isMemory() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "memory"); + } + bool isCrop() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "crop"); + } + bool isCropAffined() const noexcept { + auto cropLayer = dynamic_cast (layer); + size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size(); + return (ALIGN(cropOffset, 8) != cropOffset); + } + bool isCopy() const noexcept { + IS_VALID(); + return InferenceEngine::details::CaselessEq()(layer->type, "copy"); + } + size_t paddingSize() const noexcept { + static InferenceEngine::details::caseless_set layersWithPossiblePadding = {"FullyConnected", + "InnerProduct", + "Pooling", + "Convolution"}; + if (layersWithPossiblePadding.find(layer->type) != layersWithPossiblePadding.end()) { + size_t size_without_padding = 0; + auto inputs = layer->insData.begin()->lock(); + if (inputs) { + size_without_padding = InferenceEngine::details::product(begin(inputs->dims), + end(inputs->dims)); + } + return ALIGN(size_without_padding, 8) - size_without_padding; + } + return 0; + } + template + typename std::enable_if::value, T>::type as() noexcept { + return dynamic_cast(layer); + } + template + typename std::enable_if::value, T>::type as() const noexcept { + return dynamic_cast(layer); + } + operator InferenceEngine::CNNLayer *() noexcept { + return layer; + } + operator const InferenceEngine::CNNLayer *() const noexcept { + return layer; + } + operator InferenceEngine::CNNLayerPtr () const noexcept { + return std::shared_ptr(layer, [] (InferenceEngine::CNNLayer * p) {}); + } + + #undef IS_VALID +}; + +inline std::ostream & operator <<(std::ostream &os, const LayerInfo & info) { + os << static_cast(info)->name; + return os; +} + +} // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/gna_mem_requests.hpp b/inference-engine/src/gna_plugin/gna_mem_requests.hpp new file mode 100644 index 00000000000000..24163dc4f5a16d --- /dev/null +++ b/inference-engine/src/gna_plugin/gna_mem_requests.hpp @@ -0,0 +1,175 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include +#include +#include +#include + +namespace GNAPluginNS { + +enum rType { + REQUEST_STORE, + REQUEST_ALLOCATE, + REQUEST_BIND, + REQUEST_INITIALIZER, +}; +/** + * @brief region of firmware data + */ +enum rRegion { + REGION_RO, + REGION_RW, + REGION_AUTO, +}; + +struct MemRequest { + rType _type; + rRegion _region; + void *_ptr_out; + const void *_ptr_in = nullptr; + std::function _initializer; + // holds arbitrary value + std::vector _data; + uint8_t _element_size; + size_t _num_elements; + size_t _alignment; + size_t _offset; + // expansion in bytes due to large depended layers + size_t _padding = 0; + MemRequest(rRegion region, + rType req, + void *ptr_out, + const void *ptr_in, + uint8_t element_size = 0, + size_t num_elements = 0, + size_t alignment = 1, + size_t offset = 0) : _region(region), + _type(req), + _ptr_out(ptr_out), + _ptr_in(ptr_in), + _element_size(element_size), + _num_elements(num_elements), + _alignment(alignment), + _offset(offset) {} + + /** + * Store value only request + * @tparam T + * @param req + * @param ptr_out + * @param element + * @param num_elements + * @param alignment + */ + template + MemRequest(rRegion region, + void *ptr_out, + T element, + size_t num_elements, + size_t alignment = 1) : _region(region), + _type(REQUEST_STORE), + _ptr_out(ptr_out), + _element_size(sizeof(T)), + _num_elements(num_elements), + _alignment(alignment) { + _data.resize(sizeof(T)); + std::copy(reinterpret_cast(&element), reinterpret_cast(&element) + sizeof(T), _data.begin()); + } +/** + * Store initializer request + * @param req + * @param ptr_out + * @param element + * @param num_elements + * @param alignment + */ + MemRequest(rRegion region, + void *ptr_out, + size_t regionSize, + std::function initializer, + size_t alignment = 1) : _region(region), + _type(REQUEST_INITIALIZER), + _ptr_out(ptr_out), + _element_size(1), + _num_elements(regionSize), + _alignment(alignment), + _initializer(initializer) { + } +}; + +/** + * Adapter for requests submission and actual request queue + */ +class GNAMemRequestsQueue { + public: + virtual ~GNAMemRequestsQueue() {} + + /** + * @brief register initialiser to access memory once it is actually allocated + * @param ptr_out + * @param ptr_in + * @param num_bytes + * @param alignment + */ + void push_initializer(void *ptr_out, size_t num_bytes, std::function initializer, size_t alignment = 1) { + futureHeap().push_back({regionType(), ptr_out, num_bytes, initializer, alignment}); + } + + void push_ptr(void *ptr_out, const void *ptr_in, size_t num_bytes, size_t alignment = 1) { + futureHeap().push_back({regionType(), REQUEST_STORE, ptr_out, ptr_in, 1, num_bytes, alignment}); + } + + /** + * copy input to intermediate buffer + * @param ptr_out + * @param ptr_in + * @param num_bytes + */ + void push_local_ptr(void *ptr_out, const void *ptr_in, size_t num_bytes, size_t alignment = 1) { + localStorage().emplace_back(reinterpret_cast(ptr_in), + reinterpret_cast(ptr_in) + num_bytes); + futureHeap().push_back({regionType(), REQUEST_STORE, ptr_out, &localStorage().back().front(), 1, num_bytes, alignment}); + } + + /** + * + * @param ptr_out + * @param num_bytes + */ + void reserve_ptr(void *ptr_out, size_t num_bytes) { + futureHeap().push_back({regionType(), REQUEST_ALLOCATE, ptr_out, nullptr, 1, num_bytes}); + } + + /** + * + * @param source + * @param dest - source is binded to dest pointer after allocation + * @param offset - offset in bytes in sourse that will be set in dest + * @param num_bytes - bind can request for bigger buffer that originally allocated via reserve(), + * if that happens - reserved request parameters will be updated bero commiting memory + */ + void bind_ptr(void *source, const void *dest, size_t offset = 0, size_t num_bytes = 0) { + futureHeap().push_back({regionType(), REQUEST_BIND, source, dest, 1, num_bytes, 1, offset}); + } + /** + * @brief allocates buffer and set all its values to T value + */ + template + void push_value(void *ptr_out, T value, size_t num_elements, size_t alignment = 1) { + futureHeap().push_back({regionType(), ptr_out, value, num_elements, alignment}); + } + + /** + * @brief interface for actual queue storage + */ + virtual rRegion regionType() const = 0; + virtual std::vector & futureHeap() = 0; + virtual std::list> &localStorage() = 0; +}; + + + +} // namespace GNAPluginNS \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/gna_memory.hpp b/inference-engine/src/gna_plugin/gna_memory.hpp new file mode 100644 index 00000000000000..d1c96506bc036d --- /dev/null +++ b/inference-engine/src/gna_plugin/gna_memory.hpp @@ -0,0 +1,227 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include "gna_mem_requests.hpp" +#include +#include +#include +#include +#include + +/** + * Pads memory size to given number of Bytes + * + * Please always use this padding macro for consistency + * + * @memSize size (in bytes) of memory to be padded + * @align number of bytes to pad + * @return memory size (int bytes) padded to given value + */ +#ifndef ALIGN +# define ALIGN(memSize, pad) (static_cast(((memSize) + pad -1) / pad) * pad) +#endif + +namespace GNAPluginNS { + + + +/** + * @brief encapsulate various request to allocate GNA specific memory, + * in order to issue single allocation call and configure actual pointers in requests + * @tparam Allocator - a GNAAllocator in case of actual HW offloads + */ +template> +class GNAMemory : public GNAMemRequestsQueue { + std::vector _future_heap; + std::list> _local_storage; + size_t _total = 0; + size_t _rw_section_size = 0; + size_t _ro_section_size = 0; + Allocator _allocator; + std::shared_ptr heap; + size_t _page_alignment = 1; + + class GNAMemRequestsReadOnlyQueue : public GNAMemRequestsQueue { + std::reference_wrapper _that; + public: + explicit GNAMemRequestsReadOnlyQueue(GNAMemory & that) : _that(that) { + } + rRegion regionType() const override { + return REGION_RO; + }; + std::vector & futureHeap() override { + return _that.get().futureHeap(); + } + std::list> &localStorage() override { + return _that.get().localStorage(); + } + }; + + GNAMemRequestsReadOnlyQueue readOnlyFrontEnd; + + public: + explicit GNAMemory(size_t pageAlignment = 1) + : readOnlyFrontEnd(*this), _page_alignment(pageAlignment) {} + + explicit GNAMemory(const Allocator &a, size_t pageAlignment = 1) + : _allocator(a), readOnlyFrontEnd(*this), _page_alignment(pageAlignment) {} + + GNAMemRequestsQueue & readonly() { + return readOnlyFrontEnd; + } + + /** + * @brief calculates size required for all requests, allocates memory and updates pointers + */ + void commit() { + // 1st stage -- looking for expandable bind requests: + for (auto &originated : _future_heap) { + if (originated._type == REQUEST_BIND) continue; + size_t offset = 0; + iterate_binded(originated, [&](MemRequest & reference, MemRequest & binded) { + if (&originated == &reference) { + offset = 0; + } + offset += binded._offset; + auto current = offset + ALIGN(binded._num_elements * binded._element_size, binded._alignment); + auto original_no_pad = ALIGN(originated._num_elements * originated._element_size, originated._alignment); + auto original_with_pad = ALIGN(originated._num_elements * originated._element_size + originated._padding, originated._alignment); + + originated._padding = ALIGN(std::max(original_with_pad, current), originated._alignment) - original_no_pad; + }); + } + + updateSectionsSizes(); + + _total = _rw_section_size + _ro_section_size; + + // allocation with memory setting to 0 internally + heap = allocate(_total); + auto setupOffsets = [&](std::function filter, size_t offset) { + for (auto &re : _future_heap) { + if (re._type == REQUEST_BIND) continue; + if (filter(re)) continue; + + auto sz = re._element_size * re._num_elements; + + if (re._ptr_out != nullptr) { + auto cptr = heap.get() + offset; + *reinterpret_cast(re._ptr_out) = cptr; + // std::cout << "ALLOCATED=" << cptr << ", size=" << re._element_size * re._num_elements << "\n"; + iterate_binded(re, [](MemRequest & reference, MemRequest & binded) { + *reinterpret_cast(binded._ptr_out) = + binded._offset + reinterpret_cast(*reinterpret_cast(reference._ptr_out)); + }); + + // std::cout << "size=" << ALIGN(sz, re._alignment) << "\n" << std::flush; + + switch (re._type) { + case REQUEST_ALLOCATE :break; + case REQUEST_STORE : { + if (re._ptr_in != nullptr) { + memcpy(cptr, re._ptr_in, sz); + } else { + size_t of = 0; + for (int i = 0; i < re._num_elements; i++, of += re._element_size) { + std::copy(std::begin(re._data), std::end(re._data), cptr + of); + } + } + break; + } + case REQUEST_INITIALIZER : { + re._initializer(cptr, sz); + break; + } + } + } + + offset += ALIGN(sz + re._padding, re._alignment); + } + }; + + setupOffsets([](MemRequest & request) { + return request._region != REGION_RW; + }, 0); + + setupOffsets([](MemRequest & request) { + return request._region != REGION_RO; + }, _rw_section_size); + } + + void *getBasePtr() { + return heap.get(); + } + + size_t getRWBytes() { + updateSectionsSizes(); + return _rw_section_size; + } + + size_t getTotalBytes() { + updateSectionsSizes(); + return _total; + } + + protected: + rRegion regionType() const override { + return REGION_RW; + }; + std::vector & futureHeap() override { + return _future_heap; + } + std::list> &localStorage() override { + return _local_storage; + } + + template + void iterate_binded(MemRequest & reference, const T & visitor) { + for (auto &re : _future_heap) { + if (re._type == REQUEST_BIND && re._ptr_in == reference._ptr_out) { + // std::cout << " [binded=" << re._ptr_out <<"]\n"; + visitor(reference, re); + // TODO: no circular dependency checking, only tree-style dependency supported + iterate_binded(re, visitor); + } + } + } + + + std::shared_ptr allocate(size_t bytes) { + std::shared_ptr sp(_allocator.allocate(bytes), [=](uint8_t *p) { + _allocator.deallocate(p, bytes); + }); + std::fill(sp.get(), sp.get() + bytes, 0); + return sp; + } + + protected: + void updateSectionsSizes() { + // count total size and size of read/write regions + _rw_section_size = 0; + _ro_section_size = 0; + for (auto &re : _future_heap) { + auto current = ALIGN(re._num_elements * re._element_size + re._padding, re._alignment); +#ifdef GNA_HEAP_PROFILER + std::cout << "chunk: " << " region: " << re._region << ", " << + "type: " << (re._type == REQUEST_STORE ? "store " : re._type == REQUEST_BIND ? "bind " : "alloc ") << + std::setw(10) << re._num_elements << ", " << + static_cast(re._element_size) << ", " << + re._padding << ", " << + re._offset << ", " << + re._alignment << std::endl; +#endif + if (re._type == REQUEST_BIND) continue; + + if (re._region == REGION_RW) { + _rw_section_size += current; + } else { + _ro_section_size += current; + } + } + _rw_section_size = ALIGN(_rw_section_size, _page_alignment); + _ro_section_size = ALIGN(_ro_section_size, _page_alignment); + } +}; +} // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/gna_memory_state.hpp b/inference-engine/src/gna_plugin/gna_memory_state.hpp new file mode 100644 index 00000000000000..7edcb02e5bfd09 --- /dev/null +++ b/inference-engine/src/gna_plugin/gna_memory_state.hpp @@ -0,0 +1,25 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include "gna_plugin.hpp" + +namespace GNAPluginNS { + +class GNAMemoryState : public InferenceEngine::MemoryStateInternal { + std::shared_ptr plg; + public: + using Ptr = InferenceEngine::MemoryStateInternal::Ptr; + + explicit GNAMemoryState(std::shared_ptr plg) + : InferenceEngine::MemoryStateInternal("GNAResetState"), plg(plg) {} + void Reset() override { + plg->Reset(); + } +}; + +} // namespace GNAPluginNS \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/gna_model_serial.cpp b/inference-engine/src/gna_plugin/gna_model_serial.cpp index 74f3af1125e1a0..3b14b8c81c4950 100644 --- a/inference-engine/src/gna_plugin/gna_model_serial.cpp +++ b/inference-engine/src/gna_plugin/gna_model_serial.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -7,20 +7,12 @@ #include
#include #include -#include - -#if defined __INTEL_COMPILER || defined _MSC_VER -#include -#else +#ifndef _WIN32 #include #endif - -#include "gna_plugin.hpp" +#include #include "gna_model_serial.hpp" - -inline void writeNBytes(const void *ptr, uint32_t size, std::ostream & os) { - os.write(static_cast(ptr), size); -} +#include "gna_plugin_log.hpp" template inline void writeBits(const T & obj, std::ostream & os) { @@ -32,10 +24,6 @@ inline void readBits(T & obj, std::istream & is) { is.read(reinterpret_cast(&obj), sizeof(T)); } -inline void readNBytes(void * ptr, uint32_t size, std::istream & is) { - is.read(reinterpret_cast(ptr), size); -} - template inline void readNBits(T & obj, std::istream & is) { std::array tmp; @@ -44,15 +32,11 @@ inline void readNBits(T & obj, std::istream & is) { obj = * reinterpret_cast(&tmp.front()); } -inline void * offsetToPointer(void * const base, uint64_t offset) { - return reinterpret_cast(base) + offset; -} - template inline void readOffset(T & ptr, void *base, std::istream & is) { uint64_t offset = 0ull; readBits(offset, is); - ptr = reinterpret_cast(offsetToPointer(base, offset)); + ptr = reinterpret_cast(reinterpret_cast(base) + offset); } union { @@ -79,8 +63,8 @@ ModelHeader GNAModelSerial::ReadHeader(std::istream &is) { std::hex << std::setw(2) << static_cast(header.gnam[2]) << std::hex << std::setw(2) << static_cast(header.gnam[3]); } - if (header.version.major != HEADER_MAJOR) { - THROW_GNA_EXCEPTION << "Imported file unsupported: major version should be == " << HEADER_MAJOR; + if (header.version.major < 1) { + THROW_GNA_EXCEPTION << "Imported file unsupported: major version sould be > 1"; } if (header.headerSize < sizeof(header)) { THROW_GNA_EXCEPTION << "Unsupported header size minimal value is : " << sizeof (header) << ", but read: " << header.headerSize; @@ -96,228 +80,10 @@ ModelHeader GNAModelSerial::ReadHeader(std::istream &is) { return header; } -#define offsetFromBase(field)\ -getOffsetFromBase(field, #field) - -#if GNA_LIB_VER == 2 - -bool IsEmptyTensor(const Gna2Tensor& t) { - return t.Type == Gna2DataTypeNone && - t.Data == nullptr && - t.Layout[0] == '\0' && - t.Mode == Gna2TensorModeDefault && - t.Shape.NumberOfDimensions == 0; -} - -const std::map> GnaParamSize{ - {Gna2OperationTypeFullyConnectedAffine, {sizeof(Gna2BiasMode), sizeof(uint32_t)}}, - {Gna2OperationTypeConvolution, { - sizeof(Gna2Shape), - sizeof(Gna2BiasMode), - sizeof(Gna2PoolingMode), - sizeof(Gna2Shape), - sizeof(Gna2Shape), - sizeof(Gna2Shape)}}, - {Gna2OperationTypeCopy, {sizeof(Gna2Shape)}}, -}; - -void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream & is) { +void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream & is) { is.exceptions(std::istream::failbit); - for (auto operation = gna2Model->Operations; operation != gna2Model->Operations + gna2Model->NumberOfOperations; ++operation) { - readNBits<32>(operation->Type, is); - readBits(operation->NumberOfOperands, is); - operation->Operands = static_cast(gnaUserAllocator(sizeof(Gna2Tensor*) * operation->NumberOfOperands)); - for (uint32_t i = 0; i < operation->NumberOfOperands; i++) { - Gna2Tensor t{}; - readBits(t, is); - if (IsEmptyTensor(t)) { - operation->Operands[i] = nullptr; - } else { - operation->Operands[i] = static_cast(gnaUserAllocator(sizeof(Gna2Tensor))); - t.Data = offsetToPointer(basePointer, reinterpret_cast(t.Data)); - const_cast(*operation->Operands[i]) = t; - } - } - readBits(operation->NumberOfParameters, is); - switch (operation->Type) { - case Gna2OperationTypeElementWiseAffine: - case Gna2OperationTypeFullyConnectedAffine: - case Gna2OperationTypeConvolution: - case Gna2OperationTypeCopy: - break; - case Gna2OperationTypeRecurrent: - THROW_GNA_EXCEPTION << "Importing of recurrent operation not supported"; - case Gna2OperationTypeTransposition: - THROW_GNA_EXCEPTION << "Importing of transposition operation not supported"; - default: - THROW_GNA_EXCEPTION << "Importing of unknown GNA operation type(" << operation->Type << ") not supported"; - } - if (operation->NumberOfParameters > 0) - operation->Parameters = static_cast(gnaUserAllocator(sizeof(void*) * operation->NumberOfParameters)); - else - operation->Parameters = nullptr; - for (uint32_t i = 0; i < operation->NumberOfParameters; i++) { - uint32_t paramSize; - readBits(paramSize, is); - if (paramSize == 0) { - operation->Parameters[i] = nullptr; - continue; - } - operation->Parameters[i] = gnaUserAllocator(paramSize); - readNBytes(operation->Parameters[i], paramSize, is); - - if (GnaParamSize.at(operation->Type).size() <= i) { - THROW_GNA_EXCEPTION << "Cannot import parameter of index: " << i; - } - if (paramSize != GnaParamSize.at(operation->Type).at(i)) { - THROW_GNA_EXCEPTION << "Parameter size mismatch on import: " << i; - } - } - } - - // writing memory information - uint32_t nStates = 0; - readBits(nStates, is); - if (pstates != nullptr) { - pstates->resize(nStates); - } - - for (int i = 0; i != nStates; i++) { - void *pSegment; - readOffset(pSegment, basePointer, is); - uint32_t segmentSz; - readBits(segmentSz, is); - if (pstates) { - (*pstates)[i] = { pSegment, segmentSz }; - } - } - - - // once structure has been read lets read whole gna graph - is.read(reinterpret_cast(basePointer), gnaGraphSize); -} - - -uint32_t guessGrouping(Gna2Model const& model) { - if (model.NumberOfOperations == 0 || - model.Operations == nullptr || - model.Operations[0].Operands == nullptr || - model.Operations[0].NumberOfOperands == 0 || - model.Operations[0].Operands[0]->Shape.NumberOfDimensions < 2) { - THROW_GNA_EXCEPTION << "Can not guess grouping"; - } - return (std::min)(model.Operations[0].Operands[0]->Shape.Dimensions[0], model.Operations[0].Operands[0]->Shape.Dimensions[1]); -} - -void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostream & os) const { - os.exceptions(std::ostream::failbit); - - const std::vector - layers(gna2Model->Operations, gna2Model->Operations + gna2Model->NumberOfOperations); - - - // all offsets will be from this pointer - auto getOffsetFromBase = [basePointer, &gnaGraphSize](void * pointer, const char * name = nullptr) { - auto offset = static_cast(std::distance(reinterpret_cast(basePointer), reinterpret_cast(pointer))); - if (offset > gnaGraphSize) { - THROW_GNA_EXCEPTION << "offset to " << (name == nullptr ? "" : name) << "(0x" << pointer - << ") not in range segment retuned from GNAAlloc(0x" << basePointer << "-0x" - << reinterpret_cast(reinterpret_cast(basePointer) + gnaGraphSize) << ")"; - } - return offset; - }; - - auto getTensorWithProperOffset = [&getOffsetFromBase](const Gna2Tensor& tensor) { - Gna2Tensor out = tensor; - out.Data = reinterpret_cast(getOffsetFromBase(tensor.Data)); - return out; - }; - - auto convert_to_serial = [getOffsetFromBase](const GNAModelSerial::RuntimeEndPoint& ep) { - ModelHeader::EndPoint out; - out.elements_count = ep.elements_count; - out.descriptor_offset = offsetFromBase(ep.descriptor_ptr); - out.scaleFactor = ep.scaleFactor; - out.element_size = ep.element_size; - return out; - }; - /** - * writing header - */ - ModelHeader header; - header.gnam[0] = 'G'; - header.gnam[1] = 'N'; - header.gnam[2] = 'A'; - header.gnam[3] = 'M'; - header.headerSize = sizeof(ModelHeader); - header.version.major = HEADER_MAJOR; - header.version.minor = HEADER_MINOR; - header.gnaMemSize = gnaGraphSize; - header.layersCount = layers.size(); - header.nGroup = guessGrouping(*gna2Model); - header.input = convert_to_serial(input); - header.output = convert_to_serial(output); - - header.nRotateRows = nRotateRows; - header.nRotateColumns = nRotateColumns; - - - writeBits(header, os); - - for (const auto & layer : layers) { - writeBits(static_cast(layer.Type), os); - writeBits(layer.NumberOfOperands, os); - - for (uint32_t i = 0; i < layer.NumberOfOperands; i++) { - if (layer.Operands[i] == nullptr) - writeBits(Gna2Tensor{}, os); - else - writeBits(getTensorWithProperOffset(*layer.Operands[i]), os); - } - - writeBits(layer.NumberOfParameters, os); - - // writing parameters - switch (layer.Type) { - case Gna2OperationTypeElementWiseAffine: - case Gna2OperationTypeFullyConnectedAffine: - case Gna2OperationTypeConvolution: - case Gna2OperationTypeCopy: - break; - case Gna2OperationTypeRecurrent: - THROW_GNA_EXCEPTION << "Exporting of recurrent operation not supported"; - case Gna2OperationTypeTransposition: - THROW_GNA_EXCEPTION << "Exporting of interleave operation not supported"; - default: - THROW_GNA_EXCEPTION << "Exporting of unknown GNA operation type(" << layer.Type << ") not supported"; - } - for (uint32_t i = 0; i < layer.NumberOfParameters; i++) { - if (layer.Parameters[i] == nullptr) { - writeBits(static_cast(0), os); - continue; - } - const auto paramSize = GnaParamSize.at(layer.Type).at(i); - writeBits(paramSize, os); - writeNBytes(layer.Parameters[i], paramSize, os); - } - } - // writing memory information - writeBits(static_cast(states.size()), os); - for (auto && state : states) { - writeBits(offsetFromBase(state.first), os); - writeBits(state.second, os); - } - - // once structure has been written lets push gna graph - os.write(reinterpret_cast(basePointer), gnaGraphSize); -} -#else - -void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream & is) { - is.exceptions(std::istream::failbit); - - auto readPwl = [&is, basePointer](intel_pwl_func_t & value) { + auto readPwl = [&is, basePointer] (intel_pwl_func_t & value) { readBits(value.nSegments, is); if (value.nSegments != 0) { readOffset(value.pSegments, basePointer, is); @@ -338,74 +104,60 @@ void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream // reading layers structs switch (layer->nLayerKind) { - case INTEL_AFFINE_DIAGONAL: - case INTEL_AFFINE: { - layer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64); - if (layer->pLayerStruct == nullptr) { - THROW_GNA_EXCEPTION << "could not allocate memory for intel_affine_layer_t structure."; - } - - auto &affine = *reinterpret_cast(layer->pLayerStruct); - readBits(affine.affine.nBytesPerWeight, is); - readBits(affine.affine.nBytesPerBias, is); - readOffset(affine.affine.pWeights, basePointer, is); - readOffset(affine.affine.pBiases, basePointer, is); - readPwl(affine.pwl); - break; - } - case INTEL_CONVOLUTIONAL: { - layer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64); - if (layer->pLayerStruct == nullptr) { - THROW_GNA_EXCEPTION << "could not allocate memory for intel_convolutional_layer_t structure."; + case INTEL_AFFINE_DIAGONAL: + case INTEL_AFFINE: { + layer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64); + if (layer->pLayerStruct == nullptr) { + THROW_GNA_EXCEPTION << "could not allocate memory for intel_affine_layer_t structure."; + } + + auto &affine = *reinterpret_cast(layer->pLayerStruct); + readBits(affine.affine.nBytesPerWeight, is); + readBits(affine.affine.nBytesPerBias, is); + readOffset(affine.affine.pWeights, basePointer, is); + readOffset(affine.affine.pBiases, basePointer, is); + readPwl(affine.pwl); + break; } - - auto &convolution = *reinterpret_cast(layer->pLayerStruct); - readBits(convolution.nFilterCoefficients, is); - readBits(convolution.nBytesFilterCoefficient, is); - readBits(convolution.nBytesBias, is); - readBits(convolution.nFilters, is); - readBits(convolution.nFeatureMaps, is); - readBits(convolution.nFeatureMapRows, is); - readBits(convolution.nFeatureMapColumns, is); - readBits(convolution.nFilterRows, is); - readOffset(convolution.pFilters, basePointer, is); - readOffset(convolution.pBiases, basePointer, is); - readBits(convolution.nPoolSize, is); - readBits(convolution.nPoolStride, is); - readBits(convolution.poolType, is); - readPwl(convolution.pwl); - break; - } - - case INTEL_COPY: { - layer->pLayerStruct = _mm_malloc(sizeof(intel_copy_layer_t), 64); - if (layer->pLayerStruct == nullptr) { - THROW_GNA_EXCEPTION << "could not allocate memory for intel_copy_layer_t structure."; + case INTEL_CONVOLUTIONAL: { + layer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64); + if (layer->pLayerStruct == nullptr) { + THROW_GNA_EXCEPTION <<"could not allocate memory for intel_convolutional_layer_t structure."; + } + + auto &convolution = *reinterpret_cast(layer->pLayerStruct); + readBits(convolution.nFilterCoefficients, is); + readBits(convolution.nBytesFilterCoefficient, is); + readBits(convolution.nBytesBias, is); + readBits(convolution.nFilters, is); + readBits(convolution.nFeatureMaps, is); + readBits(convolution.nFeatureMapRows, is); + readBits(convolution.nFeatureMapColumns, is); + readBits(convolution.nFilterRows, is); + readOffset(convolution.pFilters, basePointer, is); + readOffset(convolution.pBiases, basePointer, is); + readBits(convolution.nPoolSize, is); + readBits(convolution.nPoolStride, is); + readBits(convolution.poolType, is); + readPwl(convolution.pwl); + break; } - auto © = *reinterpret_cast(layer->pLayerStruct); - readBits(copy.nCopyRows, is); - readBits(copy.nCopyCols, is); - break; - } - - case INTEL_RECURRENT: - THROW_GNA_EXCEPTION << "Importing of recurrent layer not supported"; - case INTEL_INTERLEAVE: - THROW_GNA_EXCEPTION << "Importing of interleave layer not supported"; - case INTEL_DEINTERLEAVE: - THROW_GNA_EXCEPTION << "Importing of deinterleave layer not supported"; - default: - THROW_GNA_EXCEPTION << "Importing of unknown GNA layer kind(" << layer->nLayerKind << ") not supported"; + case INTEL_RECURRENT: + THROW_GNA_EXCEPTION << "Importing of recurrent layer not supported"; + case INTEL_INTERLEAVE: + THROW_GNA_EXCEPTION << "Importing of interleave layer not supported"; + case INTEL_DEINTERLEAVE: + THROW_GNA_EXCEPTION << "Importing of deinterleave layer not supported"; + case INTEL_COPY: + THROW_GNA_EXCEPTION << "Importing of copy layer not supported"; + default: + THROW_GNA_EXCEPTION << "Importing of unknown GNA layer kind(" << layer->nLayerKind << ") not supported"; } // reading offsets of inputs/outputs readOffset(layer->pInputs, basePointer, is); - if (layer->nLayerKind == INTEL_COPY) { - layer->pOutputsIntermediate = nullptr; - } else { - readOffset(layer->pOutputsIntermediate, basePointer, is); - } + readOffset(layer->pOutputsIntermediate, basePointer, is); readOffset(layer->pOutputs, basePointer, is); } @@ -417,13 +169,13 @@ void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream } for (int i = 0; i != nStates; i++) { - void *pSegment; - readOffset(pSegment, basePointer, is); - uint32_t segmentSz; - readBits(segmentSz, is); - if (pstates) { - (*pstates)[i] = { pSegment, segmentSz }; - } + void *pSegment; + readOffset(pSegment, basePointer, is); + uint32_t segmentSz; + readBits(segmentSz, is); + if (pstates) { + (*pstates)[i] = {pSegment, segmentSz}; + } } @@ -431,6 +183,10 @@ void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream is.read(reinterpret_cast(basePointer), gnaGraphSize); } +#define offsetFromBase(field)\ +getOffsetFromBase(field, #field) + + /** * * @param ptr_nnet @@ -438,7 +194,6 @@ void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream * about base adress it is relatively easy to calculate * @param os */ - void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostream & os) const { os.exceptions(std::ostream::failbit); @@ -451,7 +206,7 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea auto offset = static_cast(std::distance(reinterpret_cast(basePointer), reinterpret_cast(pointer))); if (offset > gnaGraphSize) { THROW_GNA_EXCEPTION << "offset to " << (name == nullptr ? "" : name) << "(0x" << pointer - << ") not in range segment returned from GNAAlloc(0x" << basePointer << "-0x" + << ") not in range segment retuned from GNAAlloc(0x" << basePointer << "-0x" << reinterpret_cast(reinterpret_cast(basePointer) + gnaGraphSize) << ")"; } return offset; @@ -536,28 +291,21 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea break; } - case INTEL_COPY: { - auto © = *reinterpret_cast(layer.pLayerStruct); - writeBits(copy.nCopyRows, os); - writeBits(copy.nCopyCols, os); - break; - } - case INTEL_RECURRENT: THROW_GNA_EXCEPTION << "Exporting of recurrent layer not supported"; case INTEL_INTERLEAVE: THROW_GNA_EXCEPTION << "Exporting of interleave layer not supported"; case INTEL_DEINTERLEAVE: THROW_GNA_EXCEPTION << "Exporting of deinterleave layer not supported"; + case INTEL_COPY: + THROW_GNA_EXCEPTION << "Exporting of copy layer not supported"; default: THROW_GNA_EXCEPTION << "Exporting of unknown GNA layer kind(" << layer.nLayerKind << ") not supported"; } // writing offsets from base. writeBits(offsetFromBase(layer.pInputs), os); - if (layer.nLayerKind != INTEL_COPY) { - writeBits(offsetFromBase(layer.pOutputsIntermediate), os); - } + writeBits(offsetFromBase(layer.pOutputsIntermediate), os); writeBits(offsetFromBase(layer.pOutputs), os); } // writing memory information @@ -570,5 +318,3 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea // once structure has been written lets push gna graph os.write(reinterpret_cast(basePointer), gnaGraphSize); } - -#endif diff --git a/inference-engine/src/gna_plugin/gna_model_serial.hpp b/inference-engine/src/gna_plugin/gna_model_serial.hpp index 28dacfb306fe6b..0ba5be5ab16e55 100644 --- a/inference-engine/src/gna_plugin/gna_model_serial.hpp +++ b/inference-engine/src/gna_plugin/gna_model_serial.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -8,10 +8,6 @@ #include #include #include "gna-api.h" -#include "gna_plugin_log.hpp" -#if GNA_LIB_VER == 2 -#include "gna2-model-api.h" -#endif #pragma pack(push, 1) @@ -19,16 +15,10 @@ * version history * 1.0 - basic support * 1.1 - added memory information - * 2.0 - for use with GNA2 library */ -#if GNA_LIB_VER == 2 -#define HEADER_MAJOR 2 -#define HEADER_MINOR 0 -#else + #define HEADER_MAJOR 1 #define HEADER_MINOR 1 -#endif - /** * @brief Header version 1.0 @@ -64,10 +54,12 @@ struct ModelHeader { * @brief Number of GNA Layers */ uint64_t layersCount = 0ull; + /** * @brief Grouping level */ uint32_t nGroup = 0u; + /** * Convolution related setting - they are affecting input transformation */ @@ -108,25 +100,25 @@ struct ModelHeader { class GNAModelSerial { public: /* - * In runtime endpoint mostly same as in serial version, except of descriptor field + * In runtime endpoint mostly same as in serial version, except pf descriptor field */ struct RuntimeEndPoint { /** * if scale factor is different then pased into infer , network might need to be requantized */ - float scaleFactor = 0; + float scaleFactor; /** * Pointer descriptor */ - void* descriptor_ptr = nullptr; + void* descriptor_ptr; /** * Endpoint resolution in bytes. */ - uint32_t element_size = 0; + uint32_t element_size; /** * Number of elements */ - uint32_t elements_count = 0; + uint32_t elements_count; RuntimeEndPoint() = default; RuntimeEndPoint(double scaleFactor, @@ -141,11 +133,7 @@ class GNAModelSerial { using MemoryType = std::vector>; private: -#if GNA_LIB_VER == 2 - Gna2Model * gna2Model; -#else intel_nnet_type_t *ptr_nnet; -#endif RuntimeEndPoint input, output; uint32_t nRotateRows = 0; uint32_t nRotateColumns = 0; @@ -153,41 +141,28 @@ class GNAModelSerial { MemoryType states, *pstates = nullptr; public: -#if GNA_LIB_VER == 2 - GNAModelSerial(Gna2Model * model, MemoryType & states_holder) - : gna2Model(model), pstates(&states_holder) { + /** + * + * @brief Used for import/export + * @param ptr_nnet + * @param inputScale - in/out parameter representing input scale factor + * @param outputScale - in/out parameter representing output scale factor + */ + GNAModelSerial(intel_nnet_type_t *ptr_nnet, MemoryType &states_holder) + : ptr_nnet(ptr_nnet) , pstates(&states_holder) { } + /** + * @brief used for export only since runtime params are not passed by pointer + * @param ptr_nnet + * @param runtime + */ GNAModelSerial( - Gna2Model * model, + intel_nnet_type_t *ptr_nnet, RuntimeEndPoint input, - RuntimeEndPoint output) : gna2Model(model), input(input), output(output) { + RuntimeEndPoint output) : ptr_nnet(ptr_nnet), input(input), output(output) { } -#else - /** - * - * @brief Used for import/export - * @param ptr_nnet - * @param inputScale - in/out parameter representing input scale factor - * @param outputScale - in/out parameter representing output scale factor - */ - GNAModelSerial(intel_nnet_type_t *ptr_nnet, MemoryType &states_holder) - : ptr_nnet(ptr_nnet), pstates(&states_holder) { - } - - /** - * @brief used for export only since runtime params are not passed by pointer - * @param ptr_nnet - * @param runtime - */ - GNAModelSerial( - intel_nnet_type_t *ptr_nnet, - RuntimeEndPoint input, - RuntimeEndPoint output) : ptr_nnet(ptr_nnet), input(input), output(output) { - } -#endif - GNAModelSerial & SetInputRotation(uint32_t nRotateRows, uint32_t nRotateColumns) { this->nRotateColumns = nRotateColumns; this->nRotateRows = nRotateRows; @@ -231,4 +206,4 @@ class GNAModelSerial { void Export(void *basePtr, size_t gnaGraphSize, std::ostream &os) const; -}; +}; \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp index 1a961647832ec1..620aa489c1b175 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin.cpp @@ -1,13 +1,32 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #define NOMINMAX +#include "cpp_interfaces/base/ie_plugin_base.hpp" +#include "gna_plugin.hpp" +#include "ie_plugin_config.hpp" +#include "debug.h" +#include "blob_factory.hpp" +#include "gna_plugin_log.hpp" +#include "gna_layer_info.hpp" +#include +#include +#include "ie_memcpy.h" + +#ifdef PLOT +void ExportGnaNetworkAndrzej(const char *ptr_name, intel_nnet_type_t* pNeuralNetwork); +#endif -#include +#include +#include #include +#include +#include #include -#include +#include +#include +#include #include #include #include @@ -15,62 +34,87 @@ #include #include #include -#include -#include - -#include -#include -#include -#include +#include +#include +#include "details/caseless.hpp" +#include +#include "gna-api.h" +#include "gna-api-dumper.h" +#include "dnn.h" +#include "pwl.h" +#include "util.h" +#include "quantization/quantization.h" +#include "lstm.hpp" +#include "graph_tools.hpp" #include "gna_plugin_config.hpp" -#include -#include "gna_plugin.hpp" -#include "optimizer/gna_pass_manager.hpp" -#include "layers/gna_layer_type.hpp" -#include "preprocessing.hpp" -#include "frontend/weights_converter.hpp" -#include "frontend/model_quantizer.hpp" -#include "gna_fused_iterator.hpp" -#include "backend/am_intel_dnn.hpp" -#include "memory/gna_allocator.hpp" -#include "memory/gna_memory_state.hpp" +#include "gna/gna_config.hpp" +#include "quantization/model_quantizer.hpp" #include "gna_model_serial.hpp" +#include "gna_memory_state.hpp" +#include "details/ie_cnn_network_tools.h" -#if GNA_LIB_VER == 2 -#include - -uint32_t ToByteSize(const Gna2DataType type) { - switch (type) { - case Gna2DataTypeInt8: - case Gna2DataTypeUint8: - return 1; - case Gna2DataTypeInt16: - case Gna2DataTypeUint16: - return 2; - case Gna2DataTypeInt32: - case Gna2DataTypeUint32: - return 4; - case Gna2DataTypeInt64: - case Gna2DataTypeUint64: - return 8; - default: - return 0; - } -} - -constexpr uint32_t GNAPluginNS::GNAPlugin::FAKE_REQUEST_CONFIG_ID; -#endif using namespace InferenceEngine; using namespace std; using namespace GNAPluginNS; using namespace InferenceEngine::details; -#ifdef __clang__ -namespace InferenceEngine { - template<> - InferenceEngine::TBlob >::~TBlob() { free(); } +#ifdef VERBOSE +#define VERBOSE_LEVEL (1) +#else +#define VERBOSE_LEVEL (0) +#endif + +#ifdef PLOT +#define PLOT_LEVEL (1) +#else +#define PLOT_LEVEL (0) +#endif + + +#define PAGE_SIZE_BYTES 4096 + +#define FROM_IR_DIM(mem, idx)\ +((mem->dims.size() > idx - 1) ? mem->dims[idx - 1] : 1) + +inline int16_t GNAPluginNS::ConvertFloatToInt16(float src) { + float rounding_value = (src > 0) ? 0.5f : -0.5f; + float value = src + rounding_value; + if (value > 32767.0) { + return 32767; + } else if (value < -32768.0) { + return -32768; + } + return (int16_t)value; +} + +void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst, + const float *ptr_src, + const uint32_t num_rows, + const uint32_t num_columns, + const float scale_factor) { + if (!ptr_dst || !ptr_src) { + return; + } + for (uint32_t i = 0; i < num_rows*num_columns; i++) { + ptr_dst[i] = GNAPluginNS::ConvertFloatToInt16(ptr_src[i]*scale_factor); + } +} +void GNAPluginNS::ConvertToFloat(float *ptr_dst, + int32_t *ptr_src, + const uint32_t num_rows, + const uint32_t num_columns, + const float scale_factor) { + if (!ptr_dst || !ptr_src) { + return; + } + for (uint32_t i = 0; i < num_rows; i++) { + int32_t *ptr_int_row = ptr_src + i * num_columns; + float *ptr_float_row = ptr_dst + i * num_columns; + for (uint32_t j = 0; j < num_columns; j++) { + ptr_float_row[j] = static_cast(ptr_int_row[j]) / scale_factor; + } + } } -#endif // __clang__ template void GNAPlugin::copyInputData(T *dst, @@ -79,8 +123,7 @@ void GNAPlugin::copyInputData(T *dst, uint32_t num_group, uint32_t num_vector_elements, uint32_t num_vector_stride, - intel_dnn_orientation_t orientation, - float scaleFactor) { + intel_dnn_orientation_t orientation) { if (!dst || !src) { return; } @@ -88,7 +131,7 @@ void GNAPlugin::copyInputData(T *dst, for (uint32_t i = 0; i < num_frames; i++) { for (uint32_t j = 0; j < num_vector_elements; j++) { if (!std::is_same::value) { - dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * scaleFactor); + dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * input_scale_factor); } else { dst[j * num_group + i] = src[i * num_vector_elements + j]; } @@ -107,26 +150,25 @@ void GNAPlugin::copyInputData(T *dst, } else { if (!std::is_same::value) { for (uint32_t i = 0; i < num_frames; i++) { - T *ptr_dst_vec = reinterpret_cast(dst) + i * num_vector_stride; - const U *ptr_src_vec = reinterpret_cast(src) + i * num_vector_elements; + T *ptr_dst_vec = const_cast(reinterpret_cast(dst) + i * num_vector_stride); + U *ptr_src_vec = const_cast(reinterpret_cast(src) + i * num_vector_elements); std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T)); for (int j=0; j < num_vector_elements; j++) { - ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * scaleFactor); + ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * input_scale_factor); } } } else { for (uint32_t i = 0; i < num_frames; i++) { - void *ptr_dst_vec = reinterpret_cast(dst) + i * num_vector_stride * sizeof(T); - const void *ptr_src_vec = reinterpret_cast(src) + i * num_vector_elements * sizeof(U); + void *ptr_dst_vec = const_cast(reinterpret_cast(dst) + i * num_vector_stride * sizeof(T)); + void *ptr_src_vec = const_cast(reinterpret_cast(src) + i * num_vector_elements * sizeof(U)); std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T)); - ie_memcpy(ptr_dst_vec, num_vector_elements * sizeof(T), - ptr_src_vec, num_vector_elements * sizeof(T)); + std::memcpy(ptr_dst_vec, ptr_src_vec, num_vector_elements * sizeof(T)); } } for (uint32_t i = num_frames; i < num_group; i++) { - void *ptr_dst_vec = reinterpret_cast(dst) + i * num_vector_stride * sizeof(T); + void *ptr_dst_vec = const_cast(reinterpret_cast(dst) + i * num_vector_stride * sizeof(T)); std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T)); } } @@ -136,8 +178,7 @@ template void GNAPlugin::copyInputDataWithSplit(T *const dst, const U *src, const GNASplitLayer& splitInfo, - size_t precision_size, - int idx) { + size_t precision_size) { if (!dst || !src) { return; } @@ -146,15 +187,11 @@ void GNAPlugin::copyInputDataWithSplit(T *const dst, precision_size = sizeof(T); // we found split/slice layer connected to Input for (auto&& outputLayer : splitInfo.splitOutputLayers) { - uint32_t begin = outputLayer.offset / precision_size; + uint32_t begin = outputLayer.offset/precision_size; uint32_t end = (outputLayer.offset + outputLayer.pure_size)/precision_size; - if (dst_ptr - dst >= end) { - // output layer with bind pointer as previous one. Skip - continue; - } for (uint32_t i = begin; i < end; ++i) { if (!std::is_same::value) { - *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * inputsDesc->getScaleFactor(idx)); + *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * input_scale_factor); } else { *(dst_ptr++) = *(src_ptr++); } @@ -167,7 +204,7 @@ void GNAPlugin::copyInputDataWithSplit(T *const dst, } void GNAPlugin::ExportScores(void *ptr_dst, - const void *ptr_src, + void *ptr_src, intel_dnn_orientation_t orientation, uint32_t num_frames, uint32_t num_group, @@ -181,7 +218,7 @@ void GNAPlugin::ExportScores(void *ptr_dst, if (orientation == kDnnInterleavedOrientation) { if (num_bytes_per_element == 2) { int16_t *dst = reinterpret_cast(ptr_dst); - const int16_t *src = reinterpret_cast(ptr_src); + int16_t *src = reinterpret_cast(ptr_src); for (uint32_t i = 0; i < num_frames; i++) { for (uint32_t j = 0; j < num_active_elements; j++) { dst[i * num_vector_elements + j] = src[j * num_group + i]; @@ -192,7 +229,7 @@ void GNAPlugin::ExportScores(void *ptr_dst, } } else if (num_bytes_per_element == 4) { // should work for both int and float int32_t *dst = reinterpret_cast(ptr_dst); - const int8_t *src = reinterpret_cast(ptr_src); + int8_t *src = reinterpret_cast(ptr_src); for (uint32_t i = 0; i < num_frames; i++) { for (uint32_t j = 0; j < num_active_elements; j++) { auto input_ptr = src + (j * num_group + i) * num_bytes_per_element_input; @@ -200,11 +237,11 @@ void GNAPlugin::ExportScores(void *ptr_dst, switch (num_bytes_per_element_input) { case 2 : { - *dst_ptr = static_cast(*reinterpret_cast(input_ptr)); + *dst_ptr = static_cast(*reinterpret_cast(input_ptr)); break; } case 4 : { - *dst_ptr = *reinterpret_cast(input_ptr); + *dst_ptr = *reinterpret_cast(input_ptr); break; } default: @@ -221,19 +258,17 @@ void GNAPlugin::ExportScores(void *ptr_dst, } else { if (num_bytes_per_element == 2) { for (uint32_t i = 0; i < num_frames; i++) { - auto ptr_dst_vec = reinterpret_cast(ptr_dst) + i * num_vector_elements * sizeof(int16_t); - auto ptr_src_vec = reinterpret_cast(ptr_src) + i * num_vector_stride * sizeof(int16_t); + void *ptr_dst_vec = reinterpret_cast (reinterpret_cast(ptr_dst) + i * num_vector_elements * sizeof(int16_t)); + void *ptr_src_vec = reinterpret_cast (reinterpret_cast(ptr_src) + i * num_vector_stride * sizeof(int16_t)); memset(ptr_dst_vec, 0, num_vector_elements * sizeof(int16_t)); - ie_memcpy(ptr_dst_vec, num_active_elements * sizeof(int16_t), - ptr_src_vec, num_active_elements * sizeof(int16_t)); + memcpy(ptr_dst_vec, ptr_src_vec, num_active_elements * sizeof(int16_t)); } } else if (num_bytes_per_element == 4) { // should work for both int and float for (uint32_t i = 0; i < num_frames; i++) { - void *ptr_dst_vec = reinterpret_cast(ptr_dst) + i * num_vector_elements * sizeof(float); - const void *ptr_src_vec = reinterpret_cast(ptr_src) + i * num_vector_stride * sizeof(float); + void *ptr_dst_vec = reinterpret_cast (reinterpret_cast(ptr_dst) + i * num_vector_elements * sizeof(float)); + void *ptr_src_vec = reinterpret_cast (reinterpret_cast(ptr_src) + i * num_vector_stride * sizeof(float)); memset(ptr_dst_vec, 0, num_vector_elements * sizeof(float)); - ie_memcpy(ptr_dst_vec, num_active_elements * sizeof(float), - ptr_src_vec, num_active_elements * sizeof(float)); + memcpy(ptr_dst_vec, ptr_src_vec, num_active_elements * sizeof(float)); } } else { THROW_GNA_EXCEPTION << "Unsupported target precision for infer : " << num_bytes_per_element << "bytes"; @@ -245,105 +280,1204 @@ void GNAPlugin::ImportFrames( void *ptr_dst, const void *ptr_src, Precision input_precision, - float scaleFactor, intel_dnn_orientation_t orientation, uint32_t num_frames, uint32_t num_group, uint32_t num_vector_elements, uint32_t num_vector_stride) { + // special case if split/slice layers connected + // with Input detected + auto it = split_connection.end(); + if (split_connection.size() != 0) { + it = std::find_if(split_connection.begin(), split_connection.end(), [] + (const std::pair &item) -> bool { + return CaselessEq()(item.second.splitInputLayer.name, "Input"); + }); + } if (orientation == kDnnInterleavedOrientation) { // TODO : fix that as well - if (input_precision == Precision::U8) { - auto src = reinterpret_cast(ptr_src); - auto dst = reinterpret_cast(ptr_dst); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); - } else if (input_precision.size() == 2) { - auto dst = reinterpret_cast(ptr_dst); - auto src = reinterpret_cast(ptr_src); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + if (input_precision.size() == 2) { + int16_t *dst = const_cast(reinterpret_cast(ptr_dst)); + int16_t *src = const_cast(reinterpret_cast(ptr_src)); + if (it != split_connection.end()) { + copyInputDataWithSplit(dst, src, it->second, input_precision.size()); + } else { + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); + } } else if (input_precision.size() == 4) { if (!gnadevice) { - auto dst = reinterpret_cast(ptr_dst); - auto src = reinterpret_cast(ptr_src); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + float *dst = const_cast(reinterpret_cast(ptr_dst)); + float *src = const_cast(reinterpret_cast(ptr_src)); + if (it != split_connection.end()) { + copyInputDataWithSplit(dst, src, it->second, input_precision.size()); + } else { + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); + } } else { - auto dst = reinterpret_cast(ptr_dst); - auto src = reinterpret_cast(ptr_src); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + int16_t *dst = reinterpret_cast(ptr_dst); + const float *src = reinterpret_cast(ptr_src); + if (it != split_connection.end()) { + copyInputDataWithSplit(dst, src, it->second, input_precision.size()); + } else { + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); + } } } } else { - if (input_precision == Precision::U8) { - auto src = reinterpret_cast(ptr_src); + if (input_precision.size()== 2) { + int16_t *dst = const_cast(reinterpret_cast(ptr_dst)); + int16_t *src = const_cast(reinterpret_cast(ptr_src)); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); + } else if (input_precision.size() == 4) { if (!gnadevice) { - auto dst = reinterpret_cast(ptr_dst); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + float *dst = const_cast(reinterpret_cast(ptr_dst)); + float *src = const_cast(reinterpret_cast(ptr_src)); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); } else { - auto dst = reinterpret_cast(ptr_dst); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + uint16_t *dst = const_cast(reinterpret_cast(ptr_dst)); + float *src = const_cast(reinterpret_cast(ptr_src)); + copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation); } + } + } +} - } else if (input_precision.size()== 2) { - auto dst = reinterpret_cast(ptr_dst); - auto src = reinterpret_cast(ptr_src); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); - } else if (input_precision.size() == 4) { - if (!gnadevice) { - auto dst = reinterpret_cast(ptr_dst); - auto src = reinterpret_cast(ptr_src); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); +void GNAPlugin::fillMemoryConnections(std::map>& + memoryPairs) { + for (auto &memory : memoryPairs) { + auto inputLayer = memory.second[1]; + auto outputLayer = memory.second[0]; + + IE_ASSERT(1 == outputLayer->insData.size()); + + // creating connection for layers output as form of extramap + memory_connection.emplace_back(memory.first, GNAMemoryLayer(inputLayer, outputLayer)); + } +} + +void GNAPlugin::fillConcatConnections(InferenceEngine::CNNLayerPtr layer) { + // creating connection for each layer outputs as form of extramap + GNAPlugin::GNAConcatLayer layerInfoItem(layer); + size_t concat_size = 0; + std::string& id = layer->name; + + for (size_t i = 0; i < layer->insData.size(); ++i) { + auto dataInput = layer->insData[i].lock(); + if (!dataInput) { + THROW_GNA_EXCEPTION << "Input layer pointer for concat is unexpectedly absent"; + } + + auto ptrConcatLayerInput = dataInput->creatorLayer.lock(); + if (!ptrConcatLayerInput) { + THROW_GNA_EXCEPTION << "Input layer for concat is unexpectedly absent"; + } + layerInfoItem.concatInputLayers.emplace_back( + GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo({ptrConcatLayerInput->name, concat_size})); + + size_t layer_size = + InferenceEngine::details::product(begin(dataInput->dims), + end(dataInput->dims)) * dataInput->precision.size(); + concat_size += layer_size; + } + layerInfoItem.reserved_size = concat_size; + concat_connection.emplace(id, layerInfoItem); +} + +void GNAPlugin::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) { + // creating connection for each layer inputs as form of extramap + GNAPlugin::GNASplitLayer layerInfoItem(layer); + size_t split_size = 0; + std::string& id = layer->name; + auto dataInput = layer->insData.begin()->lock(); + if (!dataInput) { + THROW_GNA_EXCEPTION << "Input layer pointer for split/slice is unexpectedly absent"; + } + auto ptrSplitLayerInput = dataInput->creatorLayer.lock(); + if (!ptrSplitLayerInput) { + THROW_GNA_EXCEPTION << "Input layer for split/slice is unexpectedly absent"; + } + + LayerInfo ptrSplitLayerInputLayerInfo(ptrSplitLayerInput); + for (size_t i = 0; i < layer->outData.size(); ++i) { + size_t padding = 0; + size_t layer_size = 0; + auto& dataOutput = layer->outData[i]; + + if (!dataOutput || !dataInput) { + THROW_GNA_EXCEPTION << "Output layer pointer for split/slice is unexpectedly absent"; + } + + for (auto&& ptrSplitLayerOutputPair : dataOutput->getInputTo()) { + auto& ptrSplitLayerOutput = ptrSplitLayerOutputPair.second; + if (!ptrSplitLayerOutput) { + THROW_GNA_EXCEPTION << "Output layer for split/slice is unexpectedly absent"; + } + + padding = std::max(padding, LayerInfo(ptrSplitLayerOutput).paddingSize()) + * dataOutput->precision.size(); + layer_size = + InferenceEngine::details::product(begin(dataOutput->dims), + end(dataOutput->dims)) * dataOutput->precision.size(); + + layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, split_size, layer_size); + } + + split_size += ptrSplitLayerInputLayerInfo.isInput() ? + ALIGN64(padding + layer_size): + padding + layer_size; + } + layerInfoItem.reserved_size = split_size; + layerInfoItem.splitInputLayer = + GNAPlugin::GNASplitLayer::SplitConnectedLayerInfo({ptrSplitLayerInput->type, 0, + InferenceEngine::details::product(begin(dataInput->dims), + end(dataInput->dims)) * dataInput->precision.size()}); + split_connection.emplace(id, layerInfoItem); +} + +void GNAPlugin::DiagonalPrimitive(InferenceEngine::CNNLayerPtr layer) { + AffinePrimitive(layer, true); +} + +void GNAPlugin::ConvolutionPrimitive(InferenceEngine::CNNLayerPtr layer) { + auto &convolution = dynamic_cast(*layer.get()); + auto quantized = InferenceEngine::getInjectedData(layer); + + auto inputs = layer->insData.begin()->lock(); + auto outputs = *layer->outData.begin(); + + uint32_t num_feature_map_rows = FROM_IR_DIM(inputs, 1) / convolution._stride_x; + uint32_t num_feature_map_columns = FROM_IR_DIM(inputs, 3) * convolution._stride_x / num_feature_maps; + + uint32_t num_rows_in = FROM_IR_DIM(inputs, 1); + uint32_t num_columns_in = FROM_IR_DIM(inputs, 3); + uint32_t num_rows_out = FROM_IR_DIM(outputs, 1); + uint32_t num_padding = ALIGN(convolution._kernel_x * num_feature_map_columns * num_feature_maps, 8) + - convolution._kernel_x * num_feature_map_columns * num_feature_maps; + void *ptr_inputs; + void *ptr_outputs; + void *ptr_weights; + void *ptr_biases; + + // TODO: questionable why for biases that are no in IR we inventing precision + auto biasPrecision = convolution._biases ? convolution._biases->precision() : outputs->precision; + + dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t()); + auto ¤tComponent = dnnComponentsForLayer.back().second; + +#ifdef PLOT + cout << "IR layer : " << std::left << std::setw(20) << layer->name << dnnComponentsForLayer.size() - 1 << "\n"; +#endif + auto num_input_padding = ALIGN(num_feature_maps * num_feature_map_columns * num_feature_map_rows, 8) + - num_feature_maps * num_feature_map_columns * num_feature_map_rows; + auto num_filter_rows = convolution._kernel_x / convolution._stride_x; + dnn.InitConvolutional1DComponent(currentComponent, + 1, + num_feature_maps * num_feature_map_columns * num_feature_map_rows + num_input_padding, + 1, + num_rows_out * convolution._out_depth, + inputs->precision.size(), + outputs->precision.size(), + convolution._weights->precision().size(), + biasPrecision.size(), + convolution._out_depth, + num_filter_rows, + num_feature_maps * num_feature_map_columns * num_filter_rows + num_padding, + + num_feature_maps, // interesting - why this is so in gna_example + num_feature_map_rows, + num_feature_map_columns, + + quantized == nullptr ? 1 : quantized->_weights_quant.scale, + quantized == nullptr ? 1 : quantized->_dst_quant.scale, + ptr_inputs, + ptr_outputs, + ptr_weights, + ptr_biases); + + // update num_feature_maps for next convolutional layer + num_feature_maps = convolution._out_depth; // = number of filters + + size_t num_data_bytes_out = + InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) + * outputs->precision.size(); + + size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size(); + + auto connectedInputLayer = connectInput(layer, ptr_inputs, num_data_bytes_in).input; + + // TODO: convolution might be not the first layer in sorted order but connected via split for example - dont know how kaldi will handle that + if (LayerInfo(connectedInputLayer).isInput()) { + // Kaldi features are opposite orientation + dnn.num_rotate_rows = num_feature_map_columns; + dnn.num_rotate_columns = num_feature_map_rows; + } + + connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); + + // rotate + auto TransposeMatrix = [](uint8_t *ptr_matrix, size_t element_size, uint32_t num_rows, uint32_t num_cols) { + std::vector temp_buffer(num_rows * num_cols * element_size); + for (uint32_t i = 0; i < num_rows; i++) { + for (uint32_t j = 0; j < num_cols; j++) { + ie_memcpy(&temp_buffer.front() + (j*num_rows + i)*element_size, + temp_buffer.size() - (i * num_cols + j) * element_size, + ptr_matrix + (i*num_cols+j)*element_size, + element_size); + } + } + return temp_buffer; + }; + + std::vector transposedWeights; + for (uint32_t k = 0; k < convolution._out_depth; k++) { + uint8_t *ptr_filt_current + = convolution._weights->cbuffer().as() + k * num_columns_in * convolution._kernel[X_AXIS] * convolution.precision.size(); + auto transposedPart = TransposeMatrix(ptr_filt_current, convolution.precision.size(), num_columns_in, convolution._kernel[X_AXIS]); + transposedWeights.insert(transposedWeights.end(), transposedPart.begin(), transposedPart.end()); + } + + if (num_padding == 0) { + gnamem->readonly().push_local_ptr(ptr_weights, transposedWeights.data(), convolution._weights->byteSize(), 64); + } else { + auto elementsIn = convolution._kernel_x * num_feature_map_columns + num_padding; + auto paddedWeights = elementsIn * convolution._out_depth; + auto paddedWeightsSize = paddedWeights * convolution.precision.size(); + auto elements_in_row = convolution._kernel_x * num_feature_map_columns; + gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) { + for (int i = 0; i < convolution._out_depth; i++) { + memcpy(data, + transposedWeights.data() + elements_in_row * i * convolution.precision.size(), + elements_in_row * convolution.precision.size()); + + data = reinterpret_cast(data) + elementsIn * convolution.precision.size(); + } + }, 64); + } + + if (convolution._biases) { + gnamem->readonly().push_ptr(ptr_biases, + convolution._biases->cbuffer().as(), + convolution._biases->byteSize(), + 64); + } else { + gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64); + } +} + +void GNAPlugin::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) { + auto &power = dynamic_cast(*layer.get()); + auto quantized = InferenceEngine::getInjectedData(layer); + + if (power.power != 1.0) { + THROW_IE_EXCEPTION << "[GNA plugin] unsupported power factor, expected 1 but was " << power.power; + } + + auto input = layer->insData[0].lock(); + + auto outputs = *layer->outData.begin(); + + uint32_t num_rows_in = FROM_IR_DIM(input, 1); + uint32_t num_columns_in = FROM_IR_DIM(input, 2); + uint32_t num_rows_out = num_rows_in; + + void *ptr_inputs; + void *ptr_outputs; + void *ptr_weights; + void *ptr_biases; + + dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t()); + auto ¤tComponent = dnnComponentsForLayer.back().second; + dnn.InitAffineComponent(currentComponent, + num_rows_in, + num_columns_in, + num_rows_out, + input->precision.size(), + outputs->precision.size(), + // TODO: only fp32 and Int16 tested + quantized == nullptr ? input->precision.size() : 2, + quantized == nullptr ? input->precision.size() : 4, + quantized == nullptr ? 1 : quantized->_weights_quant.scale, + quantized == nullptr ? 1 : quantized->_dst_quant.scale, + ptr_inputs, + ptr_outputs, + ptr_weights, + ptr_biases, + true); + +#ifdef PLOT + cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n"; +#endif + + size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) + * outputs->precision.size(); + + size_t num_data_bytes_in = InferenceEngine::details::product(begin(input->dims), end(input->dims)) + * input->precision.size(); + + connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); + connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0); + + if (power.scale != 1.0f) { + if (quantized == nullptr) { + gnamem->readonly().push_value(ptr_weights, power.scale, num_rows_out, 64); + } else { + auto scaledIdentity = quantized->_weights_quant.scale * power.scale; + + #define FLOAT_TO_INT16(a) static_cast(((a) < 0)?((a) - 0.5):((a) + 0.5)) + + auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast(INT16_MAX))); + gnamem->readonly().push_value(ptr_weights, quantizedIdentity, num_rows_out, 64); + } + } + + if (power.offset != 0.0f) { + if (quantized == nullptr) { + gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64); + } else { + gnamem->readonly().push_value(ptr_biases, 0, num_rows_out, 64); + } + } else { + gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64); + } +} + +void GNAPlugin::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) { + auto &pooling = dynamic_cast(*layer.get()); + auto quantized = InferenceEngine::getInjectedData(layer); + + auto inputs = layer->insData.begin()->lock(); + auto outputs = *layer->outData.begin(); + + uint32_t num_rows_in = FROM_IR_DIM(inputs, 1); + uint32_t num_columns_in = FROM_IR_DIM(inputs, 3); + uint32_t num_rows_out = FROM_IR_DIM(outputs, 1); + uint32_t num_columns_out = FROM_IR_DIM(outputs, 3); + uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in; + + void *ptr_inputs; + void *ptr_outputs; + + dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t()); + auto ¤tComponent = dnnComponentsForLayer.back().second; + +#ifdef PLOT + cout << "IR layer : " << std::left << std::setw(20) << layer->name << dnnComponentsForLayer.size() - 1 << "\n"; +#endif + switch (pooling._type) { + case PoolingLayer::MAX: break; + // we are loosing precision here + case PoolingLayer::AVG: + default: + // TODO: convert to SUMM pooling + THROW_GNA_EXCEPTION << "Layer :" << layer->name << " not supported"; + } + + dnn.InitMaxpoolComponent(currentComponent, + 1, + num_columns_in * num_rows_in , + 1, + num_columns_out * num_rows_out, + inputs->precision.size(), + outputs->precision.size(), + pooling._kernel[X_AXIS], + pooling._kernel[X_AXIS], + num_columns_in, + false, + quantized == nullptr ? 1 : quantized->_dst_quant.scale, + ptr_inputs, + ptr_outputs); + + size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) + * outputs->precision.size(); + + size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size(); + + connectInput(layer, ptr_inputs, num_data_bytes_in); + connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); +} + +void GNAPlugin::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) { + auto quantized = InferenceEngine::getInjectedData(layer); + + auto inputs = layer->insData.begin()->lock(); + auto outputs = *layer->outData.begin(); + + uint32_t num_rows_in = FROM_IR_DIM(inputs, 1); + uint32_t num_columns_in = FROM_IR_DIM(inputs, 2); + uint32_t num_rows_out = FROM_IR_DIM(outputs, 1); + uint32_t num_columns_out = FROM_IR_DIM(outputs, 2); + uint32_t num_padding_in = ALIGN(num_rows_in, 8) - num_rows_in; + uint32_t num_padding_out = ALIGN(num_rows_out, 8) - num_rows_out; + void *ptr_inputs; + void *ptr_outputs; + auto orientation = (num_cnn_rows_out > 0) ? kDnnNonInterleavedOrientation : kDnnInterleavedOrientation; + + dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t()); + auto ¤tComponent = dnnComponentsForLayer.back().second; + dnn.InitCopyComponent(currentComponent, + orientation, + num_rows_in + num_padding_in, + num_columns_in, + num_rows_out + num_padding_out, + num_columns_out, + inputs->precision.size(), + outputs->precision.size(), + quantized == nullptr ? 1 : quantized->_dst_quant.scale, + num_rows_out + num_padding_out, + num_columns_out, + ptr_inputs, + ptr_outputs); + + size_t num_data_bytes_out = ALIGN(InferenceEngine::details::product( + begin(outputs->dims), end(outputs->dims)), 8) + * outputs->precision.size(); + size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding_in) * inputs->precision.size(); + + connectInput(layer, ptr_inputs, num_data_bytes_in); + connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); +} + +void GNAPlugin::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) { + auto concatLayer = dynamic_cast (layer.get()); + + if (concatLayer == nullptr) { + return; + } + if (concatLayer->insData.size() != 2) { + THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers."; + } + + auto prevInput0 = concatLayer->insData[0].lock(); + auto prevInput1 = concatLayer->insData[1].lock(); + if (!prevInput0 || !prevInput1) { + THROW_GNA_EXCEPTION << "Input layer for concat is unexpectedly absent"; + } + if (prevInput0->precision.size() != prevInput1->precision.size()) { + THROW_GNA_EXCEPTION << "Different precision for Concat input layers are not supported"; + } + + for (auto &&outLayer : concatLayer->outData.front()->getInputTo()) { + if ( LayerInfo(outLayer.second).isConcat() ) { + auto& concatLayerInfo = concat_connection.find(concatLayer->name)->second; + connectOutput(layer, &concatLayerInfo.gna_ptr, + &concatLayerInfo.gna_ptr, concatLayerInfo.reserved_size); + } + } +} + +void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) { + auto cropLayer = dynamic_cast (layer.get()); + + if (cropLayer == nullptr) { + return; + } + if (cropLayer->axis.size() > 1) { + THROW_GNA_EXCEPTION << + "Crop layer does not support the number of cropped dimentions = " + << cropLayer->axis.size() << "."; + } + + auto quantized = InferenceEngine::getInjectedData(layer); + size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size(); + size_t cropSize = cropLayer->dim.back() * cropLayer->precision.size(); + + if (ALIGN(cropOffset, 8) == cropOffset) { + // leave crop as it is + GNAPlugin::GNACropLayer cropLayerInfoItem(layer); + std::string& id = layer->name; + crop_connection.emplace(id, cropLayerInfoItem); + auto cropLayerInfo = crop_connection.find(cropLayer->name); + + if (cropLayerInfo == crop_connection.end()) { + THROW_GNA_EXCEPTION << + "Item is not in the storage but it was added recently...\n"; + } + + // calculate index idx for connectInput last parameter + connectInput(layer, &cropLayerInfo->second.gna_ptr, cropSize + cropOffset, cropOffset, 0); + + // cases for certain output layers + for (auto &&outLayer : layer->outData.front()->getInputTo()) { + auto& nextLayer = outLayer.second; + if ( LayerInfo(nextLayer).isConcat() ) { + connectOutput(layer, &cropLayerInfo->second.gna_ptr, &cropLayerInfo->second.gna_ptr, cropSize); + } + } + } else { + gnalog() << "Crop " << layer->name << " is being replaced by Affine layer...\n"; + auto outputs = *layer->outData.begin(); + auto inputs = layer->insData.begin()->lock(); + + uint32_t num_rows_in = FROM_IR_DIM(inputs, 1); + uint32_t num_columns_in = FROM_IR_DIM(inputs, 2); + uint32_t num_rows_out = FROM_IR_DIM(outputs, 1); + uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in; + + void *ptr_inputs; + void *ptr_outputs; + void *ptr_weights; + void *ptr_biases; + + dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t()); + auto ¤tComponent = dnnComponentsForLayer.back().second; + dnn.InitAffineComponent(currentComponent, + num_rows_in + num_padding, + num_columns_in, + num_rows_out, + inputs->precision.size(), + 4, + quantized == nullptr ? inputs->precision.size() : 2, + 4, + quantized == nullptr ? 1 : quantized->_weights_quant.scale, + quantized == nullptr ? 1 : quantized->_dst_quant.scale, + ptr_inputs, + ptr_outputs, + ptr_weights, + ptr_biases, + false); + + size_t num_data_bytes_out = + InferenceEngine::details::product( + begin(outputs->dims), end(outputs->dims)) * 4; + + size_t num_data_bytes_in = num_columns_in * + (num_rows_in + num_padding) * inputs->precision.size(); + + connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0); + connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); + + gnamem->readonly().push_initializer(ptr_weights, num_rows_out * (num_rows_in + num_padding)*layer->precision.size(), [=](void * data, size_t size) { + int out = 0; + for (int input = cropLayer->offset.back(); input < num_rows_out + cropLayer->offset.back(); ++input) { + auto mem_ptr = reinterpret_cast(data) + input * layer->precision.size() + out * (num_rows_in+num_padding) * layer->precision.size(); + if (quantized == nullptr) { + auto float_ptr = reinterpret_cast(mem_ptr); + *float_ptr = 1.0f; + } else { + auto int_ptr = reinterpret_cast(mem_ptr); + *int_ptr = 1; + } + ++out; + } + }, 64); + if (quantized == nullptr) { + gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64); + } else { + gnamem->readonly().push_value(ptr_biases, 0, num_rows_out, 64); + } + } +} + +void GNAPlugin::SplitPrimitive(InferenceEngine::CNNLayerPtr layer) { +// Nothing to do +} + +void GNAPlugin::SlicePrimitive(InferenceEngine::CNNLayerPtr layer) { +// Nothing to do +} + +void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) { + auto &eltwise = dynamic_cast(*layer.get()); + auto quantized = InferenceEngine::getInjectedData(layer); + + // for eltwise should be one input of 4 bytes and one of 2 bytes - detecting that + auto inputs2Bytes = layer->insData[0].lock(); + auto inputs4Bytes = layer->insData[1].lock(); + + int biasesLayerIdx = 1; + + if (quantized) { + if (eltwise._operation == EltwiseLayer::Sum) { + if (inputs4Bytes->precision.size() != 4) { + std::swap(inputs4Bytes, inputs2Bytes); + biasesLayerIdx = 0; + } + IE_ASSERT(inputs2Bytes->precision.size() == 2); + IE_ASSERT(inputs4Bytes->precision.size() == 4); + } else { + // for mul both inputs should be 2 bytes precision + IE_ASSERT(inputs2Bytes->precision.size() == 2); + IE_ASSERT(inputs4Bytes->precision.size() == 2); + } + } + + auto outputs = *layer->outData.begin(); + + uint32_t num_rows_in = FROM_IR_DIM(inputs4Bytes, 1); + uint32_t num_columns_in = FROM_IR_DIM(inputs4Bytes, 2); + uint32_t num_rows_out = num_rows_in; + + void *ptr_inputs; + void *ptr_outputs; + void *ptr_weights; + void *ptr_biases; + + dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t()); + auto ¤tComponent = dnnComponentsForLayer.back().second; + dnn.InitAffineComponent(currentComponent, + num_rows_in, + num_columns_in, + num_rows_out, + inputs2Bytes->precision.size(), + outputs->precision.size(), + // TODO: only fp32 and Int16 tested + quantized == nullptr ? inputs2Bytes->precision.size() : 2, + quantized == nullptr ? inputs4Bytes->precision.size() : 4, + quantized == nullptr ? 1 : quantized->_weights_quant.scale, + quantized == nullptr ? 1 : quantized->_dst_quant.scale, + ptr_inputs, + ptr_outputs, + ptr_weights, + ptr_biases, + true); + +#ifdef PLOT + cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n"; +#endif + + size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) + * outputs->precision.size(); + + size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs2Bytes->dims), end(inputs2Bytes->dims)) + * inputs2Bytes->precision.size(); + + connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); + connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 1 - biasesLayerIdx); + + switch (eltwise._operation) { + case EltwiseLayer::Sum: + if (quantized == nullptr) { + gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64); + } else { + auto scaledIdentity = quantized->_weights_quant.scale; + + #define FLOAT_TO_INT16(a) static_cast(((a) < 0)?((a) - 0.5):((a) + 0.5)) + + auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast(INT16_MAX))); + gnamem->readonly().push_value(ptr_weights, quantizedIdentity, num_rows_out, 64); + } + connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx); + break; + + case EltwiseLayer::Prod: + if (quantized == nullptr) { + gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64); } else { - auto dst = reinterpret_cast(ptr_dst); - auto src = reinterpret_cast(ptr_src); - copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor); + gnamem->readonly().push_value(ptr_biases, 0, num_rows_out, 64); + } + connectInput(layer, ptr_weights, num_data_bytes_in, 0, biasesLayerIdx); + break; + + default: + THROW_GNA_EXCEPTION << "Unsupported eltwise operation: " << eltwise._operation; + } +} + +void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag) { + auto &weightable = dynamic_cast(*layer.get()); + auto quantized = InferenceEngine::getInjectedData(layer); + + auto inputs = layer->insData.begin()->lock(); + auto outputs = *layer->outData.begin(); + + uint32_t num_rows_in = FROM_IR_DIM(inputs, 1); + uint32_t num_columns_in = FROM_IR_DIM(inputs, 2); + uint32_t num_rows_out = isDiag ? num_rows_in : FROM_IR_DIM(outputs, 1); + uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in; + + void *ptr_inputs; + void *ptr_outputs; + void *ptr_weights; + void *ptr_biases; + + // TODO: questionable why for biases that are no in IR we inventing precision + auto biasPrecision = weightable._biases ? weightable._biases->precision() : outputs->precision; + + dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t()); + auto ¤tComponent = dnnComponentsForLayer.back().second; + +#ifdef PLOT + cout << "IR layer : " << std::left << std::setw(20) << layer->name << (isDiag ? "diagonal_" : "affine_") << dnnComponentsForLayer.size() - 1 << "\n"; +#endif + + dnn.InitAffineComponent(currentComponent, + num_rows_in + num_padding, + num_columns_in, + num_rows_out, + inputs->precision.size(), + outputs->precision.size(), + weightable._weights->precision().size(), + biasPrecision.size(), + quantized == nullptr ? 1 : quantized->_weights_quant.scale, + quantized == nullptr ? 1 : quantized->_dst_quant.scale, + ptr_inputs, + ptr_outputs, + ptr_weights, + ptr_biases, + isDiag); + + size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) + * outputs->precision.size(); + + size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size(); + + auto connectionInfo = connectInput(layer, ptr_inputs, num_data_bytes_in); + connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); + + auto transpose = false; + auto transposedRows = 0; + auto transposedCols = 0; + /** + * TODO: enable transpose correction between Conv/affine layers implement dedicated pass + * TF topologies have inplace permutes so we dont care + * kaldi topologies did this internally + */ + if (0 && connectionInfo.needTransposeWeights) { + gnalog() << "Transposing weights for layer: " << layer->name << "\n"; + // direct order is 0, 1, 2, 3, supported order is only 0,3,2,1 where dim 2 is usually equals to 1 + auto permuteOrder = connectionInfo.permute->GetParamAsInts("order"); + if (permuteOrder != vector({0, 3, 2, 1})) { + THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") << + ", but only support 0, 3, 2, 1"; + } + transpose = !isDiag; + transposedRows = connectionInfo.permute->input()->getDims()[3]; + transposedCols = connectionInfo.permute->input()->getDims()[1]; + } + + if (num_padding == 0) { + if (!transpose) { + gnamem->readonly().push_ptr(ptr_weights, + weightable._weights->cbuffer().as(), + weightable._weights->byteSize(), + 64); + } else { + // ToDO: write unit tests for transpose + gnamem->readonly().push_initializer(ptr_weights, weightable._weights->byteSize(), [=](void * data, size_t size) { + for (int k = 0; k < (isDiag ? 1 : num_rows_out); k++) { + auto rowOffset = k * transposedRows * transposedCols * weightable.precision.size(); + auto cbuffer = weightable._weights->cbuffer().as() + rowOffset; + auto u8Data = reinterpret_cast(data) + rowOffset; + for (int j = 0; j < transposedCols; j++) { + for (int i = 0; i < transposedRows; i++) { + auto offsetWrite = (transposedRows * j + i) * weightable.precision.size(); + auto offsetRead = (i * transposedCols + j) * weightable.precision.size(); + memcpy(u8Data + offsetWrite, cbuffer + offsetRead, weightable.precision.size()); + } + } + } + }, 64); + } + } else { + auto elementsIn = (num_rows_in + num_padding) * num_columns_in; + auto paddedWeights = isDiag ? elementsIn : elementsIn * num_rows_out; + auto paddedWeightsSize = paddedWeights * weightable.precision.size(); + + gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) { + for (int i = 0; i < (isDiag ? 1 : num_rows_out); i++) { + memcpy(data, + weightable._weights->cbuffer().as() + num_rows_in * i * weightable.precision.size(), + num_rows_in * weightable.precision.size()); + data = reinterpret_cast(data) + (num_rows_in + num_padding) * weightable.precision.size(); } + }, 64); + } + + if (weightable._biases) { + gnamem->readonly().push_ptr(ptr_biases, + weightable._biases->cbuffer().as(), + weightable._biases->byteSize(), + 64); + } else { + gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64); + } +} + +void GNAPlugin::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) { + auto *generic = dynamic_cast(layer.get()); + std::string type; + std::vector ptr_pwl_segments; + uint32_t num_rows; + uint32_t num_columns; + void *ptr_inputs; + void *ptr_outputs; + + do { + if (generic == nullptr) { + type = layer->type; + break; + } + + if (CaselessEq()(layer->type, "activation")) { + type = generic->GetParamAsString("type"); + break; + } else { + type = layer->type; + break; } + } while (false); + + auto inputs = layer->insData.begin()->lock(); + auto outputs = *layer->outData.begin(); + auto quantized = InferenceEngine::getInjectedData(layer); + float output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; + + auto orientation = (num_cnn_rows_out > 0) ? kDnnNonInterleavedOrientation : kDnnInterleavedOrientation; + + if (inputs->dims.size() == 4) { + num_columns = FROM_IR_DIM(inputs, 3) * FROM_IR_DIM(inputs, 1); + num_rows = 1; + } else { + num_columns = FROM_IR_DIM(inputs, 2); + num_rows = FROM_IR_DIM(inputs, 1); } + + size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims)) + * outputs->precision.size(); + + size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs->dims), end(inputs->dims)) + * inputs->precision.size(); + + static caseless_unordered_map supportedActivations = { + {"sigmoid", kActSigmoid}, + {"tanh", kActTanh}, + {"relu", kActRelu}, + {"leakyrelu", kActLeakyRelu}, + {"clamp", kActKaldiLstmClipping}, + {"identity", kActIdentity} + }; + + auto it = supportedActivations.find(type); + if (it == supportedActivations.end()) { + THROW_GNA_EXCEPTION << "Activation function type not yet supported: " << type; + } + auto activation_type = DnnActivation::fromType(it->second); + activation_type.negative_slope = (it->second == kActRelu) ? dynamic_cast(layer.get())->negative_slope : 0.0f; + + // TODO: need to take graph dependency instead of linear + auto &prevComponent = dnnComponentsForLayer.back().second; + dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t()); + auto ¤tComponent = dnnComponentsForLayer.back().second; + + intel_pwl_segment_t *ptr_pwl_segments_target = nullptr; + + if (!inputs->precision.is_float()) { + // TODO: generalize activation function code + // now that scale factors are known, create PWL approximations to activation functions + float input_scale_factor = dnn.OutputScaleFactor(prevComponent); + if (uniformPwlDesign) { + switch (activation_type) { + case kActSigmoid:ptr_pwl_segments.resize(SIGMOID_NUM_SEGMENTS); + break; + case kActTanh:ptr_pwl_segments.resize(TANH_NUM_SEGMENTS); + break; + case kActRelu:ptr_pwl_segments.resize(RELU_NUM_SEGMENTS); + break; + case kActLeakyRelu:ptr_pwl_segments.resize(RELU_NUM_SEGMENTS); + break; + case kActKaldiLstmClipping: + case kActIdentity:ptr_pwl_segments.resize(IDENTITY_NUM_SEGMENTS); + break; + case kActCustom: + default:THROW_GNA_EXCEPTION << "Activation function type not yet supported " << activation_type; + } + PwlDesign16(activation_type, + &*ptr_pwl_segments.begin(), + static_cast(ptr_pwl_segments.size()), + input_scale_factor, + output_scale_factor); + } else { + PwlDesignOpt16(activation_type, + ptr_pwl_segments, + input_scale_factor, + output_scale_factor); + } + ptr_pwl_segments_target = reinterpret_cast(&ptr_pwl_segments_target); + } + + dnn.InitPiecewiseLinearComponent(currentComponent, + activation_type, + orientation, + num_rows, + num_columns, + inputs->precision.size(), + outputs->precision.size(), + ptr_pwl_segments.size(), + output_scale_factor, + ptr_inputs, + ptr_outputs, + ptr_pwl_segments_target); +#ifdef PLOT +#define GET_ACTIVATION_NAME(name)\ +case name:\ + actName = #name;\ + break; + string actName = "unknown"; + switch (activation_type) { + GET_ACTIVATION_NAME(kActSigmoid); + GET_ACTIVATION_NAME(kActTanh); + GET_ACTIVATION_NAME(kActRelu); + GET_ACTIVATION_NAME(kActLeakyRelu); + GET_ACTIVATION_NAME(kActKaldiLstmClipping); + GET_ACTIVATION_NAME(kActIdentity); + } + cout << "IR layer : " << std::left << std::setw(20) << layer->name << actName << "_" << dnnComponentsForLayer.size() - 1 <<"\n"; +#endif + + connectInput(layer, ptr_inputs, num_data_bytes_in); + connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out); + + if (ptr_pwl_segments_target != nullptr) { + gnamem->readonly().push_local_ptr(ptr_pwl_segments_target, + &ptr_pwl_segments.front(), + ptr_pwl_segments.size() * sizeof(intel_pwl_segment_t), + 64); + } +} + + +void GNAPlugin::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) { + auto layerOrder = layer->GetParamAsInts("order"); + + if (layerOrder != vector({0, 3, 2, 1})) { + THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") << + ", but only support 0,3,2,1"; + } +} + +class LayersBuilder { + using CreatorFnc = std::function; + + public: + LayersBuilder(const std::vector &types, CreatorFnc callback) { + for (auto && str : types) { + getStorage()[str] = callback; + } + } + static caseless_unordered_map &getStorage() { + static caseless_unordered_map LayerBuilder; + return LayerBuilder; + } +}; + +#define CREATE(name) [](GNAPlugin *p, CNNLayerPtr l) {p->name(l);} +void SKIP(GNAPlugin*, CNNLayerPtr) {} + +void GNAPlugin::CreateLayerPrimitive(CNNLayerPtr layer) { + static const LayersBuilder layersBuilder[] = { + {{"Input"}, [](GNAPlugin*, CNNLayerPtr l) {}}, // skip input layers they are not used in GNA lib, only as a memory blobs + {{"FullyConnected", "InnerProduct"}, CREATE(AffinePrimitive)}, + {{"ScaleShift"}, CREATE(DiagonalPrimitive)}, + {{"Eltwise"}, + CREATE(EltwisePrimitive)}, // same as diagonal while weights are not taken from network, rather than from another output + {{"Split"}, SKIP}, // skip information about which part of prev layer need to consume handle during layer creation + {{"Slice"}, SKIP}, + {{"clamp", "sigmoid", "relu", "tanh", "identity"}, CREATE(PWLPrimitive)}, + {{"Convolution"}, CREATE(ConvolutionPrimitive)}, + {{"Permute"}, CREATE(PermutePrimitive)}, // permute of certain form (2D transpose) can be assimilated in followed FC layer + {{"Pooling"}, CREATE(PoolingPrimitive)}, + {{"Power"} , CREATE(PowerPrimitive)}, + {{"Concat"}, CREATE(ConcatPrimitive)}, + {{"Reshape"}, SKIP}, // TODO: handled not in GNA but rather in GNA plugin + {{"Crop"}, CREATE(CropPrimitive)}, + {{"Copy"}, CREATE(CopyPrimitive)}, + }; + auto it = LayersBuilder::getStorage().find(layer->type); + if (it != LayersBuilder::getStorage().end()) { + it->second(this, layer); + } else { + THROW_GNA_EXCEPTION << "Unsupported layer: " << layer->name << ":" << layer->type; + } +} + + +GNAPlugin::GNAPlugin(const std::map& configMap) { + // holds actual value of a found key + std::string value; + auto if_set = [&](std::string key, const std::function & handler) { + auto keyInMap = configMap.find(key); + if (keyInMap != configMap.end()) { + value = keyInMap->second; + handler(); + } + }; + + if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] { + input_scale_factor = std::stod(value); + }); + + if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] { + dumpXNNPath = value; + }); + + if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] { + static caseless_unordered_map supported_values = { + {GNAConfigParams::GNA_AUTO, GNA_AUTO}, + {GNAConfigParams::GNA_HW, GNA_HARDWARE}, + {GNAConfigParams::GNA_SW, GNA_SOFTWARE}, + {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE} + }; + auto procType = supported_values.find(value); + if (procType == supported_values.end()) { + THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value; + } + gna_proc_type = static_cast(procType->second); + }); + + if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] { + if (value == PluginConfigParams::YES) { + compact_mode = true; + } else if (value == PluginConfigParams::NO) { + compact_mode = false; + } else { + THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value; + } + }); + + if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] { + if (value == PluginConfigParams::YES) { + exclusive_async_requests = true; + } else if (value == PluginConfigParams::NO) { + exclusive_async_requests = false; + } else { + THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value; + } + }); + + if_set(GNA_CONFIG_KEY(PRECISION), [&] { + auto precision = Precision::FromStr(value); + if (precision != Precision::I8 && precision != Precision::I16) { + THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value; + } + gnaPrecision = precision; + }); + + if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] { + if (value == PluginConfigParams::YES) { + uniformPwlDesign = true; + } else if (value == PluginConfigParams::NO) { + uniformPwlDesign = false; + } else { + THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter " + << "should be equal to YES/NO, but not" << value; + } + }); + + if_set(CONFIG_KEY(PERF_COUNT), [&] { + if (value == PluginConfigParams::YES) { + performance_counting = true; + } else if (value == PluginConfigParams::NO) { + performance_counting = false; + } else { + THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter " + << "should be equal to YES/NO, but not" << value; + } + }); + + if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] { + uint64_t lib_threads = std::stoul(value, NULL, 10); + if (lib_threads == 0 || lib_threads > std::numeric_limits::max()/2-1) { + THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value + << ", should be greateer than 0 and less than 127"; + } + gna_lib_async_threads_num = lib_threads; + }); + + if_set(CONFIG_KEY(SINGLE_THREAD), [&] { + if (value == PluginConfigParams::YES) { + gna_openmp_multithreading = false; + } else if (value == PluginConfigParams::NO) { + gna_openmp_multithreading = true; + } else { + THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value; + } + }); +} + +GNAPluginNS::GNAPlugin::LayerType GNAPlugin::LayerTypeFromStr(const std::string &str) { + static const caseless_map LayerNameToType = { + { "Input" , Input }, + { "Convolution" , Convolution }, + { "ReLU" , ReLU }, + { "Sigmoid" , Sigmoid }, + { "TanH" , TanH }, + { "Pooling" , Pooling }, + { "FullyConnected" , FullyConnected }, + { "InnerProduct" , InnerProduct}, + { "Split" , Split }, + { "Slice" , Slice }, + { "Eltwise" , Eltwise }, + { "Reshape" , Reshape }, + { "ScaleShift" , ScaleShift }, + { "Clamp" , Clamp }, + { "Concat" , Concat }, + { "Copy", Copy }, + { "Permute" , Permute }, + { "Power" , Power}, + { "Memory" , Memory }, + { "Crop" , Crop } + }; + auto it = LayerNameToType.find(str); + if (it != LayerNameToType.end()) + return it->second; + else + return NO_TYPE; } -GNAPlugin::GNAPlugin() { - Init(); - UpdateFieldsFromConfig(); -} +bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage) { + CNNLayerSet inputLayers; + InferenceEngine::InputsDataMap inputs; + std::unordered_set allLayers; + auto specifiedDevice = network.getTargetDevice(); + auto network_precision = network.getPrecision(); + network.getInputsInfo(inputs); + auto network_input_precision = inputs.begin()->second->getInputPrecision(); + auto batch_sise = network.getBatchSize(); + if (network_precision != Precision::FP32) { + errMessage = "The plugin does not support networks with " + std::string(network_precision.name()) + " format.\n"; + return false; + } + if (network_input_precision != Precision::FP32 && + network_input_precision != Precision::I16) { + errMessage = "The plugin does not support input precision with " + std::string(network_input_precision.name()) + " format.\n"; + return false; + } + if (specifiedDevice != InferenceEngine::TargetDevice::eCPU && + specifiedDevice != InferenceEngine::TargetDevice::eGNA && + specifiedDevice != InferenceEngine::TargetDevice::eDefault) { + errMessage = "The plugin does not support target device: " + std::string(getDeviceName(specifiedDevice)) + ".\n"; + return false; + } -GNAPlugin::GNAPlugin(const std::map& configMap) { - Init(); - SetConfig(configMap); -} + if (inputs.empty()) { + errMessage = "Network is empty (GNA)\n"; + return false; + } -void GNAPlugin::Init() { - dnn = std::make_shared(backend::AMIntelDNN()); - inputsDesc = std::make_shared(GNAPluginNS::InputDesc()); - gnaFlags = std::make_shared(GNAPluginNS::GNAFlags()); + auto & secondLayers = inputs.begin()->second->getInputData()->getInputTo(); + if (secondLayers.empty()) { + errMessage = "Network consists of input layer only (GNA)\n"; + return false; + } - graphCompiler.setDNNPtr(dnn); - graphCompiler.setInputDescPtr(inputsDesc); - graphCompiler.setGNAFlagsPtr(gnaFlags); -} + bool check_result = true; + InferenceEngine::details::UnorderedDFS(allLayers, + secondLayers.begin()->second, + [&](const CNNLayerPtr layer) { + if (LayerTypeFromStr(layer->type) == NO_TYPE) { + errMessage = "Layer is unsupported by GNA: " + layer->name + ":" + layer->type + "\n"; + check_result = false; + } + if (batch_sise != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) { + check_result = false; + } + }, false); -void GNAPlugin::InitGNADevice() { -#if GNA_LIB_VER == 1 - gnadevice = std::make_shared(config.gna_proc_type, - gnaFlags->gna_lib_async_threads_num, - gnaFlags->gna_openmp_multithreading, - gnaFlags->performance_counting); -#else - gnadevice = std::make_shared(config.pluginGna2AccMode, - config.pluginGna2DeviceConsistent, - gnaFlags->gna_lib_async_threads_num, - gnaFlags->gna_openmp_multithreading, - gnaFlags->performance_counting); -#endif - size_t page_size_bytes = 4096; - gnamem = std::make_shared(memory::make_polymorph(gnadevice), page_size_bytes); - graphCompiler.setGNAMemoryPtr(gnamem); + return check_result; } void GNAPlugin::LoadNetwork(ICNNNetwork &network) { - // move blobs from Constant layers to Convolution, Deconvolution, FullyConnected layers attributes - BlobTransformation blobsTransformation; - blobsTransformation.transform(network, true); - // Check the input network std::string error; if (!AreLayersSupported(network, error)) { @@ -351,87 +1485,68 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { } // network optimisation phases - int passIdx = 0; - auto run_passes = [&] (const CNNNetPtr& network, bool runBeforeCopy) { - auto passes = make_shared(policy, network, runBeforeCopy); - passes->registerPass(); - passes->registerPass(); - passes->registerPass(); - passes->registerPass(); - - passes->registerPass(); - passes->registerPass(); - - passes->registerPass(); - passes->registerPass(); - - passes->registerPass(); - passes->registerPass(); - if (policy.PermutePolicy != Policy::Permute::DISABLED) { - passes->registerPass(); - } - passes->registerPass(); - passes->registerPass(); - passes->registerPass(); - passes->registerPass(); - passes->registerPass(); - passIdx = passes->run(passIdx); + auto run_passes = [&] (CNNNetPtr network) { + auto layers = CNNNetSortTopologically(*network.get()); + substitutePRelu(layers); + layers = CNNNetSortTopologically(*network.get()); + reorderMaxPool(layers); + applyOrientations(layers); + insertIdentityLayer(layers); + insertDiagonalLayer(layers); }; - ICNNNetwork::Ptr newNet; - if (gnaFlags->sw_fp32) { - auto visitor = [&](InferenceEngine::CNNLayerPtr lp) { - transformLayer(lp, WeightsConverter()); - return lp; - }; - newNet = InferenceEngine::CNNNetCopy(network, visitor); - // to run all passes need to have two calls to pass manager - run_passes(newNet, true); - run_passes(newNet, false); - } else { - switch (config.gnaPrecision) { - case Precision::I16: - ModelQuantizer q16; - newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors); - break; - case Precision::I8: - ModelQuantizer q8; - newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors); - break; - default: - THROW_GNA_EXCEPTION << "no mans land for GNA precision"; - break; - } - } - - auto inputLayers = CNNNetGetAllInputLayers(*newNet); + Config supported = Config({ + {TargetDevice::eGNA, Precision::FP32, [&](InferenceEngine::ICNNNetwork &network) -> CNNNetworkPtr { + if (gnaPrecision == Precision::I16) { + ModelQuantizer q; + return q.quantize(network, run_passes, input_scale_factor); + } -#ifdef PLOT - std::ofstream file("gna_passes.dot"); - saveGraphToDot(*newNet, file, [](const CNNLayerPtr layer, - ordered_properties &printed_properties, - ordered_properties &node_properties) { - // printing quantized params - auto quantized = InferenceEngine::getInjectedData(layer); - if (!quantized) { - return; + if (gnaPrecision == Precision::I8) { + ModelQuantizer q; + return q.quantize(network, run_passes, input_scale_factor); + } + THROW_GNA_EXCEPTION << "no mans land for GNA precision"; + }}, + // TODO: need to have advanced precision matcher based on layers/biases + {TargetDevice::eGNA, Precision::MIXED}, + {TargetDevice::eGNA, Precision::I16}, + {TargetDevice::eCPU, Precision::FP32 +#define EMULATE_GNA_API_LAYERS +#ifdef EMULATE_GNA_API_LAYERS + , [&](InferenceEngine::ICNNNetwork & network) { + auto visitor = [&](InferenceEngine::CNNLayerPtr lp) { + return lp; + }; + auto copiedNet = InferenceEngine::CNNNetCopy(network, visitor); + run_passes(copiedNet); + + return copiedNet; } - printed_properties.emplace_back( - "scale factor", std::to_string(quantized->_dst_quant.scale)); - }); #endif + } + }); - auto sortedNet = CNNNetSortTopologicallyEx(*newNet, make_fuzed_order); - - // passing policy to compiler - graphCompiler.setPolicy(policy); - - if (sortedNet.empty()) { - THROW_GNA_EXCEPTION << "Sorted network is empty"; + supported.setDefaultDevice(TargetDevice::eGNA); + auto newNet = supported.find_configuration(network).convert(network); + auto networkPrecision = newNet->getPrecision(); + + if (!networkPrecision.is_float()) { + gnadevice.reset(new GNADeviceHelper(gna_proc_type, + gna_lib_async_threads_num, + gna_openmp_multithreading, + performance_counting)); + gnamem.reset(new gna_memory_type( + make_polymorph(*gnadevice.get()), PAGE_SIZE_BYTES)); + } else { + gnamem.reset(new gna_memory_type(make_polymorph>())); } + // creating intel dnn_t structures from network + auto sortedNet = CNNNetSortTopologically(*newNet); std::vector sortedNoMem; - std::unordered_map> memoryPairs; + std::map> memoryPairs; // find all memory layers pairs and mark which one used as outputs for (auto &layer : sortedNet) { auto generic = dynamic_cast(layer.get()); @@ -447,199 +1562,88 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { memoryPairs[id][generic->GetParamAsInt("index")] = layer; continue; } else if (layerInfo.isConcat()) { - graphCompiler.fillConcatConnections(layer); + fillConcatConnections(layer); } else if (layerInfo.isSplit() || layerInfo.isSlice()) { - graphCompiler.fillSplitConnections(layer); + fillSplitConnections(layer); } sortedNoMem.push_back(layer); } // fill in extra storage with memory layers - graphCompiler.fillMemoryConnections(memoryPairs); - - if (!graphCompiler.memory_connection.empty()) { - gnaFlags->gna_lib_async_threads_num = 1; - } - - if (gnaFlags->sw_fp32) { - gnamem.reset(new gna_memory_type(memory::make_polymorph>())); - graphCompiler.setGNAMemoryPtr(gnamem); - } else { - InitGNADevice(); - } + fillMemoryConnections(memoryPairs); // keep inputs information and create input primitives newNet->getInputsInfo(inputsDataMap); if (inputsDataMap.empty()) { THROW_GNA_EXCEPTION << " No inputs for the topology"; } + if (inputsDataMap.size() != 1) { + THROW_GNA_EXCEPTION << " cannot infer topologies with more than one inputs"; + } + + inputDims = inputsDataMap.begin()->second->getDims(); // keep output dims newNet->getOutputsInfo(outputsDataMap); if (outputsDataMap.empty()) { THROW_GNA_EXCEPTION << "No outputs for the topology"; } - - for (auto && input : inputsDataMap) { - inputsDesc->getPtrInputsGlobal(input.first).resize(gnaFlags->gna_lib_async_threads_num); + if (outputsDataMap.size() != 1) { + THROW_GNA_EXCEPTION << "cannot infer topologies with more than one output"; } + outputDims = outputsDataMap.begin()->second->dims; + ptr_inputs_global.resize(gna_lib_async_threads_num); + ptr_outputs_global.resize(gna_lib_async_threads_num); // CreatingLayer primitives - for (auto & layer : sortedNoMem) { - graphCompiler.CreateLayerPrimitive(layer); - } - for (auto& inputLayer : inputLayers) { - auto layerInfo = LayerInfo(inputLayer); - if (layerInfo.isInput() && 0 == inputsDesc->bytes_allocated_for_input[inputLayer->name]) { - graphCompiler.connectOutput(inputLayer, &inputsDesc->getPtrInputsGlobal(inputLayer->name).front(), 0); - } - } - // TODO: graph might be static - should we support that - if (graphCompiler.dnnComponents.components.empty()) { - THROW_GNA_EXCEPTION << "No GNA primitives created based on topology. This might indicate trivial topology"; - } - - /// setting-up output layers information - outputsDesc.resize(outputsDataMap.size()); - - auto initOutput = [this] - (int idx, const intel_dnn_component_t & component, CNNLayerPtr layer) { - // auto idx = std::distance(outputsDataMap.begin(), outputPort); - auto & desc = outputsDesc[idx]; - auto quantized = InferenceEngine::getInjectedData(layer); - - desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num); - desc.orientation = component.orientation_out; - desc.num_bytes_per_element = component.num_bytes_per_output; - desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; - // TODO: this need to be fixed - desc.num_elements = component.num_rows_out; - - // binding ptr for first infer request - then others will be setup during relocation - gnamem->bind_ptr(&desc.ptrs.front(), &component.ptr_outputs); - }; - - int portId = 0; - for (auto && outPort : outputsDataMap) { - // gets output layer pointer in original topology not in cloned - auto outLayer = outPort.second->getCreatorLayer().lock(); - - // Memory layers are not dnnComponents hence we need to make switch with identity layer - if (outLayer->type == "Memory") { - // traverse memory connection to find corresponding output_memory - for (auto && memConnection : graphCompiler.memory_connection) { - if (memConnection.second.getInput()->name == outLayer->name) { - // if connection is found, replace memory input layer with memory output layer - outLayer = memConnection.second.getOutput(); - break; - } - } - } - - // searching for outData represented in GNA blob - // using ufs - upper first search - gnalog() << "[UFS] searching for : "<< outPort.first << " representation in GNA\n"; - bool stopSearching = false; - - CNNNetDFS(outLayer, [this, &outPort, portId, &stopSearching, &initOutput](CNNLayerPtr layer) { - auto irLayerAvatar = std::find_if( - graphCompiler.dnnComponents.components.begin(), - graphCompiler.dnnComponents.components.end(), - [&layer](std::pair & value) { - return value.first == layer->name; - }); - - gnalog() << "[UFS] from : "<< outPort.first <<" reached: " << layer->name << "\n"; - - // probing gna_primitives - if (irLayerAvatar != graphCompiler.dnnComponents.components.end()) { - initOutput(portId, irLayerAvatar->second, layer); - stopSearching = true; - } - - // probing concatInfo - if (!stopSearching && LayerInfo(layer).isConcat()) { - auto concatConnection = graphCompiler.concat_connection.find(layer->name); - if (concatConnection != graphCompiler.concat_connection.end()) { - //initOutput(portId, irLayerAvatar->second, layer); - - auto &desc = outputsDesc[portId]; - auto quantized = InferenceEngine::getInjectedData(layer); - - desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num); - // TODO: what is orientation for concat - desc.orientation = kDnnInterleavedOrientation; - desc.num_bytes_per_element = layer->outData.front()->getPrecision().size(); - desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; - desc.num_elements = concatConnection->second.reserved_size / desc.num_bytes_per_element; - - // binding ptr for first infer request - then others will be setup during relocation - gnamem->bind_ptr(&desc.ptrs.front(), &concatConnection->second.gna_ptr); - stopSearching = true; - } - } - }, true, [&stopSearching](InferenceEngine::CNNLayer* from) { - return make_upstream_order(!stopSearching ? from : nullptr); - }); - if (!stopSearching) { - THROW_GNA_EXCEPTION << "unsupported topology: cannot locate " << outPort.first - << " after compiling GNA graph"; - } - portId++; + // TODO: solely gna_example convolution hack + num_feature_maps = 1; + for (auto layer = sortedNoMem.begin(); layer != sortedNoMem.end(); ++layer) { + CreateLayerPrimitive(*layer); } + gnamem->bind_ptr(&ptr_outputs_global.front(), &dnnComponentsForLayer.back().second.ptr_outputs); - // TODO: how active list will work in multioutput case // make room for active list - gnamem->reserve_ptr(nullptr, - ALIGN64(outputsDesc.front().num_bytes_per_element * outputsDesc.front().num_elements), 64); + auto &last_component = dnnComponentsForLayer.back().second; + gnamem->reserve_ptr(nullptr, ALIGN64(last_component.num_bytes_per_output * last_component.num_rows_out)); void *pParallelExecutionData = nullptr; - // reserving more bytes for intermediate data in parallel case - TODO: this works incorrectly in compact mode at lest + // reserving more bytes for intermidiate data in parallel case - TODO: this works incorrectly in compact mode at lest rwSegmentSize = gnamem->getRWBytes(); - if (gnaFlags->gna_lib_async_threads_num > 1) { - gnamem->reserve_ptr(&pParallelExecutionData, gnamem->getRWBytes() * (gnaFlags->gna_lib_async_threads_num - 1), 64); + if (gna_lib_async_threads_num > 1) { + gnamem->reserve_ptr(&pParallelExecutionData, gnamem->getRWBytes() * (gna_lib_async_threads_num - 1)); } gnamem->commit(); - dnn->Init(gnamem->getBasePtr(), + dnn.Init(gnamem->getBasePtr(), gnamem->getTotalBytes(), - gnaFlags->sw_fp32 ? kDnnFloat : kDnnInt, + networkPrecision.is_float() ? kDnnFloat : kDnnInt, 1); - // TODO: this copy is unneeded; in fact, we can directly create gna structs from list - for (auto &element : graphCompiler.dnnComponents.components) { - dnn->component.push_back(element.second); + // TODO: this copy unneed infact we can directly create gna structs from list + for (auto &element : dnnComponentsForLayer) { + dnn.component.push_back(element.second); } // in fp32 mode last PWL cannot be computed without that - dnn->InitActiveList(NULL); + dnn.InitActiveList(NULL); -#if GNA_LIB_VER == 2 - gnaModels.push_back(std::make_tuple(make_shared>())); -#else - nnets.emplace_back(make_shared>(), -1, InferenceEngine::BlobMap()); -#endif - if (!gnaFlags->sw_fp32) { + nnets.push_back(std::make_tuple(make_shared>(0), -1, InferenceEngine::BlobMap())); + + if (!networkPrecision.is_float()) { // number of layer gets calculated inside that InitGNAStruct function -#if GNA_LIB_VER == 2 - dnn->InitGNAStruct(&std::get<0>(gnaModels.front())->obj); -#else - dnn->InitGNAStruct(&std::get<0>(nnets.front())->obj); -#endif + dnn.InitGNAStruct(&std::get<0>(nnets.front())->obj); } - // creating same gna RW segment for parallel infer requests - for (int i = 1; i != gnaFlags->gna_lib_async_threads_num; i++) { -#if GNA_LIB_VER == 2 - gnaModels.push_back(std::make_tuple(make_shared>())); + // creating same gna RW segment for paralle infer requests + for (int i = 1; i != gna_lib_async_threads_num; i++) { + nnets.push_back(std::make_tuple(make_shared>(0), -1, InferenceEngine::BlobMap())); + // this can be improved by just copy all structures, but we are too lazy - dnn->InitGNAStruct(&std::get<0>(gnaModels.back())->obj); -#else - nnets.emplace_back(make_shared>(), -1, InferenceEngine::BlobMap()); - dnn->InitGNAStruct(&std::get<0>(nnets.back())->obj); -#endif + dnn.InitGNAStruct(&std::get<0>(nnets.back())->obj); + // relocate rw pointers to new offset auto basePtr = reinterpret_cast(pParallelExecutionData) + rwSegmentSize * (i - 1); @@ -652,167 +1656,48 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) { } }; - for (auto &&input : inputsDesc->ptr_inputs_global_storage) { - relocate(input[i], input[0]); - } - - // relocating all output pointers - for (int j = 0; j < outputsDesc.size(); ++j) { - relocate(outputsDesc[j].ptrs[i], outputsDesc[j].ptrs[0]); - } - -#if GNA_LIB_VER == 2 - for (int j = 0; j != std::get<0>(gnaModels.front())->obj.NumberOfOperations; j++) { - auto & gnaOperation = std::get<0>(gnaModels[i])->obj.Operations[j]; - relocate(const_cast(gnaOperation.Operands[0])->Data, gnaOperation.Operands[0]->Data); - relocate(const_cast(gnaOperation.Operands[1])->Data, gnaOperation.Operands[1]->Data); -#else + relocate(ptr_inputs_global[i], ptr_inputs_global[0]); + relocate(ptr_outputs_global[i], ptr_outputs_global[0]); for (int j = 0; j != std::get<0>(nnets.front())->obj.nLayers; j++) { auto & layer = std::get<0>(nnets[i])->obj.pLayers[j]; + relocate(layer.pInputs, layer.pInputs); relocate(layer.pOutputs, layer.pOutputs); relocate(layer.pOutputsIntermediate, layer.pOutputsIntermediate); -#endif - } - } - - // calculating input orientation without memory layers, since their orientation not changed during infer right now - std::unordered_map skippedLayers; - - bool withConv = false; - for (auto &layer : sortedNet) { - auto layerInfo = LayerInfo(layer); - if (layerInfo.isConvolution()) { - withConv = true; - break; } } - if (withConv) { - for (auto &layer : sortedNet) { - for (int i = 0; CNNNetHasPrevLayer(layer.get(), i); i++) { - auto prevLayer = CNNNetPrevLayer(layer.get(), i); - if (!skippedLayers.count(prevLayer->name)) { - if (CNNNetHasPrevLayer(prevLayer.get())) { - continue; - } - - // we are in the one of input layers - if (LayerInfo(prevLayer).isMemory()) { - continue; - } - } - - auto dnnLayer = graphCompiler.dnnComponents.findComponent(layer); - string inputName = prevLayer->name; - if (skippedLayers.count(prevLayer->name)) { - inputName = skippedLayers[prevLayer->name]; - } - - // non functional layer - skipped by gna - if (nullptr == dnnLayer) { - // storing input name for skipped layer - skippedLayers[layer->name] = inputName; - continue; - } + orientation_in = dnn.component[0].orientation_in; + orientation_out = dnn.component[dnn.num_components()-1].orientation_out; + num_bytes_per_output = dnn.component[dnn.num_components()-1].num_bytes_per_output; - // input orientation might be already initialized, thus verify that it matches - if (!inputsDesc->orientation_in.count(inputName)) { - inputsDesc->orientation_in[inputName] = dnnLayer->orientation_in; - } else { - if (inputsDesc->orientation_in[inputName] != dnnLayer->orientation_in) { - THROW_GNA_EXCEPTION << "orientation for input layer: " << inputName << "cannot be calculated"; - } - } - } - } - } else { - for (auto& inputLayer : inputLayers) { - inputsDesc->orientation_in[inputLayer->name] = kDnnInterleavedOrientation; - } - } + auto quantized = InferenceEngine::getInjectedData(sortedNoMem.back()); + output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f; - num_rotate_rows = dnn->num_rotate_rows; - num_rotate_columns = dnn->num_rotate_columns; + num_rotate_rows = dnn.num_rotate_rows; + num_rotate_columns = dnn.num_rotate_columns; DumpXNNToFile(); #ifdef PLOT - dnn->WriteGraphWizModel("gna-blob.dot"); -#endif -#if GNA_LIB_VER == 2 - createRequestConfigsForGnaModels(); -#endif -} - -#if GNA_LIB_VER == 2 -void GNAPlugin::createRequestConfigsForGnaModels() { - if (!gnadevice) { - gnaRequestConfigToRequestIdMap.push_back(std::make_tuple(FAKE_REQUEST_CONFIG_ID, -1, InferenceEngine::BlobMap())); - return; - } - for (auto& model : gnaModels) { - const auto& gnaNnet = std::get<0>(model).get()->obj; - const auto modelId = gnadevice->createModel(gnaNnet); - const auto requestConfigId = gnadevice->createRequestConfig(modelId); - gnaRequestConfigToRequestIdMap.push_back(std::make_tuple(requestConfigId, -1, InferenceEngine::BlobMap())); - } -} - + dnn.WriteGraphWizModel("graph.dot"); + // ExportGnaNetworkAndrzej("layers/loaded_from_ir", &nnet->obj); #endif - -int GNAPlugin::GetDeviceVersionFromString(const std::string deviceString) { - constexpr uint32_t embeddedSuffix = 0xE; - if (deviceString.empty()) - return 0x100 + embeddedSuffix; - if (deviceString.size() == 4 && deviceString.substr(0, 3) == "GNA") { - int version = deviceString[3] - '0'; - if (version > 0) { - version <<= 8; - version += embeddedSuffix; - return version; - } - } - THROW_GNA_EXCEPTION << "Wrong GNA generation for embedded model dump: " << deviceString; } - void GNAPlugin::DumpXNNToFile() const { // TODO: output precision as well as pointer might be incorrect, LSTM for sure // gna looks automatically set layer 0 as output and adjust it's pointer / precision/ size respectively - if (config.dumpXNNPath.empty()) { - return; - } - - const auto versionInt = GetDeviceVersionFromString(config.dumpXNNGeneration); - - if (!gnadevice) { - THROW_GNA_EXCEPTION << "Cannot generate XNNDump for float network"; - } - std::ofstream dumpStream(config.dumpXNNPath, std::ios::out | std::ios::binary); -#if GNA_LIB_VER == 1 - if (versionInt != 0x10E) - THROW_GNA_EXCEPTION << "Wrong GNA version for embedded model dump: " << config.dumpXNNGeneration; - auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices); - dump.header.rw_region_size = gnamem->getRWBytes(); - dump.header.input_scaling_factor = inputsDesc->inputScaleFactors.front(); - dump.header.output_scaling_factor = outputsDesc.front().scale_factor; - dumpStream.write(reinterpret_cast(&dump.header), sizeof(intel_gna_model_header)); - dumpStream.write(reinterpret_cast(dump.model.get()), dump.header.model_size); -#else - auto const modelId = gnadevice->createModel(std::get<0>(gnaModels.front())->obj); - if (versionInt == Gna2DeviceVersionEmbedded1_0) { - auto dump = gnadevice->dumpXnn(modelId); - dump.header.RwRegionSize = gnamem->getRWBytes(); - dump.header.InputScalingFactor = inputsDesc->inputScaleFactors.front(); - dump.header.OutputScalingFactor = outputsDesc.front().scale_factor; - dumpStream.write(reinterpret_cast(&dump.header), sizeof(Gna2ModelSueCreekHeader)); - dumpStream.write(reinterpret_cast(dump.model.get()), dump.header.ModelSize); - } else { - static_assert(sizeof(versionInt) >= sizeof(Gna2DeviceVersion), ""); - gnadevice->dumpXnnForDeviceVersion(modelId, dumpStream, - *reinterpret_cast(&versionInt)); + if (!dumpXNNPath.empty()) { + if (!gnadevice) { + THROW_GNA_EXCEPTION << "Cannot generate XNNDump for float network"; + } + auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices); + dump.header.rw_region_size = gnamem->getRWBytes(); + dump.header.input_scaling_factor = input_scale_factor; + dump.header.output_scaling_factor = output_scale_factor; + std::ofstream dumpStream(dumpXNNPath, std::ios::out | std::ios::binary); + dumpStream.write(reinterpret_cast(&dump.header), sizeof(intel_gna_model_header)); + dumpStream.write(reinterpret_cast(dump.model.get()), dump.header.model_size); } - gnadevice->releseModel(modelId); -#endif } void RotateFeatures(uint8_t *ptr_feat, @@ -833,8 +1718,7 @@ void RotateFeatures(uint8_t *ptr_feat, element_size); } } - ie_memcpy(ptr_in, num_feature_vector_elements * element_size, - &temp.front(), num_feature_vector_elements * element_size); + memcpy(ptr_in, &temp.front(), num_feature_vector_elements * element_size); } } else { THROW_GNA_EXCEPTION << "Rotate dimensions (" << num_rotate_rows << "," << num_rotate_columns @@ -842,182 +1726,128 @@ void RotateFeatures(uint8_t *ptr_feat, } } -uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, InferenceEngine::BlobMap &result) { -#if GNA_LIB_VER == 2 - auto& nnets = gnaRequestConfigToRequestIdMap; -#endif +uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) { + return QueueInference(*input.begin()->second.get(), result); + + /*if (!syncPoints.empty()) { + syncPoints.back().second = result; + }*/ +} + +uint32_t GNAPlugin::QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result) { + auto inputLayout = input.layout(); + if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) { + THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: " << input.layout(); + } + if (inputLayout == NCHW) { + inputLayout = NC; + } + auto is2D = input.layout() == Layout::NC || input.layout() == Layout ::CN; + auto freeNnet = std::find_if(std::begin(nnets), std::end(nnets), [](decltype(nnets.front()) & item) { return std::get<1>(item) == -1; }); if (freeNnet == nnets.end()) { - if (!graphCompiler.memory_connection.empty()) { - Wait(0); - freeNnet = nnets.begin(); - } else { - THROW_IE_EXCEPTION << as_status << REQUEST_BUSY - << "GNA executable network has max of " - << static_cast(gnaFlags->gna_lib_async_threads_num) - << " parallel infer requests, please sync one of already running"; - } + THROW_IE_EXCEPTION << as_status << REQUEST_BUSY + << "GNA executable network has max of " << static_cast(gna_lib_async_threads_num) + << " parallel infer requests, please sync one of already running"; } + auto nnet = std::get<0>(*freeNnet).get(); auto idx = static_cast(std::distance(std::begin(nnets), freeNnet)); - int inputNum = 0; - for (auto &input : inputs) { - auto inputLayout = input.second->getTensorDesc().getLayout(); - if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) { - THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: " - << input.second->getTensorDesc().getLayout(); - } - if (inputLayout == NCHW) { - inputLayout = NC; - } - auto is2D = input.second->getTensorDesc().getLayout() == Layout::NC || input.second->getTensorDesc().getLayout() == Layout::CN; - - if (!inputsDesc->ptr_inputs_global_id.count(input.first)) { - // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance - THROW_GNA_EXCEPTION << "network not loaded : input pointer for " << input.first << " not set"; - } - - if (inputsDesc->getPtrInputsGlobal(input.first)[idx] == nullptr) { - // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance - THROW_GNA_EXCEPTION << "network not loaded : input pointer for (" << input.first << " at inferRequest #" - << idx << " not set"; - } + if (ptr_inputs_global[idx] == nullptr) { + // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance + THROW_GNA_EXCEPTION << "network not loaded : global input pointer not set"; + } - if (inputsDesc->getOrientation(input.first) == kDnnUnknownOrientation) { - // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance - THROW_GNA_EXCEPTION << "network not loaded : input orientation for " << input.first << " not set"; - } + if (orientation_in == kDnnUnknownOrientation) { + // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance + THROW_GNA_EXCEPTION << "network not loaded : input orientation not set"; + } - for (auto& outputDesc : outputsDesc) { - if (outputDesc.orientation == kDnnUnknownOrientation) { - // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance - THROW_GNA_EXCEPTION << "network not loaded : output orientation not set"; - } - } + if (orientation_out == kDnnUnknownOrientation) { + // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance + THROW_GNA_EXCEPTION << "network not loaded : output orientation not set"; + } - auto dims = input.second->getTensorDesc().getDims(); - - ImportFrames(inputsDesc->getPtrInputsGlobal(input.first)[idx], - input.second->cbuffer().as(), - input.second->getTensorDesc().getPrecision(), - gnaFlags->sw_fp32 ? 1.0f : inputsDesc->getScaleFactor(inputNum), - inputsDesc->getOrientation(input.first), - dims[0], - is2D ? dims[dims.size() - 2] : dims[0], - is2D ? dims[dims.size() - 1] : dims[dims.size() - 1] * dims[dims.size() - 2] * dims[dims.size() - 3], - is2D ? dims[dims.size() - 1] : dims[dims.size() - 1] * dims[dims.size() - 2] * dims[dims.size() - 3]); - - bool isOneChannel = input.second->getTensorDesc().getDims()[1] == 1; - if (((inputLayout == Layout::NC || inputLayout == Layout::NCHW) - != (inputsDesc->getOrientation(input.first) == kDnnInterleavedOrientation)) - && !isOneChannel) { - RotateFeatures(reinterpret_cast(inputsDesc->getPtrInputsGlobal(input.first)[idx]), - gnadevice ? 2 : 4, - // TODO: only works for cnn4a and google command so far - dims[0], - is2D ? dims[dims.size() - 1] : dims[dims.size() - 1] * dims[dims.size() - 3], // num_feature_vectors looks batch should be there - num_rotate_rows, - num_rotate_columns); - } - ++inputNum; + ImportFrames(ptr_inputs_global[idx], + input.cbuffer().as(), + input.precision(), + orientation_in, + input.dims()[input.dims().size() - 1], + is2D ? input.dims()[1] : input.dims()[input.dims().size() - 1], + is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2], + is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2]); + + if ((inputLayout == Layout::NC || inputLayout == Layout::NCHW) != (orientation_in == kDnnInterleavedOrientation)) { + RotateFeatures(reinterpret_cast(ptr_inputs_global[idx]), + gnadevice ? 2 : 4, + // TODO: only works for cnn4a and google command so far + input.dims()[input.dims().size() - 1], + is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2], // num_feature_vectors looks batch should be there + num_rotate_rows, + num_rotate_columns); } if (!gnadevice) { - dnn->Propagate(); - if (freeNnet != nnets.end()) { - std::get<1>(*freeNnet) = 1; - } + dnn.Propagate(); + std::get<1>(*freeNnet) = 1; } else { -#if GNA_LIB_VER == 1 - auto nnet = std::get<0>(*freeNnet).get(); std::get<1>(*freeNnet) = gnadevice->propagate(&nnet->obj, ptr_active_indices, num_active_indices); -#else - const auto reqConfigId = std::get<0>(*freeNnet); - if (ptr_active_indices != nullptr && num_active_indices > 0 && activeLayerIndex != 0xffffffff) - gnadevice->setUpActiveList(reqConfigId, activeLayerIndex, ptr_active_indices, num_active_indices); - std::get<1>(*freeNnet) = gnadevice->propagate(reqConfigId); -#endif - } - -#ifdef PLOT - dnn->BeginNewWrite(dnn_dump_write_index); - if (dnn->num_components() != 0) { - dnn->WriteDnnText("Net_.txt", kDnnFloat); - } - dnn_dump_write_index++; -#endif - if (freeNnet != nnets.end()) { - // TODO: GNA2: Substitute properly when using GNA 2.0 Library setting and CPU - std::get<2>(*freeNnet) = result; } + std::get<2>(*freeNnet) = result; return idx; } -void GNAPlugin::Wait(uint32_t request_idx) { -#if GNA_LIB_VER == 2 - auto& nnets = gnaRequestConfigToRequestIdMap; -#endif - if (nnets.size() <= request_idx) return; // TODO: GNA2: check whether necessary +void GNAPlugin::Wait(uint32_t idx) { // already synced TODO: might be copy required ??? - if (std::get<1>(nnets[request_idx]) == -1) return; + if (std::get<1>(nnets[idx]) == -1) return; if (gnadevice) { - gnadevice->wait(std::get<1>(nnets[request_idx])); + gnadevice->wait(std::get<1>(nnets[idx])); } - std::get<1>(nnets[request_idx]) = -1; - auto &request = std::get<2>(nnets[request_idx]); + std::get<1>(nnets[idx]) = -1; + auto & output = *std::get<2>(nnets[idx]).begin()->second; #ifdef PLOT - if (dnn->num_components() != 0) { - dnn->WriteInputAndOutputText(); + dnn.BeginNewWrite(); + if (dnn.num_components() != 0) { + dnn.WriteDnnText("Net_.txt", kDnnFloat); + dnn.WriteInputAndOutputText(); } -#if GNA_LIB_VER == 1 - dnn->WriteInputAndOutputTextGNA(&std::get<0>(nnets[request_idx])->obj); -#else - dnn->WriteInputAndOutputTextGNA(std::get<0>(gnaModels[request_idx])->obj); + dnn.WriteInputAndOutputTextGNA(&std::get<0>(nnets.front())->obj); #endif -#endif - int output_idx = 0; - for (auto && outputBlobIt : request) { - auto & outputBlob = outputBlobIt.second; - auto & outputDesc = outputsDesc[output_idx]; - if (outputBlob->getTensorDesc().getLayout() == Layout::NC) { - // TODO: rotate can be incorporated with exporting - used only in unit tests so far - // TODO: restore: + + if (output.layout() == Layout::NC) { + // TODO: rotate can be incorporated with exporting - used only in unit tests so far + // TODO: restore: // if (orientation_out != kDnnInterleavedOrientation) { -// if (inputs.size() != 1) { -// THROW_GNA_EXCEPTION << "Invalid number of inputs for for deinterleave " << inputs.size() -// << ", only 1 supported"; -// } -// auto dims = inputs.begin()->second->dims(); // RotateFeatures(reinterpret_cast(ptr_outputs_global), // gnadevice ? 2 : 4, -// dims[dims.size() - 1], -// dims[0], // num_feature_vectors looks batch should be there -// dims[0], -// dims[dims.size() - 1]); +// input.dims()[input.dims().size() - 1], +// input.dims()[0], // num_feature_vectors looks batch should be there +// input.dims()[0], +// input.dims()[input.dims().size() - 1]); // } - auto& exportOutputDims = outputBlob->getTensorDesc().getDims(); - ExportScores(outputBlob->buffer(), - outputDesc.ptrs[request_idx], - outputDesc.orientation, - exportOutputDims[0], - exportOutputDims[exportOutputDims.size() - 2], - exportOutputDims[exportOutputDims.size() - 1], - exportOutputDims[exportOutputDims.size() - 1], - exportOutputDims[exportOutputDims.size() - 1], - outputDesc.num_bytes_per_element, - sizeof(float)); - } else if (outputBlob->getTensorDesc().getLayout() != Layout::CN) { - THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC or Layout::CN. But was " - << outputBlob->getTensorDesc().getLayout(); - } - if (gnadevice) { + ExportScores(output.buffer(), + ptr_outputs_global[idx], + orientation_out, + output.dims()[output.dims().size() - 1], + output.dims()[1], + output.dims()[0], + output.dims()[0], + output.dims()[0], + // TODO: create better getter consider multiple outputs case + gnadevice ? std::get<0>(nnets[idx])->obj.pLayers[std::get<0>(nnets[idx])->obj.nLayers - 1].nBytesPerOutput : sizeof(float), + sizeof(float)); + } else if (output.layout() != Layout::CN) { + THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC or Layout::CN. But was " << output.layout(); + } + + if (gnadevice) { #ifdef PLOT FILE *f = nullptr; static int num_infers = 0; @@ -1026,93 +1856,79 @@ void GNAPlugin::Wait(uint32_t request_idx) { } num_infers++; if (f) { - auto dims = outputBlob->getTensorDesc().getDims(); - for (int i = 0; i < dims[dims.size() - 2]; i++) { - for (int j = 0; j < dims[dims.size() - 1]; j++) { - fprintf(f, "%d ", outputBlob->cbuffer().as()[dims[dims.size() - 1] * i + j]); + for (int i = 0; i < output.dims()[1]; i++) { + for (int j = 0; j < output.dims()[0]; j++) { + fprintf(f, "%d ", output.cbuffer().as()[output.dims()[0] * i + j]); } fprintf(f, "\n"); - } - fprintf(f, "\n\n"); } + fprintf(f, "\n\n"); + } #endif - ConvertToFloat(outputBlob->buffer(), - outputBlob->buffer(), - outputBlob->getTensorDesc().getDims()[outputBlob->getTensorDesc().getDims().size() - 1], - outputBlob->getTensorDesc().getDims()[outputBlob->getTensorDesc().getDims().size() - 2], - outputDesc.scale_factor); + ConvertToFloat(output.buffer(), + output.buffer(), + output.dims()[0], + output.dims()[1], + output_scale_factor); #ifdef PLOT if (f) { - auto dims = outputBlob->getTensorDesc().getDims(); - for (int i = 0; i < dims[dims.size() - 2]; i++) { - for (int j = 0; j < dims[dims.size() - 1]; j++) { - fprintf(f, "%.2f ", outputBlob->cbuffer().as()[dims[dims.size() - 1] * i + j]); + for (int i = 0; i < output.dims()[1]; i++) { + for (int j = 0; j < output.dims()[0]; j++) { + fprintf(f, "%.2f ", output.cbuffer().as()[output.dims()[0] * i + j]); } fprintf(f, "\n"); - } - fclose(f); } -#endif + fclose(f); } - output_idx++; +#endif } } -void GNAPlugin::Reset() { - graphCompiler.Reset(); -} void GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) { - BlobMap bmInput; - BlobMap bmOutput; - if (inputsDataMap.size() != 1) { - THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << inputsDataMap.size() << " inputs"; - } + BlobMap result; + result["output"] = std::shared_ptr(&output, [](Blob*){}); + Wait(QueueInference(input, result)); +} - IE_ASSERT(!inputsDataMap.empty()); - bmInput[inputsDataMap.begin()->first] = std::shared_ptr(const_cast(&input), [](Blob*){}); - IE_ASSERT(!outputsDataMap.empty()); - bmOutput[outputsDataMap.begin()->first] = std::shared_ptr(&output, [](Blob*){}); - Infer(bmInput, bmOutput); +void GNAPlugin::Reset() { + for (auto && memLayer : memory_connection) { + std::memset(memLayer.second.gna_ptr, 0, memLayer.second.reserved_size); + } + for (auto && concatLayer : concat_connection) { + std::memset(concatLayer.second.gna_ptr, 0, concatLayer.second.reserved_size); + } } -void GNAPlugin::Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) { - Wait(QueueInference(input, result)); +void GNAPlugin::Infer(const BlobMap &inputs, BlobMap &result) { + auto &input = *inputs.begin()->second.get(); + auto &output = *result.begin()->second.get(); + Infer(input, output); } -Blob::Ptr GNAPlugin::GetOutputBlob(const std::string& name, InferenceEngine::Precision precision) { +Blob::Ptr GNAPlugin::GetOutputBlob(InferenceEngine::Precision precision) { // need to have intermediate blob for interleave conversion InferenceEngine::Blob::Ptr outputBlob; - auto outputDims = outputsDataMap[name]->getTensorDesc().getDims(); - outputBlob = make_blob_with_precision(TensorDesc(precision, outputDims, outputDims.size() == 2 ? NC : NCHW)); + outputBlob = make_blob_with_precision(precision, NC, outputDims); outputBlob->allocate(); return outputBlob; } -Blob::Ptr GNAPlugin::GetInputBlob(const std::string& name, InferenceEngine::Precision precision) { +Blob::Ptr GNAPlugin::GetInputBlob(InferenceEngine::Precision precision) { InferenceEngine::Blob::Ptr inputBlob; // need to have intermediate blob for interleave conversion // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not - auto inputDims = inputsDataMap[name]->getTensorDesc().getDims(); - inputBlob = make_blob_with_precision(TensorDesc(precision, inputDims, inputDims.size() == 2 ? NC : NCHW)); + inputBlob = make_blob_with_precision(precision, inputDims.size() == 2 ? NC : NCHW, inputDims); inputBlob->allocate(); return inputBlob; } std::vector GNAPlugin::QueryState() { - if (graphCompiler.memory_connection.empty()) { + if (memory_connection.empty()) { return {}; } - return {std::make_shared(shared_from_this())}; -} - -std::string GNAPlugin::GetName() const noexcept { - return _pluginName; -} - -void GNAPlugin::SetName(const std::string & pluginName) noexcept { - _pluginName = pluginName; + return {std::make_shared(shared_from_this())}; } InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::string &modelFileName) { @@ -1124,128 +1940,96 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str auto header = GNAModelSerial::ReadHeader(inputStream); - InitGNADevice(); + gnadevice.reset(new GNADeviceHelper(gna_proc_type, + gna_lib_async_threads_num, + gna_openmp_multithreading)); + gnamem.reset(new gna_memory_type(make_polymorph(*gnadevice.get()), PAGE_SIZE_BYTES)); - graphCompiler.setGNAMemoryPtr(gnamem); void *basePtr = nullptr; gnamem->reserve_ptr(&basePtr, header.gnaMemSize); gnamem->commit(); -#if GNA_LIB_VER == 2 - gnaModels.push_back(std::make_tuple(make_shared>(header.layersCount))); -#else - nnets.emplace_back(make_shared>(header.layersCount), -1, InferenceEngine::BlobMap()); + + nnets.push_back(std::make_tuple(make_shared>(header.layersCount), -1, InferenceEngine::BlobMap())); std::get<0>(nnets.back())->obj.nGroup = header.nGroup; -#endif GNAModelSerial::MemoryType mt; -#if GNA_LIB_VER == 2 - auto serial = GNAModelSerial(&std::get<0>(gnaModels.back())->obj, mt); -#else auto serial = GNAModelSerial(&std::get<0>(nnets.back())->obj, mt); -#endif serial.Import(basePtr, header.gnaMemSize, inputStream); - inputsDesc->getPtrInputsGlobal("input").push_back(reinterpret_cast(reinterpret_cast (basePtr) + header.input.descriptor_offset)); - // TODO: import of multioutput network not supported - outputsDesc.resize(1); - auto &outputDesc = outputsDesc.front(); - outputDesc.ptrs.push_back(reinterpret_cast(reinterpret_cast (basePtr) + header.output.descriptor_offset)); + ptr_inputs_global.push_back(reinterpret_cast(reinterpret_cast (basePtr) + header.input.descriptor_offset)); + ptr_outputs_global.push_back(reinterpret_cast(reinterpret_cast (basePtr) + header.output.descriptor_offset)); -#if GNA_LIB_VER == 2 - auto getOrientation = [](Gna2Operation & gnaOperation) { - return gnaOperation.Type == Gna2OperationTypeConvolution ? - kDnnNonInterleavedOrientation : kDnnInterleavedOrientation; - }; -#else auto getOrientation = [](intel_nnet_layer_t & layer) { return layer.nLayerKind == INTEL_CONVOLUTIONAL ? kDnnNonInterleavedOrientation : kDnnInterleavedOrientation; }; -#endif -#if GNA_LIB_VER == 2 - inputsDesc->orientation_in["input"] = getOrientation(std::get<0>(gnaModels.back())->obj.Operations[0]); - outputDesc.orientation = getOrientation(std::get<0>(gnaModels.back())->obj.Operations[std::get<0>(gnaModels.back())->obj.NumberOfOperations - 1]); -#else - inputsDesc->orientation_in["input"] = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]); - outputDesc.orientation = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers - 1]); -#endif - outputDesc.num_bytes_per_element = header.output.element_size; + orientation_in = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]); + orientation_out = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers-1]); + + num_bytes_per_output = header.output.element_size; - auto outputDims = SizeVector({header.nGroup, header.output.elements_count / header.nGroup}); - auto inputDims = SizeVector({header.nGroup, header.input.elements_count / header.nGroup}); + + outputDims = SizeVector({header.output.elements_count / header.nGroup, header.nGroup}); + inputDims = SizeVector({header.input.elements_count / header.nGroup, header.nGroup}); inputsDataMap["input"] = std::make_shared(); inputsDataMap["input"]->setInputData(make_shared("input", - TensorDesc( - Precision::FP32, - inputDims, - Layout::NC))); + inputDims, + Precision::FP32, + Layout::NC)); outputsDataMap["output"] = make_shared("output", - TensorDesc( - Precision::FP32, - outputDims, - Layout::NC)); + outputDims, + Precision::FP32, + Layout::NC); - outputDesc.scale_factor = header.output.scaleFactor; - inputsDesc->inputScaleFactors.push_back(header.input.scaleFactor); + output_scale_factor = header.output.scaleFactor; + input_scale_factor = header.input.scaleFactor; num_rotate_rows = header.nRotateRows; num_rotate_columns = header.nRotateColumns; for (auto && memory : mt) { - GNAMemoryLayer memoryLayer(nullptr, nullptr, gnaFlags->sw_fp32 ? 4 : 2); + GNAMemoryLayer memoryLayer(nullptr, nullptr); memoryLayer.gna_ptr = memory.first; memoryLayer.reserved_size = memory.second; - graphCompiler.memory_connection.emplace_back(make_pair(std::string("noname"), memoryLayer)); + memory_connection.emplace_back(make_pair(std::string("noname"), memoryLayer)); } DumpXNNToFile(); #ifdef PLOT - dnn->WriteGraphWizModel("gna-blob-imported.dot"); -#endif -#if GNA_LIB_VER == 2 - createRequestConfigsForGnaModels(); + dnn.WriteGraphWizModel("graph.dot"); + // ExportGnaNetworkAndrzej("layers/loaded_from_aot_file", &nnet->obj); #endif + return nullptr; } void GNAPlugin::Export(const std::string &fileName) { - if (inputsDesc->ptr_inputs_global_id.empty() || outputsDesc.empty()) { + if (ptr_inputs_global.empty() || ptr_outputs_global.empty()) { THROW_GNA_EXCEPTION << " network not loaded"; } - if (inputsDesc->ptr_inputs_global_id.size() != 1) { - THROW_GNA_EXCEPTION << " exporting network with multiple inputs not supported"; - } - std::fstream outStream(fileName, ios_base::out | ios_base::binary); // TODO: nnet group parameter looks only used in application - so can we move this line into load network. - IE_ASSERT(!inputsDataMap.empty()); - auto inputDims = inputsDataMap.begin()->second->getTensorDesc().getDims(); if (inputDims.size() == 2) { -#if GNA_LIB_VER == 1 - std::get<0>(nnets.front())->obj.nGroup = inputDims[0]; -#endif + std::get<0>(nnets.front())->obj.nGroup = inputDims[1]; } -#if GNA_LIB_VER == 2 - auto serial = GNAModelSerial(&std::get<0>(gnaModels.front())->obj, -#else + auto serial = GNAModelSerial(&std::get<0>(nnets.front())->obj, -#endif - {inputsDesc->inputScaleFactors.front(), - inputsDesc->ptr_inputs_global_storage.front()[0], + {input_scale_factor, + ptr_inputs_global[0], 2, - static_cast(InferenceEngine::details::product(inputsDataMap.begin()->second->getTensorDesc().getDims()))}, - {outputsDesc.front().scale_factor, - outputsDesc.front().ptrs.front(), - outputsDesc.front().num_bytes_per_element, - static_cast(InferenceEngine::details::product(outputsDataMap.begin()->second->getTensorDesc().getDims()))}) - .SetInputRotation(dnn->num_rotate_rows, dnn->num_rotate_columns); - - for (auto && memoryConnection : graphCompiler.memory_connection) { + static_cast(InferenceEngine::details::product(inputsDataMap.begin()->second->getDims()))}, + {output_scale_factor, + ptr_outputs_global[0], + num_bytes_per_output, + static_cast(InferenceEngine::details::product(outputsDataMap.begin()->second->getDims()))}) + .SetInputRotation(dnn.num_rotate_rows, dnn.num_rotate_columns); + + for (auto && memoryConnection : memory_connection) { serial.AddState(memoryConnection.second.gna_ptr, memoryConnection.second.reserved_size); } @@ -1253,46 +2037,238 @@ void GNAPlugin::Export(const std::string &fileName) { } void GNAPlugin::GetPerformanceCounts(std::map &perfMap) { - if (gnaFlags->performance_counting) { + if (performance_counting) { gnadevice->getGnaPerfCounters(perfMap); } } void GNAPlugin::AddExtension(InferenceEngine::IExtensionPtr extension) {} +void GNAPlugin::SetConfig(const std::map &config) {} + +intel_dnn_component_t * GNAPlugin::find_first_unused_input(InferenceEngine::CNNLayerPtr current) { + if (current->insData.empty()) return nullptr; -void GNAPlugin::SetConfig(const std::map &config_map) { - config.UpdateFromMap(config_map); - UpdateFieldsFromConfig(); + auto prev_layer = current->insData.front().lock()->creatorLayer.lock(); + + return findDnnLayer(prev_layer); } +void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, void *ptr_inputs, size_t num_data_bytes_out) { + gnalog() << "Connecting output " << layer->name << " ...\n"; + // in case of Memory Layer it's input allocated in meminput layer + if (layer->outData.size() == 1) { + for (auto &&outLayer : layer->outData.front()->getInputTo()) { + auto& nextLayer = outLayer.second; + auto nextMemoryLayerIt = + std::find_if(begin(memory_connection), end(memory_connection), + [&](MemoryConnection::value_type &comp) { + return comp.second.getOutput()->name + == nextLayer->name; + }); + if (nextMemoryLayerIt != memory_connection.end()) { + auto &nextMemoryLayer = nextMemoryLayerIt->second; + // memory layer not yet initialized + if (nextMemoryLayer.reserved_size == 0) { + gnamem->reserve_ptr(&nextMemoryLayer.gna_ptr, ALIGN64(num_data_bytes_out)); + gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, 0); + + nextMemoryLayer.reserved_offset = 0; + nextMemoryLayer.reserved_size = ALIGN64(num_data_bytes_out); + } else { + IE_ASSERT(nextMemoryLayer.reserved_size == ALIGN64(num_data_bytes_out)); + // same offsets + gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, nextMemoryLayer.reserved_offset); + } + return; + } + } + + // if one of next layers is concat... + for (auto &&outLayer : layer->outData.front()->getInputTo()) { + auto nextLayer = outLayer.second; + if ( LayerInfo(nextLayer).isConcat() ) { + auto& name = layer->name; + // we look for this concat layer pointer in extra concat map + auto concatLayerInfo = concat_connection.find( + nextLayer->name); + + if (concatLayerInfo != concat_connection.end()) { + auto &concatLayerInfoItem = concatLayerInfo->second; + + // find this input in vector sum all outputs in primitive + auto it = std::find_if(concatLayerInfoItem.concatInputLayers.begin(), + concatLayerInfoItem.concatInputLayers.end(), + [&name](GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) { + return item.name == name; + }); + // reserve full size for concat + if (!concatLayerInfoItem.output_allocation_flag) { + // check if this concat is being included by other one + // by going thru each concat and checking inputs + auto included = + std::find_if(concat_connection.begin(), + concat_connection.end(), + [&concatLayerInfo] + (const std::pair &concatItem) -> bool { + auto it = std::find_if(concatItem.second.concatInputLayers.begin(), + concatItem.second.concatInputLayers.end(), + [&concatLayerInfo] + (const GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) -> bool { + return item.name == concatLayerInfo->first; + }); + return it != concatItem.second.concatInputLayers.end(); + }); + if (included == concat_connection.end()) { + gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size)); + } + concatLayerInfo->second.output_allocation_flag = true; + } + gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, it->offset); + } else { + // error + } + return; + } + } + } -void GNAPlugin::UpdateFieldsFromConfig() { - inputsDesc->inputScaleFactors = config.inputScaleFactors; - *gnaFlags = config.gnaFlags; + intel_dnn_component_t * unused_input = nullptr; + if (compact_mode) { + unused_input = find_first_unused_input(layer); + if (unused_input != nullptr) { + gnamem->bind_ptr(ptr, &unused_input->ptr_inputs, 0, ALIGN64(num_data_bytes_out)); + } + } + // cannot reuse suitable input + if (unused_input == nullptr) { + gnamem->reserve_ptr(ptr, ALIGN64(num_data_bytes_out)); + } } -void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network, - const std::map& config, - InferenceEngine::QueryNetworkResult& res) const { - std::unordered_set allLayers; - InferenceEngine::InputsDataMap inputs; +intel_dnn_component_t * GNAPlugin::findDnnLayer(CNNLayerPtr __layer) { + auto component = std::find_if(begin(dnnComponentsForLayer), + end(dnnComponentsForLayer), + [&](DnnComponentsForLayer::value_type &comp) { + return comp.first == __layer->name; + }); + // check for generic prev layer + if (component != dnnComponentsForLayer.end()) { + return &component->second; + } - network.getInputsInfo(inputs); - std::vector sortedLayers = CNNNetSortTopologically(network); + return nullptr; +} - if (inputs.empty()) { - THROW_GNA_EXCEPTION << "Network is empty (GNA)\n"; +GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, size_t offset, int idx) { + // selecting particular input layers + auto prevLayer = CNNNetPrevLayer(layer, idx); + + gnalog() << "Connecting input " << layer->name << " to " << prevLayer->name << " ...\n"; + + // real input not a memory input + if (LayerInfo(prevLayer).isInput()) { + if (0 == bytes_alllocated_for_input) { + gnamem->push_value(&ptr_inputs_global.front(), static_cast(0), num_data_bytes_in, 64); + bytes_alllocated_for_input = num_data_bytes_in; + } + if (ALIGN(num_data_bytes_in, 64) > ALIGN(bytes_alllocated_for_input, 64)) { + THROW_IE_EXCEPTION << "Layer: " << layer->name << " Cannot bind pointer to already allocated input, due to size_allocated=" + << bytes_alllocated_for_input << ", and size_requested=" << num_data_bytes_in; + } + gnamem->bind_ptr(ptr, &ptr_inputs_global.front(), offset); + return prevLayer; } - auto const & secondLayers = inputs.begin()->second->getInputData()->getInputTo(); - if (secondLayers.empty()) { - THROW_GNA_EXCEPTION << "Network consists of input layer only (GNA)\n"; + LayerInfo layerInfoObj(prevLayer); + LayerInfo thisLayerInfoObj(layer); + // connecting to split/slice splitiing layers + if (layerInfoObj.isSplit() || layerInfoObj.isSlice()) { + auto& splittingLayer = prevLayer; + auto& splitName = splittingLayer->name; + auto& name = layer->name; + + // we look for this concat layer pointer in extra concat map + auto splitLayerInfo = split_connection.find(splitName); + + if (splitLayerInfo != split_connection.end()) { + auto &splitLayerInfoItem = splitLayerInfo->second; + // find this input in vector sum all outputs in primitive + auto it = std::find_if(splitLayerInfoItem.splitOutputLayers.begin(), + splitLayerInfoItem.splitOutputLayers.end(), + [&name](GNAPlugin::GNASplitLayer::SplitConnectedLayerInfo &item) { + return item.name == name; + }); + + if (it != splitLayerInfoItem.splitOutputLayers.end()) { + gnalog() << "Connecting split/slice input \n"; + auto res = connectInput(splittingLayer, ptr, + splitLayerInfoItem.reserved_size, it->offset, 0); + gnalog() << "Connected \n"; + return res; + } + } + THROW_GNA_EXCEPTION << "Split/Slice layer: " << splitName + << " is not included in extra map. Something wrong happened"; + } else if (layerInfoObj.isConcat()) { + auto concatLayerInfo = concat_connection.find( + prevLayer->name); + if (concatLayerInfo != concat_connection.end()) { + auto & concatLayerInfoItem = concatLayerInfo->second; + // dnnLayer that is input for concat output layer + gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, offset); + // return layer over concat + return CNNNetPrevLayer(prevLayer); + } + } else if (layerInfoObj.isCrop()) { + auto cropLayerInfo = crop_connection.find( + prevLayer->name); + if (cropLayerInfo != crop_connection.end()) { + auto & cropLayerInfoItem = cropLayerInfo->second; + gnamem->bind_ptr(ptr, &cropLayerInfoItem.gna_ptr, offset); + return CNNNetPrevLayer(prevLayer); + } } + auto prevDnnLayer = findDnnLayer(prevLayer); - InferenceEngine::details::UnorderedDFS(allLayers, - secondLayers.begin()->second, - [&](CNNLayerPtr const& layer) { - if (LayerTypeFromStr(layer->type) != LayerType::NO_TYPE) { - res.supportedLayersMap.insert({ layer->name, GetName() }); - } - }, false); + // check for generic prev layer + if (prevDnnLayer != nullptr) { + gnamem->bind_ptr(ptr, &prevDnnLayer->ptr_outputs, offset); + return prevLayer; + } + + auto prevMemoryLayer = + std::find_if(begin(memory_connection), end(memory_connection), [&](MemoryConnection::value_type &comp) { + return comp.second.getInput()->name == prevLayer->name; + }); + if (prevMemoryLayer != memory_connection.end()) { + // dnnLayer that is input for memory output layer + auto& memoryLayer = prevMemoryLayer->second; + if (memoryLayer.reserved_size == 0) { + gnamem->reserve_ptr(&memoryLayer.gna_ptr, ALIGN64(num_data_bytes_in)); + gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, offset); + + memoryLayer.reserved_offset = offset; + memoryLayer.reserved_size = ALIGN64(num_data_bytes_in); + } else { + IE_ASSERT(memoryLayer.reserved_size == ALIGN64(num_data_bytes_in)); + // same offsets + gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, memoryLayer.reserved_offset); + } + + return prevLayer; } + + // several layers are to be skipped right now + if (LayerInfo(prevLayer).isReshape()) { + gnalog() << "Skipping reshape layer: " << prevLayer->name << "\n"; + return connectInput(prevLayer, ptr, num_data_bytes_in, offset, 0); + } + + if (LayerInfo(prevLayer).isPermute()) { + gnalog() << "Skipping permute layer: " << prevLayer->name << "\n"; + return {connectInput(prevLayer, ptr, num_data_bytes_in, offset, 0).input, true, prevLayer}; + } + + + THROW_GNA_EXCEPTION << "Cannot connect input for: " << layer->name; +} + diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp index 64a47467431288..53365d7a659e71 100644 --- a/inference-engine/src/gna_plugin/gna_plugin.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin.hpp @@ -1,153 +1,170 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once +#include "cpp_interfaces/base/ie_plugin_base.hpp" +#include "dnn.h" +#include "gna_memory.hpp" +#include "gna_device.hpp" #include -#include #include #include #include #include #include #include +#include +#include #include -#include -#include "descriptions/gna_flags.hpp" -#include "descriptions/gna_input_desc.hpp" -#include "descriptions/gna_output_desc.hpp" -#include "backend/am_intel_dnn.hpp" -#include "gna_data_types.hpp" -#include "gna_graph_compiler.hpp" -#include "gna_plugin_policy.hpp" -#include "gna_plugin_log.hpp" -#include "gna_plugin_config.hpp" - -#if GNA_LIB_VER == 2 -#include -#endif +#include +#include +#include +#include "gna_allocator.hpp" +#include "gna_api_wrapper.hpp" namespace GNAPluginNS { -class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::enable_shared_from_this { - protected: - std::string _pluginName = "GNA"; - Config config; - std::shared_ptr dnn; - std::shared_ptr gnaFlags; - std::shared_ptr gnamem; - std::shared_ptr inputsDesc; +void ConvertToInt16(int16_t *ptr_dst, + const float *ptr_src, + const uint32_t num_rows, + const uint32_t num_columns, + const float scale_factor); +void ConvertToFloat(float *ptr_dst, + int32_t *ptr_src, + const uint32_t num_rows, + const uint32_t num_columns, + const float scale_factor); + +int16_t ConvertFloatToInt16(float src); - GNAPluginNS::GNAGraphCompiler graphCompiler; +class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::enable_shared_from_this { + protected: + AmIntelDnn dnn; + using dnn_ptr = std::shared_ptr>; /** * @brief - copy of nnet structure and indicator that related infer request not yet synced */ -#if GNA_LIB_VER == 1 std::vector> nnets; -#else - static constexpr uint32_t FAKE_REQUEST_CONFIG_ID = 0xffffffff; - std::vector> gnaModels; - std::vector> gnaRequestConfigToRequestIdMap; -#endif - -#if GNA_LIB_VER == 2 - uint32_t activeLayerIndex = 0xffffffff; -#endif + + intel_dnn_orientation_t orientation_in = kDnnUnknownOrientation; + intel_dnn_orientation_t orientation_out = kDnnUnknownOrientation; + double input_scale_factor = 1.0; + double output_scale_factor = 1.0; uint32_t num_rotate_rows = 0; uint32_t num_rotate_columns = 0; - uint32_t *ptr_active_indices = nullptr; - uint32_t num_active_indices = 0; - uint32_t num_group_in = 0; - uint32_t dnn_dump_write_index = 0; - - // index matches iterating order of cnnnetwork outputs info - std::vector outputsDesc = std::vector(); - - intel_dnn_number_type_t output_type = kDnnInt; - - GNAPluginNS::Policy policy; -#if GNA_LIB_VER == 2 - void createRequestConfigsForGnaModels(); -#endif - static int GetDeviceVersionFromString(const std::string deviceString); + uint32_t num_feature_maps = 1; + uint32_t num_memory_bytes; - std::shared_ptr gnadevice; - /** - * @brief size of RW segment without extra memory for parallel execution - */ - uint32_t rwSegmentSize = 0; + std::vector ptr_inputs_global; + std::vector ptr_outputs_global; - InferenceEngine::InputsDataMap inputsDataMap; - InferenceEngine::OutputsDataMap outputsDataMap; + int16_t *ptr_int_inputs = NULL; + int32_t *ptr_int_outputs = NULL; + uint32_t *ptr_active_indices = NULL; + uint32_t num_active_indices = 0; + uint32_t num_group_in = 0; + uint32_t num_bytes_weight; + uint32_t num_bytes_per_output = 0; + + bool use_dynamic_quantization = false; + bool compact_mode = true; + bool exclusive_async_requests = false; + bool uniformPwlDesign = false; + uint8_t gna_lib_async_threads_num = 1; + bool gna_openmp_multithreading = false; + // precision of GNA hardware model + InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16; + + bool performance_counting = false; + int bytes_alllocated_for_input = 0; + intel_dnn_number_type_t output_type = kDnnInt; + std::string utterance_name; + + // internal types + enum LayerType { + Input, + Convolution, + ReLU, + LeakyReLU, + Sigmoid, + TanH, + Activation, + Pooling, + FullyConnected, + InnerProduct, + Reshape, + Split, + Slice, + Eltwise, + ScaleShift, + Clamp, + Concat, + Copy, + Permute, + Memory, + Power, + Crop, + NO_TYPE + }; public: explicit GNAPlugin(const std::map& configMap); /** * @brief construct from aot rather then from cnn network */ - GNAPlugin(); - - std::string GetName() const noexcept override; - void SetName(const std::string & pluginName) noexcept override; + GNAPlugin() = default; - void LoadNetwork(InferenceEngine::ICNNNetwork &network); + void LoadNetwork(InferenceEngine::ICNNNetwork &network) override; + using InferenceEngine::IInferencePluginInternal::Infer; - void Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result); - void GetPerformanceCounts(std::map &perfMap); + void Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) override; + void GetPerformanceCounts(std::map &perfMap) override; void AddExtension(InferenceEngine::IExtensionPtr extension) override; - void SetConfig(const std::map &config) override; void LoadNetwork(InferenceEngine::IExecutableNetwork::Ptr &executableNetwork, - const InferenceEngine::ICNNNetwork &network, - const std::map &config_map) override { THROW_GNA_EXCEPTION << "Not implemented"; } - InferenceEngine::ExecutableNetwork LoadNetwork(const InferenceEngine::ICNNNetwork &network, - const std::map &config_map, - InferenceEngine::RemoteContext::Ptr context) override { THROW_GNA_EXCEPTION << "Not implemented"; } - void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result); - void SetCore(InferenceEngine::ICore*) noexcept override {} - InferenceEngine::ICore* GetCore() const noexcept override {return nullptr;} + InferenceEngine::ICNNNetwork &network, + const std::map &config) override { THROW_GNA_EXCEPTION << "Not implemented"; } + void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result) override; + void SetLogCallback(InferenceEngine::IErrorListener &listener) override {}; void Reset(); + /** + * @deprecated Use the version with config parameter + */ + void QueryNetwork(const InferenceEngine::ICNNNetwork &network, + InferenceEngine::QueryNetworkResult &res) const override { } void QueryNetwork(const InferenceEngine::ICNNNetwork &network, const std::map& config, - InferenceEngine::QueryNetworkResult &res) const override; + InferenceEngine::QueryNetworkResult &res) const override { } uint32_t QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result); void Wait(uint32_t idx = 0); - InferenceEngine::Parameter GetConfig(const std::string& name, - const std::map & options) const override; - InferenceEngine::Parameter GetMetric(const std::string& name, - const std::map & options) const override; - InferenceEngine::RemoteContext::Ptr CreateContext(const InferenceEngine::ParamMap& params) override { THROW_GNA_EXCEPTION << "Not implemented"; } - InferenceEngine::RemoteContext::Ptr GetDefaultContext() override { THROW_GNA_EXCEPTION << "Not implemented"; } - - void Wait(uint32_t sync, InferenceEngine::Blob &result) { THROW_GNA_EXCEPTION << "Not implemented"; } + uint32_t QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result); + /** + * + * @param sync - points to gna sync point + * @param idx - points to + * @param result + */ + void Wait(uint32_t sync, InferenceEngine::Blob &result); void Export(const std::string &fileName); - InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName, - const std::map &config) override { - THROW_GNA_EXCEPTION << "Not implemented"; - } - InferenceEngine::ExecutableNetwork ImportNetwork(std::istream& networkModel, - const InferenceEngine::RemoteContext::Ptr& context, - const std::map &config) override { - THROW_GNA_EXCEPTION << "Not implemented"; - } - InferenceEngine::ExecutableNetwork ImportNetwork(std::istream& networkModel, - const std::map &config) override { - THROW_GNA_EXCEPTION << "Not implemented"; - } - + InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName + , const std::map &config) override { THROW_GNA_EXCEPTION << "Not implemented"; } InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName); + + bool IsExclusiveAsyncRequests() { return exclusive_async_requests; } + /** * utility to provide input and output blobs externally to be used by InferenceEngine request API clients */ - InferenceEngine::Blob::Ptr GetInputBlob(const std::string& name, InferenceEngine::Precision precision); - InferenceEngine::Blob::Ptr GetOutputBlob(const std::string& name, InferenceEngine::Precision precision); + InferenceEngine::Blob::Ptr GetInputBlob(InferenceEngine::Precision precision); + InferenceEngine::Blob::Ptr GetOutputBlob(InferenceEngine::Precision precision); /** * helpers to provide inputs info on AOT network */ @@ -159,28 +176,223 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: */ std::vector QueryState(); - /** - * test-wise API - */ - void SetPolicy(GNAPluginNS::Policy p) {policy = p;} + protected: + uint32_t num_cnn_rows_out = 0; + bool done = false; + std::string dumpXNNPath; + intel_gna_proc_t gna_proc_type = static_cast(GNA_SOFTWARE & GNA_HARDWARE); - /** - * QueryMetrics API - */ + void DumpXNNToFile() const; + void CreateLayerPrimitive(InferenceEngine::CNNLayerPtr); + void AffinePrimitive(InferenceEngine::CNNLayerPtr, bool isDiag = false); + void DiagonalPrimitive(InferenceEngine::CNNLayerPtr); + void ConvolutionPrimitive(InferenceEngine::CNNLayerPtr); + void PermutePrimitive(InferenceEngine::CNNLayerPtr); + void PoolingPrimitive(InferenceEngine::CNNLayerPtr); + void PowerPrimitive(InferenceEngine::CNNLayerPtr); + void ConcatPrimitive(InferenceEngine::CNNLayerPtr); + void CropPrimitive(InferenceEngine::CNNLayerPtr); + void EltwisePrimitive(InferenceEngine::CNNLayerPtr); + void SplitPrimitive(InferenceEngine::CNNLayerPtr); + void SlicePrimitive(InferenceEngine::CNNLayerPtr); + void PWLPrimitive(InferenceEngine::CNNLayerPtr); + void CopyPrimitive(InferenceEngine::CNNLayerPtr); + bool AreLayersSupported(InferenceEngine::ICNNNetwork& network, std::string& errMessage); + LayerType LayerTypeFromStr(std::string const &str); + /** + * maps tpe of connection to input and output layers also stores gna_pointer for alloc request + */ + class GNAMemoryLayer { + InferenceEngine::CNNLayerPtr inputLayer; + InferenceEngine::CNNLayerPtr outputLayer; + public: + GNAMemoryLayer(InferenceEngine::CNNLayerPtr inLayer, InferenceEngine::CNNLayerPtr outLayer) : + inputLayer(inLayer), outputLayer(outLayer) { + } + + InferenceEngine::CNNLayerPtr getInput() { return inputLayer; } + InferenceEngine::CNNLayerPtr getOutput() { return outputLayer; } + + /** + * pointer to gna memory request + */ + void *gna_ptr = nullptr; + /** + * gna memory of this size is reserved + */ + size_t reserved_size = 0; + /** + * gna memory of this offset from gna_ptr + */ + size_t reserved_offset = 0; + }; + + class GNAConcatLayer { + InferenceEngine::CNNLayerPtr concatLayer; + + public: + explicit GNAConcatLayer(InferenceEngine::CNNLayerPtr layer) : + concatLayer(layer) + {} + + InferenceEngine::CNNLayerPtr getConcat() { return concatLayer; } + /** + * pointer to gna memory request + */ + void *gna_ptr = nullptr; + /** + * gna memory of this size is reserved for concat + */ + size_t reserved_size = 0; + bool output_allocation_flag = false; + /** + * gna memory of this offset from gna_ptr + */ + struct ConcatConnectedLayerInfo { + ConcatConnectedLayerInfo(const std::string& n, + size_t o) : + name(n), + offset(o) {} + std::string name = ""; + size_t offset = 0; + }; + + std::vector concatInputLayers; + }; + + // Split, Slice + class GNASplitLayer { + InferenceEngine::CNNLayerPtr splitLayer; + + public: + explicit GNASplitLayer(InferenceEngine::CNNLayerPtr layer) : + splitLayer(layer), + splitInputLayer() + {} + + InferenceEngine::CNNLayerPtr getSplit() { return splitLayer; } + /** + * gna memory of this size is reserved for concat + */ + size_t reserved_size = 0; + bool output_allocation_flag = false; + /** + * gna memory of this offset from gna_ptr + */ + struct SplitConnectedLayerInfo { + SplitConnectedLayerInfo() {} + SplitConnectedLayerInfo(std::string& n, + size_t o, + size_t p) : + name(n), + offset(o), + pure_size(p) {} + + SplitConnectedLayerInfo& operator= + (SplitConnectedLayerInfo const& layerInfo) { + this->name = layerInfo.name; + this->offset = layerInfo.offset; + this->pure_size = layerInfo.pure_size; + return *this; + } + std::string name = ""; + size_t offset = 0; + size_t pure_size = 0; + }; + SplitConnectedLayerInfo splitInputLayer; + std::vector splitOutputLayers; + }; + + class GNACropLayer { + InferenceEngine::CNNLayerPtr cropLayer; + + public: + explicit GNACropLayer(InferenceEngine::CNNLayerPtr layer) : + cropLayer(layer) + {} + + InferenceEngine::CNNLayerPtr getCrop() { return cropLayer; } + /** + * pointer to gna croped memory beginning + */ + void *gna_ptr = nullptr; + }; + using MemoryConnection = std::list>; + using ConcatConnection = std::map; + using SplitConnection = std::map; + using CropConnection = std::map; + // layers with extra storage for connections and additional + // non trivial processing + MemoryConnection memory_connection; + ConcatConnection concat_connection; + SplitConnection split_connection; + CropConnection crop_connection; + void fillMemoryConnections(std::map> &memoryPairs); + + void fillConcatConnections(InferenceEngine::CNNLayerPtr layer); + void fillSplitConnections(InferenceEngine::CNNLayerPtr layer); + /** + * maps layer name to dnn.component, in topological sort prev nodes will be initialized + */ + using DnnComponentsForLayer = std::list>; + std::list> dnnComponentsForLayer; - InferenceEngine::Parameter GetAvailableDevices() const; + /** + * @brief returns corresponding dnn layer for topology layer + * @param __layer + * @return + */ + intel_dnn_component_t * findDnnLayer(InferenceEngine::CNNLayerPtr __layer); - protected: - void Init(); + using allocator_type = PolymorphAllocator; + using gna_memory_type = GNAMemory; - void InitGNADevice(); + std::unique_ptr gnadevice; + /** + * @brief size of RW segment without extra memory for parallel execution + */ + uint32_t rwSegmentSize = 0; + std::unique_ptr gnamem; - void DumpXNNToFile() const; + /** + * Connects either memory output, or generic output to a layer + * @param layer - layer pointer + * @param ptr - pointer to pointer where to store output layer information + * @param sz - sizeof output blob + * @param ptr_inputs - sizeof output blob + */ + void connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr_outputs, void *ptr_inputs, size_t sz); + /** + * Connects certain input to this layer + * @param layer - layer that we connect input to + * @param pVoid - pointer that holds current layer pointer in gna_mem request + * @param num_data_bytes_in - size + * @param offset - num bytes to advance in buffer + * @param idx - index of input port that we are connecting + * @return layer used as input + */ + struct ConnectionDetails { + InferenceEngine::CNNLayerPtr input; + bool needTransposeWeights = false; + InferenceEngine::CNNLayerPtr permute; + ConnectionDetails(InferenceEngine::CNNLayerPtr input, + bool bTranspose = false, + InferenceEngine::CNNLayerPtr permute = nullptr) + : input(input) + , needTransposeWeights(bTranspose) + , permute(permute) { + } + }; + ConnectionDetails connectInput(InferenceEngine::CNNLayerPtr layer, + void *pVoid, + size_t num_data_bytes_in, + size_t offset = 0, + int idx = 0); void ImportFrames(void *ptr_dst, const void *ptr_src, InferenceEngine::Precision input_precision, - float scaleFactor, intel_dnn_orientation_t orientation, uint32_t num_frames, uint32_t num_group, @@ -188,7 +400,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: uint32_t num_vector_stride); void ExportScores(void *ptr_dst, - const void *ptr_src, + void *ptr_src, intel_dnn_orientation_t orientation, uint32_t num_frames, uint32_t num_group, @@ -198,6 +410,19 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: uint32_t num_bytes_per_element_input, uint32_t num_bytes_per_element); + friend void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst, + const float *ptr_src, + const uint32_t num_rows, + const uint32_t num_columns, + const float scale_factor); + friend void GNAPluginNS::ConvertToFloat(float *ptr_dst, + int32_t *ptr_src, + const uint32_t num_rows, + const uint32_t num_columns, + const float scale_factor); + + friend int16_t GNAPluginNS::ConvertFloatToInt16(float src); + template void copyInputData(T *dst, const U *src, @@ -205,17 +430,59 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std:: uint32_t num_group, uint32_t num_vector_elements, uint32_t num_vector_stride, - intel_dnn_orientation_t orientation, - float scaleFactor); + intel_dnn_orientation_t orientation); template void copyInputDataWithSplit(T *const dst, const U *src, const GNASplitLayer& splitInfo, - size_t precision_size, - int idx = 0); + size_t precision_size); + /** + * @brief GNA affine layers are always have activation atatched, while IR not + * @param net - copied net ready for quantisation + */ + void insertIdentityLayer(std::vector &layers); - void UpdateFieldsFromConfig(); -}; + /** + * @brief GNA convolution layers have deinterleaved oriantations, while affine one doesn't + * so between convolution and affine layers permute layers need to be inserted, + * or removed if they are present in topology + * @param layers + */ + void applyOrientations(std::vector &layers); + + /** + * brief @search for specific patter in the graph (6 layers are replaced by single one) + * @param layers + */ + void substitutePRelu(std::vector &layers); + + std::vector getCandidatesForIdentityInsertion(const InferenceEngine::CNNLayerPtr layer); + + /** + * diagonal layer insertion required in cases where activation followed by split layers, or any other + * topology changing layers + */ + void insertDiagonalLayer(std::vector & layers); + + /** + * @brief MaxPool can be reordered with activation, on GNA there is a strategy to have conv->maxpool->activation + * it means maxpool receives 4 bytes, and produces 4 bytes + */ + void reorderMaxPool(std::vector & layers); + + /** + * copy layer insertion required in cases where input layer does not have output memory + */ + void insertCopyLayer(std::vector & layers); + + intel_dnn_component_t * find_first_unused_input(InferenceEngine::CNNLayerPtr current); + + InferenceEngine::SizeVector inputDims; + InferenceEngine::InputsDataMap inputsDataMap; + + InferenceEngine::SizeVector outputDims; + InferenceEngine::OutputsDataMap outputsDataMap; +}; } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.hpp b/inference-engine/src/gna_plugin/gna_plugin_config.hpp index 4bc24bd5c465f3..f82e4434e31dc5 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_config.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin_config.hpp @@ -1,48 +1,67 @@ -// Copyright (C) 2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once - -#if GNA_LIB_VER == 1 -#include -#else -#include -#include -#endif -#include "ie_precision.hpp" -#include "descriptions/gna_flags.hpp" #include -#include +#include +#include +#include +#include "ie_common.h" +#include "gna_plugin_log.hpp" namespace GNAPluginNS { -struct Config { - Config() { - AdjustKeyMapValues(); +using CNNNetworkPtr = std::shared_ptr; + +struct Endpoint { + InferenceEngine::TargetDevice device; + InferenceEngine::Precision networkPrec; + std::function convert; + + Endpoint(InferenceEngine::TargetDevice device, + InferenceEngine::Precision networkPrec, + std::function converter = [](InferenceEngine::ICNNNetwork &network) { + return CNNNetworkPtr(&network, [](InferenceEngine::ICNNNetwork *nodelete) {}); + }) : device(device), networkPrec(networkPrec), convert(converter) { + } +}; + +class Config { + public: + using Desc = std::vector; + Desc supported; + InferenceEngine::TargetDevice _defaultDevice = InferenceEngine::TargetDevice::eDefault; + + public: + explicit Config(std::vector &&config) + : supported(std::move(config)) { } - void UpdateFromMap(const std::map& configMap); - void AdjustKeyMapValues(); - std::string GetParameter(const std::string& name) const; - std::vector GetSupportedKeys() const; - // precision of GNA hardware model - InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16; + /** + * @brief default device value is plugin dependent, so it should be also set, to allow fallback + */ + void setDefaultDevice(InferenceEngine::TargetDevice d) { + _defaultDevice = d; + } - std::string dumpXNNPath; - std::string dumpXNNGeneration; + inline Endpoint find_configuration(InferenceEngine::ICNNNetwork &network) { + auto device = network.getTargetDevice(); + auto targetDevice = device == InferenceEngine::TargetDevice::eDefault ? _defaultDevice : device; -#if GNA_LIB_VER == 1 - intel_gna_proc_t gna_proc_type = static_cast(GNA_SOFTWARE & GNA_HARDWARE); -#else - Gna2AccelerationMode pluginGna2AccMode = Gna2AccelerationModeSoftware; - Gna2DeviceVersion pluginGna2DeviceConsistent = Gna2DeviceVersion1_0; -#endif + auto res = std::find_if(std::begin(supported), std::end(supported), [&](Endpoint &e) { + return e.networkPrec == network.getPrecision() && ( + e.device == device || + e.device == targetDevice); + }); - std::vector inputScaleFactors; - GNAFlags gnaFlags; + if (res == std::end(supported)) { + THROW_GNA_EXCEPTION << "\"The plugin doesn't support target device: " + << InferenceEngine::TargetDeviceInfo::name(network.getTargetDevice()) + << ".\nSupported target device: " << InferenceEngine::TargetDeviceInfo::name(InferenceEngine::TargetDevice::eGNA); + } - std::map key_config_map; + return *res; + } }; - } // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp b/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp index e6c4cc3ad9e2ba..d2312741f3f073 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp +++ b/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -11,21 +11,9 @@ using namespace InferenceEngine; using namespace std; using namespace GNAPluginNS; -IE_SUPPRESS_DEPRECATED_START - -static const Version gnaPluginDescription = { - {2, 1}, - CI_BUILD_NUMBER -#if GNA_LIB_VER == 2 - "_with_GNA_LIB_VER==2" -#endif - , - "GNAPlugin" -}; - INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin *&plugin, ResponseDesc *resp) noexcept { try { - plugin = make_ie_compatible_plugin(gnaPluginDescription, make_shared()); + plugin = make_ie_compatible_plugin({1, 5, "GNAPlugin", "GNAPlugin"}, make_shared()); return OK; } catch (std::exception &ex) { diff --git a/inference-engine/src/gna_plugin/gna_plugin_internal.hpp b/inference-engine/src/gna_plugin/gna_plugin_internal.hpp index 0f9ec354f8374d..3c2dcf02ab825d 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_internal.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin_internal.hpp @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // @@ -10,71 +10,19 @@ #include #include #include "gna_executable_network.hpp" -#include "gna_plugin_config.hpp" namespace GNAPluginNS { class GNAPluginInternal : public InferenceEngine::InferencePluginInternal { -private: - Config defaultConfig; - std::weak_ptr plgPtr; - std::shared_ptr GetCurrentPlugin() const { - auto ptr = plgPtr.lock(); - if (ptr == nullptr) { - return std::make_shared(); - } else { - return ptr; - } - } - -public: - InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl( - const InferenceEngine::ICNNNetwork &network, - const std::map &config) override { - Config updated_config(defaultConfig); - updated_config.UpdateFromMap(config); - auto plg = std::make_shared(updated_config.key_config_map); - plgPtr = plg; - return std::make_shared(*cloneNet(network), plg); - } - - void SetConfig(const std::map &config) override { - defaultConfig.UpdateFromMap(config); - } - - InferenceEngine::IExecutableNetwork::Ptr ImportNetwork( - const std::string &modelFileName, - const std::map &config) override { - Config updated_config(defaultConfig); - updated_config.UpdateFromMap(config); - auto plg = std::make_shared(updated_config.key_config_map); - plgPtr = plg; - return make_executable_network(std::make_shared(modelFileName, plg)); - } - - using InferenceEngine::InferencePluginInternal::ImportNetwork; - - std::string GetName() const noexcept override { - return GetCurrentPlugin()->GetName(); - } - - void QueryNetwork(const InferenceEngine::ICNNNetwork& network, - const std::map& config, - InferenceEngine::QueryNetworkResult& res) const override { - auto plg = GetCurrentPlugin(); - try { - plg->SetConfig(config); - } catch (InferenceEngine::details::InferenceEngineException) {} - plg->QueryNetwork(network, config, res); - } - - InferenceEngine::Parameter GetMetric(const std::string& name, - const std::map & options) const override { - return GetCurrentPlugin()->GetMetric(name, options); - } - - InferenceEngine::Parameter GetConfig(const std::string& name, const std::map & options) const override { - return defaultConfig.GetParameter(name); + public: + InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network, + const std::map &config) override { + return std::make_shared(network, config); + } + void SetConfig(const std::map &config) override {} + InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName, + const std::map &config) override { + return make_executable_network(std::make_shared(modelFileName, config)); } }; diff --git a/inference-engine/src/gna_plugin/gna_plugin_log.hpp b/inference-engine/src/gna_plugin/gna_plugin_log.hpp index b3d5dc249ed9e6..08f45ad78dac27 100644 --- a/inference-engine/src/gna_plugin/gna_plugin_log.hpp +++ b/inference-engine/src/gna_plugin/gna_plugin_log.hpp @@ -1,15 +1,13 @@ -// Copyright (C) 2018-2020 Intel Corporation +// Copyright (C) 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include #include
// #define GNA_DEBUG -#ifdef GNA_DEBUG -#include +#ifdef GNA_DEBUG /** * @brief used for creating graphviz charts, and layers dump */ @@ -18,18 +16,6 @@ # define gnawarn() std::cerr #else -#ifdef VERBOSE -#define VERBOSE_LEVEL (1) -#else -#define VERBOSE_LEVEL (0) -#endif - -#ifdef PLOT -#define PLOT_LEVEL (1) -#else -#define PLOT_LEVEL (0) -#endif - class GnaLog { public : template @@ -50,27 +36,19 @@ inline GnaLog & gnawarn() { return gnalog(); } -#endif - /** * @brief gna_plugin exception unification */ #ifdef __PRETTY_FUNCTION__ #undef __PRETTY_FUNCTION__ #endif -#ifdef _WIN32 +#if defined(_WIN32) || defined(__WIN32__) || defined(WIN32) # define __PRETTY_FUNCTION__ __FUNCSIG__ #else # define __PRETTY_FUNCTION__ __FUNCTION__ #endif +#endif -#define GNA_LAYER_ASSERT(layer, expr)\ -if (!(expr)) { \ - THROW_GNA_LAYER_EXCEPTION(layer) << ": " << #expr; \ -} #define THROW_GNA_EXCEPTION THROW_IE_EXCEPTION << "[GNAPlugin] in function " << __PRETTY_FUNCTION__<< ": " -#define THROW_GNA_LAYER_EXCEPTION(layer) THROW_GNA_EXCEPTION << LAYER_NAME(layer) -#define LAYER_NAME(layer) layer->type << " layer : \"" << layer->name << "\" " - diff --git a/inference-engine/src/gna_plugin/gna_plugin_passes.cpp b/inference-engine/src/gna_plugin/gna_plugin_passes.cpp new file mode 100644 index 00000000000000..79d42d24036be9 --- /dev/null +++ b/inference-engine/src/gna_plugin/gna_plugin_passes.cpp @@ -0,0 +1,338 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include + +#include +#include "gna_plugin.hpp" +#include "gna_layer_info.hpp" + + +using namespace InferenceEngine; +using namespace std; +using namespace GNAPluginNS; + +void GNAPlugin::insertDiagonalLayer(std::vector & layers) { + int numOfDiagLayers = 0; + for (auto & l : layers) { + if (l->insData.empty()) continue; + auto prevLayer = CNNNetPrevLayer(l); + if (LayerInfo(l).isActivation()) { + if (LayerInfo(prevLayer).has32BOutput()) + continue; + } else { + auto eltwise = dynamic_cast(l.get()); + if (!eltwise) { + continue; + } + // in case of eltwise sum one of input would be 4 bytes one - 2 + // in case of eltwise mull one of input would be 2 bytes one - 2 + // for e sum if we have 4-4 inputs we will handle that by inserting identity activation + // for e sum if we have 4-2 - OK + // for e sum if we have 2-2 inputs we need to insert diagonal -- handling here + // for e mul if we have 2-2 - OK + // for e mul if we have 2-4 - inputs we need to insert identity to put 4 bytes input into weights + // for e mul if we have 4-4 - inputs we need to insert 2 identities to put both 4 bytes input into weights + + if (eltwise->_operation != EltwiseLayer::Sum) + continue; + + auto prevLayer1 = CNNNetPrevLayer(l, 1); + if (!LayerInfo(prevLayer).has16BOutput() || !LayerInfo(prevLayer1).has16BOutput()) + continue; + } + +#ifdef PLOT + std::cout << "Inserted Diagonal Layer between: " << prevLayer->name << " and " << l->name << "\n" << std::flush; +#endif + // actual insertion + auto diagName = std::string("SyntheticScaleShift_") + std::to_string(numOfDiagLayers++); + auto diagLayer = make_shared(LayerParams({diagName, "ScaleShift", Precision::FP32})); + + // TODO: diagonal size + std::vector arrayOf1(l->outData[0]->dims[0], 1.f); + diagLayer->_weights = make_shared_blob(l->outData[0]->precision, Layout::C, arrayOf1);; + auto newDims = l->outData[0]->dims; + auto dataPtr = std::make_shared(diagName, + newDims, + l->outData[0]->precision, + l->outData[0]->layout); + + auto diagonalWithQuant = InferenceEngine::injectData(diagLayer); + + dataPtr->creatorLayer = diagonalWithQuant; + diagonalWithQuant->outData.push_back(dataPtr); + CNNNetworkInsertLayer(prevLayer, l, diagonalWithQuant); + } +} + +void GNAPlugin::reorderMaxPool(std::vector & layers) { + // detecting following pattern + // conv->relu->maxpooling + // changing it to conv->mxpooling->relu + for (auto & l : layers) { + auto pool = LayerInfo(l); + if (!pool.isMaxPooling()) continue; + + // checking prev layer type + auto activation = LayerInfo(CNNNetPrevLayer(l)); + if (!activation.isActivation()) continue; + + // if activation came from convolution + auto convolution = LayerInfo(CNNNetPrevLayer(static_cast(activation))); + if (!convolution.isConvolution()) continue; + + gnalog() << "MaxPooling: " << pool << ", reordered with activation: " << activation << "\n"; + + CNNNetSwapLayers(activation, pool); + } +} + +std::vector GNAPlugin::getCandidatesForIdentityInsertion(const CNNLayerPtr l) { + vector prevLayers; + + // skipping memory inputs and true inputs layers + if (l->insData.empty()) return {}; + + auto eltwise = dynamic_cast(l.get()); + auto concat = dynamic_cast(l.get()); + + // eltwise + if (eltwise != nullptr) { + // eltwise layer has 2 inputs, so depends on situation identity should or should not be inserted + + // for sum if we have 4-4 inputs we will handle that by inserting identity activation case (1) + // for sum if we have 4-2 - OK + // for sum if we have 2-2 inputs we need to insert diagonal + + // for mul if we have 2-2 - OK + // for mul if we have 2-4 - inputs we need to insert identity activation to make 2 bytes input + // for mul if we have 4-4 - inputs we need to insert 2 identities activations to put 2 bytes input and weights + auto prev0 = CNNNetPrevLayer(l, 0); + auto prev1 = CNNNetPrevLayer(l, 1); + switch (eltwise->_operation) { + case EltwiseLayer::Sum: + if (!LayerInfo(prev0).has32BOutput() || !LayerInfo(prev1).has32BOutput()) { + return prevLayers; + } + // TODO: wether there - are possibility to select what layer to quantize + prevLayers.push_back(prev0); + break; + case EltwiseLayer::Prod: + if (LayerInfo(prev0).has16BOutput() && LayerInfo(prev1).has16BOutput()) { + return prevLayers; + } + + if (LayerInfo(prev0).has32BOutput()) { + prevLayers.push_back(prev0); + } + + if (LayerInfo(prev1).has32BOutput()) { + prevLayers.push_back(prev1); + } + + break; + default : + THROW_GNA_EXCEPTION << "Eltwise Layer of type: " << eltwise->_operation << " not supported"; + } + } else if (concat != nullptr) { + for (int i = 0; CNNNetHasPrevLayer(l.get(), i); ++i) { + auto prev = CNNNetPrevLayer(l, i); + if (LayerInfo(prev).has32BOutput()) { + prevLayers.push_back(prev); + } + } + } else { // not eltwise or concat + // other layers has 1 inputs - situation is easier + // ex. activation or pooling - no need to insert identity activation. + if (LayerInfo(l).has32BInput()) + return prevLayers; + + auto prevLayer = CNNNetPrevLayer(l); + if (!LayerInfo(prevLayer).has32BOutput()) + return prevLayers; + + prevLayers.push_back(prevLayer); + } + return prevLayers; +} + +void GNAPlugin::substitutePRelu(std::vector &layers) { + auto getScale = [](CNNLayer* layer) { + auto powerCandidate = LayerInfo(layer); + if (!powerCandidate.isPower()) return 0.0f; + auto power = powerCandidate.as(); + + return power->power == 1 && power->offset == 0.0f ? power->scale : 0.0f; + }; + + auto isScale = [getScale](CNNLayer* layer) { + return getScale(layer) != 0.0f; + }; + + auto isNegate = [getScale](CNNLayer* layer) { + return getScale(layer) == -1.0f; + }; + + auto getNext = [](CNNLayer* layer) { + CNNLayer* next = nullptr; + if (layer == nullptr) return next; + if (layer->outData.size() != 1) return next; + return layer->outData[0]->inputTo.begin()->second.get(); + }; + + // TODO: unit tests for bad cases + for (auto & l : layers) { + // assume l is starting layer, that is followed by eltwise_sum(relu, negate/relu/scale/negate) + if (l->outData.size() != 1) continue; + auto &outputLayers = l->outData[0]->inputTo; + if (outputLayers.size() != 2) continue; + + // one of followed layers need to be generic relu + auto first = LayerInfo(outputLayers.begin()->second); + auto second = LayerInfo((++outputLayers.begin())->second); + + auto relu1 = outputLayers.begin()->second; + auto neg1 = (++outputLayers.begin())->second; + if (second.isRelu()) { + swap(first, second); + swap(relu1, neg1); + } + if (!first.isRelu()) continue; + // now we have relu as first layer, lets check second + // negate + if (!isNegate(neg1.get())) continue; + + // relu + auto relu2 = getNext(second); + if (!LayerInfo(relu2).isRelu()) continue; + + // scale + auto scale = getNext(relu2); + if (!isScale(scale)) continue; + + // negate2 + auto negate = getNext(scale); + if (!isNegate(negate)) continue; + + // sum + auto sum = getNext(negate); + if (!LayerInfo(sum).isEltwiseSum()) continue; + if (sum->insData.size() != 2) continue; + + auto s1 = sum->insData[0].lock()->creatorLayer.lock().get(); + auto s2 = sum->insData[1].lock()->creatorLayer.lock().get(); + + if (s1 != static_cast(first) && + s2 != static_cast(first)) { + continue; + } + + // hurray we found parametric relu group - dont know what to do with it though + gnalog() << "PRelu with negative slope of " << -LayerInfo(scale).as()->scale << " found" << std::endl; + + // removing all layers references except of relu layer + outputLayers.clear(); + outputLayers[relu1->name] = relu1; + // pointing relu to output of eltwise_summ + relu1->outData = sum->outData; + // changing creator layer + relu1->outData[0]->creatorLayer = relu1; + // pointing back to relu if any + if (!relu1->outData[0]->inputTo.empty()) { + auto summOutputLayer = relu1->outData[0]->inputTo.begin()->second; + summOutputLayer->insData.clear(); + summOutputLayer->insData.push_back(relu1->outData[0]); + } + + // changing negative slope + first.as()->negative_slope = LayerInfo(scale).as()->scale; + } +} + +void GNAPlugin::applyOrientations(std::vector & layers) { +} + +void GNAPlugin::insertIdentityLayer(std::vector &layers) { + int numOfIdentityLayers = 0; + for (auto & l : layers) { + for (auto && prev : getCandidatesForIdentityInsertion(l)) { + // actual insertion + auto activationName = std::string("identity_") + std::to_string(numOfIdentityLayers++); + + gnalog() << "Inserted "<< activationName << " between: " << prev->name << " and " << l->name << "\n" << std::flush; + + CNNLayerPtr activationLayer = + make_shared(LayerParams({activationName, "identity", Precision::FP32})); + auto inputData = l->insData[0].lock(); + auto newDims = inputData->dims; + std::reverse(begin(newDims), end(newDims)); + + auto dataPtr = std::make_shared("FullyConnected", + TensorDesc(inputData->precision, + newDims, + inputData->layout)); + + auto activationLayerWithQuant = InferenceEngine::injectData(activationLayer); + dataPtr->creatorLayer = activationLayerWithQuant; + activationLayerWithQuant->outData.push_back(dataPtr); + // wether 1 identity or all outputs TODO possible grouping here, need to implement special groupped inserter + bool notAll = false; + for (auto && nextData : prev->outData) { + for (auto && nextLayer : nextData->inputTo) { + if (nextLayer.second.get() == l.get()) + continue; + if (getCandidatesForIdentityInsertion(nextLayer.second).empty()) { + notAll = true; + } + } + } + + CNNNetworkInsertLayer(prev, notAll ? l : CNNLayerPtr(nullptr), activationLayerWithQuant); + } + } +} + +void GNAPlugin::insertCopyLayer(std::vector & layers) { + int numCopyLayers = 0; + for (auto & l : layers) { + if (l->insData.empty()) continue; + auto prevLayer = CNNNetPrevLayer(l); + if ((LayerInfo(l).isMemory() && LayerInfo(prevLayer).isConcat()) || + (LayerInfo(l).isConcat() && LayerInfo(prevLayer).isCrop())) { + if (LayerInfo(prevLayer).isCrop()) { + auto cropLayer = dynamic_cast (prevLayer.get()); + size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size(); + if (ALIGN(cropOffset, 8) != cropOffset) { + // The crop will be replced by affine. + // Copy layer insertion is not required + continue; + } + } + std::string copyName = std::string("copy_") + std::to_string(numCopyLayers++); + gnalog() << "Inserted "<< copyName << " between: " << l->name << " and " << prevLayer->name << "\n" << std::flush; + + CNNLayerPtr copyLayer = + make_shared(LayerParams({copyName, "Copy", Precision::FP32})); + + auto inputData = l->insData[0].lock(); + auto newDims = inputData->dims; + + std::reverse(begin(newDims), end(newDims)); + + auto dataPtr = std::make_shared(copyName, + TensorDesc(inputData->precision, + newDims, + inputData->layout)); + + auto copyWithQuant = InferenceEngine::injectData(copyLayer); + dataPtr->creatorLayer = copyWithQuant; + copyWithQuant->outData.push_back(dataPtr); + CNNNetworkInsertLayer(prevLayer, l, copyWithQuant); + } + } +} diff --git a/inference-engine/src/gna_plugin/lstm.cpp b/inference-engine/src/gna_plugin/lstm.cpp new file mode 100644 index 00000000000000..53906e64325597 --- /dev/null +++ b/inference-engine/src/gna_plugin/lstm.cpp @@ -0,0 +1,69 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "lstm.hpp" + +const char *intel_lstm_projected_layer_name[NUM_LSTM_LAYERS] = { + "combined input transform", + "combined recurrent transform", + "input gate", + "forget gate", + "cell gate input part 1", + "cell gate input part 2", + "cell gate output part 1", + "cell gate output part 2", + "output gate", + "hidden gated output", + "projected output" +}; + +const char *intel_lstm_projected_layer_g4_name[NUM_LSTM_G4_LAYERS] = { + "combined input transform", + "deinterleave", + "interleave 1", + "interleave 2", + "interleave 3", + "interleave 4", + "combined recurrent transform - 1", + "input gate - 1", + "forget gate - 1", + "cell gate input part 1 - 1", + "cell gate input part 2 - 1", + "cell gate output part 1 - 1", + "cell gate output part 2 - 1", + "output gate - 1", + "hidden gated output - 1", + "projected output - 1", + "combined recurrent transform - 2", + "input gate - 2", + "forget gate - 2", + "cell gate input part 1 - 2", + "cell gate input part 2 - 2", + "cell gate output part 1 - 2", + "cell gate output part 2 - 2", + "output gate - 2", + "hidden gated output - 2", + "projected output - 2", + "combined recurrent transform - 3", + "input gate - 3", + "forget gate - 3", + "cell gate input part 1 - 3", + "cell gate input part 2 - 3", + "cell gate output part 1 - 3", + "cell gate output part 2 - 3", + "output gate - 3", + "hidden gated output - 3", + "projected output - 3", + "combined recurrent transform - 4", + "input gate - 4", + "forget gate - 4", + "cell gate input part 1 - 4", + "cell gate input part 2 - 4", + "cell gate output part 1 - 4", + "cell gate output part 2 - 4", + "output gate - 4", + "hidden gated output - 4", + "projected output - 4", + "interleave" +}; \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/lstm.hpp b/inference-engine/src/gna_plugin/lstm.hpp new file mode 100644 index 00000000000000..6ce8f10940e186 --- /dev/null +++ b/inference-engine/src/gna_plugin/lstm.hpp @@ -0,0 +1,209 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#define LSTM_GIFO_X_C (component_index) +#define LSTM_GIFO_R_C (component_index+1) +#define LSTM_INPUT_GATE_C (component_index+2) +#define LSTM_INPUT_SIGMOID_C (component_index+3) +#define LSTM_FORGET_GATE_C (component_index+4) +#define LSTM_FORGET_SIGMOID_C (component_index+5) +#define LSTM_CELL_INPUT1_C (component_index+6) +#define LSTM_CELL_INPUT1_TANH_C (component_index+7) +#define LSTM_CELL_INPUT2_C (component_index+8) +#define LSTM_CELL_OUTPUT1_C (component_index+9) +#define LSTM_CELL_TANH_C (component_index+10) +#define LSTM_CELL_OUTPUT2_C (component_index+11) +#define LSTM_CELL_CLIPPING_C (component_index+12) +#define LSTM_OUTPUT_GATE_C (component_index+13) +#define LSTM_OUTPUT_SIGMOID_C (component_index+14) +#define LSTM_HIDDEN_C (component_index+15) +#define LSTM_HIDDEN_IDENTITY_C (component_index+16) +#define LSTM_PROJECTED_C (component_index+17) +#define LSTM_PROJECTED_IDENTITY_C (component_index+18) +#define NUM_LSTM_COMPONENTS 19 + +#define BILSTM_GIFO_X_FW_C (component_index) +#define BILSTM_GIFO_R_FW_C (component_index+1) +#define BILSTM_INPUT_GATE_FW_C (component_index+2) +#define BILSTM_INPUT_SIGMOID_FW_C (component_index+3) +#define BILSTM_FORGET_GATE_FW_C (component_index+4) +#define BILSTM_FORGET_SIGMOID_FW_C (component_index+5) +#define BILSTM_CELL_INPUT1_FW_C (component_index+6) +#define BILSTM_CELL_INPUT1_TANH_FW_C (component_index+7) +#define BILSTM_CELL_INPUT2_FW_C (component_index+8) +#define BILSTM_CELL_GATE_FW_C (component_index+9) +#define BILSTM_CELL_OUTPUT1_FW_C (component_index+10) +#define BILSTM_CELL_TANH_FW_C (component_index+11) +#define BILSTM_CELL_COPY_FW_C (component_index+12) +#define BILSTM_OUTPUT_GATE_FW_C (component_index+13) +#define BILSTM_OUTPUT_SIGMOID_FW_C (component_index+14) +#define BILSTM_HIDDEN_FW_C (component_index+15) +#define BILSTM_HIDDEN_IDENTITY_FW_C (component_index+16) +#define BILSTM_GIFO_X_BW_C (component_index+17) +#define BILSTM_GIFO_R_BW_C (component_index+18) +#define BILSTM_INPUT_GATE_BW_C (component_index+19) +#define BILSTM_INPUT_SIGMOID_BW_C (component_index+20) +#define BILSTM_FORGET_GATE_BW_C (component_index+21) +#define BILSTM_FORGET_SIGMOID_BW_C (component_index+22) +#define BILSTM_CELL_INPUT1_BW_C (component_index+23) +#define BILSTM_CELL_INPUT1_TANH_BW_C (component_index+24) +#define BILSTM_CELL_INPUT2_BW_C (component_index+25) +#define BILSTM_CELL_GATE_BW_C (component_index+26) +#define BILSTM_CELL_OUTPUT1_BW_C (component_index+27) +#define BILSTM_CELL_TANH_BW_C (component_index+28) +#define BILSTM_CELL_COPY_BW_C (component_index+29) +#define BILSTM_OUTPUT_GATE_BW_C (component_index+30) +#define BILSTM_OUTPUT_SIGMOID_BW_C (component_index+31) +#define BILSTM_HIDDEN_BW_C (component_index+32) +#define BILSTM_HIDDEN_IDENTITY_BW_C (component_index+33) +#define NUM_BILSTM_COMPONENTS 34 + +#include "gna-api.h" + +#define ACTIVATION_SCALE_IG 1024.0f +#define ACTIVATION_SCALE_CI1 1024.0f +#define ACTIVATION_SCALE_CO1 2048.0f +#define ACTIVATION_SCALE_OG 2048.0f +#define ACTIVATION_SCALE_HID 2048.0f +#define MAX_WEIGHT_IFO_GATE 1024.0f +#define NUM_WEIGHT_BYTES_IN 2 +#define NUM_WEIGHT_BYTES_PROJ 2 + +typedef struct { + float min; + float max; + float sum; + float sum_squared; + uint32_t num_saturations; + uint32_t num_elements; +} intel_buffer_stats_t; + +typedef struct { + intel_nnet_layer_t in; // combined input transform + intel_nnet_layer_t rec; // combined recurrent transform + intel_nnet_layer_t ig; // input gate + intel_nnet_layer_t fg; // forget gate + intel_nnet_layer_t ci1; // cell gate input part 1 + intel_nnet_layer_t ci2; // cell gate input part 2 + intel_nnet_layer_t co1; // cell gate output part 1 + intel_nnet_layer_t co2; // cell gate output part 2 + intel_nnet_layer_t og; // output gate + intel_nnet_layer_t hid; // hidden gated output + intel_nnet_layer_t proj; // projected output +} intel_lstm_projected_layer_t; + +typedef struct { + intel_affine_layer_t *in; // combined input transform + intel_affine_layer_t *rec; // combined recurrent transform + intel_affine_layer_t *ig; // input gate + intel_affine_layer_t *fg; // forget gate + intel_affine_layer_t *ci1; // cell gate input part 1 + intel_affine_layer_t *ci2; // cell gate input part 2 + intel_affine_layer_t *co1; // cell gate output part 1 + intel_affine_layer_t *co2; // cell gate output part 2 + intel_affine_layer_t *og; // output gate + intel_affine_layer_t *hid; // hidden gated output + intel_affine_layer_t *proj; // projected output +} intel_lstm_projected_transform_t; + +typedef struct { + intel_buffer_stats_t in; // combined input transform + intel_buffer_stats_t rec; // combined recurrent transform + intel_buffer_stats_t ig; // input gate + intel_buffer_stats_t fg; // forget gate + intel_buffer_stats_t ci1; // cell gate input part 1 + intel_buffer_stats_t ci2; // cell gate input part 2 + intel_buffer_stats_t co1; // cell gate output part 1 + intel_buffer_stats_t co2; // cell gate output part 2 + intel_buffer_stats_t og; // output gate + intel_buffer_stats_t hid; // hidden gated output + intel_buffer_stats_t proj; // projected output +} intel_lstm_projected_stats_t; + +typedef struct { + intel_nnet_layer_t rec; // combined recurrent transform + intel_nnet_layer_t ig; // input gate + intel_nnet_layer_t fg; // forget gate + intel_nnet_layer_t ci1; // cell gate input part 1 + intel_nnet_layer_t ci2; // cell gate input part 2 + intel_nnet_layer_t co1; // cell gate output part 1 + intel_nnet_layer_t co2; // cell gate output part 2 + intel_nnet_layer_t og; // output gate + intel_nnet_layer_t hid; // hidden gated output + intel_nnet_layer_t proj; // projected output +} intel_lstm_partial_layer_t; + +typedef struct { + intel_affine_layer_t *rec; // combined recurrent transform + intel_affine_layer_t *ig; // input gate + intel_affine_layer_t *fg; // forget gate + intel_affine_layer_t *ci1; // cell gate input part 1 + intel_affine_layer_t *ci2; // cell gate input part 2 + intel_affine_layer_t *co1; // cell gate output part 1 + intel_affine_layer_t *co2; // cell gate output part 2 + intel_affine_layer_t *og; // output gate + intel_affine_layer_t *hid; // hidden gated output + intel_affine_layer_t *proj; // projected output +} intel_lstm_partial_transform_t; + +typedef struct { + intel_buffer_stats_t rec; // combined recurrent transform + intel_buffer_stats_t ig; // input gate + intel_buffer_stats_t fg; // forget gate + intel_buffer_stats_t ci1; // cell gate input part 1 + intel_buffer_stats_t ci2; // cell gate input part 2 + intel_buffer_stats_t co1; // cell gate output part 1 + intel_buffer_stats_t co2; // cell gate output part 2 + intel_buffer_stats_t og; // output gate + intel_buffer_stats_t hid; // hidden gated output + intel_buffer_stats_t proj; // projected output +} intel_lstm_partial_stats_t; + +typedef struct { + intel_nnet_layer_t in; // combined input transform + intel_nnet_layer_t dintl; // interleave x8 + intel_nnet_layer_t intl1; // deinterleave x2 + intel_nnet_layer_t intl2; // deinterleave x2 + intel_nnet_layer_t intl3; // deinterleave x2 + intel_nnet_layer_t intl4; // deinterleave x2 + intel_lstm_partial_layer_t part[4]; // unrolled part + intel_nnet_layer_t intl; // interleave x4 +} intel_lstm_projected_layer_g4_t; + +typedef struct { + intel_affine_layer_t *in; // combined input transform + intel_lstm_partial_transform_t part[4]; // unrolled part +} intel_lstm_projected_transform_g4_t; + +typedef struct { + intel_buffer_stats_t in; // combined input transform + intel_lstm_partial_stats_t part[4]; // unrolled part +} intel_lstm_projected_stats_g4_t; + +#define NUM_LSTM_LAYERS 11 +#define NUM_LSTM_G4_LAYERS 47 + +extern const char *intel_lstm_projected_layer_name[NUM_LSTM_LAYERS]; +extern const char *intel_lstm_projected_layer_g4_name[NUM_LSTM_G4_LAYERS]; +/* +void GetLstmBufferStats(intel_lstm_projected_layer_t *ptr_layer, std::vector &stats); +void UpdateLstmBufferStats(std::vector &accum, std::vector stats); +void ClearLstmBufferStats(std::vector &stats); +void PrintLstmBufferStats(std::string preamble, std::vector stats); +uint32_t NumBytesLstmMacroLayer(uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells, uint32_t num_group_size, uint32_t layer_num, bool is_compact); +void InitLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform, uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells); +void InitLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform, uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells); +void AllocateLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform, intel_shared_outputs scratch, uint8_t **ptr_memory, uint32_t *ptr_num_bytes_used, uint32_t num_memory_bytes, bool is_compact); +void AllocateLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform, intel_shared_outputs scratch, uint8_t **ptr_memory, uint32_t *ptr_num_bytes_used, uint32_t num_memory_bytes, bool is_compact); +void ConnectLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform); +void ConnectLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform); +void QuantizeLstmMacroLayerG1(std::vector *ptr_component, uint32_t component_index, intel_lstm_projected_transform_t *ptr_transform, float input_scale, gna_scale_factor_t *scale, uint32_t j); +void QuantizeLstmMacroLayerG4(std::vector *ptr_component, uint32_t component_index, intel_lstm_projected_transform_g4_t *ptr_transform, float input_scale, gna_scale_factor_t *scale, uint32_t j); +void ReQuantizeLstmMacroLayerG1(std::vector *ptr_component, uint32_t component_index, intel_lstm_projected_layer_t *ptr_layer, float input_scale, gna_scale_factor_t *scale, uint32_t j); +void ReQuantizeLstmMacroLayerG4(std::vector *ptr_component, uint32_t component_index, intel_lstm_projected_layer_g4_t *ptr_layer, float input_scale, gna_scale_factor_t *scale, uint32_t j); +void IntegrityCheckLstmMacroLayer(std::vector *ptr_component, uint32_t component_index, intel_lstm_projected_layer_t *ptr_layer, gna_scale_factor_t *scale, uint32_t j); + +*/ \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/polymorh_allocator.hpp b/inference-engine/src/gna_plugin/polymorh_allocator.hpp new file mode 100644 index 00000000000000..d50d8a3a7e5245 --- /dev/null +++ b/inference-engine/src/gna_plugin/polymorh_allocator.hpp @@ -0,0 +1,68 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include + +/** + * @brief c++17 concept simulation + */ + +template +class IPolymorhAllocator { + public: + virtual T *allocate(std::size_t n) = 0; + virtual void deallocate(T *p, std::size_t n) = 0; +}; + +template +class allocator_polymorph; + +template +class PolymorphAllocator { + std::shared_ptr> _impl; + public: + explicit PolymorphAllocator(const std::shared_ptr> &impl) : _impl(impl) {} + + T *allocate(std::size_t n) { + return _impl->allocate(n); + } + + void deallocate(T *p, std::size_t n) { + _impl->deallocate(p, n); + } +}; + +/** + * transform any allocator into polymorph type + * @tparam origin + */ + +template +class polymorph_adapter : public IPolymorhAllocator { + origin _impl; + using T = typename origin::value_type; + + public: + template + explicit polymorph_adapter(Args &&... args) + :_impl(std::forward(args)...) { + } + T *allocate(std::size_t n) override { + return _impl.allocate(n); + } + void deallocate(T *p, std::size_t n) override { + _impl.deallocate(p, n); + } +}; + +template +inline PolymorphAllocator make_polymorph(Args &&... args) { + auto sp = std::make_shared>(std::forward(args)...); + auto ipoly = std::static_pointer_cast>(sp); + + return PolymorphAllocator(ipoly); +} \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/pwl.h b/inference-engine/src/gna_plugin/pwl.h new file mode 100644 index 00000000000000..fd45903fcb4a73 --- /dev/null +++ b/inference-engine/src/gna_plugin/pwl.h @@ -0,0 +1,70 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "dnn.h" +#include + +#define SIGMOID_NUM_SEGMENTS 65 +#define SIGMOID_DOMAIN 10.0f // portion of input to be approximated (-10,10) +#define TANH_NUM_SEGMENTS 65 +#define TANH_DOMAIN 5.0f // portion of input to be approximated (-5,5) +#define RELU_NUM_SEGMENTS 2 +#define LEAKYRELU_SLOPE 0.01 +#define IDENTITY_NUM_SEGMENTS 3 +#define IDENTITY_DOMAIN 10.0f +#define PWL_MAX_ERR_PERCENT 1.0f +#define PWL_MAX_ITERATIONS 2000 +#define PWL_MAX_NUM_SEGMENTS 128 +#define PWL_DESIGN_THRESHOLD 0.1f +#define PWL_DESIGN_SAMPLES 500 +#define ACTIVATION_SCALE_FACTOR 2048.0f +#define IDENTITY_SCALE_FACTOR 2049.0f +#define XBASEMASK 0xFFFFFFFC // only top 30 bits are used +#define KALDI_LSTM_CLIP_LOWER (-50.0) +#define KALDI_LSTM_CLIP_UPPER (50.0) + +typedef struct { + double t; + double alpha; + double beta; + double m; + double b; +} pwl_t; + +typedef struct { + double slope; + uint64_t slope_scale = 0; + uint32_t slope_scale_index; +} pwl_gna_slope_scale_t; + +double first_deriv_tanh(const double x); +double sigmoid(const double x); +double first_deriv_sigmoid(const double x); +double relu(const double x); +double leaky_relu(const double x); + +double clipping(const double x, const double lbound, const double ubound); +void PwlApply16(intel_dnn_component_t *component, const uint32_t num_subset_size); +void PwlApply16(intel_dnn_component_t *component, + const uint32_t num_row_start, + const uint32_t num_row_end, + const uint32_t num_col_start, + const uint32_t num_col_end); +void PwlApply32(intel_dnn_component_t *component, const uint32_t num_subset_size); +void PwlApply32(intel_dnn_component_t *component, + const uint32_t num_row_start, + const uint32_t num_row_end, + const uint32_t num_col_start, + const uint32_t num_col_end); +void PwlDesign16(const DnnActivation activation_type, + intel_pwl_segment_t *ptr_segment, + const uint32_t num_segments, + const float scale_in, + const float scale_out); +void PwlDesignOpt16(const DnnActivation activation_type, + std::vector &ptr_segment, + const float scale_in, + const float scale_out); diff --git a/inference-engine/src/gna_plugin/pwl_design.cpp b/inference-engine/src/gna_plugin/pwl_design.cpp new file mode 100644 index 00000000000000..1f325bac7fe4b9 --- /dev/null +++ b/inference-engine/src/gna_plugin/pwl_design.cpp @@ -0,0 +1,681 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "pwl.h" +#include "gna_plugin_log.hpp" +#include +#include +#include + +#define FLOAT_TO_INT16(a) static_cast(((a) < 0)?((a) - 0.5):((a) + 0.5)) +#define FLOAT_TO_INT32(a) static_cast(((a) < 0)?((a)-0.5):((a)+0.5)) +#ifdef _NO_MKL_ +#include +#include
+#define SCOPY(num, in, inci, out, inco) for (int i_ = 0; i_ < *(num); i_++) *(out + i_ * *(inco)) = *(in + i_ * *(inci)); +#define SSCAL(num, scale, inout, inco) for (int i_ = 0; i_ < *(num); i_++) *(inout + i_ * *(inco)) = *(scale) * *(inout + i_ * *(inco)); +#define TANH(num, in, out) for (int i_ = 0; i_ < num; i_++) *(out+i_) = tanh(*(in+i_)) +#else +#include +#define SCOPY(num, in, incx, out, incy) scopy(num, in, incx, out, incy) +#define SSCAL(num, scale, inout, incx) sscal(num, scale, inout, incx) +#define TANH(num, in, out) vsTanh(num, in, out) +#endif + +double first_deriv_tanh(const double x) { return(1.0 - tanh(x) * tanh(x)); } + +double sigmoid(const double x) { return(0.5 * (1.0 + tanh(x / 2))); } +double first_deriv_sigmoid(const double x) { return(sigmoid(x) * (1.0 - sigmoid(x))); } +double relu(const double x) { if (x < 0) { return(0.0); } else { return(x); } } +double leaky_relu(const double x) { if (x < 0.0) { return(LEAKYRELU_SLOPE*x); } else { return(x); } } +double clipping(const double x, const double lbound, const double ubound) { return((x < lbound)?lbound:((x > ubound)?ubound:x)); } + +double pivot_search(std::vector& result, double(*f)(const double), + double(*first_deriv_f)(const double), + const uint32_t N, + const double alpha_0, + const double alpha_N, + const double threshold, + const bool negative) { + std::vector> t(N + 1); + std::vector> alpha(N + 1); + std::vector> epsilon(N + 1); + std::vector> d(N + 1); + bool same_epsilon = false; + double Delta; + double epsilon_final = 0.0; + double max_epsilon = 0.0; + double max_epsilon_prev; + double min_epsilon; + double sgn = (negative) ? -1.0 : 1.0; + int j; + + if ( f == nullptr || + first_deriv_f == nullptr || + threshold < 0) { + return epsilon_final; + } + // Figure 4: Box #1 + j = 0; + Delta = 1.0; + + for (int i = 0; i < N; i++) { + t[i].push_back(alpha_0 + (static_cast((i + 1)) / static_cast((N + 1))) * (alpha_N - alpha_0)); + } + + while (true) { + // Figure 4: Box #2 + alpha[0].resize(j + 1); + alpha[0][j] = alpha_0; + for (int i = 1; i < N; i++) { + alpha[i].resize(j + 1); + alpha[i][j] = (f(t[i - 1][j]) - f(t[i][j]) + first_deriv_f(t[i][j]) * t[i][j] - first_deriv_f(t[i - 1][j]) * t[i - 1][j]) + / (first_deriv_f(t[i][j]) - first_deriv_f(t[i - 1][j])); + } + alpha[N].resize(j + 1); + alpha[N][j] = alpha_N; + + // Figure 4: Box #3 + for (int i = 0; i < N; i++) { + epsilon[i].resize(j + 1); + epsilon[i][j] = sgn * (first_deriv_f(t[i][j]) * (alpha[i][j] - t[i][j]) + f(t[i][j]) - f(alpha[i][j])); + } + epsilon[N].resize(j + 1); + epsilon[N][j] = sgn * (first_deriv_f(t[N - 1][j]) * (alpha[N][j] - t[N - 1][j]) + f(t[N - 1][j]) - f(alpha[N][j])); + + // Figure 4: Test for completion + max_epsilon_prev = max_epsilon; + max_epsilon = fabs(epsilon[0][j]); + min_epsilon = fabs(epsilon[0][j]); + for (int i = 1; i < N + 1; i++) { + if (fabs(epsilon[i][j]) > max_epsilon) max_epsilon = fabs(epsilon[i][j]); + if (fabs(epsilon[i][j]) < min_epsilon) min_epsilon = fabs(epsilon[i][j]); + } + if ((j == PWL_MAX_ITERATIONS) || (max_epsilon - min_epsilon < threshold * min_epsilon)) { + pwl_t value; + result.resize(0); + epsilon_final = (max_epsilon + min_epsilon) / 4.0; // Andrzej's modification + for (int i = 0; i < N; i++) { + double val, val_next; + value.t = t[i][j]; + value.alpha = alpha[i][j]; + val = sgn * first_deriv_f(value.t) * (value.alpha - value.t) + sgn * f(value.t) - epsilon_final; + val_next = sgn * first_deriv_f(value.t) * (alpha[i + 1][j] - value.t) + sgn * f(value.t) - epsilon_final; + value.beta = val; + value.m = (val_next - val) / (alpha[i + 1][j] - value.alpha); + value.b = (val - value.m * value.alpha); + result.push_back(value); + } + value.t = value.m = value.b = 0.0; + value.alpha = alpha[N][j]; + value.beta = sgn * first_deriv_f(t[N - 1][j]) * (alpha[N][j] - t[N - 1][j]) + sgn * f(t[N - 1][j]) - epsilon_final; + result.push_back(value); + if (j == PWL_MAX_ITERATIONS) { + std::cerr << "Error: failed to converge in pivot_search!" << std::endl; + } + return(epsilon_final); + } + + if (j > 0) { + if (max_epsilon > max_epsilon_prev) { + j = j - 1; + Delta = Delta / 2; + } else if (max_epsilon == max_epsilon_prev) { + if (!same_epsilon) { + same_epsilon = true; + } else { + j = j - 1; + Delta = Delta / 2; + same_epsilon = false; + } + } + } + + // Figure 4: Box #4 + for (int i = 0; i < N; i++) { + d[i].resize(j + 1); + d[i][j] = Delta * (epsilon[i + 1][j] - epsilon[i][j]) / + ((epsilon[i + 1][j] / (alpha[i + 1][j] - t[i][j])) + (epsilon[i][j] / (t[i][j] - alpha[i][j]))); + } + + // Figure 4: Box #5 + for (int i = 0; i < N; i++) { + t[i].resize(j + 2); + t[i][j + 1] = t[i][j] + d[i][j]; + } + t[N].resize(j + 2); + + j = j + 1; + } +} + +double calculate_error_pct(const DnnActivationType fun, + const double l_bound, + const double u_bound, + const double offset, + const int samples) { + double delta = (u_bound - l_bound) / (samples + 1); + double min_val = 0.0; + double max_val = 0.0; + + if ( delta < 0 ) { + return 0.0; + } + + switch (fun) { + case kActSigmoid: min_val = max_val = sigmoid(l_bound); break; + case kActTanh: min_val = max_val = tanh(l_bound); break; + } + + for (int i = 0; i < samples; i++) { + double arg = l_bound + i * delta; + double val = 0.0; + switch (fun) { + case kActSigmoid: val = sigmoid(arg); break; + case kActTanh: val = tanh(arg); break; + } + if (val > max_val) max_val = val; + if (val < min_val) min_val = val; + } + + return(100.0 * fabs(offset) / (max_val - min_val)); +} + +bool split_search(const DnnActivationType fun, + const double l_bound, + const double u_bound) { + bool is_split = false; + if (l_bound > u_bound) { + return is_split; + } + + switch (fun) { + case kActSigmoid: + case kActTanh: + if ((l_bound < 0.0) && (u_bound > 0.0)) { + is_split = true; + } + break; + default: + is_split = false; + } + return(is_split); +} + +inline std::vector negative_pwl(const std::vector& pwl) { + std::vector new_pwl; + new_pwl = pwl; + for (uint32_t i = 0; i < pwl.size(); i++) { + new_pwl[i].m = -pwl[i].m; + new_pwl[i].b = -pwl[i].b; + new_pwl[i].beta = -pwl[i].beta; + } + + return(new_pwl); +} + +std::vector pwl_search(const DnnActivationType fun, + const double l_bound, + const double u_bound, + const double threshold, + const double allowed_err_pct, + const int samples, + double& err_pct) { + std::vector pwl; + double err = 0.0; + int n_segments = 1; + + if (l_bound > u_bound || + threshold < 0) { + return pwl; + } + + if (split_search(fun, l_bound, u_bound)) { + std::vector pwl2; + double err_pct1 = 0.0, err_pct2 = 0.0; + + pwl = pwl_search(fun, l_bound, 0.0, threshold, allowed_err_pct, samples, err_pct1); + pwl = negative_pwl(pwl); + pwl2 = pwl_search(fun, 0.0, u_bound, threshold, allowed_err_pct, samples, err_pct2); + + // merge + pwl.pop_back(); // remove final alpha and beta from first half + pwl.insert(pwl.end(), pwl2.begin(), pwl2.end()); // concatenate the two halves + err_pct = (err_pct1 + err_pct2) / 2; // this is not quite correct but should give an indication + + } else { + if (fun == kActIdentity) { + pwl.resize(2); + pwl[0].alpha = pwl[0].t = pwl[0].beta = -std::numeric_limits::infinity(); + pwl[0].m = 1.0; + pwl[0].b = 0.0; + pwl[1].alpha = std::numeric_limits::infinity(); + pwl[1].beta = std::numeric_limits::infinity(); + + } else if (fun == kActKaldiLstmClipping) { + pwl.resize(4); + pwl[0].alpha = pwl[0].t = pwl[0].beta = -std::numeric_limits::infinity(); + pwl[0].m = 0.0; + pwl[0].b = pwl[0].beta = KALDI_LSTM_CLIP_LOWER; + pwl[1].alpha = pwl[0].t = pwl[1].beta = KALDI_LSTM_CLIP_LOWER; + pwl[1].m = 1.0; + pwl[1].b = 0.0; + pwl[2].alpha = pwl[0].t = pwl[1].beta = KALDI_LSTM_CLIP_UPPER; + pwl[2].m = 0.0; + pwl[2].b = KALDI_LSTM_CLIP_UPPER; + pwl[3].alpha = pwl[3].beta = std::numeric_limits::infinity(); + + } else { + bool negative = false; + + switch (fun) { + case kActSigmoid: + if (u_bound == 0) negative = true; // make left half convex + err = pivot_search(pwl, sigmoid, first_deriv_sigmoid, n_segments, l_bound, u_bound, threshold, negative); + break; + case kActTanh: + if (u_bound == 0) negative = true; // make left half convex + err = pivot_search(pwl, tanh, first_deriv_tanh, n_segments, l_bound, u_bound, threshold, negative); + break; + } + err_pct = calculate_error_pct(fun, l_bound, u_bound, err, samples); + + while ((n_segments < PWL_MAX_ITERATIONS) && (allowed_err_pct < err_pct)) { + n_segments += 1; + switch (fun) { + case kActSigmoid: + err = pivot_search(pwl, sigmoid, first_deriv_sigmoid, n_segments, l_bound, u_bound, threshold, negative); + break; + case kActTanh: + err = pivot_search(pwl, tanh, first_deriv_tanh, n_segments, l_bound, u_bound, threshold, negative); + break; + } + err_pct = calculate_error_pct(fun, l_bound, u_bound, err, samples); + } + + if (n_segments >= PWL_MAX_ITERATIONS) { + std::cerr << "Error: failed to converge in pwl_search!" << std::endl; + } + } + } + return(pwl); +} + +pwl_gna_slope_scale_t gna_slope(const double slope, + const double in_scale, + const double out_scale) { + pwl_gna_slope_scale_t s; + s.slope = slope* out_scale / in_scale; + + for (s.slope_scale_index = 3; s.slope_scale_index > 0; --s.slope_scale_index) { + s.slope_scale = static_cast(1) << (8 * (1 + s.slope_scale_index)); + if (((s.slope * s.slope_scale) <= std::numeric_limits::max()) && + ((s.slope * s.slope_scale) >= std::numeric_limits::min())) + break; + } + s.slope_scale = static_cast(1) << (8 * (1 + s.slope_scale_index)); + + return(s); +} + +void make_gna_pwl(const DnnActivation fun, + const std::vector& pwl, + const double l_bound, + const double u_bound, + const double in_scale, + const double out_scale, + std::vector &gna_pwl) { + pwl_gna_slope_scale_t s; + uint32_t pwl_size = static_cast(pwl.size()); + switch (fun) { + case kActSigmoid: + case kActTanh: { + auto n_segments = static_cast (pwl_size) + 1; + gna_pwl.resize(n_segments); + // insert extra segment for x values < l_bound + gna_pwl[0].xBase = static_cast (INT32_MIN & XBASEMASK); // zero out the 2 lsb + if (fun == kActSigmoid) { + gnalog() << "=========================== Sigmoid Segments ===========================\n"; + gna_pwl[0].yBase = gna_pwl[1].yBase = 0; + gna_pwl[1].xBase = (static_cast (in_scale * (-pwl[0].b / pwl[0].m))) & XBASEMASK; + } else { + gnalog() << "=========================== Tanh Segments ===========================\n"; + gna_pwl[0].yBase = gna_pwl[1].yBase = static_cast(-1.0 * out_scale); + gna_pwl[1].xBase = (static_cast (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK; + } + gna_pwl[0].slope = 0; + + gnalog() << (gna_pwl[0].xBase) / in_scale + << " " << (gna_pwl[0].yBase) / out_scale + << " " << 0.0 + << "\n"; + + s = gna_slope(pwl[0].m, in_scale, out_scale); + gna_pwl[1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale); + gna_pwl[1].xBase = gna_pwl[1].xBase | s.slope_scale_index; + + gnalog() << (gna_pwl[1].xBase/in_scale) + << " " << (gna_pwl[1].yBase) / out_scale + << " " << pwl[0].m + << "\n"; + + for (uint32_t i = 1; i < pwl_size - 1; ++i) { + s = gna_slope(pwl[i].m, in_scale, out_scale); + gna_pwl[i + 1].xBase = (static_cast (in_scale * pwl[i].alpha)) & XBASEMASK; + gna_pwl[i + 1].yBase = FLOAT_TO_INT16(pwl[i].beta * out_scale); + gna_pwl[i + 1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale); + gna_pwl[i + 1].xBase = gna_pwl[i + 1].xBase | s.slope_scale_index; + + gnalog() << (pwl[i].alpha) + << " " << pwl[i].beta + << " " << pwl[i].m + << "\n"; + } + // insert extra segment for xvalues > u_bound + gna_pwl[n_segments - 1].xBase = + ((uint32_t) (in_scale * (1.0 - pwl[pwl_size - 2].b) / pwl[pwl_size - 2].m)) & XBASEMASK; + gna_pwl[n_segments - 1].yBase = FLOAT_TO_INT16(1.0 * out_scale); + gna_pwl[n_segments - 1].slope = 0; + + gnalog() << (gna_pwl[n_segments - 1].xBase / in_scale) + << " " << 1.0 + << " " << 0.0 + << "\n"; + break; + } + case kActRelu: + case kActLeakyRelu: { + auto n_segments = 2; + gna_pwl.resize(n_segments); + + gnalog() << "=========================== ReLU Segments ===========================\n"; + int32_t x_lower = INT32_MIN; + int16_t y_lower = INT16_MIN; + if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale); + if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale); + gna_pwl[0].yBase = y_lower * fun.negative_slope; + s = gna_slope(fun.negative_slope, in_scale, out_scale); + gna_pwl[0].xBase = (x_lower & XBASEMASK) | s.slope_scale_index; // zero out the 2 lsb + gna_pwl[0].slope = FLOAT_TO_INT16(s.slope * s.slope_scale); + + gnalog() << gna_pwl[0].xBase / in_scale + << " " << gna_pwl[0].yBase / out_scale + << " " << (gna_pwl[0].slope * in_scale) / (out_scale*s.slope_scale) + << "\n"; + gna_pwl[1].xBase = 0; + gna_pwl[1].yBase = 0; + s = gna_slope(1.0, in_scale, out_scale); + gna_pwl[1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale); + gna_pwl[1].xBase = gna_pwl[1].xBase | s.slope_scale_index; + gnalog() << 0.0 + << " " << 0.0 + << " " << (gna_pwl[1].slope * in_scale) / (out_scale*s.slope_scale) + << "\n"; + break; + } + case kActIdentity: + case kActKaldiLstmClipping: { + int32_t x_lower = INT32_MIN; + int32_t x_upper = INT32_MAX; + int16_t y_lower = INT16_MIN; + int16_t y_upper = INT16_MAX; + auto n_segments = 2; + if (fun == kActKaldiLstmClipping) { + gnalog() << "=========================== Clipping Segments ===========================\n"; + if (x_lower < l_bound * in_scale) { + if (y_lower < l_bound * out_scale) { + x_lower = FLOAT_TO_INT32(l_bound * in_scale); + y_lower = FLOAT_TO_INT16(l_bound * out_scale); + } else { + x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale); + } + } + if (x_upper > u_bound * in_scale) { + if (y_upper > u_bound * out_scale) { + x_upper = FLOAT_TO_INT32(u_bound * in_scale); + y_upper = FLOAT_TO_INT16(u_bound * out_scale); + } else { + x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale); + } + } + } else { + gnalog() << "=========================== Identity Segments ===========================\n"; + if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale); + if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale); + if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale); + if (y_upper > x_upper * out_scale / in_scale) y_upper = FLOAT_TO_INT16(x_upper * out_scale / in_scale); + } + gna_pwl.resize(n_segments); + gna_pwl[0].xBase = INT32_MIN & XBASEMASK; // zero out the 2 lsb + gna_pwl[0].yBase = y_lower; + gna_pwl[0].slope = 0; + gnalog() << gna_pwl[0].xBase / in_scale + << " " << gna_pwl[0].yBase / out_scale + << " " << 0 + << "\n"; + gna_pwl[1].xBase = x_lower & XBASEMASK; // zero out the 2 lsb + gna_pwl[1].yBase = y_lower; + s = gna_slope(1.0, in_scale, out_scale); + gna_pwl[1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale); + gna_pwl[1].xBase = gna_pwl[1].xBase | s.slope_scale_index; + gnalog() << gna_pwl[1].xBase / in_scale + << " " << gna_pwl[1].yBase / out_scale + << " " << 1.0 + << "\n"; + if (INT32_MAX > x_upper) { // need a right segment + gna_pwl.push_back({ + static_cast(x_upper & XBASEMASK), // zero out the 2 lsb + y_upper, + 0 }); + + gnalog() << gna_pwl[n_segments].xBase / in_scale + << " " << gna_pwl[n_segments].yBase / out_scale + << " " << 0 + << "\n"; + n_segments += 1; + } + break; + } + default: + gnalog() << "Unexpected function activation!\n"; + std::cerr << "Unexpected function activation!\n"; + } +} + +void PwlDesignOpt16(const DnnActivation activation_type, + std::vector &ptr_segment, + const float scale_in, + const float scale_out) { + std::vector pwl; + double err_pct = 0.0; + switch (activation_type) { + case kActSigmoid: + pwl = pwl_search(kActSigmoid, -SIGMOID_DOMAIN, SIGMOID_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); + make_gna_pwl(activation_type, pwl, -SIGMOID_DOMAIN, SIGMOID_DOMAIN, scale_in, scale_out, ptr_segment); + break; + case kActTanh: + pwl = pwl_search(kActTanh, -TANH_DOMAIN, TANH_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct); + make_gna_pwl(activation_type, pwl, -TANH_DOMAIN, TANH_DOMAIN, scale_in, scale_out, ptr_segment); + break; + case kActRelu: + make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); + break; + case kActLeakyRelu: + make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); + break; + case kActIdentity: + make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment); + break; + case kActKaldiLstmClipping: + make_gna_pwl(activation_type, pwl, KALDI_LSTM_CLIP_LOWER, KALDI_LSTM_CLIP_UPPER, scale_in, scale_out, ptr_segment); + break; + default: + break; + } +} + +void PwlDesign16(const DnnActivation activation_type, + intel_pwl_segment_t *ptr_segment, + const uint32_t num_segments, + const float scale_in, + const float scale_out) { + switch (activation_type) { + case kActSigmoid: + { + gnalog() << "=========================== Sigmoid Segments===========================\n"; + uint32_t num_segment_size = 0; + int32_t offset = 0; + ptr_segment[0].xBase = static_cast(INT32_MIN & XBASEMASK); // zero out the 2 lsb + num_segment_size = static_cast(SIGMOID_DOMAIN * scale_in / ((num_segments-2) / 2) + 0.5); + offset = -static_cast(num_segment_size * (num_segments-2) / 2); + for (uint32_t i = 1; i < num_segments; i++) { + ptr_segment[i].xBase = static_cast(offset & XBASEMASK); // zero out the 2 lsb + offset += num_segment_size; + } + for (uint32_t i = 0; i < num_segments; i++) { + int32_t xbase = static_cast(ptr_segment[i].xBase & XBASEMASK); + int32_t xbasenext = (i < num_segments-1) ? static_cast(ptr_segment[i+1].xBase & XBASEMASK) : INT32_MAX; + float floatarg = static_cast(xbase / (2 * scale_in)); + float floatargnext = static_cast(xbasenext / (2 * scale_in)); + float floatval, floatvalnext, slope; + TANH(1, &floatarg, &floatval); + floatval = 0.5f * (1.0f + floatval); + TANH(1, &floatargnext, &floatvalnext); + floatvalnext = 0.5f * (1.0f + floatvalnext); + slope = scale_out*(floatvalnext - floatval) / static_cast(xbasenext - xbase); + { + // find best scale factor + uint64_t slope_scale; + uint32_t slope_scale_index; + for (slope_scale_index = 3; slope_scale_index > 0; slope_scale_index--) { + slope_scale = static_cast(1) << (8 * (1 + slope_scale_index)); + if (((slope * slope_scale) <= 32767.0) && ((slope * slope_scale) >= -32768.0)) + break; + } + slope_scale = static_cast(1) << (8 * (1 + slope_scale_index)); + ptr_segment[i].slope = FLOAT_TO_INT16(slope * slope_scale); + + ptr_segment[i].xBase = ptr_segment[i].xBase | slope_scale_index; + } + ptr_segment[i].yBase = FLOAT_TO_INT16(floatval * scale_out); + gnalog() << (static_cast((ptr_segment[i].xBase & XBASEMASK))/scale_out) + << " " + << (static_cast((ptr_segment[i].yBase))/scale_out) + << " " + << (slope/scale_out) + << "\n"; + } + } + break; + case kActTanh: + { + gnalog() << "=========================== Tanh Segments===========================\n"; + uint32_t num_segment_size = 0; + int32_t offset = 0; + ptr_segment[0].xBase = static_cast(INT32_MIN & XBASEMASK); // zero out the 2 lsb + num_segment_size = static_cast(TANH_DOMAIN * scale_in / ((num_segments-2) / 2) + 0.5); + offset = -static_cast(num_segment_size * (num_segments-2) / 2); + for (uint32_t i = 1; i < num_segments; i++) { + ptr_segment[i].xBase = static_cast(offset & XBASEMASK); // zero out the 2 lsb + offset += num_segment_size; + } + for (uint32_t i = 0; i < num_segments; i++) { + int32_t xbase = static_cast(ptr_segment[i].xBase & XBASEMASK); + int32_t xbasenext = (i < num_segments-1) ? + static_cast(ptr_segment[i+1].xBase & XBASEMASK) : + INT32_MAX; + float floatarg = static_cast(xbase / scale_in); + float floatargnext = static_cast(xbasenext / scale_in); + float floatval, floatvalnext, slope; + TANH(1, &floatarg, &floatval); + TANH(1, &floatargnext, &floatvalnext); + slope = scale_out * (floatvalnext - floatval) / + static_cast(xbasenext - xbase); + { + // find best scale factor + uint64_t slope_scale; + uint32_t slope_scale_index; + for (slope_scale_index = 3; slope_scale_index > 0; slope_scale_index--) { + slope_scale = static_cast(1) << (8 * (1 + slope_scale_index)); + if (((slope * slope_scale) <= 32767.0) && ((slope * slope_scale) >= -32768.0)) + break; + } + slope_scale = static_cast(1) << (8 * (1 + slope_scale_index)); + ptr_segment[i].slope = FLOAT_TO_INT16(slope * slope_scale); + ptr_segment[i].xBase = ptr_segment[i].xBase | slope_scale_index; + } + ptr_segment[i].yBase = FLOAT_TO_INT16(floatval * scale_out); + gnalog() << (static_cast((ptr_segment[i].xBase & XBASEMASK))/scale_out) + << " " + << (static_cast((ptr_segment[i].yBase))/scale_out) + << " " + << (slope/scale_out) + << "\n"; + } + } + break; + case kActRelu: + std::cerr << "Rectilinear activation function design not yet implemented!" << std::endl; + throw -1; + break; + case kActIdentity: + case kActKaldiLstmClipping: // clipping of IDENTITY is more aggressive than Kaldi + { + float slope = 0.0; + int64_t x_lower_limit = static_cast((INT16_MIN / scale_out) * scale_in - 0.5); + int64_t x_upper_limit = static_cast((INT16_MAX / scale_out) * scale_in + 0.5); + int16_t y_lower_limit = INT16_MIN; + int16_t y_upper_limit = INT16_MAX; + if (activation_type == kActKaldiLstmClipping) + gnalog() << "=========================== Clipping Segments ===========================\n"; + else + gnalog() << "=========================== Identity Segments ===========================\n"; + if (x_lower_limit < INT32_MIN) { + std::cerr << "Warning: saturation in PwlDesign16! " << x_lower_limit << " < INT32_MIN"<< std::endl; + x_lower_limit = INT32_MIN; + y_lower_limit = static_cast((scale_out / scale_in)*static_cast(INT32_MIN) - 0.5); + } + if (x_upper_limit > INT32_MAX) { + std::cerr << "Warning: saturation in PwlDesign16! " << x_upper_limit << " > INT32_MAX"<< std::endl; + x_upper_limit = INT32_MAX; + y_upper_limit = static_cast((scale_out / scale_in)*static_cast(INT32_MAX) + 0.5); + } + slope = + static_cast(static_cast(y_upper_limit) - static_cast(y_lower_limit)) / + static_cast(static_cast(x_upper_limit) - static_cast(x_lower_limit)); + ptr_segment[0].xBase = static_cast(INT32_MIN & XBASEMASK); // zero out the 2 lsb + ptr_segment[0].yBase = y_lower_limit; + ptr_segment[0].slope = 0; + + gnalog() << ptr_segment[0].xBase / scale_in + << " " << ptr_segment[0].yBase / scale_out + << " " << 0 + << "\n"; + + ptr_segment[1].xBase = static_cast(x_lower_limit & XBASEMASK); + ptr_segment[1].yBase = y_lower_limit; + { + // find best scale factor + uint64_t slope_scale = 0; + uint32_t slope_scale_index = 0; + for (slope_scale_index = 3; slope_scale_index > 0; slope_scale_index--) { + slope_scale = static_cast(1) << (8 * (1 + slope_scale_index)); + if (((slope * slope_scale) <= std::numeric_limits::max()) && + ((slope * slope_scale) >= std::numeric_limits::min())) + break; + } + slope_scale = static_cast(1) << (8 * (1 + slope_scale_index)); + ptr_segment[1].slope = FLOAT_TO_INT16(slope * slope_scale); + ptr_segment[1].xBase = ptr_segment[1].xBase | slope_scale_index; + } + ptr_segment[2].xBase = static_cast(x_upper_limit & XBASEMASK); + ptr_segment[2].yBase = y_upper_limit; + ptr_segment[2].slope = 0; + } + break; + default: + fprintf(stderr, "Activation function design for %s not yet implemented!\n", intel_dnn_activation_name[activation_type]); + throw -1; + } +} diff --git a/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp b/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp new file mode 100644 index 00000000000000..6c42d9255bec58 --- /dev/null +++ b/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp @@ -0,0 +1,488 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include +#include +#include +#include "ie_layers.h" +#include "quantized_layer_params.hpp" +#include "quantization.h" +#include "details/caseless.hpp" +#include "graph_tools.hpp" +#include "blob_factory.hpp" +#include "precision_ex.hpp" +#include "pwl.h" +#include "gna_layer_info.hpp" + +namespace GNAPluginNS { +namespace details { + +/** + * @brief description of quantisation precision + * @tparam Ip - input precision + * @tparam Wp - weights precision + * @tparam Bp - biases precision + * @tparam Np - network precision - can be auto generated in future + */ +template +struct QuantDescTmpl { + using WeightsPrecision = Wp; + using BiasesPrecision = Bp; + + InferenceEngine::TPrecision _Ip; + InferenceEngine::TPrecision _Op; + InferenceEngine::TPrecision _Wp; + InferenceEngine::TPrecision _Bp; + InferenceEngine::TPrecision _Np; + + QuantDescTmpl() = default; + QuantDescTmpl(InferenceEngine::TPrecision _Ip, + InferenceEngine::TPrecision _Op, + InferenceEngine::TPrecision _Wp, + InferenceEngine::TPrecision _Bp, + InferenceEngine::TPrecision _Np) : _Op(_Op), _Ip(_Ip), _Wp(_Wp), _Bp(_Bp), _Np(_Np) { + } + + InferenceEngine::Precision getInputPrecision() const { + return _Ip; + } + InferenceEngine::Precision getWeightsPrecision() const { + return _Wp; + } + InferenceEngine::Precision getBiasesPrecision() const { + return _Bp; + } + InferenceEngine::Precision getNetPrecision() const { + return _Np; + } + InferenceEngine::Precision getOutputPrecision() const { + return _Op; + } +}; + +#define P_TYPE(X)\ +typename InferenceEngine::PrecisionTrait::value_type + +#define PRECISION_TYPE(A, B, C, D, E)\ + P_TYPE(A), P_TYPE(B), P_TYPE(C), P_TYPE(D), P_TYPE(E) + + +struct QuantI16 : public QuantDescTmpl { + QuantI16() { + _Np = InferenceEngine::Precision::MIXED; + } +}; +struct QuantI8 : public QuantDescTmpl { + QuantI8() { + _Np = InferenceEngine::Precision::MIXED; + } +}; + +template +struct QuantPair { + using MandatoryType = A; + using OptionalType = B; + static A mandatory () { return A();} + static B optional () { return B();} +}; + +/** + * @brief should allocated blob for specific data type, in case of src blob is nullptr + * @tparam T + * @return + */ +template +inline bool shouldAlwaysAllocate() { + return false; +} + +template <> +inline bool shouldAlwaysAllocate() { + return true; +} + + +#undef P_TYPE +#undef PRECISION_TYPE + +/** + * @brief designate actual data quantisation functions trait + */ +template +class Quant { + public: + template + void operator()(Args && ... args) const { } +}; + +template<> +class Quant { + public: + template + void operator()(Args && ... args) const { + QuantizeAffine16(std::forward(args)...); + } +}; + +template<> +class Quant { + public: + template + void operator()(Args && ... args) const { + QuantizeAffine8(std::forward(args)...); + } +}; + +template +inline void quantizeWeightsBiases(const QuantDesc & quantDesc, + InferenceEngine::WeightableLayer *wl, + const QuantFunc &fnc, + bool isDiagonal = false) { // for diagonal layer number of weights and biases significatly smaller + // for quantized weights + auto intWeights = + make_custom_blob(InferenceEngine::C, InferenceEngine::SizeVector({wl->_weights->size()})); + intWeights->allocate(); + if (intWeights->buffer() == nullptr) { + THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED + << "cannot copy weights for layer :"<< wl->name << " of size" << intWeights->byteSize(); + } + + + auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) { + if (wl->_biases) { + return wl->_biases->size(); + } + // calculating biases len using weight dims + auto & dims = wl->outData.front()->getDims(); + return dims[1]; + }; + + using BiasesPrecision = typename QuantDesc::BiasesPrecision; + auto biasMaker = [&] () { + InferenceEngine::Blob::Ptr zero; + if (!wl->_biases && !shouldAlwaysAllocate()) { + return zero; + } + auto bias = make_custom_blob(InferenceEngine::C, InferenceEngine::SizeVector({ + getBiasSizeForLayer(wl) + })); + bias->allocate(); + if (bias->buffer() == nullptr) { + THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED + << "cannot copy bias for layer :"<< wl->name <<"of size" << bias->byteSize(); + } + + memset(bias->buffer(), 0, bias->byteSize()); + + return bias; + }; + auto intBiases = biasMaker(); + + float input_scale_factor = 1.f; + if (InferenceEngine::CNNNetHasPrevLayer(wl)) { + auto quantDataForInputLayer = + InferenceEngine::getInjectedData(*InferenceEngine::CNNNetPrevLayer(wl).get()); + input_scale_factor = quantDataForInputLayer->_dst_quant.scale; + if (std::isnan(input_scale_factor) || + std::isinf(input_scale_factor)) { + THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor; + } + } + if (wl->outData[0]->getDims().size() < 2) { + THROW_IE_EXCEPTION << "Unsupported output dims size for " << wl->name <<", should be > 1, but " << wl->outData[0]->getDims().size(); + } + if (wl->insData[0].lock().get()->getDims().size() < 2) { + THROW_IE_EXCEPTION << "Unsupported input dims size for " << wl->name << ", should be > 1, but " << wl->insData[0].lock().get()->getDims().size(); + } + uint32_t num_rows = isDiagonal ? 1 : wl->outData[0]->getDims()[1]; + uint32_t num_columns = wl->insData[0].lock().get()->getDims()[1]; + + if (isDiagonal) { + std::swap(num_rows, num_columns); + } + + uint32_t num_rows_padded = num_rows; + uint32_t num_columns_padded = num_columns; + + // TODO: replace this into fixed scale quantizer then + + auto quantData = InferenceEngine::getInjectedData(*wl); + { + fnc(wl->_weights->buffer().as(), + wl->_biases ? wl->_biases->buffer().as() : nullptr, + intWeights->buffer(), + intBiases ? intBiases->buffer() : static_cast(nullptr), + input_scale_factor, + &quantData->_weights_quant.scale, + &quantData->_dst_quant.scale, + num_rows, + num_columns, + num_rows_padded, + num_columns_padded); + } + wl->_weights = intWeights; + wl->_biases = intBiases; + + /** + * correcting precision for outdata + */ + wl->precision = quantDesc.getWeightsPrecision(); + for (auto &&outData : wl->outData) { + outData->setPrecision(quantDesc.getOutputPrecision()); + } +} + + +template +inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc, + InferenceEngine::WeightableLayer *conv, + const QuantFunc &fnc) { + // for quantized weights + auto intWeights = make_custom_blob(InferenceEngine::C, InferenceEngine::SizeVector({conv->_weights->size()})); + intWeights->allocate(); + if (intWeights->buffer() == nullptr) { + THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED + << "cannot copy weights for layer :"<< conv->name << " of size" << intWeights->byteSize(); + } + + + auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) { + if (wl->_biases) { + return wl->_biases->size(); + } + // calculating biases len using weight dims + auto & dims = wl->outData.front()->getDims(); + return dims[1]; + }; + + using BiasesPrecision = typename QuantDesc::BiasesPrecision; + auto biasMaker = [&] () { + InferenceEngine::Blob::Ptr zero; + if (!conv->_biases && !shouldAlwaysAllocate()) { + return zero; + } + auto bias = make_custom_blob(InferenceEngine::C, InferenceEngine::SizeVector({ + getBiasSizeForLayer(conv) + })); + bias->allocate(); + if (bias->buffer() == nullptr) { + THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED + << "cannot copy bias for layer :"<< conv->name <<"of size" << bias->byteSize(); + } + memset(bias->buffer(), 0, bias->byteSize()); + + return bias; + }; + auto intBiases = biasMaker(); + + float input_scale_factor = 1.f; + if (InferenceEngine::CNNNetHasPrevLayer(conv)) { + auto quantDataForInputLayer = + InferenceEngine::getInjectedData(*InferenceEngine::CNNNetPrevLayer(conv).get()); + input_scale_factor = quantDataForInputLayer->_dst_quant.scale; + if (std::isnan(input_scale_factor) || + std::isinf(input_scale_factor)) { + THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor; + } + } + if (conv->outData[0]->getDims().size() < 2) { + THROW_IE_EXCEPTION << "Unsupported output dims size for " << conv->name <<", should be > 1, but " << conv->outData[0]->getDims().size(); + } + if (conv->insData[0].lock().get()->getDims().size() < 2) { + THROW_IE_EXCEPTION << "Unsupported input dims size for " << conv->name << ", should be > 1, but " << conv->insData[0].lock().get()->getDims().size(); + } + auto inputData = conv->insData[0].lock(); + + uint32_t num_rows = getBiasSizeForLayer(conv); + uint32_t num_columns = conv->_weights->size() / num_rows; + + uint32_t num_rows_padded = num_rows; + uint32_t num_columns_padded = num_columns; + + // TODO: replace this into fixed scale quantizer then + + auto quantData = InferenceEngine::getInjectedData(*conv); + { + fnc(conv->_weights->buffer().as(), + conv->_biases ? conv->_biases->buffer().as() : nullptr, + intWeights->buffer(), + intBiases ? intBiases->buffer() : static_cast(nullptr), + input_scale_factor, + &quantData->_weights_quant.scale, + &quantData->_dst_quant.scale, + num_rows, + num_columns, + num_rows_padded, + num_columns_padded); + } + conv->_weights = intWeights; + conv->_biases = intBiases; + + /** + * correcting precision for outdata + */ + conv->precision = quantDesc.getWeightsPrecision(); + for (auto &&outData : conv->outData) { + outData->setPrecision(quantDesc.getOutputPrecision()); + } +} + + +class DataQuantizerBase { + public: + explicit DataQuantizerBase(float scaleFactor) : scaleFactor(scaleFactor) { + } + protected: + float scaleFactor = 1.0; +}; +/** + * Helper class to use partial specialisation of Layer type + * @tparam Desc + * @tparam Layer + */ +template +class DataQuantizer : public DataQuantizerBase { + public: + explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {} + bool operator()(Layer cnnLayer) const { + return false; + } +}; + +template +class DataQuantizer : public DataQuantizerBase { + public: + explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {} + + bool operator()(InferenceEngine::CNNLayer *cnnLayer) const { + for (auto &&outData : cnnLayer->outData) { + outData->setPrecision(Desc::mandatory().getOutputPrecision()); + } + // set scale factor for input layers + auto quantData = InferenceEngine::getInjectedData(*cnnLayer); + if (cnnLayer->insData.empty()) { + for (auto &&outData : cnnLayer->outData) { + outData->setPrecision(Desc::mandatory().getInputPrecision()); + } + } else { + if (LayerInfo(*cnnLayer).isActivation() || + LayerInfo(*cnnLayer).isCopy()) { + // precision of activation layers is always equal input precision + for (auto &&outData : cnnLayer->outData) { + outData->setPrecision(Desc::mandatory().getInputPrecision()); + } + } + } + cnnLayer->precision = Desc::mandatory().getInputPrecision(); + + return true; + } +}; + + +template +class DataQuantizer : public DataQuantizer { + using base = DataQuantizer; + public: + explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {} + bool operator()(InferenceEngine::SplitLayer *splitLayer) const { + base::operator()(splitLayer); + // split layer doesnt change it's data at all + for (auto &&outData : splitLayer->outData) { + outData->setPrecision(Desc::mandatory().getInputPrecision()); + } + return true; + } +}; + +template +class DataQuantizer : public DataQuantizer { + using base = DataQuantizer; + public: + explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {} + bool operator()(InferenceEngine::ConcatLayer *concatLayer) const { + base::operator()(concatLayer); + for (auto &&outData : concatLayer->outData) { + outData->setPrecision(Desc::mandatory().getInputPrecision()); + } + return true; + } +}; + +template +class DataQuantizer : public DataQuantizer { + using base = DataQuantizer; + public: + explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {} + bool operator()(InferenceEngine::CropLayer *cropLayer) const { + base::operator()(cropLayer); + for (auto &&outData : cropLayer->outData) { + outData->setPrecision(Desc::mandatory().getInputPrecision()); + } + return true; + } +}; + +template +class DataQuantizer : public DataQuantizer { + using base = DataQuantizer; + public: + explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {} + bool operator()(InferenceEngine::ReshapeLayer *reshapeLayer) const { + base::operator()(reshapeLayer); + // reshape layer doesnt change it's data at all + for (auto &&outData : reshapeLayer->outData) { + outData->setPrecision(Desc::mandatory().getInputPrecision()); + } + return true; + } +}; + +template +class DataQuantizer : public DataQuantizerBase { + public: + explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {} + bool operator()(InferenceEngine::WeightableLayer *wl) const { + quantizeWeightsBiases(Desc::mandatory(), wl, Quant()); + return true; + } +}; + +template +class DataQuantizer : public DataQuantizerBase { + public: + explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {} + bool operator()(InferenceEngine::WeightableLayer *wl) const { + quantizeWeightsBiasesConv(Desc::optional(), wl, Quant()); + return true; + } +}; + +template +class DataQuantizer : public DataQuantizerBase { + public: + explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {} + bool operator()(InferenceEngine::ScaleShiftLayer *wl) const { + quantizeWeightsBiases(Desc::optional(), wl, Quant(), true); + return true; + } +}; + +} // namespace details + +template +class LayersQuantizer : public details::DataQuantizerBase { + public: + explicit LayersQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {} + template + bool operator()(T input) const { + return details::DataQuantizer(scaleFactor)(input); + } +}; + +using QuantI16 = details::QuantPair; +using QuantI8 = details::QuantPair; + +} // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp b/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp new file mode 100644 index 00000000000000..797c87c9c71818 --- /dev/null +++ b/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp @@ -0,0 +1,78 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#pragma once +#include +#include "gna_plugin_config.hpp" +#include "layer_transform.hpp" +#include "graph_tools.hpp" +#include "details/ie_cnn_network_tools.h" +#include "layer_quantizer.hpp" +#include "scale_factor_calc.hpp" + +namespace GNAPluginNS { +/** + * Quantize entire cnn - network + * @tparam T - type trait for weights and biases + */ +template +class ModelQuantizer { + public: + CNNNetworkPtr quantize(InferenceEngine::ICNNNetwork &model, float scaleFactor) const { + return quantize(model, [](InferenceEngine::CNNNetPtr &){}, scaleFactor); + } + + template + CNNNetworkPtr quantize(InferenceEngine::ICNNNetwork &model, const PreQuantisationCb &cb, float scaleFactor) const { + auto visitor = [&](InferenceEngine::CNNLayerPtr lp) { + return InferenceEngine::injectData(lp); + }; + auto copiedNet = InferenceEngine::CNNNetCopy(model, visitor); + + // TODO: probably not the best way of using dynamic cast in order to transform Precision + // one of solution is to create not copyNet overloads, that accepts 2 functors, one for layer copy + // and another one for net copy + auto rawNet = dynamic_cast(copiedNet.get()); + rawNet->setPrecision(T::mandatory().getNetPrecision()); + + // allow client code to access copied topology, to avoid copies if user would like to chain quantisation with + // another preprocessing + cb(copiedNet); + + LayersQuantizer lc(scaleFactor); + auto sortedNewNet = InferenceEngine::details::CNNNetSortTopologically(*copiedNet.get()); + gnalog() << "Sorted layers: " << std::endl; + for (auto &&layer : sortedNewNet) { + gnalog() << layer->name << std::endl; + } + + // weights scale is a hint, not all weightable layer preserve it in all possible precisions + propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), scaleFactor); + + // sorted order gives possibility for propagate quantisation along depended layers + for (auto &&layer : sortedNewNet) { + transformLayer(layer, lc); + } + + return copiedNet; + } + + private : + void propagateScaleFactor(std::vector & net, int weightsBytesSize, float scaleFactor) const { + ScaleFactorCalculator sf(net, weightsBytesSize, scaleFactor); + + while (!sf.allLayersProcessed()) { + for (auto &&layer : sf.getStartLayers()) { + transformLayer(layer, sf); + // transforming until we reached cases where output scale updated due to situation in downstream layer + if (sf.needToRestart()) { + break; + } + } + } + } +}; +} // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/quantization/precision_ex.hpp b/inference-engine/src/gna_plugin/quantization/precision_ex.hpp new file mode 100644 index 00000000000000..798345e9821545 --- /dev/null +++ b/inference-engine/src/gna_plugin/quantization/precision_ex.hpp @@ -0,0 +1,95 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ie_precision.hpp" + +namespace InferenceEngine { + +/** + * @brief reverse trait for getting some precision from it's underlined memory type + * this might not work for certain precisions : for Q78, U16 + * @tparam T + */ +template +struct precision_from_media { + static const Precision::ePrecision type = Precision::CUSTOM; +}; + +template<> +struct precision_from_media { + static const Precision::ePrecision type = Precision::FP32; +}; + +template<> +struct precision_from_media { + static const Precision::ePrecision type = Precision::FP16; +}; + +template<> +struct precision_from_media { + static const Precision::ePrecision type = Precision::I16; +}; + +template<> +struct precision_from_media { + static const Precision::ePrecision type = Precision::U8; +}; + +template<> +struct precision_from_media { + static const Precision::ePrecision type = Precision::I8; +}; + +template<> +struct precision_from_media { + static const Precision::ePrecision type = Precision::I32; +}; + +/** + * @brief container for storing both precision and it's underlined media type + * @tparam TMedia + */ +template +class TPrecision : public Precision { + public: + typedef TMedia MediaType; + TPrecision() : Precision(precision_from_media::type) {} + explicit TPrecision(const Precision & that) : Precision(that) {} + TPrecision & operator = (const Precision & that) { + Precision::operator=(that); + return *this; + } + explicit TPrecision(const Precision::ePrecision value) : Precision(value) {} +}; + +template TPrecision createTPrecision() { + TPrecision cnt(InferenceEngine::Precision::fromType()); + return cnt; +} + +template +TPrecision::value_type> createTPrecision() { + TPrecision::value_type> cnt(T); + return cnt; +} + + +// special case for Mixed, or undefined precisions +template <> +class TPrecision : public Precision { + public: + typedef void MediaType; + TPrecision() = default; + explicit TPrecision(const Precision & that) : Precision(that) {} + TPrecision & operator = (const Precision & that) { + Precision::operator=(that); + return *this; + } + explicit TPrecision(const Precision::ePrecision value) : Precision(value) {} +}; + + +} // namespace InferenceEngine \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/quantization/quantization.cpp b/inference-engine/src/gna_plugin/quantization/quantization.cpp new file mode 100644 index 00000000000000..457bff9afed336 --- /dev/null +++ b/inference-engine/src/gna_plugin/quantization/quantization.cpp @@ -0,0 +1,699 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include "quantization.h" + +void QuantizeAffine16(float *ptr_float_weights, + float *ptr_float_biases, + int16_t *ptr_int_weights, + int32_t *ptr_int_biases, + float input_scale_factor, + float *ptr_weight_scale_factor, + float *ptr_output_scale_factor, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_rows_padded, + uint32_t num_columns_padded) { + uint32_t num_saturate = 0; + + if (*ptr_weight_scale_factor == 1.0) { + // scale factor for weights is not calculated yet + float mean_weight = 0.0; + float mean_weight_squared = 0.0; + float max_weight = -1e20f; + float var_weight; + float mean_plus_2stdev; + + for (uint32_t i = 0; i < num_rows; i++) { + for (uint32_t j = 0; j < num_columns; j++) { + float weight = ptr_float_weights[i * num_columns + j]; + mean_weight += weight; + mean_weight_squared += weight * weight; + if (fabs(weight) > max_weight) { + max_weight = fabs(weight); + } + } + } + + mean_weight /= static_cast(num_rows * num_columns); + mean_weight_squared /= static_cast(num_rows * num_columns); + var_weight = mean_weight_squared - mean_weight * mean_weight; + mean_plus_2stdev = mean_weight + 2.0f * static_cast(sqrtf(var_weight)); + + *ptr_weight_scale_factor = static_cast(MAX_VAL_2B_WEIGHT) / max_weight; + *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor; + } + + for (uint32_t row = 0; row < num_rows; row++) { + for (uint32_t col = 0; col < num_columns; col++) { + float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; + float value = ptr_float_weights[row * num_columns + col] * *ptr_weight_scale_factor + rounding_value; + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + if (value > 32767.0) { + *ptr_weight_16 = 32767; + num_saturate++; + } else if (value < -32768.0) { + *ptr_weight_16 = -32768; + num_saturate++; + } else { + *ptr_weight_16 = (int16_t) value; + } + } + for (uint32_t col = num_columns; col < num_columns_padded; col++) { + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + *ptr_weight_16 = 0; + } + } + for (uint32_t row = num_rows; row < num_rows_padded; row++) { + for (uint32_t col = 0; col < num_columns_padded; col++) { + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + *ptr_weight_16 = 0; + } + } + + // case for element wise layer + if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) { + for (uint32_t j = 0; j < num_rows; j++) { + float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f; + float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value; + if (value > 2147483647.0) { + ptr_int_biases[j] = 2147483647L; + num_saturate++; + } else if (value < -2147483648.0) { + ptr_int_biases[j] = -2147483648LL; + num_saturate++; + } else { + ptr_int_biases[j] = (int32_t) value; + } + } + for (uint32_t j = num_rows; j < num_rows_padded; j++) { + ptr_int_biases[j] = 0; + } + } + + if (num_saturate > 0) { + QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine16()\n", + num_saturate, + num_rows * num_columns + num_rows); + } +} + +void FixedQuantizeAffine16(float *ptr_float_weights, + float *ptr_float_biases, + int16_t *ptr_int_weights, + int32_t *ptr_int_biases, + float input_scale_factor, + float weight_scale_factor, + float *ptr_output_scale_factor, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_rows_padded, + uint32_t num_columns_padded) { + uint32_t num_saturate = 0; + + for (uint32_t row = 0; row < num_rows; row++) { + for (uint32_t col = 0; col < num_columns; col++) { + float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; + float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value; + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + if (value > 32767.0) { + *ptr_weight_16 = 32767; + num_saturate++; + } else if (value < -32768.0) { + *ptr_weight_16 = -32768; + num_saturate++; + } else { + *ptr_weight_16 = (int16_t) value; + } + } + } + for (uint32_t row = num_rows; row < num_rows_padded; row++) { + for (uint32_t col = 0; col < num_columns_padded; col++) { + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + *ptr_weight_16 = 0; + } + } + + *ptr_output_scale_factor = input_scale_factor * weight_scale_factor; + + for (uint32_t j = 0; j < num_rows; j++) { + float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f; + float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value; + if (value > 2147483647.0) { + ptr_int_biases[j] = 2147483647L; + num_saturate++; + } else if (value < -2147483648.0) { + ptr_int_biases[j] = -2147483648LL; + num_saturate++; + } else { + ptr_int_biases[j] = (int32_t) value; + } + } + for (uint32_t j = num_rows; j < num_rows_padded; j++) { + ptr_int_biases[j] = 0; + } + + if (num_saturate > 0) { + QUANTWARNING("Warning: %d / %d saturations in FixedQuantizeAffine16()\n", + num_saturate, + num_rows * num_columns + num_rows); + } +} + +float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements) { + float *ptr_float_feat = reinterpret_cast(ptr_float_memory); + float max = 0.0; + float scale_factor; + + for (size_t i = 0; i < num_elements; i++) { + if (fabs(ptr_float_feat[i]) > max) { + max = fabs(ptr_float_feat[i]); + } + } + + if (max == 0) { + scale_factor = 1.0; + } else { + scale_factor = target_max / max; + } + + return (scale_factor); +} + +float ScaleFactorForQuantization(std::vector> &input_vectors, float target_max) { + float max = 0.0; + float scale_factor; + uint32_t num_vectors = (uint32_t) input_vectors.size(); + + for (uint32_t i = 0; i < num_vectors; i++) { + float *ptr_float_feat = input_vectors[i].data(); + uint32_t num_elements = (uint32_t) input_vectors[i].size(); + for (uint32_t j = 0; i < num_elements; i++) { + if (fabs(ptr_float_feat[j]) > max) { + max = fabs(ptr_float_feat[j]); + } + } + } + + if (max == 0) { + scale_factor = 1.0; + } else { + scale_factor = target_max / max; + } + + return (scale_factor); +} + +float ScaleFactorForQuantization(std::vector> &input_vectors, + int index, + int num_group_size, + float target_max) { + float max = 0.0; + float scale_factor; + uint32_t start_index = (uint32_t) index; + uint32_t end_index = + (uint32_t) ((index + num_group_size > input_vectors.size()) ? input_vectors.size() - 1 : start_index + + num_group_size); + + for (uint32_t i = start_index; i < end_index; i++) { + float *ptr_float_feat = input_vectors[i].data(); + uint32_t num_elements = (uint32_t) input_vectors[i].size(); + for (uint32_t j = 0; j < num_elements; j++) { + if (fabs(ptr_float_feat[j]) > max) { + max = fabs(ptr_float_feat[j]); + } + } + } + + if (max == 0) { + scale_factor = 1.0; + } else { + scale_factor = target_max / max; + } + + return (scale_factor); +} + +void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor) { + float *ptr_float_feat = reinterpret_cast(ptr_float_memory); + uint32_t num_saturate = 0; + + int16_t *ptr_int_feat = reinterpret_cast(ptr_int_memory); + for (uint32_t i = 0; i < num_elements; i++) { + float rounding_value = (ptr_float_feat[i] > 0) ? 0.5f : -0.5f; + float value = ptr_float_feat[i] * scale_factor + rounding_value; + if (value > 32767.0) { + ptr_int_feat[i] = 32767; + num_saturate++; + } else if (value < -32768.0) { + ptr_int_feat[i] = -32768; + num_saturate++; + } else { + ptr_int_feat[i] = (int16_t) value; + } + } + + if (num_saturate > 0) { + QUANTWARNING("Warning: %d / %d saturations during QuantizeVector16()\n", num_saturate, num_elements); + } +} + +void QuantizeVector16(std::vector> &input_vectors, + int16_t *ptr_int_memory, + uint32_t index, + uint32_t num_group_size, + float scale_factor) { + int16_t *ptr_int_feat = reinterpret_cast (ptr_int_memory); + uint32_t num_saturate = 0; + uint32_t num_elements = (uint32_t) input_vectors[0].size(); // assume all vector are same size + uint32_t start_index = (uint32_t) index; + uint32_t end_index = + (uint32_t) ((index + num_group_size > input_vectors.size()) ? input_vectors.size() - 1 : start_index + + num_group_size); + + if (end_index - start_index < num_group_size) { + memset(ptr_int_feat, 0, num_elements * num_group_size * sizeof(int16_t)); // for zero padding partial group + } + for (uint32_t j = start_index; j < end_index; j++) { + for (uint32_t i = 0; i < num_elements; i++) { + float *ptr_float_feat = input_vectors[j].data(); + float rounding_value = (ptr_float_feat[i] > 0) ? 0.5f : -0.5f; + float value = ptr_float_feat[i] * scale_factor + rounding_value; + if (value > 32767.0) { + ptr_int_feat[i * num_group_size + j - start_index] = 32767; + num_saturate++; + } else if (value < -32768.0) { + ptr_int_feat[i * num_group_size + j - start_index] = -32768; + num_saturate++; + } else { + ptr_int_feat[i * num_group_size + j - start_index] = (int16_t) value; + } + } + } + if (num_saturate > 0) { + QUANTWARNING("Warning: %d / %d saturations during QuantizeVector16()\n", + num_saturate, + num_elements * num_group_size); + } +} + +void ReQuantizeVector16(int16_t *ptr_int_memory, uint32_t num_elements, float prev_scale_factor, float scale_factor) { + uint32_t num_saturate = 0; + + int16_t *ptr_int_feat = reinterpret_cast (ptr_int_memory); + for (uint32_t i = 0; i < num_elements; i++) { + float float_value = ptr_int_feat[i] / prev_scale_factor; + float rounding_value = (float_value > 0) ? 0.5f : -0.5f; + float value = float_value * scale_factor + rounding_value; + if (value > 32767.0) { + ptr_int_feat[i] = 32767; + num_saturate++; + } else if (value < -32768.0) { + ptr_int_feat[i] = -32768; + num_saturate++; + } else { + ptr_int_feat[i] = (int16_t) value; + } + } + + if (num_saturate > 0) { + QUANTWARNING("Warning: %d / %d saturations during ReQuantizeVector16()\n", num_saturate, num_elements); + } +} + +void QuantizeBias16(float *ptr_float_biases, + int32_t *ptr_int_biases, + float input_scale_factor, + float weight_scale_factor, + float *ptr_output_scale_factor, + uint32_t num_rows) { + uint32_t num_saturate = 0; + + *ptr_output_scale_factor = input_scale_factor * weight_scale_factor; + for (uint32_t j = 0; j < num_rows; j++) { + float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f; + float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value; + if (value > 2147483647.0) { + ptr_int_biases[j] = 2147483647L; + num_saturate++; + } else if (value < -2147483648.0) { + ptr_int_biases[j] = -2147483648LL; + num_saturate++; + } else { + ptr_int_biases[j] = (int32_t) value; + } + } + + if (num_saturate > 0) { + QUANTWARNING("Warning: %d / %d saturations in QuantizeBias16()\n", num_saturate, num_rows); + } +} + +void DeQuantizeVector16(int16_t *ptr_int_memory, std::vector &float_vector, float scale_factor) { + int16_t *int16_vector = reinterpret_cast (ptr_int_memory); + for (uint32_t i = 0; i < float_vector.size(); i++) { + float_vector[i] = int16_vector[i] / scale_factor; + } +} + +void DeQuantizeVector32(int32_t *ptr_int_memory, std::vector &float_vector, float scale_factor) { + int32_t *int32_vector = reinterpret_cast (ptr_int_memory); + for (uint32_t i = 0; i < float_vector.size(); i++) { + float_vector[i] = int32_vector[i] / scale_factor; + } +} + +void DeQuantizeVector32(int32_t *ptr_int_memory, + std::vector &float_vector, + uint32_t index, + uint32_t num_group_size, + float scale_factor) { + int32_t *int32_vector = reinterpret_cast (ptr_int_memory); + for (uint32_t i = 0; i < float_vector.size(); i++) { + float_vector[i] = int32_vector[i * num_group_size + index] / scale_factor; + } +} +bool IntegrityCheckAffine16(float *ptr_float_weights, + float *ptr_float_biases, + int16_t *ptr_int_weights, + int32_t *ptr_int_biases, + float weight_scale_factor, + float output_scale_factor, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_rows_padded, + uint32_t num_columns_padded) { + bool model_ok = true; + + for (uint32_t row = 0; row < num_rows; row++) { + for (uint32_t col = 0; col < num_columns; col++) { + float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; + float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value; + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + int16_t int_value; + if (value > 32767.0) { + int_value = 32767; + } else if (value < -32768.0) { + int_value = -32768; + } else { + int_value = (int16_t) value; + } + if (int_value != *ptr_weight_16) { + model_ok = false; + } + } + for (uint32_t col = num_columns; col < num_columns_padded; col++) { + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + if (*ptr_weight_16 != 0) { + model_ok = false; + } + } + } + for (uint32_t row = num_rows; row < num_rows_padded; row++) { + for (uint32_t col = 0; col < num_columns_padded; col++) { + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + if (*ptr_weight_16 != 0) { + model_ok = false; + } + } + } + + for (uint32_t j = 0; j < num_rows; j++) { + float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f; + float value = ptr_float_biases[j] * output_scale_factor + rounding_value; + int32_t int_value; + if (value > 2147483647.0) { + int_value = 2147483647L; + } else if (value < -2147483648.0) { + int_value = -2147483648LL; + } else { + int_value = (int32_t) value; + } + if (int_value != ptr_int_biases[j]) { + model_ok = false; + } + } + for (uint32_t j = num_rows; j < num_rows_padded; j++) { + if (ptr_int_biases[j] != 0) { + model_ok = false; + } + } + + return (model_ok); +} + +bool IntegrityCheckAffineWeights16(float *ptr_float_weights, + int16_t *ptr_int_weights, + float weight_scale_factor, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_rows_padded, + uint32_t num_columns_padded) { + bool model_ok = true; + + for (uint32_t row = 0; row < num_rows; row++) { + for (uint32_t col = 0; col < num_columns; col++) { + float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; + float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value; + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + int16_t int_value; + if (value > 32767.0) { + int_value = 32767; + } else if (value < -32768.0) { + int_value = -32768; + } else { + int_value = (int16_t) value; + } + if (int_value != *ptr_weight_16) { + model_ok = false; + } + } + for (uint32_t col = num_columns; col < num_columns_padded; col++) { + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + if (*ptr_weight_16 != 0) { + model_ok = false; + } + } + } + for (uint32_t row = num_rows; row < num_rows_padded; row++) { + for (uint32_t col = 0; col < num_columns_padded; col++) { + int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col); + if (*ptr_weight_16 != 0) { + model_ok = false; + } + } + } + + return (model_ok); +} + + +void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, + int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases, + float input_scale_factor, float *ptr_weight_scale_factor, + float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns, + uint32_t num_rows_padded, uint32_t num_columns_padded) { + uint32_t num_saturate = 0; + + if (*ptr_weight_scale_factor == 1.0) { + // scale factor for weights is not calculated yet + float mean_weight = 0.0; + float mean_weight_squared = 0.0; + float max_weight = -1e20f; + float var_weight; + float mean_plus_2stdev; + + for (uint32_t i = 0; i < num_rows; i++) { + for (uint32_t j = 0; j < num_columns; j++) { + float weight = ptr_float_weights[i*num_columns + j]; + mean_weight += weight; + mean_weight_squared += weight * weight; + if (fabs(weight) > max_weight) { + max_weight = fabs(weight); + } + } + } + + mean_weight /= static_cast(num_rows * num_columns); + mean_weight_squared /= static_cast(num_rows * num_columns); + var_weight = mean_weight_squared - mean_weight * mean_weight; + mean_plus_2stdev = mean_weight + 2.0f * static_cast(sqrtf(var_weight)); + + *ptr_weight_scale_factor = static_cast(MAX_VAL_1B_WEIGHT) / max_weight; + + // For 8 bit weights quantize as follows: + // 1. adjust scale factor to increase dynamic range of entire matrix by max multiplier + // 2. find maximum scaled weight for each row + // 3. find multiplier such that dividing by the multiplier brings row back within 8-bit dynamic range + // 4. quantize and store scaled row + *ptr_weight_scale_factor = MAX_OUT_MULTIPLIER * *ptr_weight_scale_factor; // increase dynamic range by max multiplier + *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor; + } + float valueAcc = 0.0; + for (uint32_t row = 0; row < num_rows; row++) { + float scaled_row_max = 0; + float rounding_value, value; + for (uint32_t col = 0; col < num_columns; col++) { + value = ptr_float_weights[row*num_columns + col] * *ptr_weight_scale_factor; + valueAcc += value; + if (fabs(value) > scaled_row_max) { + scaled_row_max = fabs(value); + } + } + + value = scaled_row_max / static_cast(MAX_VAL_1B_WEIGHT); + ptr_int_biases[row].multiplier = (uint8_t) (value + 0.5); + for (uint32_t col = 0; col < num_columns; col++) { + int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col); + rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f; + + + value = ptr_float_weights[row*num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value; + if (value > 127.0) { + *ptr_weight_8 = 127; + num_saturate++; + } else if (value < -128.0) { + *ptr_weight_8 = -128; + num_saturate++; + } else { + *ptr_weight_8 = (int8_t)value; + } + } + for (uint32_t col = num_columns; col < num_columns_padded; col++) { + int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col); + *ptr_weight_8 = 0; + } + } + for (uint32_t row = num_rows; row < num_rows_padded; row++) { + for (uint32_t col = 0; col < num_columns_padded; col++) { + int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col); + *ptr_weight_8 = 0; + } + ptr_int_biases[row].multiplier = 0; + } + + // bias value of the bas will be only used when input bias provided + if (ptr_float_biases != nullptr) { + for (uint32_t j = 0; j < num_rows; j++) { + float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f; + float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value; + if (value > 2147483647.0) { + ptr_int_biases[j].bias = 2147483647L; + num_saturate++; + } else if (value < -2147483648.0) { + ptr_int_biases[j].bias = -2147483648LL; + num_saturate++; + } else { + ptr_int_biases[j].bias = (int32_t) value; + } + } + } + + if (num_saturate > 0) { + QUANTWARNING("Warning: %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows); + } +} + + +void QuantizeBias8(float *ptr_float_biases, + intel_compound_bias_t *ptr_int_biases, + float input_scale_factor, + float weight_scale_factor, + float *ptr_output_scale_factor, uint32_t num_rows) { + uint32_t num_saturate = 0; + + *ptr_output_scale_factor = input_scale_factor * weight_scale_factor; + for (uint32_t j = 0; j < num_rows; j++) { + float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f; + float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value; + if (value > 2147483647.0) { + ptr_int_biases[j].bias = 2147483647L; + num_saturate++; + } else if (value < -2147483648.0) { + ptr_int_biases[j].bias = -2147483648LL; + num_saturate++; + } else { + ptr_int_biases[j].bias = (int32_t)value; + } + } + + if (num_saturate > 0) { + QUANTWARNING("Warning: %d / %d saturations in QuantizeBias8()\n", num_saturate, num_rows); + } +} + +bool IntegrityCheckAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases, + float weight_scale_factor, float output_scale_factor, uint32_t num_rows, uint32_t num_columns, + uint32_t num_rows_padded, uint32_t num_columns_padded) { + bool model_ok = true; + + for (uint32_t row = 0; row < num_rows; row++) { + float scaled_row_max = 0; + float rounding_value, value; + for (uint32_t col = 0; col < num_columns; col++) { + value = ptr_float_weights[row*num_columns + col] * weight_scale_factor; + if (fabs(value) > scaled_row_max) { + scaled_row_max = fabs(value); + } + } + value = scaled_row_max / static_cast(MAX_VAL_1B_WEIGHT); + if (ptr_int_biases[row].multiplier != (uint8_t)(value + 0.5)) { + model_ok = false; + } + for (uint32_t col = 0; col < num_columns; col++) { + int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col); + int8_t int_value; + rounding_value = (ptr_float_weights[row*num_columns + col] > 0) ? 0.5f : -0.5f; + value = ptr_float_weights[row*num_columns + col] * (weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value; + if (value > 127.0) { + int_value = 127; + } else if (value < -128.0) { + int_value = -128; + } else { + int_value = (int8_t)value; + } + if (int_value != *ptr_weight_8) { + model_ok = false; + } + } + for (uint32_t col = num_columns; col < num_columns_padded; col++) { + int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col); + if (*ptr_weight_8 != 0) { + model_ok = false; + } + } + } + for (uint32_t row = num_rows; row < num_rows_padded; row++) { + for (uint32_t col = 0; col < num_columns_padded; col++) { + int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col); + if (*ptr_weight_8 != 0) { + model_ok = false; + } + } + if (ptr_int_biases[row].multiplier != 0) { + model_ok = false; + } + } + + for (uint32_t j = 0; j < num_rows; j++) { + float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f; + float value = ptr_float_biases[j] * output_scale_factor + rounding_value; + int32_t int_value; + if (value > 2147483647.0) { + int_value = 2147483647L; + } else if (value < -2147483648.0) { + int_value = -2147483648LL; + } else { + int_value = (int32_t)value; + } + if (int_value != ptr_int_biases[j].bias) { + model_ok = false; + } + } + + return(model_ok); +} + diff --git a/inference-engine/src/gna_plugin/quantization/quantization.h b/inference-engine/src/gna_plugin/quantization/quantization.h new file mode 100644 index 00000000000000..bd1ff7b07146fe --- /dev/null +++ b/inference-engine/src/gna_plugin/quantization/quantization.h @@ -0,0 +1,100 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include + +#define MAX_OUT_MULTIPLIER 230 +#define MAX_VAL_1B_WEIGHT 127 +#define MAX_VAL_2B_WEIGHT 16384 +#define MAX_VAL_2B_FEAT 16384 +#ifdef DEBUG +#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__)) +#else +#define QUANTWARNING(...) +#endif + +void QuantizeAffine16(float *ptr_float_weights, + float *ptr_float_biases, + int16_t *ptr_int_weights, + int32_t *ptr_int_biases, + float input_scale_factor, + float *ptr_weight_scale_factor, + float *ptr_output_scale_factor, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_rows_padded, + uint32_t num_columns_padded); +void FixedQuantizeAffine16(float *ptr_float_weights, + float *ptr_float_biases, + int16_t *ptr_int_weights, + int32_t *ptr_int_biases, + float input_scale_factor, + float weight_scale_factor, + float *ptr_output_scale_factor, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_rows_padded, + uint32_t num_columns_padded); +float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements); +float ScaleFactorForQuantization(std::vector> &input_vectors, float target_max); +float ScaleFactorForQuantization(std::vector> &input_vectors, + int index, + int num_group_size, + float target_max); +void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor); +void QuantizeVector16(std::vector> &input_vectors, + int16_t *ptr_int_memory, + uint32_t index, + uint32_t num_group_size, + float scale_factor); +void ReQuantizeVector16(int16_t *ptr_int_memory, uint32_t num_elements, float prev_scale_factor, float scale_factor); +bool IntegrityCheckAffine16(float *ptr_float_weights, + float *ptr_float_biases, + int16_t *ptr_int_weights, + int32_t *ptr_int_biases, + float weight_scale_factor, + float output_scale_factor, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_rows_padded, + uint32_t num_columns_padded); +bool IntegrityCheckAffineWeights16(float *ptr_float_weights, + int16_t *ptr_int_weights, + float weight_scale_factor, + uint32_t num_rows, + uint32_t num_columns, + uint32_t num_rows_padded, + uint32_t num_columns_padded); +void QuantizeBias16(float *ptr_float_biases, + int32_t *ptr_int_biases, + float input_scale_factor, + float weight_scale_factor, + float *ptr_output_scale_factor, + uint32_t num_rows); +void DeQuantizeVector16(int16_t *ptr_int_memory, std::vector &float_vector, float scale_factor); +void DeQuantizeVector32(int32_t *ptr_int_memory, std::vector &float_vector, float scale_factor); +void DeQuantizeVector32(int32_t *ptr_int_memory, + std::vector &float_vector, + uint32_t index, + uint32_t num_group_size, + float scale_factor); + +#include "gna-api.h" + +void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases, + float input_scale_factor, float *ptr_weight_scale_factor, float *ptr_output_scale_factor, + uint32_t num_rows, uint32_t num_columns, uint32_t num_rows_padded, uint32_t num_columns_padded); +void QuantizeBias8(float *ptr_float_biases, intel_compound_bias_t *ptr_int_biases, float input_scale_factor, + float weight_scale_factor, float *ptr_output_scale_factor, uint32_t num_rows); +bool IntegrityCheckAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases, + float weight_scale_factor, float output_scale_factor, uint32_t num_rows, uint32_t num_columns, + uint32_t num_rows_padded, uint32_t num_columns_padded); + + diff --git a/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp new file mode 100644 index 00000000000000..347102bbb3ac39 --- /dev/null +++ b/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp @@ -0,0 +1,24 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +namespace GNAPluginNS { + +struct Quantization { + float scale = 1.0f; + float offset = 0.0f; + int shift = 0.0f; +}; + +struct QuantizedLayerParams { + Quantization _src_quant; + Quantization _dst_quant; + Quantization _weights_quant; + Quantization _bias_quant; + float _o_shift = 0.0f; + float _b_shift = 0.0f; +}; + +} // namespace GNAPluginNS \ No newline at end of file diff --git a/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp new file mode 100644 index 00000000000000..a3ba22c1b00713 --- /dev/null +++ b/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp @@ -0,0 +1,339 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include +#include +#include +#include +#include +#include "gna_layer_info.hpp" +#include "ie_layers.h" +#include "gna_plugin_log.hpp" + +namespace GNAPluginNS { +namespace details { +using namespace InferenceEngine; +struct ScaleFactorUpdateResult { + CNNLayer *restartLayer = nullptr; + ScaleFactorUpdateResult() = default; + explicit ScaleFactorUpdateResult(CNNLayer * restartlayer) : restartLayer(restartlayer) { + } + operator bool() { + return restartLayer == nullptr; + } +}; + +/** + * @brief calculates output scale factor per layer + * @tparam T + */ +template +class ScaleFactorPerLayer { + public: + /** + * @brief calculates weights scale factor for fit dynamic range into target bitsize, + * also calculates output scale factor for the given layer + * @param cnnLayer + * @param weightsSize + * @param inputScaleFactor + * @param result + * @return + */ + bool operator()(T cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) { + return false; + } +}; + +template<> +class ScaleFactorPerLayer { + private : + const float activation_scale_factor = 2048.f; + const float identity_scale_factor = 2049.0f; + const float k = 5; + const float k_identity = 6; + public : + bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) { + if ( !cnnLayer ) { + THROW_IE_EXCEPTION << "Incorrect Convolutional Layer pointer \n"; + } + LayerInfo layerInfo(*cnnLayer); + // TODO: current approach set input scale factor for true input layer(s) equals to provided factor, + auto quant = getInjectedData(*cnnLayer); + if (InferenceEngine::details::CaselessEq()(cnnLayer->type, "Memory")) { + // for memory output layer need to verify it's input scale factor + if (CNNNetHasPrevLayer(cnnLayer)) { + auto prevLayer = CNNNetPrevLayer(cnnLayer); + auto inputQuant = getInjectedData(prevLayer); + if (inputQuant->_dst_quant.scale != activation_scale_factor) { + gnawarn() << "[WARNING] quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") " + << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : " + << activation_scale_factor << std::endl; + inputQuant->_dst_quant.scale = activation_scale_factor; + // restarting from that activation; + result = ScaleFactorUpdateResult(prevLayer.get()); + return true; + } + } + quant->_src_quant.scale = quant->_dst_quant.scale = activation_scale_factor; + return true; + } + + if (!CNNNetHasPrevLayer(cnnLayer)) { + quant->_dst_quant.scale = inputScaleFactor; + return ScaleFactorUpdateResult(); + } + + // by default layer is pass thru its scale factor + auto inputQuant = getInjectedData(CNNNetPrevLayer(cnnLayer)); + quant->_dst_quant.scale = inputQuant->_dst_quant.scale; + quant->_src_quant.scale = inputQuant->_dst_quant.scale; + + if (layerInfo.isActivation()) { + // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights + // set the initial value + quant->_dst_quant.scale = layerInfo.isIdentity() ? identity_scale_factor:activation_scale_factor; + // if activation is one from relu family, we need to apply heuruistic to avoid activation output overflow + if (layerInfo.isRelu() && + static_cast(quant->_dst_quant.scale * quant->_src_quant.scale) + > std::numeric_limits::max()-1) { + quant->_dst_quant.scale = (quant->_dst_quant.scale * 0.5); + } + } + return true; + } +}; + +template<> +class ScaleFactorPerLayer { + public: + bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) { + if ( !eltwiseLayer ) { + THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n"; + } + auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0); + auto in1 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 1); + + auto quantParams0 = InferenceEngine::getInjectedData(in0); + auto quantParams1 = InferenceEngine::getInjectedData(in1); + auto quantData = InferenceEngine::getInjectedData(*eltwiseLayer); + + switch (eltwiseLayer->_operation) { + case InferenceEngine::EltwiseLayer::Prod: { + quantData->_weights_quant.scale = quantParams1->_dst_quant.scale; + quantData->_dst_quant.scale = quantParams0->_dst_quant.scale * quantParams1->_dst_quant.scale; + break; + } + case InferenceEngine::EltwiseLayer::Sum: { + // detect which input will be used as biases + if (LayerInfo(in0).has32BOutput()) { + std::swap(in0, in1); + std::swap(quantParams0, quantParams1); + } + + // this path might result in significant data loss + quantData->_weights_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale; + quantData->_dst_quant.scale = quantParams1->_dst_quant.scale; + + // eltwise will always work in int16 + auto maxValue = std::numeric_limits::max() - 1; + if (quantData->_weights_quant.scale > maxValue + 1) { + // rescaling it's activation input + // iterating thru previous layers of eltwise + for (uint8_t i = 0; i < 2; ++i) { + InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, i); + // trick to get opposite index (for 0 -> 1 for 1 -> 0) by inversing i. + auto quantParams = + InferenceEngine::getInjectedData(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, !i)); + + for (; InferenceEngine::CNNNetHasPrevLayer(in.get()); in = CNNNetPrevLayer(in)) { + auto info = LayerInfo(in); + // we skipping only split layers so far, also need to work on memory layers + // this case for input from port 0 + if (info.isSplit() || info.isSlice()) { + continue; + } else if (info.has16BOutput() && info.isActivation()) { + auto newOutputScale = quantParams->_dst_quant.scale / maxValue; + if (newOutputScale > std::numeric_limits::max() / 2) { + break; + } + auto quantDataForActivation = InferenceEngine::getInjectedData(*in); + gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name + << ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale + << ", was " << quantDataForActivation->_dst_quant.scale <<"\n" << std::flush; + quantDataForActivation->_dst_quant.scale = newOutputScale; + result = ScaleFactorUpdateResult(in.get()); + return true; + } else if (info.has16BOutput()) { + break; + } + + // if we are here it means that we are in the port 1 + if (info.isFullyConnected() || info.isConvolutional()) { + auto quantDataForInputLayer = InferenceEngine::getInjectedData(*in); + auto newOutputScale = quantParams->_dst_quant.scale * maxValue; + auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.scale; + quantDataForInputLayer->_dst_quant.scale = newOutputScale; + quantDataForInputLayer->_weights_quant.scale = newWeightScale; + result = ScaleFactorUpdateResult(in.get()); + return true; + } + } + } + // we unable to rescale the input - results might be bad + gnawarn() << "[INFO] weights saturated for " << eltwiseLayer->name << "\n"; + } + break; + } + default : THROW_GNA_EXCEPTION << "Unsupported Eltwise layer for quantisation: " << eltwiseLayer->_operation; + } + return true; + } +}; + +template<> +class ScaleFactorPerLayer { + private: + float const _scale_reduction_50 = 0.50; + float const _scale_reduction_45 = 0.45; + float const _scale_reduction_40 = 0.40; + float const _scale_reduction_35 = 0.35; + + uint16_t const _scale_change_req_threshold = 30; + uint16_t const _scale_change_threshold_100 = 100; + uint16_t const _scale_change_threshold_150 = 150; + uint16_t const _scale_change_threshold_200 = 200; + + public: + bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) { + if ( !wl ) { + THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer \n"; + } else if (!wl->_weights) { + THROW_GNA_EXCEPTION << "Incorrect weight value for " << wl->name << ":" << wl->type << "\n"; + } + + auto prevLayer = CNNNetPrevLayer(wl); + auto quantDataForInputLayer = + InferenceEngine::getInjectedData(*InferenceEngine::CNNNetPrevLayer(wl).get()); + + auto quant = InferenceEngine::getInjectedData(*wl); + // TODO: pass 8 bits somehow + if (quant->_weights_quant.scale == 1.0f) { + size_t scaleRange = 0; + if (weightsSize == 2) { + scaleRange = MAX_VAL_2B_WEIGHT; + } else if (weightsSize == 1) { + scaleRange = MAX_VAL_1B_WEIGHT; + } else { + THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize; + } + quant->_weights_quant.scale = + ScaleFactorForQuantization(wl->_weights->buffer().as(), scaleRange, wl->_weights->size()); + + // TODO: findout why ??? + if (weightsSize == 1) { + quant->_weights_quant.scale *= MAX_OUT_MULTIPLIER; + } + } + + quant->_src_quant.scale = quantDataForInputLayer->_dst_quant.scale; + + double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale; + + if (weightsSize == 1 && + static_cast(tmp_dst_quant_scale * quant->_src_quant.scale) > + static_cast(std::numeric_limits::max()-1) * _scale_change_req_threshold) { + gnawarn() << "Output scale for " << wl->name + << " too large and are being reduced. Else saturations likely will happen \n"; + // reduce weight scale according experimentatl heuruistic + if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits::max() < _scale_change_threshold_100) { + quant->_weights_quant.scale *= _scale_reduction_50; + tmp_dst_quant_scale *= _scale_reduction_50; + } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits::max() < _scale_change_threshold_150) { + quant->_weights_quant.scale *= _scale_reduction_45; + tmp_dst_quant_scale *= _scale_reduction_45; + } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits::max() < _scale_change_threshold_200) { + quant->_weights_quant.scale *= _scale_reduction_40; + tmp_dst_quant_scale *= _scale_reduction_40; + } else { + quant->_weights_quant.scale *= _scale_reduction_35; + tmp_dst_quant_scale *= _scale_reduction_35; + } + } + + quant->_dst_quant.scale = tmp_dst_quant_scale; + + return true; + } +}; + +template<> +class ScaleFactorPerLayer : public ScaleFactorPerLayer { + public: + bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) { + return ScaleFactorPerLayer::operator()(wl, 2, inputScaleFactor, result); + } +}; + +/** + * GNA convolutions cannot be quantized in int8, remove when library starts support that + */ +template<> +class ScaleFactorPerLayer : public ScaleFactorPerLayer { +}; + + +} // namespace details + +/** + * @brief scale factor calculator will calculate only output scale factors for the layer + * if scale factor propagation not possible, it will fall indicate a restart condition + */ +class ScaleFactorCalculator { + using Cnt = std::vector; + Cnt net; + mutable Cnt::const_iterator idx; + float inputScaleFactor; + mutable bool needRestart = false; + int weightsBytesSize; + + public: + ScaleFactorCalculator(Cnt &net, int weightsBytesSize, float inputScaleFactor) + : net(net), inputScaleFactor(inputScaleFactor), weightsBytesSize(weightsBytesSize) { + idx = std::begin(this->net); + } + bool needToRestart() const { + return needRestart; + } + bool allLayersProcessed() const { + return idx == std::end(net); + } + std::vector getStartLayers() const { + return std::vector(idx, std::end(net)); + } + template + bool operator()(T ptr) const { + needRestart = false; + details::ScaleFactorUpdateResult result; + if (!details::ScaleFactorPerLayer()(ptr, weightsBytesSize, inputScaleFactor, result)) { + return false; + } + if (result) { + idx++; + return true; + } + + idx = std::find_if(net.begin(), net.end(), [&](InferenceEngine::CNNLayerPtr cnnLayer) { + if (!result) { + return result.restartLayer == cnnLayer.get(); + } + return ptr == cnnLayer.get(); + }); + idx++; + needRestart = true; + return true; + } +}; + +} // namespace GNAPluginNS diff --git a/inference-engine/src/gna_plugin/util.cpp b/inference-engine/src/gna_plugin/util.cpp new file mode 100644 index 00000000000000..c10e3175f47456 --- /dev/null +++ b/inference-engine/src/gna_plugin/util.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#ifndef _WIN32 +#include +#endif +#include +#include
+#include "util.h" +#include "gna_plugin_log.hpp" + +void *AllocateMemory(uint32_t num_memory_bytes, const char *ptr_name) { + void *ptr_memory = _mm_malloc(num_memory_bytes, 64); + if (ptr_memory == NULL) { + THROW_GNA_EXCEPTION << "Memory allocation failed for " << ptr_name; + } + memset(ptr_memory, 0, num_memory_bytes); + + return (ptr_memory); +} + +void FreeMemory(void *ptr_memory) { + if (ptr_memory != NULL) { + _mm_free(ptr_memory); + } + ptr_memory = NULL; +} + +int32_t MemoryOffset(void *ptr_target, void *ptr_base) { + uint64_t target = (uint64_t) ptr_target; + uint64_t base = (uint64_t) ptr_base; + if (target == 0) { // handle NULL pointers separately + return (-1); + } else if (target < base) { + THROW_GNA_EXCEPTION << "Error: target address value " << target<< " is less than base address " << base << " in MemoryOffset()"; + } else { + uint64_t diff = target - base; + if (diff > 0x7fffffff) { + THROW_GNA_EXCEPTION << "Error: target address value " << target << " too far from base address " << base << " in MemoryOffset()!"; + } + return ((int32_t) diff); + } +} + diff --git a/inference-engine/src/gna_plugin/util.h b/inference-engine/src/gna_plugin/util.h new file mode 100644 index 00000000000000..0838bd2a690e0f --- /dev/null +++ b/inference-engine/src/gna_plugin/util.h @@ -0,0 +1,9 @@ +// Copyright (C) 2018 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +void *AllocateMemory(uint32_t num_memory_bytes, const char *ptr_name); +void FreeMemory(void *ptr_memory); +int32_t MemoryOffset(void *ptr_target, void *ptr_base);