From 6664654699548ed382758045adbc541b0d98e923 Mon Sep 17 00:00:00 2001
From: Alexey Suhov <asuhov@users.noreply.github.com>
Date: Mon, 21 Jan 2019 21:31:31 +0300
Subject: [PATCH] Publishing R5 content (#72)

* Publishing R5 content

* Updated ade revision

* updated readme

* add possibility to build CPU plugin with Intel MKL package
---
 .../src/gna_plugin/CMakeLists.txt             |   86 +-
 inference-engine/src/gna_plugin/dnn.cpp       | 2528 ++++++++++++++++
 inference-engine/src/gna_plugin/dnn.h         |  823 ++++++
 .../src/gna_plugin/dnn_memory.cpp             |   30 +
 .../src/gna_plugin/dnn_memory.hpp             |   13 +
 .../src/gna_plugin/dnn_traits.hpp             |   90 +
 inference-engine/src/gna_plugin/floatmath.cpp |  423 +++
 inference-engine/src/gna_plugin/floatmath.h   |   71 +
 .../src/gna_plugin/gna_allocator.hpp          |   33 +
 .../src/gna_plugin/gna_api_wrapper.hpp        |   79 +-
 .../src/gna_plugin/gna_device.cpp             |  342 +--
 .../src/gna_plugin/gna_device.hpp             |  128 +-
 .../src/gna_plugin/gna_executable_network.hpp |   42 +-
 .../src/gna_plugin/gna_helper.cpp             |  185 +-
 .../src/gna_plugin/gna_infer_request.hpp      |   20 +-
 .../src/gna_plugin/gna_layer_info.hpp         |  206 ++
 .../src/gna_plugin/gna_mem_requests.hpp       |  175 ++
 .../src/gna_plugin/gna_memory.hpp             |  227 ++
 .../src/gna_plugin/gna_memory_state.hpp       |   25 +
 .../src/gna_plugin/gna_model_serial.cpp       |  396 +--
 .../src/gna_plugin/gna_model_serial.hpp       |   77 +-
 .../src/gna_plugin/gna_plugin.cpp             | 2592 ++++++++++++-----
 .../src/gna_plugin/gna_plugin.hpp             |  503 +++-
 .../src/gna_plugin/gna_plugin_config.hpp      |   83 +-
 .../gna_plugin/gna_plugin_entry_points.cpp    |   16 +-
 .../src/gna_plugin/gna_plugin_internal.hpp    |   72 +-
 .../src/gna_plugin/gna_plugin_log.hpp         |   30 +-
 .../src/gna_plugin/gna_plugin_passes.cpp      |  338 +++
 inference-engine/src/gna_plugin/lstm.cpp      |   69 +
 inference-engine/src/gna_plugin/lstm.hpp      |  209 ++
 .../src/gna_plugin/polymorh_allocator.hpp     |   68 +
 inference-engine/src/gna_plugin/pwl.h         |   70 +
 .../src/gna_plugin/pwl_design.cpp             |  681 +++++
 .../quantization/layer_quantizer.hpp          |  488 ++++
 .../quantization/model_quantizer.hpp          |   78 +
 .../gna_plugin/quantization/precision_ex.hpp  |   95 +
 .../gna_plugin/quantization/quantization.cpp  |  699 +++++
 .../gna_plugin/quantization/quantization.h    |  100 +
 .../quantization/quantized_layer_params.hpp   |   24 +
 .../quantization/scale_factor_calc.hpp        |  339 +++
 inference-engine/src/gna_plugin/util.cpp      |   46 +
 inference-engine/src/gna_plugin/util.h        |    9 +
 42 files changed, 10552 insertions(+), 2056 deletions(-)
 create mode 100644 inference-engine/src/gna_plugin/dnn.cpp
 create mode 100644 inference-engine/src/gna_plugin/dnn.h
 create mode 100644 inference-engine/src/gna_plugin/dnn_memory.cpp
 create mode 100644 inference-engine/src/gna_plugin/dnn_memory.hpp
 create mode 100644 inference-engine/src/gna_plugin/dnn_traits.hpp
 create mode 100644 inference-engine/src/gna_plugin/floatmath.cpp
 create mode 100644 inference-engine/src/gna_plugin/floatmath.h
 create mode 100644 inference-engine/src/gna_plugin/gna_allocator.hpp
 create mode 100644 inference-engine/src/gna_plugin/gna_layer_info.hpp
 create mode 100644 inference-engine/src/gna_plugin/gna_mem_requests.hpp
 create mode 100644 inference-engine/src/gna_plugin/gna_memory.hpp
 create mode 100644 inference-engine/src/gna_plugin/gna_memory_state.hpp
 create mode 100644 inference-engine/src/gna_plugin/gna_plugin_passes.cpp
 create mode 100644 inference-engine/src/gna_plugin/lstm.cpp
 create mode 100644 inference-engine/src/gna_plugin/lstm.hpp
 create mode 100644 inference-engine/src/gna_plugin/polymorh_allocator.hpp
 create mode 100644 inference-engine/src/gna_plugin/pwl.h
 create mode 100644 inference-engine/src/gna_plugin/pwl_design.cpp
 create mode 100644 inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp
 create mode 100644 inference-engine/src/gna_plugin/quantization/model_quantizer.hpp
 create mode 100644 inference-engine/src/gna_plugin/quantization/precision_ex.hpp
 create mode 100644 inference-engine/src/gna_plugin/quantization/quantization.cpp
 create mode 100644 inference-engine/src/gna_plugin/quantization/quantization.h
 create mode 100644 inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp
 create mode 100644 inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp
 create mode 100644 inference-engine/src/gna_plugin/util.cpp
 create mode 100644 inference-engine/src/gna_plugin/util.h

diff --git a/inference-engine/src/gna_plugin/CMakeLists.txt b/inference-engine/src/gna_plugin/CMakeLists.txt
index aa7045813e8f4e..f6a25b61844784 100644
--- a/inference-engine/src/gna_plugin/CMakeLists.txt
+++ b/inference-engine/src/gna_plugin/CMakeLists.txt
@@ -1,66 +1,60 @@
-# Copyright (C) 2018-2020 Intel Corporation
+# Copyright (C) 2018 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
+#
 
 set(TARGET_NAME "GNAPlugin")
 
-if(ENABLE_LTO)
-    ie_enable_lto()
-endif()
-
 file(GLOB_RECURSE SOURCES
-        ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp)
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp
+        )
 
 file(GLOB_RECURSE HEADERS
         ${CMAKE_CURRENT_SOURCE_DIR}/*.h
-        ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp)
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp
+        )
 
-addVersionDefines(gna_plugin_entry_points.cpp CI_BUILD_NUMBER)
+add_definitions(-DIMPLEMENT_INFERENCE_ENGINE_PLUGIN)
 
 find_package(libGNA)
+include_directories(${libGNA_INCLUDE_DIRS})
 
-ie_add_plugin(NAME ${TARGET_NAME}
-              DEVICE_NAME "GNA"
-              SOURCES ${SOURCES} ${HEADERS})
+include_directories(
+        ${CMAKE_SOURCE_DIR}/include
+        ${CMAKE_SOURCE_DIR}/src/inference_engine
+        ${CMAKE_CURRENT_SOURCE_DIR}
+        ${libGNA_INCLUDE_DIRS}
+)
 
-if(GNA_LIBRARY_VERSION STREQUAL "GNA2")
-    SET(GNA_LIBRARY_VERSION_NUMBER 2)
-else()
-    SET(GNA_LIBRARY_VERSION_NUMBER 1)
-endif()
+add_definitions(-D_NO_MKL_)
+add_library(${TARGET_NAME} SHARED ${SOURCES} ${HEADERS})
 
-#saving rpath to GNA shared library be used by CI
-log_rpath_from_dir(GNA ${libGNA_LIBRARIES_BASE_PATH})
+if (LINUX)
+    find_package(Threads)
+endif ()
 
-target_link_libraries(${TARGET_NAME} PRIVATE inference_engine inference_engine_lp_transformations ${INTEL_ITT_LIBS} Threads::Threads libGNA)
-target_include_directories(${TARGET_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
-target_compile_definitions(${TARGET_NAME}
-    PRIVATE
-        _NO_MKL_
-    PUBLIC
-        GNA_LIB_VER=${GNA_LIBRARY_VERSION_NUMBER})
+set_target_properties(${TARGET_NAME} PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME})
 
+#saving rpath to GNA shared library be used by CI
+log_rpath_remove_top(GNA FALSE "/gna${libGNA_LIBRARY}" TRUE)
+
+target_link_libraries(${TARGET_NAME} inference_engine ${INTEL_ITT_LIBS} ${libGNA_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 
-add_library(${TARGET_NAME}_test_static STATIC ${SOURCES} ${HEADERS})
-target_compile_definitions(${TARGET_NAME}_test_static
-        PRIVATE
-            _NO_MKL_
-            IMPLEMENT_INFERENCE_ENGINE_PLUGIN
-        PUBLIC
-            GNA_LIB_VER=${GNA_LIBRARY_VERSION_NUMBER}
-            INTEGER_LOW_P
-            USE_STATIC_IE)
-target_link_libraries(${TARGET_NAME}_test_static PUBLIC inference_engine_preproc_s inference_engine_lp_transformations libGNA::API)
-target_include_directories(${TARGET_NAME}_test_static PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-set_target_properties(${TARGET_NAME}_test_static PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}_test_static)
 
-if(WIN32)
-    # Correct 'jnl' macro/jit issue
-    target_compile_options(${TARGET_NAME} PRIVATE $<$<CXX_COMPILER_ID:MSVC>:/bigobj> )
-    target_compile_options(${TARGET_NAME}_test_static PRIVATE $<$<CXX_COMPILER_ID:MSVC>:/bigobj> )
-endif()
+set(TEST_SOURCES
+        "${CMAKE_CURRENT_SOURCE_DIR}/gna_plugin.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/gna_plugin_passes.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/quantization/quantization.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/dnn.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/gna_device.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/pwl_design.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/floatmath.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/dnn_memory.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/util.cpp"
+        "${CMAKE_CURRENT_SOURCE_DIR}/gna_model_serial.cpp")
 
-# install
+add_library(${TARGET_NAME}_test_static STATIC ${TEST_SOURCES} ${HEADERS})
+target_compile_definitions(${TARGET_NAME}_test_static
+        PUBLIC -DINTEGER_LOW_P
+               -DUSE_STATIC_IE)
 
-install(FILES "${GNA_KERNEL_LIBRARY}"
-        DESTINATION ${IE_CPACK_IE_DIR}/external/gna/lib
-        COMPONENT gna)
+set_target_properties(${TARGET_NAME}_test_static PROPERTIES COMPILE_PDB_NAME ${TARGET_NAME}_test_static)
diff --git a/inference-engine/src/gna_plugin/dnn.cpp b/inference-engine/src/gna_plugin/dnn.cpp
new file mode 100644
index 00000000000000..8c94f720a1e5bb
--- /dev/null
+++ b/inference-engine/src/gna_plugin/dnn.cpp
@@ -0,0 +1,2528 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+extern bool global_debug;
+
+#include <cstdlib>
+#include <cstdio>
+#include <cmath>
+#include <set>
+#include <details/ie_exception.hpp>
+#include <algorithm>
+#include <gna-api-types-xnn.h>
+
+#ifndef _NO_MKL_
+#include <mkl_dnn.h>
+#endif
+#include "dnn.h"
+#ifdef INTEGER_REF
+#include "convnet.h"
+#include "igemv16.h"
+#include "igemv8.h"
+#include "sgemm.h"
+#else
+#include "floatmath.h"
+#endif
+#include "pwl.h"
+#include "util.h"
+#include "gna_plugin_log.hpp"
+
+#ifdef WIN32
+# define rand_r(X) rand()
+#endif
+
+/**
+ * whether to dump weights and biases
+ */
+#define DUMP_WB
+/**
+ * in light mode only layer names are dumped
+ * @param filename
+ * @param number_type
+ * @return
+ */
+#define LIGHT_DUMP
+
+static int & getDumpFolderId() {
+    static int N = 0;
+    return N;
+}
+
+static std::string getDumpFolderNameGNA() {
+    return std::string("./gna_layers/")+std::to_string(getDumpFolderId() - 1)+"/";
+}
+
+static std::string getDumpFolderName() {
+    return std::string("./layers/")+std::to_string(getDumpFolderId() - 1)+"/";
+}
+
+static std::string getRefFolderName() {
+    return std::string("./ref_layers/")+std::to_string(getDumpFolderId() - 1)+"/";
+}
+
+void AmIntelDnn::BeginNewWrite() {
+    getDumpFolderId()++;
+}
+
+
+void AmIntelDnn::Init(void *ptr_memory,
+                      uint32_t num_memory_bytes,
+                      intel_dnn_number_type_t number_type,
+                      float scale_factor) {
+    ptr_dnn_memory_ = ptr_memory;
+    num_bytes_dnn_memory_ = num_memory_bytes;
+    number_type_ = number_type;
+    input_scale_factor_ = scale_factor;
+
+    ptr_active_outputs_ = nullptr;
+    num_active_outputs_ = 0;
+    num_left_context = 0;
+    num_right_context = 0;
+    do_rotate_input = false;
+    softmax_type = kSoftmaxNone;
+    ptr_sumgroup_sizes = nullptr;
+    num_sumgroup_sizes = 0;
+    ptr_priors = nullptr;
+
+
+    //  component.clear();
+}
+
+void AmIntelDnn::InitActiveList(uint32_t *ptr_active_list) {
+    ptr_active_outputs_ = ptr_active_list;
+    if (ptr_active_list == nullptr) {
+        if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) {
+            num_active_outputs_ = component[component.size() - 1].num_rows_out;
+        } else {
+            num_active_outputs_ = component[component.size() - 1].num_columns_out;
+        }
+    } else {
+        num_active_outputs_ = 0;
+    }
+}
+
+void AmIntelDnn::AddComponents(uint32_t num_components_to_add) {
+    component.resize(component.size() + num_components_to_add);
+    for (uint32_t i = 0; i < num_components_to_add; i++) {
+        ClearComponent(component.size() - i - 1);
+    }
+}
+
+void AmIntelDnn::ClearComponent(uint32_t component_index) {
+    if (component_index > component.size() - 1) {
+        fprintf(stderr, "Error:  attempt to clear non-existent component!\n");
+        throw -1;
+    }
+    component[component_index].num_rows_in = 0;
+    component[component_index].num_columns_in = 0;
+    component[component_index].num_rows_out = 0;
+    component[component_index].num_columns_out = 0;
+    component[component_index].num_bytes_per_input = 0;
+    component[component_index].num_bytes_per_output = 0;
+    component[component_index].operation = kDnnNullOp;
+    component[component_index].macro_operation = kDnnMacroOpNone;
+    component[component_index].orientation_in = kDnnUnknownOrientation;
+    component[component_index].orientation_out = kDnnUnknownOrientation;
+    component[component_index].ptr_inputs = nullptr;
+    component[component_index].ptr_outputs = nullptr;
+    memset(&component[component_index].op, 0, sizeof(component[component_index].op));
+}
+
+void AmIntelDnn::ClearState() {
+    // To support recurrent networks, provide mechanism to clear persistent state
+    // (e.g., between utterances for speech recognition).  For recurrent component,
+    // this means clearing the feedback buffer.  For other components, just clear the
+    // output buffer since any feedback will come from some component's output.
+    for (uint32_t i = 0; i < component.size(); i++) {
+        if (component[i].operation == kDnnRecurrentOp) {
+            memset(component[i].op.recurrent.ptr_feedbacks,
+                   0,
+                   component[i].op.recurrent.num_vector_delay * component[i].num_columns_out
+                       * component[i].num_bytes_per_input);
+        } else {
+            memset(component[i].ptr_outputs,
+                   0,
+                   component[i].num_bytes_per_output * component[i].num_rows_out * component[i].num_columns_out);
+        }
+    }
+}
+
+void AmIntelDnn::InitAffineComponentPrivate(intel_dnn_component_t &comp,
+                                            uint32_t num_rows_in,
+                                            uint32_t num_columns,
+                                            uint32_t num_rows_out,
+                                            uint32_t num_bytes_per_input,
+                                            uint32_t num_bytes_per_output,
+                                            uint32_t num_bytes_per_weight,
+                                            uint32_t num_bytes_per_bias,
+                                            float weight_scale_factor,
+                                            float output_scale_factor,
+                                            void *&ptr_inputs,
+                                            void *&ptr_outputs,
+                                            void *&ptr_weights,
+                                            void *&ptr_biases,
+                                            bool isDiag,
+                                            bool postInitMem) {
+    comp.num_rows_in = num_rows_in;
+    comp.num_columns_in = num_columns;
+    comp.num_rows_out = num_rows_out;
+    comp.num_columns_out = num_columns;
+    comp.num_bytes_per_input = num_bytes_per_input;
+    comp.num_bytes_per_output = num_bytes_per_output;
+    comp.operation = isDiag ? kDnnDiagonalOp : kDnnAffineOp;
+    comp.macro_operation = kDnnMacroOpNone;
+    comp.orientation_in = kDnnInterleavedOrientation;
+    comp.orientation_out = kDnnInterleavedOrientation;
+    comp.op.affine.num_bytes_per_weight = num_bytes_per_weight;
+    comp.op.affine.num_bytes_per_bias = num_bytes_per_bias;
+    comp.op.affine.weight_scale_factor = weight_scale_factor;
+    comp.output_scale_factor = output_scale_factor;
+    if (!postInitMem) {
+        comp.op.affine.ptr_weights = ptr_weights;
+        comp.op.affine.ptr_biases = ptr_biases;
+        comp.ptr_inputs = ptr_inputs;
+        comp.ptr_outputs = ptr_outputs;
+    } else {
+        ptr_weights = &comp.op.affine.ptr_weights;
+        ptr_biases = &comp.op.affine.ptr_biases;
+        ptr_inputs = &comp.ptr_inputs;
+        ptr_outputs = &comp.ptr_outputs;
+    }
+}
+
+void AmIntelDnn::InitDiagonalComponent(uint32_t component_index,
+                                       uint32_t num_rows_in,
+                                       uint32_t num_columns,
+                                       uint32_t num_rows_out,
+                                       uint32_t num_bytes_per_input,
+                                       uint32_t num_bytes_per_output,
+                                       uint32_t num_bytes_per_weight,
+                                       uint32_t num_bytes_per_bias,
+                                       float weight_scale_factor,
+                                       float output_scale_factor,
+                                       void *ptr_inputs,
+                                       void *ptr_outputs,
+                                       void *ptr_weights,
+                                       void *ptr_biases) {
+    component[component_index].num_rows_in = num_rows_in;
+    component[component_index].num_columns_in = num_columns;
+    component[component_index].num_rows_out = num_rows_out;
+    component[component_index].num_columns_out = num_columns;
+    component[component_index].num_bytes_per_input = num_bytes_per_input;
+    component[component_index].num_bytes_per_output = num_bytes_per_output;
+    component[component_index].operation = kDnnDiagonalOp;
+    component[component_index].macro_operation = kDnnMacroOpNone;
+    component[component_index].orientation_in = kDnnInterleavedOrientation;
+    component[component_index].orientation_out = kDnnInterleavedOrientation;
+    component[component_index].ptr_inputs = ptr_inputs;
+    component[component_index].ptr_outputs = ptr_outputs;
+    component[component_index].op.affine.num_bytes_per_weight = num_bytes_per_weight;
+    component[component_index].op.affine.num_bytes_per_bias = num_bytes_per_bias;
+    component[component_index].op.affine.weight_scale_factor = weight_scale_factor;
+    component[component_index].output_scale_factor = output_scale_factor;
+    component[component_index].op.affine.ptr_weights = ptr_weights;
+    component[component_index].op.affine.ptr_biases = ptr_biases;
+}
+
+void AmIntelDnn::InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp,
+                                              uint32_t num_rows_in,
+                                              uint32_t num_columns_in,
+                                              uint32_t num_rows_out,
+                                              uint32_t num_columns_out,
+                                              uint32_t num_bytes_per_input,
+                                              uint32_t num_bytes_per_output,
+                                              uint32_t num_bytes_per_weight,
+                                              uint32_t num_bytes_per_bias,
+                                              uint32_t num_filters,
+                                              uint32_t num_filter_rows,
+                                              uint32_t num_filter_coefficients,
+                                              uint32_t num_feature_maps,
+                                              uint32_t num_feature_map_rows,
+                                              uint32_t num_feature_map_columns,
+                                              float weight_scale_factor,
+                                              float output_scale_factor,
+                                              void *&ptr_inputs,
+                                              void *&ptr_outputs,
+                                              void *&ptr_filters,
+                                              void *&ptr_biases,
+                                              bool postInitMem) {
+    comp.num_rows_in = num_rows_in;
+    comp.num_columns_in = num_columns_in;
+    comp.num_rows_out = num_rows_out;
+    comp.num_columns_out = num_columns_out;
+    comp.num_bytes_per_input = num_bytes_per_input;
+    comp.num_bytes_per_output = num_bytes_per_output;
+    comp.operation = kDnnConvolutional1dOp;
+    comp.macro_operation = kDnnMacroOpNone;
+    comp.orientation_in = kDnnNonInterleavedOrientation;
+    comp.orientation_out = kDnnNonInterleavedOrientation;
+    comp.ptr_inputs = ptr_inputs;
+    comp.ptr_outputs = ptr_outputs;
+    comp.op.conv1D.num_bytes_per_weight = num_bytes_per_weight;
+    comp.op.conv1D.num_bytes_per_bias = num_bytes_per_bias;
+    comp.op.conv1D.num_filters = num_filters;
+    comp.op.conv1D.num_filter_rows = num_filter_rows;
+    comp.op.conv1D.num_filter_coefficients = num_filter_coefficients;
+    comp.op.conv1D.num_feature_maps = num_feature_maps;
+    comp.op.conv1D.num_feature_map_rows = num_feature_map_rows;
+    comp.op.conv1D.num_feature_map_columns = num_feature_map_columns;
+    comp.op.conv1D.weight_scale_factor = weight_scale_factor;
+    comp.output_scale_factor = output_scale_factor;
+
+    if (!postInitMem) {
+        comp.op.conv1D.ptr_filters = ptr_filters;
+        comp.op.conv1D.ptr_biases  = ptr_biases;
+        comp.ptr_inputs = ptr_inputs;
+        comp.ptr_outputs = ptr_outputs;
+    } else {
+        ptr_filters = &comp.op.conv1D.ptr_filters;
+        ptr_biases  = &comp.op.conv1D.ptr_biases;
+        ptr_inputs  = &comp.ptr_inputs;
+        ptr_outputs = &comp.ptr_outputs;
+    }
+}
+
+void AmIntelDnn::InitMaxpoolComponentPrivate(intel_dnn_component_t &comp,
+                                      uint32_t num_rows_in,
+                                      uint32_t num_columns_in,
+                                      uint32_t num_rows_out,
+                                      uint32_t num_columns_out,
+                                      uint32_t num_bytes_per_input,
+                                      uint32_t num_bytes_per_output,
+                                      uint32_t num_pool_size,
+                                      uint32_t num_pool_step,
+                                      uint32_t num_pool_stride,
+                                      bool do_sum_not_max,
+                                      float output_scale_factor,
+                                      void *&ptr_inputs,
+                                      void *&ptr_outputs,
+                                      bool postInitMem) {
+    comp.num_rows_in = num_rows_in;
+    comp.num_columns_in = num_columns_in;
+    comp.num_rows_out = num_rows_out;
+    comp.num_columns_out = num_columns_out;
+    comp.num_bytes_per_input = num_bytes_per_input;
+    comp.num_bytes_per_output = num_bytes_per_output;
+    comp.operation = kDnnMaxPoolOp;
+    comp.macro_operation = kDnnMacroOpNone;
+    comp.orientation_in = kDnnNonInterleavedOrientation;
+    comp.orientation_out = kDnnNonInterleavedOrientation;
+    comp.op.maxpool.num_inputs = num_pool_size;
+    comp.op.maxpool.num_inputs_step = num_pool_step;
+    comp.op.maxpool.num_inputs_stride = num_pool_stride;
+    comp.op.maxpool.do_sum_not_max = do_sum_not_max;
+    comp.output_scale_factor = output_scale_factor;
+
+    if (!postInitMem) {
+        comp.ptr_inputs = ptr_inputs;
+        comp.ptr_outputs = ptr_outputs;
+    } else {
+        ptr_inputs  = &comp.ptr_inputs;
+        ptr_outputs = &comp.ptr_outputs;
+    }
+}
+
+void AmIntelDnn::InitCopyComponentPrivate(intel_dnn_component_t &comp,
+                                          intel_dnn_orientation_t orientation,
+                                          uint32_t num_rows_in,
+                                          uint32_t num_columns_in,
+                                          uint32_t num_rows_out,
+                                          uint32_t num_columns_out,
+                                          uint32_t num_bytes_per_input,
+                                          uint32_t num_bytes_per_output,
+                                          float output_scale_factor,
+                                          uint32_t num_copy_rows,
+                                          uint32_t num_copy_columns,
+                                          void *&ptr_inputs,
+                                          void *&ptr_outputs,
+                                          bool postInitMem) {
+    comp.num_rows_in = num_rows_in;
+    comp.num_columns_in = num_columns_in;
+    comp.num_rows_out = num_rows_out;
+    comp.num_columns_out = num_columns_out;
+    comp.num_bytes_per_input = num_bytes_per_input;
+    comp.num_bytes_per_output = num_bytes_per_output;
+    comp.operation = kDnnCopyOp;
+    comp.macro_operation = kDnnMacroOpNone;
+    comp.orientation_in = orientation;
+    comp.orientation_out = orientation;
+    comp.ptr_inputs = ptr_inputs;
+    comp.ptr_outputs = ptr_outputs;
+    comp.output_scale_factor = output_scale_factor;
+    comp.op.copy.num_copy_rows = num_copy_rows;
+    comp.op.copy.num_copy_columns = num_copy_columns;
+
+    if (!postInitMem) {
+        comp.ptr_inputs = ptr_inputs;
+        comp.ptr_outputs = ptr_outputs;
+    } else {
+        ptr_inputs  = &comp.ptr_inputs;
+        ptr_outputs = &comp.ptr_outputs;
+    }
+}
+
+void AmIntelDnn::InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &comp,
+                                                     DnnActivation function_id,
+                                                     intel_dnn_orientation_t orientation,
+                                                     uint32_t num_rows,
+                                                     uint32_t num_columns,
+                                                     uint32_t num_bytes_per_input,
+                                                     uint32_t num_bytes_per_output,
+                                                     uint32_t num_segments,
+                                                     float output_scale_factor,
+                                                     void *&ptr_inputs,
+                                                     void *&ptr_outputs,
+                                                     intel_pwl_segment_t *ptr_segments,
+                                                     bool postInitMem) {
+    comp.num_rows_in = num_rows;
+    comp.num_columns_in = num_columns;
+    comp.num_rows_out = num_rows;
+    comp.num_columns_out = num_columns;
+    comp.num_bytes_per_input = num_bytes_per_input;
+    comp.num_bytes_per_output = num_bytes_per_output;
+    comp.operation = kDnnPiecewiselinearOp;
+    comp.macro_operation = kDnnMacroOpNone;
+    comp.orientation_in = orientation;
+    comp.orientation_out = orientation;
+    comp.op.pwl.func_id = function_id;
+    comp.op.pwl.num_segments = num_segments;
+    comp.output_scale_factor = output_scale_factor;
+
+    if (!postInitMem) {
+        comp.ptr_inputs = ptr_inputs;
+        comp.ptr_outputs = ptr_outputs;
+        comp.op.pwl.ptr_segments = ptr_segments;
+    } else {
+        ptr_inputs = &comp.ptr_inputs;
+        ptr_outputs = &comp.ptr_outputs;
+        if (ptr_segments != nullptr) {
+            *reinterpret_cast<intel_pwl_segment_t **>(ptr_segments) =
+                reinterpret_cast<intel_pwl_segment_t *>(& comp.op.pwl.ptr_segments);
+        }
+    }
+}
+
+void AmIntelDnn::InitRecurrentComponent(uint32_t component_index,
+                                        uint32_t num_rows,
+                                        uint32_t num_columns_in,
+                                        uint32_t num_columns_out,
+                                        uint32_t num_bytes_per_input,
+                                        uint32_t num_bytes_per_output,
+                                        uint32_t num_vector_delay,
+                                        uint32_t num_bytes_per_weight,
+                                        uint32_t num_bytes_per_bias,
+                                        float weight_scale_factor,
+                                        float output_scale_factor,
+                                        void *ptr_inputs,
+                                        void *ptr_feedbacks,
+                                        void *ptr_outputs,
+                                        void *ptr_weights,
+                                        void *ptr_biases) {
+    component[component_index].num_rows_in = num_rows;
+    component[component_index].num_columns_in = num_columns_in;
+    component[component_index].num_rows_out = num_rows;
+    component[component_index].num_columns_out = num_columns_out;
+    component[component_index].num_bytes_per_input = num_bytes_per_input;
+    component[component_index].num_bytes_per_output = num_bytes_per_output;
+    component[component_index].operation = kDnnRecurrentOp;
+    component[component_index].macro_operation = kDnnMacroOpNone;
+    component[component_index].orientation_in = kDnnNonInterleavedOrientation;
+    component[component_index].orientation_out = kDnnNonInterleavedOrientation;
+    component[component_index].ptr_inputs = ptr_inputs;
+    component[component_index].ptr_outputs = ptr_outputs;
+    component[component_index].op.recurrent.num_vector_delay = num_vector_delay;
+    component[component_index].op.recurrent.num_bytes_per_weight = num_bytes_per_weight;
+    component[component_index].op.recurrent.num_bytes_per_bias = num_bytes_per_bias;
+    component[component_index].op.recurrent.weight_scale_factor = weight_scale_factor;
+    component[component_index].output_scale_factor = output_scale_factor;
+    component[component_index].op.recurrent.ptr_feedbacks = ptr_feedbacks;
+    component[component_index].op.recurrent.ptr_weights = ptr_weights;
+    component[component_index].op.recurrent.ptr_biases = ptr_biases;
+}
+
+void AmIntelDnn::InitInterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns,
+                                         uint32_t num_bytes_per_input, uint32_t num_bytes_per_output,
+                                         float output_scale_factor, void *ptr_inputs, void *ptr_outputs) {
+    component[component_index].num_rows_in = num_rows;
+    component[component_index].num_columns_in = num_columns;
+    component[component_index].num_rows_out = num_columns;
+    component[component_index].num_columns_out = num_rows;
+    component[component_index].num_bytes_per_input = num_bytes_per_input;
+    component[component_index].num_bytes_per_output = num_bytes_per_output;
+    component[component_index].operation = kDnnInterleaveOp;
+    component[component_index].macro_operation = kDnnMacroOpNone;
+    component[component_index].orientation_in = kDnnNonInterleavedOrientation;
+    component[component_index].orientation_out = kDnnInterleavedOrientation;
+    component[component_index].ptr_inputs = ptr_inputs;
+    component[component_index].ptr_outputs = ptr_outputs;
+    component[component_index].output_scale_factor = output_scale_factor;
+}
+
+void AmIntelDnn::InitDeinterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns,
+                                           uint32_t num_bytes_per_input, uint32_t num_bytes_per_output,
+                                           float output_scale_factor, void *ptr_inputs, void *ptr_outputs) {
+    component[component_index].num_rows_in = num_rows;
+    component[component_index].num_columns_in = num_columns;
+    component[component_index].num_rows_out = num_columns;
+    component[component_index].num_columns_out = num_rows;
+    component[component_index].num_bytes_per_input = num_bytes_per_input;
+    component[component_index].num_bytes_per_output = num_bytes_per_output;
+    component[component_index].operation = kDnnDeinterleaveOp;
+    component[component_index].macro_operation = kDnnMacroOpNone;
+    component[component_index].orientation_in = kDnnInterleavedOrientation;
+    component[component_index].orientation_out = kDnnNonInterleavedOrientation;
+    component[component_index].ptr_inputs = ptr_inputs;
+    component[component_index].ptr_outputs = ptr_outputs;
+    component[component_index].output_scale_factor = output_scale_factor;
+}
+
+__inline void ApplyAffineTransform(intel_dnn_component_t *component, uint32_t *list, uint32_t listsize) {
+    auto transform = &component->op.affine;
+    int m = component->num_rows_out;
+    int n = component->num_columns_in;
+    int k = component->num_rows_in;
+    int lda = component->num_rows_in;
+    int ldb = component->num_columns_in;
+    int ldc = component->num_columns_out;
+
+    switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+        case 2:
+            if (component->op.affine.num_bytes_per_weight == 1) {
+                int8_t *A = reinterpret_cast<int8_t*>(transform->ptr_weights);
+                int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
+                intel_compound_bias_t *bias = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
+                if (list == nullptr) {
+                    //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
+                    igemm8_gna(m, n, k, A, lda, B, ldb, bias, C, ldc);
+                } else {
+                    //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
+                    igemm8_gna_subset(m, n, k, A, lda, B, ldb, bias, C, ldc, list, listsize);
+                }
+                //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+            } else if (component->op.affine.num_bytes_per_weight == 2) {
+                int16_t *A = reinterpret_cast<int16_t*>(transform->ptr_weights);
+                int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
+                int32_t *bias = reinterpret_cast<int32_t*>(transform->ptr_biases);
+                if (list == nullptr) {
+                    for (uint32_t i = 0; i < m; i++) {
+                        for (uint32_t j = 0; j < n; j++) {
+                            C[i*ldc+j] = bias[i];
+                        }
+                    }
+                    //  PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor);
+                    //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+                    cblas_igemm16(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc);
+                } else {
+                    for (int l = 0; l < listsize; l++) {
+                        int i = list[l];
+                        for (uint32_t j = 0; j < n; j++) {
+                            C[l*ldc+j] = bias[i];
+                        }
+                    }
+                    //  PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.scale_factor);
+                    //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.scale_factor);
+                    //  PrintMatrixInt32("C int32", C, m, n, ldc, component->op.affine.scale_factor * component->op.affine.scale_factor);
+                    cblas_igemm16_subset(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc, list, listsize);
+                }
+                //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+            } else {
+                fprintf(stderr, "Bad weight width in ApplyAffineTransform!\n");
+                throw -1;
+            }
+            break;
+#endif  // #ifdef INTEGER_REF
+        case 4: {
+            auto A = reinterpret_cast<float *>(transform->ptr_weights);
+            auto B = reinterpret_cast<float *>(component->ptr_inputs);
+            auto C = reinterpret_cast<float *>(component->ptr_outputs);
+            auto bias = reinterpret_cast<float *>(transform->ptr_biases);
+            if (list == nullptr) {
+                for (uint32_t i = 0; i < m; i++) {
+                    for (uint32_t j = 0; j < n; j++) {
+                        C[i * ldc + j] = bias[i];
+                    }
+                }
+                //  if (global_debug) PrintMatrixFloat32("A float", A, m, k, lda);
+                //  if (global_debug) PrintMatrixFloat32("B float", B, k, n, ldb);
+                //  if (global_debug) PrintMatrixFloat32("C float before", C, m, n, ldc);
+                cblas_sgemm1(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc);
+                //  if (global_debug) PrintMatrixFloat32("C float after", C, m, n, ldc);
+            } else {
+                for (int l = 0; l < listsize; l++) {
+                    int i = list[l];
+                    for (uint32_t j = 0; j < n; j++) {
+                        C[l * ldc + j] = bias[i];
+                    }
+                }
+                //  PrintMatrixFloat32("A float", A, k, m, lda);
+                //  PrintMatrixFloat32("trans(B) float", B, k, n, ldb);
+                //  PrintMatrixFloat32("C float before", C, listsize, n, ldc);
+                cblas_sgemm_subset(CblasRowMajor,
+                                   CblasNoTrans,
+                                   CblasNoTrans,
+                                   m,
+                                   n,
+                                   k,
+                                   1.0,
+                                   A,
+                                   lda,
+                                   B,
+                                   ldb,
+                                   1.0,
+                                   C,
+                                   ldc,
+                                   list,
+                                   listsize);
+                //  PrintMatrixFloat32("C float after", C, listsize, n, ldc);
+            }
+        }
+            break;
+        default:fprintf(stderr, "Bad data width in ApplyAffineTransform!\n");
+            throw -1;
+    }
+}
+
+__inline void ApplyDiagonalTransform(intel_dnn_component_t *component) {
+    auto transform = &component->op.affine;
+    int m = component->num_rows_out;
+    int n = component->num_columns_in;
+    int ldb = component->num_columns_in;
+    int ldc = component->num_columns_out;
+
+    switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+        case 2:
+            if (component->op.affine.num_bytes_per_weight == 1) {
+                int8_t *A = reinterpret_cast<int8_t*>(transform->ptr_weights);
+                int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
+                intel_compound_bias_t *bias = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
+                //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
+                //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
+                //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
+                isbmm8_gna(m, n, A, lda, B, ldb, bias, C, ldc);
+                //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+            } else if (component->op.affine.num_bytes_per_weight == 2) {
+                int16_t *A = reinterpret_cast<int16_t*>(transform->ptr_weights);
+                int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
+                int32_t *bias = reinterpret_cast<int32_t*>(transform->ptr_biases);
+                for (uint32_t i = 0; i < m; i++) {
+                    for (uint32_t j = 0; j < n; j++) {
+                        C[i*ldc+j] = bias[i];
+                    }
+                }
+                //  PrintMatrixInt16("A int16", A, 1, m, lda, component->op.affine.weight_scale_factor);
+                //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor);
+                //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+                cblas_isbmm16(m, n, A, lda, B, ldb, C, ldc);
+                //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
+            } else {
+                fprintf(stderr, "Bad weight width in ApplyDiagonalTransform!\n");
+                throw -1;
+            }
+            break;
+#endif  // #ifdef INTEGER_REF
+        case 4: {
+            auto A = reinterpret_cast<float *>(transform->ptr_weights);
+            auto B = reinterpret_cast<float *>(component->ptr_inputs);
+            auto C = reinterpret_cast<float *>(component->ptr_outputs);
+            auto bias = reinterpret_cast<float *>(transform->ptr_biases);
+            for (uint32_t i = 0; i < m; i++) {
+                for (uint32_t j = 0; j < n; j++) {
+                    C[i * ldc + j] = bias[i];
+                }
+            }
+            //  PrintMatrixFloat32("A float", A, 1, m, lda);
+            //  PrintMatrixFloat32("B float", B, k, n, ldb);
+            //  PrintMatrixFloat32("C float before", C, m, n, ldc);
+            for (uint32_t j = 0; j < n; j++) {
+                float *Bcol = B + j * ldb;
+                float *Ccol = C + j * ldc;
+                cblas_ssbmv1(CblasRowMajor, CblasLower, m, 0, 1.0, A, 1, Bcol, 1, 1.0, Ccol, 1);
+            }
+            //  PrintMatrixFloat32("C float after", C, m, n, ldc);
+        }
+            break;
+        default:fprintf(stderr, "Bad data width in ApplyDiagonalTransform!\n");
+            throw -1;
+    }
+}
+
+__inline void ApplyRecurrentTransform(intel_dnn_component_t *component, uint32_t row, void *ptr_feedbacks) {
+    intel_recurrent_t *transform = &component->op.recurrent;
+    int k1 = component->num_columns_in;
+    int k2 = component->num_columns_out;
+    int n = k2;
+
+    if (component->op.recurrent.ptr_feedbacks == nullptr) {
+        fprintf(stderr, "nullptr feedback pointer in ApplyRecurrentTransform()!\n");
+        throw -1;
+    }
+
+    switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+        case 2:
+            if (component->op.recurrent.num_bytes_per_weight == 1) {
+                int16_t *A1 = reinterpret_cast<int16_t*>(component->ptr_inputs) + row * component->num_columns_in;
+                int16_t *A2 = reinterpret_cast<int16_t*>(ptr_feedbacks);
+                int8_t *X = reinterpret_cast<int8_t*>(transform->ptr_weights);
+                intel_compound_bias_t *B = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs) + row * component->num_columns_out;
+                //  PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor);
+                //  PrintMatrixInt16("A2 int", A2, 1, k2, k2);
+                //  PrintMatrixInt8("X int", X, k, n, n, component->op.recurrent.weight_scale_factor);
+                //  PrintMatrixInt32("B int", B, 1, 2*n, 2*n, component->output_scale_factor);
+                igemv8_gna_split(n, k1, k2, A1, A2, X, B, C);
+                //  PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor);
+            } else if (component->op.recurrent.num_bytes_per_weight == 2) {
+                int16_t *A1 = reinterpret_cast<int16_t*>(component->ptr_inputs) + row * component->num_columns_in;
+                int16_t *A2 = reinterpret_cast<int16_t*>(ptr_feedbacks);
+                int16_t *X = reinterpret_cast<int16_t*>(transform->ptr_weights);
+                int32_t *B = reinterpret_cast<int32_t*>(transform->ptr_biases);
+                int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs) + row * component->num_columns_out;
+                //  PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor);
+                //  PrintMatrixInt16("A2 int", A2, 1, k2, k2, component->op.recurrent.weight_scale_factor);
+                //  PrintMatrixInt16("X int", X, k, n, n, component->op.recurrent.weight_scale_factor);
+                //  PrintMatrixInt32("B int", B, 1, n, n, component->output_scale_factor);
+                igemv16_split(n, k1, k2, A1, A2, X, B, C);
+                //  PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor);
+            } else {
+                fprintf(stderr, "Weight width not supported in ApplyRecurrentTransform!\n");
+                throw -1;
+            }
+            break;
+#endif  // #ifdef INTEGER_REF
+        case 4: {
+            auto A1 = reinterpret_cast<float *>(component->ptr_inputs) + row * component->num_columns_in;
+            auto A2 = reinterpret_cast<float *>(ptr_feedbacks);
+            auto X = reinterpret_cast<float *>(transform->ptr_weights);
+            auto B = reinterpret_cast<float *>(transform->ptr_biases);
+            auto C = reinterpret_cast<float *>(component->ptr_outputs) + row * component->num_columns_out;
+            //  PrintMatrixFloat32("A1 float", A1, 1, k1, k1);
+            //  PrintMatrixFloat32("A2 float", A2, 1, k2, k2);
+            //  PrintMatrixFloat32("X float", X, k, n, n);
+            //  PrintMatrixFloat32("B float", B, 1, n, n);
+            sgemv_split(n, k1, k2, A1, A2, X, B, C);
+            //  PrintMatrixFloat32("C float", C, 1, n, n);
+        }
+            break;
+        default:fprintf(stderr, "Bad data width in ApplyRecurrentTransform!\n");
+            throw -1;
+    }
+}
+
+__inline void ApplyConvolutional1DTransform(intel_dnn_component_t *component) {
+    switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+        case 2:
+            CNNFilter16(component);
+            break;
+#endif  // #ifdef INTEGER_REF
+        case 4:
+            //  PrintMatrixFloat32("Input float", reinterpret_cast<float*>(component->ptr_inputs),
+            //  component->num_rows_in, component->num_columns_in, component->num_columns_in);
+            //  PrintMatrixFloat32("Filt float", reinterpret_cast<float*>(component->op.conv1D.ptr_filters),
+            //  component->op.conv1D.num_filters,
+            //  component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps,
+            //  component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps);
+            //  PrintMatrixFloat32("Bias float", reinterpret_cast<float*>(component->op.conv1D.ptr_biases), 1,
+            // component->op.conv1D.num_filters, component->op.conv1D.num_filters);
+            CNNFilter32(component);
+            //  PrintMatrixFloat32("Output float", reinterpret_cast<float*>(component->ptr_outputs, component->num_rows_out,
+            // component->num_columns_out, component->num_columns_out);
+            break;
+        default:fprintf(stderr, "Bad data width in ApplyConvolutionalTransform!\n");
+            throw -1;
+    }
+}
+
+__inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component,
+                                            intel_dnn_number_type_t number_type,
+                                            uint32_t listsize) {
+    if (number_type == kDnnFloat) {
+        // PrintMatrixFloat32("PWL Input float", reinterpret_cast<float*>(component->ptr_inputs), component->num_rows_in,
+        // component->num_columns_in, component->num_columns_in);
+        PwlApply32(component, listsize);
+        // PrintMatrixFloat32("PWL Output float", reinterpret_cast<float*>(component->ptr_outputs), component->num_rows_out,
+        // component->num_columns_out, component->num_columns_out);
+#ifdef INTEGER_REF
+        } else if (component->num_bytes_per_output == 2) {
+            PwlApply16(component, listsize);
+#endif  // #ifdef INTEGER_REF
+    } else {
+        fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n");
+        throw -1;
+    }
+}
+
+__inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component,
+                                            intel_dnn_number_type_t number_type,
+                                            uint32_t listsize,
+                                            uint32_t num_row) {
+    if (number_type == kDnnFloat) {
+        PwlApply32(component, num_row, num_row, 0, listsize - 1);
+#ifdef INTEGER_REF
+        } else if (component->num_bytes_per_output == 2) {
+            PwlApply16(component, num_row, num_row, 0, listsize-1);
+#endif  // #ifdef INTEGER_REF
+    } else {
+        fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n");
+        throw -1;
+    }
+}
+
+__inline void ApplyMaxPoolTransform(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
+    if (component->num_bytes_per_input == 4) {
+        // PrintMatrixFloat32("Input float", reinterpret_cast<float*>(component->ptr_inputs), component->num_rows_in,
+        // component->num_columns_in, component->num_columns_in);
+        CNNMaxPool(component, number_type);
+        // PrintMatrixFloat32("Output float", reinterpret_cast<float*>(component->ptr_outputs), component->num_rows_out,
+        // component->num_columns_out, component->num_columns_out);
+    } else {
+        fprintf(stderr, "Bad data width in ApplyMaxPoolTransform!\n");
+        throw -1;
+    }
+}
+
+__inline void ApplyTranspose(intel_dnn_component_t *component) {
+    int m = component->num_rows_in;
+    int n = component->num_columns_in;
+    int lda = component->num_columns_in;
+    int ldb = component->num_columns_out;
+    // B = Transpose(A) where A is mxn and B is nxm
+    switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+        case 1:
+            {
+                int8_t *A = reinterpret_cast<int8_t*>(component->ptr_inputs);
+                int8_t *B = reinterpret_cast<int8_t*>(component->ptr_outputs);
+                for (uint32_t row = 0; row < m; row++) {
+                    for (uint32_t col = 0; col < n; col++) {
+                        B[col*ldb+row] = A[row*lda+col];
+                    }
+                }
+            }
+            break;
+        case 2:
+            {
+                int16_t *A = reinterpret_cast<int16_t*>(component->ptr_inputs);
+                int16_t *B = reinterpret_cast<int16_t*>(component->ptr_outputs);
+                for (uint32_t row = 0; row < m; row++) {
+                    for (uint32_t col = 0; col < n; col++) {
+                        B[col*ldb+row] = A[row*lda+col];
+                    }
+                }
+            }
+            break;
+#endif  // #ifdef INTEGER_REF
+        case 4: {
+            auto A = reinterpret_cast<float *>(component->ptr_inputs);
+            auto B = reinterpret_cast<float *>(component->ptr_outputs);
+            for (uint32_t row = 0; row < m; row++) {
+                for (uint32_t col = 0; col < n; col++) {
+                    B[col * ldb + row] = A[row * lda + col];
+                }
+            }
+        }
+            break;
+        default:fprintf(stderr, "Bad data width in ApplyInterleave!\n");
+            throw -1;
+    }
+}
+
+__inline void ApplyCopy(intel_dnn_component_t *component) {
+    auto src = reinterpret_cast<uint8_t *>(component->ptr_inputs);
+    auto dst = reinterpret_cast<uint8_t *>(component->ptr_outputs);
+    int32_t m = component->op.copy.num_copy_rows;
+    int32_t n = component->op.copy.num_copy_columns;
+    int32_t lda = component->num_columns_in;
+    int32_t ldb = component->num_columns_out;
+    if (m > component->num_rows_in) {
+        fprintf(stderr, "Error:  attempt to copy more columns than matrix has!\n");
+        throw -1;
+    } else {
+        switch (component->num_bytes_per_input) {
+#ifdef INTEGER_REF
+            case 2:
+                {
+                    int16_t *A = reinterpret_cast<int16_t*>(src);
+                    int16_t *B = reinterpret_cast<int16_t*>(dst);
+                    for (uint32_t row = 0; row < m; row++) {
+                        for (uint32_t col = 0; col < n; col++) {
+                            B[row*ldb + col] = A[row*lda + col];
+                        }
+                    }
+                }
+                break;
+#endif  // #ifdef INTEGER_REF
+            case 4: {
+                auto A = reinterpret_cast<float *>(src);
+                auto B = reinterpret_cast<float *>(dst);
+                for (uint32_t row = 0; row < m; row++) {
+                    for (uint32_t col = 0; col < n; col++) {
+                        B[row * ldb + col] = A[row * lda + col];
+                    }
+                }
+            }
+                break;
+            default:fprintf(stderr, "Bad data width in ApplyCopy!\n");
+                throw -1;
+        }
+    }
+}
+
+uint32_t AmIntelDnn::CopyActiveList(std::vector<std::vector<uint32_t> > &active_list, uint32_t list_index) {
+    if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) {
+        num_active_outputs_ = component[component.size() - 1].num_rows_out;
+    } else {
+        num_active_outputs_ = component[component.size() - 1].num_columns_out;
+    }
+
+    if (!active_list.empty()) {
+        if (list_index >= active_list.size()) {
+            fprintf(stderr, "Index %d beyond end of active list in CopyActiveList()\n", list_index);
+            throw -1;
+        }
+        if (active_list[list_index].size() > component[component.size() - 1].num_rows_out) {
+            fprintf(stderr, "Active list too large in CopyActiveList()\n");
+            throw -1;
+        }
+
+        if (ptr_active_outputs_ != nullptr) {
+            num_active_outputs_ = active_list[list_index].size();
+            memcpy(ptr_active_outputs_, active_list[list_index].data(), num_active_outputs_ * sizeof(uint32_t));
+        }
+    }
+
+    return (num_active_outputs_);
+}
+
+void AmIntelDnn::Propagate() {
+    for (uint32_t i = 0; i < component.size(); i++) {
+        intel_dnn_component_t *comp = &component[i];
+        uint32_t *ptr_active_outputs = nullptr;
+        uint32_t num_active_outputs = (comp->orientation_out == kDnnInterleavedOrientation)
+                                      ? comp->num_rows_out : comp->num_columns_out;
+
+        if (i == component.size() - 1) {  // active list applies to last component
+            ptr_active_outputs = ptr_active_outputs_;
+            num_active_outputs = num_active_outputs_;
+        } else if (i == component.size() - 2) {  // also applies to last two components when last is PWL
+            if ((component[i].operation == kDnnAffineOp) && (component[i + 1].operation == kDnnPiecewiselinearOp)) {
+                ptr_active_outputs = ptr_active_outputs_;
+                num_active_outputs = num_active_outputs_;
+            }
+        }
+
+        switch (comp->operation) {
+            case kDnnAffineOp :ApplyAffineTransform(comp, ptr_active_outputs, num_active_outputs);
+                break;
+            case kDnnDiagonalOp:ApplyDiagonalTransform(comp);
+                break;
+            case kDnnRecurrentOp:
+                if ((i < component.size() - 1) && (component[i + 1].operation == kDnnPiecewiselinearOp)) {
+                    intel_dnn_component_t *comp_pwl = &component[i + 1];
+                    for (uint32_t j = 0; j < comp->num_rows_in; j++) {
+                        void *ptr_feedbacks =
+                            reinterpret_cast<void *>(reinterpret_cast<int32_t *>(comp->op.recurrent.ptr_feedbacks) + j * comp_pwl->num_columns_out);
+                        ApplyRecurrentTransform(comp, j, ptr_feedbacks);
+                        //  PrintOutputs(i);
+                        ApplyPiecewiseLinearTransform(comp_pwl, number_type_, num_active_outputs, j);
+                    }
+                    i++;  // skip next component
+                } else {
+                    fprintf(stderr, "Missing PiecewiseLinear component after Recurrent component in Propagate!\n");
+                    throw -1;
+                }
+                break;
+            case kDnnConvolutional1dOp:ApplyConvolutional1DTransform(comp);
+                break;
+            case kDnnPiecewiselinearOp:ApplyPiecewiseLinearTransform(comp, number_type_, num_active_outputs);
+                break;
+            case kDnnMaxPoolOp:ApplyMaxPoolTransform(comp, number_type_);
+                break;
+            case kDnnInterleaveOp:ApplyTranspose(comp);
+                break;
+            case kDnnDeinterleaveOp:ApplyTranspose(comp);
+                break;
+            case kDnnCopyOp:ApplyCopy(comp);
+                break;
+            default:fprintf(stderr, "Bad operation in Propagate!\n");
+                throw -1;
+                break;
+        }
+        //  PrintOutputs(i); fflush(stdout);
+    }
+}
+
+intel_dnn_macro_operation_t AmIntelDnn::MacroOperation(uint32_t component_index) {
+    return (component[component_index].macro_operation);
+}
+
+void AmIntelDnn::SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation) {
+    component[component_index].macro_operation = macro_operation;
+}
+
+float AmIntelDnn::InputScaleFactor(uint32_t component_index) {
+    float scale_factor = 1.0;
+
+    if (component_index == 0) {
+        scale_factor = input_scale_factor_;
+    } else {
+        if (component[component_index - 1].operation == kDnnAffineOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnDiagonalOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnConvolutional1dOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnRecurrentOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnInterleaveOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnDeinterleaveOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        } else if (component[component_index - 1].operation == kDnnCopyOp) {
+            scale_factor = component[component_index - 1].output_scale_factor;
+        }
+    }
+
+    return (scale_factor);
+}
+
+float AmIntelDnn::WeightScaleFactor(uint32_t component_index) {
+    float scale_factor = 1.0;
+
+    if (component[component_index].operation == kDnnAffineOp) {
+        scale_factor = component[component_index].op.affine.weight_scale_factor;
+    } else if (component[component_index].operation == kDnnDiagonalOp) {
+        scale_factor = component[component_index].op.affine.weight_scale_factor;
+    } else if (component[component_index].operation == kDnnConvolutional1dOp) {
+        scale_factor = component[component_index].op.conv1D.weight_scale_factor;
+    } else if (component[component_index].operation == kDnnRecurrentOp) {
+        scale_factor = component[component_index].op.recurrent.weight_scale_factor;
+    }
+
+    return (scale_factor);
+}
+
+float AmIntelDnn::OutputScaleFactor(intel_dnn_component_t &comp) {
+    return comp.output_scale_factor;
+}
+
+void AmIntelDnn::SetOutputScaleFactor(uint32_t component_index, float scale_factor) {
+    component[component_index].output_scale_factor = scale_factor;
+}
+
+void AmIntelDnn::PrintOutputs(uint32_t component_index) {
+    float scale_factor = OutputScaleFactor(component_index);
+    uint32_t num_rows = component[component_index].num_rows_out;
+    uint32_t num_columns = component[component_index].num_columns_out;
+
+    printf("component %d : %s\n", component_index, intel_dnn_operation_name[component[component_index].operation]);
+    if (number_type_ == kDnnFloat) {
+        auto ptr_output = reinterpret_cast<float *>(component[component_index].ptr_outputs);
+        for (int i = 0; i < num_rows; i++) {
+            for (int j = 0; j < num_columns; j++) {
+                printf("%d %d : %e\n", i, j, ptr_output[i * num_columns + j] / scale_factor);
+            }
+        }
+    } else {
+        switch (component[component_index].num_bytes_per_output) {
+            case 1: {
+                auto ptr_output = reinterpret_cast<int8_t *>(component[component_index].ptr_outputs);
+                for (int i = 0; i < num_rows; i++) {
+                    for (int j = 0; j < num_columns; j++) {
+                        printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
+                    }
+                }
+            }
+                break;
+            case 2: {
+                auto ptr_output = reinterpret_cast<int16_t *>(component[component_index].ptr_outputs);
+                for (int i = 0; i < num_rows; i++) {
+                    for (int j = 0; j < num_columns; j++) {
+                        printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
+                    }
+                }
+            }
+                break;
+            case 4: {
+                auto ptr_output = reinterpret_cast<int32_t *>(component[component_index].ptr_outputs);
+                for (int i = 0; i < num_rows; i++) {
+                    for (int j = 0; j < num_columns; j++) {
+                        printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
+                    }
+                }
+            }
+                break;
+            default:
+                fprintf(stderr,
+                        "Bad num_bytes_per_output in component %d in AmIntelDnn::PrintOutputs()\n",
+                        component_index);
+                throw -1;
+        }
+    }
+}
+
+uint32_t AmIntelDnn::CompareScores(void *ptr_refscorearray, intel_score_error_t *score_error, uint32_t num_frames) {
+    intel_dnn_component_t *ptr_component = &component[component.size() - 1];
+    intel_dnn_orientation_t orientation = ptr_component->orientation_out;
+    float scale_factor = OutputScaleFactor(component.size() - 1);
+    uint32_t num_errors = 0;
+    uint32_t num_rows = (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : num_frames;
+    uint32_t num_columns = (orientation == kDnnInterleavedOrientation) ? num_frames : ptr_component->num_columns_out;
+    uint32_t num_row_step_ref =
+        (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : ptr_component->num_columns_out;
+    uint32_t num_row_step = ptr_component->num_columns_out;
+
+    if (ptr_component->operation == kDnnAffineOp) {
+        num_rows = num_active_outputs_;
+    }
+
+    ClearScoreError(score_error);
+
+    if (number_type_ == kDnnFloat) {
+        auto A = reinterpret_cast<float *>(ptr_component->ptr_outputs);
+        auto B = reinterpret_cast<float *>(ptr_refscorearray);
+        for (int i = 0; i < num_rows; i++) {
+            for (int j = 0; j < num_columns; j++) {
+                float score = A[i * num_row_step + j];
+                float refscore =
+                    (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref
+                        + j];
+                float scaled_score = score / scale_factor;
+                float error = fabs(refscore - scaled_score);
+                float rel_error = error / (fabs(refscore) + 1e-20);
+                float squared_error = error * error;
+                float squared_rel_error = rel_error * rel_error;
+                score_error->num_scores++;
+                score_error->sum_error += error;
+                score_error->sum_squared_error += squared_error;
+                if (error > score_error->max_error) {
+                    score_error->max_error = error;
+                }
+                score_error->sum_rel_error += rel_error;
+                score_error->sum_squared_rel_error += squared_rel_error;
+                if (rel_error > score_error->max_rel_error) {
+                    score_error->max_rel_error = rel_error;
+                }
+                if (error > score_error->threshold) {
+                    num_errors++;
+                }
+            }
+        }
+    } else if (number_type_ == kDnnInt) {
+        auto B = reinterpret_cast<float *>(ptr_refscorearray);
+        for (int i = 0; i < num_rows; i++) {
+            for (int j = 0; j < num_columns; j++) {
+                float score;
+                if (ptr_component->num_bytes_per_output == 4) {
+                    auto A = reinterpret_cast<int32_t *>(ptr_component->ptr_outputs);
+                    score = static_cast<float>(A[i * num_row_step + j]);
+                } else if (ptr_component->num_bytes_per_output == 2) {
+                    auto A = reinterpret_cast<int16_t *>(ptr_component->ptr_outputs);
+                    score = static_cast<float>(A[i * num_row_step + j]);
+                } else {
+                    fprintf(stderr,
+                            "Unsupported output width (%d) in AmIntelDnn::CompareScores()!\n",
+                            ptr_component->num_bytes_per_output);
+                    throw -1;
+                }
+                float refscore =
+                    (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref
+                        + j];
+                float scaled_score = score / scale_factor;
+                float error = fabs(refscore - scaled_score);
+                float rel_error = error / (fabs(refscore) + 1e-20);
+                float squared_error = error * error;
+                float squared_rel_error = rel_error * rel_error;
+                score_error->num_scores++;
+                score_error->sum_error += error;
+                score_error->sum_squared_error += squared_error;
+                if (error > score_error->max_error) {
+                    score_error->max_error = error;
+                }
+                score_error->sum_rel_error += rel_error;
+                score_error->sum_squared_rel_error += squared_rel_error;
+                if (rel_error > score_error->max_rel_error) {
+                    score_error->max_rel_error = rel_error;
+                }
+                if (error > score_error->threshold) {
+                    num_errors++;
+                }
+            }
+        }
+    } else {
+        fprintf(stderr, "Unknown number type in AmIntelDnn::CompareScores()!\n");
+        throw -1;
+    }
+
+    score_error->num_errors = num_errors;
+
+    return (num_errors);
+}
+
+void AmIntelDnn::WriteGraphWizModel(const char *filename) {
+    auto & components = component;
+
+#define IS_AFFINE(k)\
+    (components[k].operation == kDnnAffineOp ||\
+     components[k].operation == kDnnDiagonalOp)
+
+#define IS_CONV(k)\
+    (components[k].operation == kDnnConvolutional1dOp)
+
+#define IS_RELU(k)\
+    (components[k].operation == kDnnPiecewiselinearOp &&\
+     components[k].op.pwl.func_id == kActRelu)
+
+
+#define IS_DIAG(k)\
+    (components[k].operation == kDnnDiagonalOp)
+
+#define OUTPUTS(idx)\
+    components[idx].ptr_outputs, components[idx].num_rows_out*components[idx].num_columns_out * components[idx].num_bytes_per_output
+
+#define INPUTS(idx)\
+    components[idx].ptr_inputs, components[idx].num_rows_in*components[idx].num_columns_in * components[idx].num_bytes_per_input
+
+#define BIASES(idx)\
+    components[idx].op.affine.ptr_biases,  components[idx].num_rows_in*components[idx].num_columns_in * components[idx].op.affine.num_bytes_per_bias
+
+#define WEIGHTS(idx)\
+    components[idx].op.affine.ptr_weights, components[idx].op.affine.num_bytes_per_weight * components[idx].num_rows_in*components[idx].num_columns_in * \
+            (IS_DIAG(idx) ? 1 : components[idx].num_rows_out*components[idx].num_columns_out)
+
+    auto intersected = [](void * ptra, size_t asize, void * ptrb, size_t bsize) {
+        return !(((reinterpret_cast<char*>(ptra) + asize) <= ptrb) || ((reinterpret_cast<char*>(ptrb) + bsize) <= ptra));
+    };
+
+    auto equals = [](void * ptra, size_t asize, void * ptrb, size_t bsize) {
+        // return !((((char*)ptra + asize) < ptrb) || (((char*)ptrb + bsize) < ptra));
+        return ptra >= ptrb  && ptra < reinterpret_cast<char*>(ptrb) + bsize;
+    };
+
+    std::fstream graph("graph.dot", std::ios::out);
+    graph << "strict digraph {";
+    std::set<void*> weights;
+    std::set<void*> biases;
+    std::set<void*> outputs;
+    std::set<std::string> layersNames;
+
+    auto generate_layer_name = [&](int k) {
+        std::string l;
+        if (components[k].operation == kDnnPiecewiselinearOp) {
+            l += intel_dnn_activation_name[components[k].op.pwl.func_id];
+        } else {
+            l += intel_dnn_operation_name[components[k].operation];
+        }
+        l += "_" + std::to_string(k);
+        if (components[k].operation == kDnnPiecewiselinearOp) {
+            graph << l << " [shape=box, style=filled, fillcolor=yellow";
+        } else {
+            graph << l << " [shape=box";
+        }
+
+        graph << ", label=<<TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">\n"
+            "  <TR><TD  colspan=\"2\">" <<  l << "</TD></TR>\n"
+            "  <TR><TD  colspan=\"2\">" <<  components[k].num_rows_in << "x" <<  components[k].num_rows_out<< "</TD></TR>\n";
+        if (IS_AFFINE(k)) {
+            graph << "  <TR><TD> wscale</TD><TD>" <<  components[k].op.affine.weight_scale_factor<< "</TD></TR>\n";
+            graph << "  <TR><TD> wbit</TD><TD>" <<  components[k].op.affine.num_bytes_per_weight<< "</TD></TR>\n";
+            graph << "  <TR><TD> bbit</TD><TD>" <<  components[k].op.affine.num_bytes_per_bias<< "</TD></TR>\n";
+        }
+        if (IS_RELU(k)) {
+            graph << "  <TR><TD> negative_slope</TD><TD>" <<  components[k].op.pwl.func_id.negative_slope<< "</TD></TR>\n";
+        }
+        if (IS_CONV(k)) {
+            auto &conv = components[k].op.conv1D;
+            graph << "  <TR><TD> num_filters</TD><TD>" <<  conv.num_filters<< "</TD></TR>\n";
+            graph << "  <TR><TD> num_filter_rows</TD><TD>" <<  conv.num_filter_rows<< "</TD></TR>\n";
+            graph << "  <TR><TD> num_filter_coefficients</TD><TD>" <<  conv.num_filter_coefficients<< "</TD></TR>\n";
+            graph << "  <TR><TD> num_feature_maps</TD><TD>" <<  conv.num_feature_maps<< "</TD></TR>\n";
+            graph << "  <TR><TD> num_feature_map_rows</TD><TD>" <<  conv.num_feature_map_rows<< "</TD></TR>\n";
+            graph << "  <TR><TD> num_feature_map_columns</TD><TD>" <<  conv.num_feature_map_columns<< "</TD></TR>\n";
+            graph << "  <TR><TD> wscale</TD><TD>" <<  conv.weight_scale_factor<< "</TD></TR>\n";
+            graph << "  <TR><TD> wbit</TD><TD>" <<  conv.num_bytes_per_weight<< "</TD></TR>\n";
+            graph << "  <TR><TD> bbit</TD><TD>" <<  conv.num_bytes_per_bias<< "</TD></TR>\n";
+        }
+        graph<<   "  <TR><TD> num_rows_in</TD><TD>" <<  components[k].num_rows_in<< "</TD></TR>\n"
+                  "  <TR><TD> num_columns_in</TD><TD>" <<  components[k].num_columns_in<< "</TD></TR>\n"
+                  "  <TR><TD> num_rows_out</TD><TD>" <<  components[k].num_rows_out<< "</TD></TR>\n"
+                  "  <TR><TD> num_columns_out</TD><TD>" <<  components[k].num_columns_out<< "</TD></TR>\n"
+                  "  <TR><TD> oscale</TD><TD>" <<  components[k].output_scale_factor<< "</TD></TR>\n"
+                  "  <TR><TD> ibit</TD><TD>" <<  components[k].num_bytes_per_input<< "</TD></TR>\n"
+                  "  <TR><TD> obit</TD><TD>" <<  components[k].num_bytes_per_output<< "</TD></TR>\n"
+            "</TABLE>>];\n";
+
+        return l;
+    };
+
+
+    for (int k = 0; k < components.size(); ++k) {
+        std::string l = generate_layer_name(k);
+        layersNames.insert(l);
+        int lidx = std::distance(layersNames.begin(), layersNames.find(l));
+        int widx = 0;
+        int bidx = 0;
+
+        if (IS_AFFINE(k)) {
+            weights.insert(components[k].op.affine.ptr_weights);
+            biases.insert(components[k].op.affine.ptr_biases);
+
+            widx = std::distance(weights.begin(), weights.find(components[k].op.affine.ptr_weights));
+            bidx = std::distance(biases.begin(), biases.find(components[k].op.affine.ptr_biases));
+        }
+
+
+        auto lw =  "weights_" +  std::to_string(lidx) + "_" + std::to_string(widx);;
+        auto lb =  "biases_" +  std::to_string(lidx) + "_" + std::to_string(bidx);
+
+        if (IS_AFFINE(k)) {
+            graph << lw << " -> " << l << "[style=bold];";
+            graph << lb << " -> " << l << "[style=bold];";
+        }
+
+        graph << "\n";
+
+        bool inputConnected = false;
+
+        for (int k2 = 0; k2 < components.size(); ++k2) {
+            if (k2 == k) continue;
+
+
+            std::string r = generate_layer_name(k2);
+
+            int w2idx = 0;
+            int b2idx = 0;
+
+            if (IS_AFFINE(k2)) {
+                weights.insert(components[k2].op.affine.ptr_weights);
+                biases.insert(components[k2].op.affine.ptr_biases);
+
+                w2idx = std::distance(weights.begin(), weights.find(components[k2].op.affine.ptr_weights));
+                b2idx = std::distance(biases.begin(), biases.find(components[k2].op.affine.ptr_biases));
+            }
+
+            auto rw =  "weights_" + std::to_string(w2idx);
+            auto rb =  "biases_" + std::to_string(b2idx);
+
+            // ----------------------------------------------------------
+            // output to input connections
+            if (intersected(OUTPUTS(k2), INPUTS(k))) {
+                graph << r <<" -> "<< l << ";";
+                inputConnected = true;
+            }
+
+            // ----------------------------------------------------------
+            // output to biases connections
+            if (IS_AFFINE(k) && intersected(OUTPUTS(k2), BIASES(k))) {
+                graph << r << " -> " << lb << " [label=\"OB\", fontcolor=blue, color=blue, style=dashed];";
+            }
+
+            // ----------------------------------------------------------
+            // output to weights connections
+            if (IS_AFFINE(k) && equals(OUTPUTS(k2), WEIGHTS(k))) {
+                graph << r << " -> " << lw << " [label=\"OW\", fontcolor=magenta, color=magenta, style=dashed];";
+            }
+
+            // ----------------------------------------------------------
+            // weights to input connections
+            if (IS_AFFINE(k2) && equals(WEIGHTS(k2), INPUTS(k))) {
+                graph << rw << " -> " << l << " [label=\"WI\", fontcolor=red, color=red, style=dashed];";
+                inputConnected = true;
+            }
+
+            // ----------------------------------------------------------
+            // weights to bias connections
+            if (IS_AFFINE(k2) && IS_AFFINE(k) && equals(WEIGHTS(k2), BIASES(k))) {
+                graph << rw << " -> " << lb << " [label=\"WB\", fontcolor=darkgreen,color=darkgreen, style=dashed];";
+            }
+        }
+        if (!inputConnected) {
+            // drawing tmp connection
+            outputs.insert(components[k].ptr_inputs);
+            auto tidx = std::distance(outputs.begin(), outputs.find(components[k].ptr_inputs));
+            graph << tidx << " -> " << l
+                  << " [label=\"FROM_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
+        }
+    }
+
+    for (int k = 0; k < components.size(); ++k) {
+        std::string l = generate_layer_name(k);
+
+        int tidx = 0;
+        for (auto tmpOutPtrs : outputs) {
+            if (components[k].ptr_outputs == tmpOutPtrs) {
+                graph << l << " -> " << tidx << " [label=\"TO_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
+            }
+            tidx++;
+        }
+    }
+
+    graph << "}";
+}
+
+void AmIntelDnn::WriteDnnText(const char *filename, intel_dnn_number_type_t number_type) {
+    if ((number_type_ == kDnnFloat) && (number_type == kDnnInt)) {
+        fprintf(stderr, "Error trying to write floating point DNN as integer in AmIntelDnn::WriteDnnText().\n");
+        fprintf(stderr, "  Please convert to integer first.\n");
+        throw -1;
+    }
+#ifndef LIGHT_DUMP
+    std::ofstream out_file1(filename, std::ios::out);
+    std::ofstream &out_file = out_file1;
+#else
+    std::ofstream out_file((std::string(filename) + ".light").c_str(), std::ios::out);
+#endif
+    if (out_file.good()) {
+        uint32_t num_inputs = component[0].num_rows_in;
+        uint32_t num_outputs =
+            (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) ? component[component.size()
+                - 1].num_rows_out : component[component.size() - 1].num_columns_out;
+        uint32_t num_layers = num_gna_layers();
+        uint32_t num_group = this->num_group_in();
+        uint32_t layer = 0;
+
+        out_file << "<intel_dnn_file>\n";
+        out_file << "<number_type> " << intel_dnn_number_type_name[number_type] << "\n";
+        out_file << "<softmax_type> " << intel_dnn_softmax_name[softmax_type] << "\n";
+        out_file << "<num_memory_bytes> " << std::dec << num_bytes_dnn_memory_ << "\n";
+        out_file << "<num_group> " << std::dec << num_group << "\n";
+        out_file << "<number_inputs> " << std::dec << num_inputs << "\n";
+        out_file << "<num_outputs> " << std::dec << num_outputs << "\n";
+        out_file << "<num_layers> " << std::dec << num_layers << "\n";
+        for (uint32_t i = 0; i < component.size(); i++) {
+#ifdef LIGHT_DUMP
+            std::stringstream out_file_name;
+            out_file_name << getDumpFolderName() << std::setfill('0') << std::setw(2) << i << "_"
+                          << intel_dnn_operation_name[component[i].operation]
+                          << "-" << component[i].num_rows_in
+                          << "-" << component[i].num_rows_out;
+            if (component[i].operation == kDnnPiecewiselinearOp) {
+                out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id.type];
+            }
+            std::ofstream out_file((out_file_name.str() + ".txt").c_str(), std::ios::out);
+#endif
+
+            uint32_t num_rows_in = component[i].num_rows_in;
+            uint32_t num_columns_in = component[i].num_columns_in;
+            uint32_t num_rows_out = component[i].num_rows_out;
+            uint32_t num_columns_out = component[i].num_columns_out;
+            uint32_t num_bytes_per_input = component[i].num_bytes_per_input;
+            uint32_t num_bytes_per_output = component[i].num_bytes_per_output;
+            if ((component[i].operation == kDnnAffineOp)
+                || (component[i].operation == kDnnDiagonalOp)
+                || (component[i].operation == kDnnRecurrentOp)
+                || (component[i].operation == kDnnConvolutional1dOp)
+                || (component[i].operation == kDnnInterleaveOp)
+                || (component[i].operation == kDnnDeinterleaveOp)
+                || (component[i].operation == kDnnCopyOp)) {
+                out_file << "<layer_index> " << std::dec << layer << "\n";
+                layer++;
+            }
+            out_file << "<component_operation> " << intel_dnn_operation_name[component[i].operation] << "\n";
+            out_file << "<macro_operation> " << intel_dnn_macro_operation_name[component[i].macro_operation] << "\n";
+            out_file << "<num_rows_in> " << std::dec << num_rows_in << "\n";
+            out_file << "<num_columns_in> " << std::dec << num_columns_in << "\n";
+            out_file << "<num_rows_out> " << std::dec << num_rows_out << "\n";
+            out_file << "<num_columns_out> " << std::dec << num_columns_out << "\n";
+            out_file << "<orientation_in> " << std::dec << (component[i].orientation_in == kDnnInterleavedOrientation ?
+            "interleaved" : "deinterleaved") << "\n";
+            out_file << "<orientation_out> " << std::dec << (component[i].orientation_out == kDnnInterleavedOrientation ?
+                                                            "interleaved" : "deinterleaved") << "\n";
+
+            if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                out_file << "<num_bytes_per_input> " << std::dec << sizeof(float) << "\n";
+                out_file << "<num_bytes_per_output> " << std::dec << sizeof(float) << "\n";
+            } else {
+                out_file << "<num_bytes_per_input> " << std::dec << num_bytes_per_input << "\n";
+                out_file << "<num_bytes_per_output> " << std::dec << num_bytes_per_output << "\n";
+            }
+            out_file << "<input_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                     << MemoryOffset(component[i].ptr_inputs, ptr_dnn_memory_) << "\n";
+            out_file << "<output_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                     << MemoryOffset(component[i].ptr_outputs, ptr_dnn_memory_) << "\n";
+            switch (component[i].operation) {
+                case kDnnAffineOp:
+                case kDnnDiagonalOp: {
+                    uint32_t num_bytes_per_weight = component[i].op.affine.num_bytes_per_weight;
+                    uint32_t num_bytes_per_bias = component[i].op.affine.num_bytes_per_bias;
+                    float weight_scale_factor = component[i].op.affine.weight_scale_factor;
+                    float output_scale_factor = component[i].output_scale_factor;
+                    uint32_t num_weight_rows = (component[i].operation == kDnnDiagonalOp) ? 1 : num_rows_out;
+                    uint32_t num_weight_columns = num_rows_in;
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
+                    } else {
+                        out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
+                    }
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
+                    } else {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
+                                 << weight_scale_factor << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                                 << output_scale_factor << "\n";
+                    }
+                    out_file << "<weight_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.affine.ptr_weights, ptr_dnn_memory_) << "\n";
+                    out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.affine.ptr_biases, ptr_dnn_memory_) << "\n";
+
+                    std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out);
+                    std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
+
+                    if (num_bytes_per_weight == 1) {
+                        int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.affine.ptr_weights);
+                        intel_compound_bias_t *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.affine.ptr_biases);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                if (number_type == kDnnFloat) {
+                                    float val =
+                                        static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier
+                                            / weight_scale_factor;
+                                    out_wfile << std::setprecision(4) << val << " ";
+                                } else {
+                                    out_wfile <<  int((int8_t) ptr_weight[row * num_weight_columns + col]) << " ";
+                                }
+                                out_wfile << "\n";
+                            }
+                        }
+#endif
+                    } else if (num_bytes_per_weight == 2) {
+                        int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.affine.ptr_weights);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                if (number_type == kDnnFloat) {
+                                    out_wfile << std::setprecision(12)
+                                              << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " ";
+                                } else {
+                                    out_wfile << ptr_weight[row * num_weight_columns + col] << " ";
+                                }
+                                out_wfile << "\n";
+                            }
+                        }
+#endif
+                    } else if (number_type_ == kDnnFloat) {
+                        float *ptr_weight = reinterpret_cast<float *>(component[i].op.affine.ptr_weights);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                out_wfile << std::setprecision(5)
+                                          << ptr_weight[row * num_weight_columns + col] << " ";
+                                out_wfile << "\n";
+                            }
+                        }
+#endif
+                    } else {
+                        fprintf(stderr, "Unsupported weight type in WriteDnnText!\n");
+                        throw -1;
+                    }
+                    if (number_type_ == kDnnInt) {
+                        if (num_bytes_per_weight == 1) {
+                            intel_compound_bias_t
+                                *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.affine.ptr_biases);
+#ifdef DUMP_WB
+                            for (uint32_t row = 0; row < num_rows_out; row++) {
+                                out_bfile << std::setw(8) << ptr_biases[row].bias << ", ";
+                                out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n";
+                            }
+#endif
+                        } else {
+                            int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.affine.ptr_biases);
+#ifdef DUMP_WB
+                            for (uint32_t row = 0; row < num_rows_out; row++) {
+                                if (number_type == kDnnInt) {
+                                    out_bfile << std::setw(8) << ptr_biases[row] << "\n";
+                                } else {
+                                    out_bfile << std::setw(8) << ptr_biases[row] / output_scale_factor << "\n";
+                                }
+                            }
+#endif
+                        }
+
+                    } else {
+                        float *ptr_biases = reinterpret_cast<float *>(component[i].op.affine.ptr_biases);
+#ifdef DUMP_WB
+
+                        for (uint32_t row = 0; row < num_rows_out; row++) {
+                            out_bfile << std::setprecision(5) << ptr_biases[row] << "\n";
+                        }
+#endif
+                    }
+                }
+                break;
+                case kDnnConvolutional1dOp: {
+                    uint32_t num_filters = component[i].op.conv1D.num_filters;
+                    uint32_t num_filter_rows = component[i].op.conv1D.num_filter_rows;
+                    uint32_t num_filter_coefficients = component[i].op.conv1D.num_filter_coefficients;
+                    uint32_t num_feature_maps = component[i].op.conv1D.num_feature_maps;
+                    uint32_t num_feature_map_rows = component[i].op.conv1D.num_feature_map_rows;
+                    uint32_t num_feature_map_columns = component[i].op.conv1D.num_feature_map_columns;
+                    uint32_t num_filter_outputs =
+                        component[i].op.conv1D.num_feature_map_rows - component[i].op.conv1D.num_filter_rows + 1;
+                    uint32_t num_bytes_per_weight = component[i].op.conv1D.num_bytes_per_weight;
+                    uint32_t num_bytes_per_bias = component[i].op.conv1D.num_bytes_per_bias;
+                    float weight_scale_factor = component[i].op.conv1D.weight_scale_factor;
+                    float output_scale_factor = component[i].output_scale_factor;
+                    out_file << "<num_filters> " << std::dec << num_filters << "\n";
+                    out_file << "<num_filter_coefficients> " << std::dec << num_filter_coefficients << "\n";
+                    out_file << "<num_filter_rows> " << std::dec << num_filter_rows << "\n";
+                    out_file << "<num_feature_maps> " << std::dec << num_feature_maps << "\n";
+                    out_file << "<num_feature_map_rows> " << std::dec << num_feature_map_rows << "\n";
+                    out_file << "<num_feature_map_columns> " << std::dec << num_feature_map_columns << "\n";
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
+                    } else {
+                        out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
+                    }
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
+                    } else {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
+                                 << weight_scale_factor << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                                 << output_scale_factor << "\n";
+                    }
+                    out_file << "<filter_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.conv1D.ptr_filters, ptr_dnn_memory_) << "\n";
+                    out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.conv1D.ptr_biases, ptr_dnn_memory_) << "\n";
+
+
+                    std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out);
+                    std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
+
+
+                    if (num_bytes_per_weight == 1) {
+                        int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.conv1D.ptr_filters);
+                        intel_compound_bias_t *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.conv1D.ptr_biases);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_filters; row++) {
+                            for (uint32_t col = 0; col < num_filter_coefficients; col++) {
+                                if (number_type == kDnnFloat) {
+                                    float val = static_cast<float>(ptr_weight[row * num_filter_coefficients + col])
+                                        * ptr_bias[row].multiplier / weight_scale_factor;
+                                    out_wfile << std::setprecision(12) <<val << "\n";
+                                } else {
+                                    out_wfile << "0x" << std::setfill('0') << std::setw(2) << std::hex
+                                             << int((uint8_t) ptr_weight[row * num_filter_coefficients + col]) << "\n";
+                                }
+                            }
+                        }
+#endif
+                    } else if (num_bytes_per_weight == 2) {
+                        int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.conv1D.ptr_filters);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_filters; row++) {
+                            for (uint32_t col = 0; col < num_filter_coefficients; col++) {
+                                if (number_type == kDnnFloat) {
+                                    out_wfile << std::setprecision(12)
+                                             << ptr_weight[row * num_filter_coefficients + col] / weight_scale_factor
+                                             << "\n";
+                                } else {
+                                    out_wfile << "0x" << std::setfill('0') << std::setw(4) << std::hex
+                                             << ptr_weight[row * num_filter_coefficients + col] << "\n";
+                                }
+                            }
+                        }
+#endif
+                    } else if (number_type_ == kDnnFloat) {
+                        float *ptr_weight = reinterpret_cast<float *>(component[i].op.conv1D.ptr_filters);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_filters; row++) {
+                            for (uint32_t col = 0; col < num_filter_coefficients; col++) {
+                                out_wfile << std::setprecision(12)
+                                         << ptr_weight[row * num_filter_coefficients + col] << "\n";
+                            }
+                            out_wfile << "\n";
+                        }
+#endif
+                    } else {
+                        fprintf(stderr, "Unsupported filter weight type in WriteDnnText!\n");
+                        throw -1;
+                    }
+
+                    if (number_type_ == kDnnInt) {
+                        if (number_type == kDnnInt) {
+                            if (num_bytes_per_weight == 1) {
+                                intel_compound_bias_t
+                                    *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.conv1D.ptr_biases);
+#ifdef DUMP_WB
+                                for (uint32_t row = 0; row < num_filters; row++) {
+                                    out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                             << ptr_biases[row].bias << " ";
+                                    out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                             << int(ptr_biases[row].multiplier) << "\n";
+                                }
+#endif
+                            } else {
+                                int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.conv1D.ptr_biases);
+#ifdef DUMP_WB
+                                for (uint32_t row = 0; row < num_filters; row++) {
+                                    out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[row]
+                                             << "\n";
+                                }
+#endif
+                            }
+                        } else {
+                            int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.conv1D.ptr_biases);
+#ifdef DUMP_WB
+                            for (uint32_t row = 0; row < num_filters; row++) {
+                                out_bfile << std::setprecision(12)
+                                         << ptr_biases[row] / output_scale_factor << "\n";
+                            }
+#endif
+                        }
+                    } else {
+                        float *ptr_biases = reinterpret_cast<float *>(component[i].op.conv1D.ptr_biases);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_filters; row++) {
+                            out_bfile << std::setprecision(12) << ptr_biases[row] << "\n";
+                        }
+#endif
+                    }
+                    out_file << "\n";
+                }
+                    break;
+                case kDnnRecurrentOp: {
+                    float weight_scale_factor = component[i].op.recurrent.weight_scale_factor;
+                    float output_scale_factor = component[i].output_scale_factor;
+                    uint32_t num_vector_delay = component[i].op.recurrent.num_vector_delay;
+                    uint32_t num_bytes_per_weight = component[i].op.recurrent.num_bytes_per_weight;
+                    uint32_t num_bytes_per_bias = component[i].op.recurrent.num_bytes_per_bias;
+                    uint32_t num_weight_rows = num_columns_out;
+                    uint32_t num_weight_columns = num_columns_in + num_columns_out;
+                    out_file << "<num_vector_delay> " << std::dec << num_vector_delay << "\n";
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
+                    } else {
+                        out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
+                        out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
+                    }
+                    if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
+                    } else {
+                        out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
+                                 << weight_scale_factor << "\n";
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                                 << output_scale_factor << "\n";
+                    }
+                    out_file << "<weight_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.recurrent.ptr_weights, ptr_dnn_memory_) << "\n";
+                    out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.recurrent.ptr_biases, ptr_dnn_memory_) << "\n";
+                    out_file << "<feedback_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                             << MemoryOffset(component[i].op.recurrent.ptr_feedbacks, ptr_dnn_memory_) << "\n";
+                    if (num_bytes_per_weight == 1) {
+                        int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.recurrent.ptr_weights);
+                        intel_compound_bias_t
+                            *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.recurrent.ptr_biases);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            out_file << "<weight_row> ";
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                if (number_type == kDnnFloat) {
+                                    float val =
+                                        static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[col].multiplier
+                                            / weight_scale_factor;
+                                    out_file << std::setprecision(12) << std::scientific << val << " ";
+                                } else {
+                                    out_file << "0x" << std::setfill('0') << std::setw(2) << std::hex
+                                             << int((uint8_t) ptr_weight[row * num_weight_columns + col]) << " ";
+                                }
+                            }
+                            out_file << "\n";
+                        }
+#endif
+                    } else if (num_bytes_per_weight == 2) {
+                        int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.recurrent.ptr_weights);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            out_file << "<weight_row> ";
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                if (number_type == kDnnFloat) {
+                                    out_file << std::setprecision(12) << std::scientific
+                                             << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " ";
+                                } else {
+                                    out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
+                                             << ptr_weight[row * num_weight_columns + col] << " ";
+                                }
+                            }
+                            out_file << "\n";
+                        }
+#endif
+                    } else if (number_type_ == kDnnFloat) {
+                        float *ptr_weight = reinterpret_cast<float *>(component[i].op.recurrent.ptr_weights);
+#ifdef DUMP_WB
+                        for (uint32_t row = 0; row < num_weight_rows; row++) {
+                            out_file << "<weight_row> ";
+                            for (uint32_t col = 0; col < num_weight_columns; col++) {
+                                out_file << std::setprecision(12) << std::scientific
+                                         << ptr_weight[row * num_weight_columns + col] << " ";
+                            }
+                            out_file << "\n";
+                        }
+#endif
+                    } else {
+                        fprintf(stderr, "Unsupported weight type in WriteDnnText!\n");
+                        throw -1;
+                    }
+                    if (number_type_ == kDnnInt) {
+                        if (number_type == kDnnInt) {
+                            if (num_bytes_per_weight == 1) {
+                                intel_compound_bias_t
+                                    *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.recurrent.ptr_biases);
+                                out_file << "<compound_bias>" << " ";
+#ifdef DUMP_WB
+                                for (uint32_t col = 0; col < num_columns_out; col++) {
+                                    out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                             << ptr_biases[col].bias << " ";
+                                    out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                             << ptr_biases[col].multiplier << " ";
+                                }
+#endif
+                            } else {
+                                int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.recurrent.ptr_biases);
+                                out_file << "<bias>" << " ";
+#ifdef DUMP_WB
+                                for (uint32_t col = 0; col < num_columns_out; col++) {
+                                    out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[col]
+                                             << " ";
+                                }
+#endif
+                            }
+                        } else {
+                            int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.recurrent.ptr_biases);
+                            out_file << "<bias>" << " ";
+#ifdef DUMP_WB
+                            for (uint32_t col = 0; col < num_columns_out; col++) {
+                                out_file << std::setprecision(12) << std::scientific
+                                         << ptr_biases[col] / output_scale_factor << " ";
+                            }
+#endif
+                        }
+                    } else {
+                        float *ptr_biases = reinterpret_cast<float *>(component[i].op.recurrent.ptr_biases);
+                        out_file << "<bias>" << " ";
+#ifdef DUMP_WB
+                        for (uint32_t col = 0; col < num_columns_out; col++) {
+                            out_file << std::setprecision(12) << std::scientific << ptr_biases[col] << " ";
+                        }
+#endif
+                    }
+                    out_file << "\n";
+                }
+                    break;
+                case kDnnMaxPoolOp: {
+                    uint32_t num_pool_type = (component[i].op.maxpool.do_sum_not_max) ? 2 : 1;
+                    out_file << "<pool_type> " << std::dec << num_pool_type << "\n";
+                    out_file << "<pool_size> " << std::dec << component[i].op.maxpool.num_inputs << "\n";
+                    out_file << "<pool_step> " << std::dec << component[i].op.maxpool.num_inputs_step << "\n";
+                    out_file << "<pool_num_rows> " << std::dec << component[i].op.maxpool.num_inputs_stride << "\n";
+                    out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                             << component[i].output_scale_factor << "\n";
+                }
+                    break;
+                case kDnnPiecewiselinearOp: {
+                    intel_pwl_segment_t *ptr_segment = component[i].op.pwl.ptr_segments;
+                    DnnActivationType func_id = component[i].op.pwl.func_id.type;
+                    uint32_t num_segments = component[i].op.pwl.num_segments;
+                    float output_scale_factor = component[i].output_scale_factor;
+                    out_file << "<func_id> " << intel_dnn_activation_name[func_id] << "\n";
+                    out_file << "<num_bytes_per_slope> " << std::dec << sizeof(int16_t) << "\n";
+                    out_file << "<num_bytes_per_intercept> " << std::dec << sizeof(int16_t) << "\n";
+                    out_file << "<num_bytes_per_offset> " << std::dec << sizeof(int32_t) << "\n";
+                    if (number_type == kDnnFloat) {
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
+                        out_file << "<num_segments> " << std::dec << 0 << "\n";
+                        out_file << "<segment_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                 << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n";
+                    } else {
+                        out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                                 << output_scale_factor << "\n";
+                        out_file << "<num_segments> " << std::dec << num_segments << "\n";
+                        out_file << "<segment_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                 << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n";
+                        if (number_type_ == kDnnInt) {
+                            out_file << "<slope> ";
+                            for (int segment = 0; segment < num_segments; segment++) {
+                                out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
+                                         << ptr_segment[segment].slope << " ";
+                            }
+                            out_file << "\n";
+                            out_file << "<intercept> ";
+                            for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) {
+                                out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
+                                         << ptr_segment[segment].yBase << " ";
+                            }
+                            out_file << "\n";
+                            out_file << "<offset> ";
+                            for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) {
+                                out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                                         << ptr_segment[segment].xBase << " ";
+                            }
+                            out_file << "\n";
+                        } else if (num_segments > 0) {
+                            fprintf(stderr,
+                                    "Number of segments must be zero in floating point model in WriteDnnText!\n");
+                            throw -1;
+                        }
+                    }
+                }
+                    break;
+                case kDnnInterleaveOp:
+                    out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                             << component[i].output_scale_factor << "\n";
+                    break;
+                case kDnnDeinterleaveOp:
+                    out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                             << component[i].output_scale_factor << "\n";
+                    break;
+                case kDnnCopyOp:
+                    out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
+                             << component[i].output_scale_factor << "\n";
+                    out_file << "<num_copy_rows> " << std::dec << component[i].op.copy.num_copy_rows << "\n";
+                    out_file << "<num_copy_columns> " << std::dec << component[i].op.copy.num_copy_columns << "\n";
+                    break;
+                default:
+                    out_file << "<Error!!!> Unsupported Component :  "
+                             << intel_dnn_operation_name[component[i].operation] << "\n";
+                    //  fprintf(stderr, "Component type %s not yet supported in AmIntelDnn::WriteDnnText()!\n",
+                    //    intel_dnn_operation_name[component[i].operation]);
+                    //  throw -1;
+                    break;
+            }
+        }
+        if (ptr_active_outputs() != nullptr) {
+            out_file << "<activelist_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
+                     << MemoryOffset(ptr_active_outputs(), ptr_dnn_memory_) << "\n";
+        }
+        out_file << "<end_of_file>\n";
+        out_file.close();
+    } else {
+        fprintf(stderr, "Failed to open %s for writing!\n", filename);
+        throw -1;
+    }
+}
+
+void AmIntelDnn::InitGNAStruct(intel_nnet_type_t *ptr_nnet) {
+    intel_nnet_layer_t *pLayer;
+
+    if (ptr_nnet == nullptr)
+        THROW_GNA_EXCEPTION << "Invalid input parameter";
+    if (component.empty())
+        THROW_GNA_EXCEPTION << "empty model in AmIntelDnn::FillGNAStruct()";
+
+    ptr_nnet->nLayers = 0;
+    for (auto && c : component) {
+        if (c.operation == kDnnAffineOp
+            || (c.operation == kDnnDiagonalOp)
+            || (c.operation == kDnnConvolutional1dOp)
+            || (c.operation == kDnnDeinterleaveOp)
+            || (c.operation == kDnnInterleaveOp)
+            || (c.operation == kDnnRecurrentOp)
+            || (c.operation == kDnnCopyOp)
+            ) {
+            ptr_nnet->nLayers++;
+        }
+    }
+    ptr_nnet->nGroup = num_group_in();
+    ptr_nnet->pLayers = reinterpret_cast<intel_nnet_layer_t *>(_mm_malloc(ptr_nnet->nLayers * sizeof(intel_nnet_layer_t), 64));
+    if (ptr_nnet->pLayers == nullptr)
+        THROW_GNA_EXCEPTION << "out of memory in AmIntelDnn::FillGNAStruct()";
+    pLayer = ptr_nnet->pLayers;
+
+    for (int i = 0; i < component.size(); i++) {
+        // std::cout << "Component + " << i <<"=GNA_" << std::distance(ptr_nnet->pLayers, pLayer) << "\n";
+        switch (component[i].operation) {
+            case kDnnAffineOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = component[i].ptr_outputs;
+                pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
+                pLayer->nLayerKind = INTEL_AFFINE;
+                {
+                    pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE layer structure.";
+                    }
+                    auto pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
+                    pAffineLayer->pwl.pSegments = nullptr;
+                    pAffineLayer->pwl.nSegments = 0;
+
+                    pAffineLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias;
+                    pAffineLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight;
+                    pAffineLayer->affine.pBiases = component[i].op.affine.ptr_biases;
+                    pAffineLayer->affine.pWeights = component[i].op.affine.ptr_weights;
+                }
+                if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
+                    pLayer++;
+                }
+                break;
+            case kDnnDiagonalOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = component[i].ptr_outputs;
+                pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
+                pLayer->nLayerKind = INTEL_AFFINE_DIAGONAL;
+                {
+                    pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE_DIAGONAL layer structure.";
+                    }
+                    auto pDiagonalLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
+                    pDiagonalLayer->pwl.pSegments = nullptr;
+                    pDiagonalLayer->pwl.nSegments = 0;
+
+                    pDiagonalLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias;
+                    pDiagonalLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight;
+                    pDiagonalLayer->affine.pBiases = component[i].op.affine.ptr_biases;
+                    pDiagonalLayer->affine.pWeights = component[i].op.affine.ptr_weights;
+                }
+                if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
+                    pLayer++;
+                }
+                break;
+            case kDnnRecurrentOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = component[i].ptr_outputs;
+                pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
+                pLayer->nLayerKind = INTEL_RECURRENT;
+                {
+                    pLayer->pLayerStruct = _mm_malloc(sizeof(intel_recurrent_layer_t), 64);
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_RECURRENT layer structure.";
+                    }
+                    auto pRecurrentLayer = reinterpret_cast<intel_recurrent_layer_t *>(pLayer->pLayerStruct);
+                    pRecurrentLayer->pFeedbackBuffer = component[i].op.recurrent.ptr_feedbacks;
+                    pRecurrentLayer->pwl.pSegments = nullptr;
+                    pRecurrentLayer->pwl.nSegments = 0;
+
+                    pRecurrentLayer->affine.nBytesPerBias = component[i].op.recurrent.num_bytes_per_bias;
+                    pRecurrentLayer->affine.nBytesPerWeight = component[i].op.recurrent.num_bytes_per_weight;
+                    pRecurrentLayer->affine.pBiases = component[i].op.recurrent.ptr_biases;
+                    pRecurrentLayer->affine.pWeights = component[i].op.recurrent.ptr_weights;
+                }
+                if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
+                    pLayer++;
+                }
+                break;
+            case kDnnConvolutional1dOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = component[i].ptr_outputs;
+                pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten
+                pLayer->nLayerKind = INTEL_CONVOLUTIONAL;
+                {
+                    pLayer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64);
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_CONVOLUTIONAL layer structure.";
+                    }
+                    auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
+                    pConvolutionalLayer->nBytesBias = component[i].op.conv1D.num_bytes_per_bias;
+                    pConvolutionalLayer->nBytesFilterCoefficient = component[i].op.conv1D.num_bytes_per_weight;
+                    pConvolutionalLayer->nFilters = component[i].op.conv1D.num_filters;
+                    pConvolutionalLayer->nFilterRows = component[i].op.conv1D.num_filter_rows;
+                    pConvolutionalLayer->nFilterCoefficients = component[i].op.conv1D.num_filter_coefficients;
+                    pConvolutionalLayer->nFeatureMaps = component[i].op.conv1D.num_feature_maps;
+                    pConvolutionalLayer->nFeatureMapRows = component[i].op.conv1D.num_feature_map_rows;
+                    pConvolutionalLayer->nFeatureMapColumns = component[i].op.conv1D.num_feature_map_columns;
+                    pConvolutionalLayer->poolType = INTEL_NO_POOLING;  //  will be overwritten
+                    pConvolutionalLayer->nPoolSize = 0;  //  will be overwritten
+                    pConvolutionalLayer->nPoolStride = 0;  //  will be overwritten
+                    pConvolutionalLayer->pwl.nSegments = 0;  //  will be overwritten
+                    pConvolutionalLayer->pwl.pSegments = nullptr;  //  will be overwritten
+                    pConvolutionalLayer->pBiases = component[i].op.conv1D.ptr_biases;
+                    pConvolutionalLayer->pFilters = component[i].op.conv1D.ptr_filters;
+                }
+                if (i == component.size() - 1 || ((component[i + 1].operation != kDnnMaxPoolOp)
+                        && (component[i + 1].operation != kDnnPiecewiselinearOp))) {
+                    pLayer++;
+                }
+                break;
+            case kDnnMaxPoolOp:
+                if (i == 0) {
+                    THROW_GNA_EXCEPTION << "Pooling component with no preceeding component";
+                } else if (pLayer->nLayerKind == INTEL_CONVOLUTIONAL) {
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION "INTEL_CONVOLUTIONAL layer structure was not initialized.";
+                    }
+                    auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
+                    // it is possible to have activation preceding to maxpool
+                    if (pConvolutionalLayer->pwl.nSegments != 0) {
+                        THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at." << i;
+                    } else {
+                        pConvolutionalLayer->poolType =
+                            (component[i].op.maxpool.do_sum_not_max) ? INTEL_SUM_POOLING : INTEL_MAX_POOLING;
+                        pConvolutionalLayer->nPoolSize = component[i].op.maxpool.num_inputs;
+                        pConvolutionalLayer->nPoolStride = component[i].op.maxpool.num_inputs_step;
+
+
+                        // number of output columns correction - based on GNA-library expectations
+                        auto nFltSize = pConvolutionalLayer->nFilterCoefficients;
+                        auto fltStrideSz = pConvolutionalLayer->nFeatureMaps * pConvolutionalLayer->nFeatureMapColumns;  // always move 1 "row"
+                        auto maxNCOE = (pLayer->nInputColumns - nFltSize) / fltStrideSz + 1;
+                        // FLAT input matrix, pooled outputs per filter
+                        pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((maxNCOE - 1) / pConvolutionalLayer->nPoolStride + 1);
+
+                        // old code
+                        // pLayer->nOutputColumns /= pConvolutionalLayer->nPoolStride;
+                    }
+                } else {
+                    THROW_GNA_EXCEPTION << "Pooling component applied to non-convolutional layer";
+                }
+                break;
+            case kDnnPiecewiselinearOp:
+                pLayer->pOutputs = component[i].ptr_outputs;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
+                if (pLayer->pLayerStruct == nullptr) {
+                    THROW_GNA_EXCEPTION << pLayer->nLayerKind << " layer structure was not initialized.";
+                }
+                if (i == 0) {
+                    THROW_GNA_EXCEPTION << "PWL component with no preceding component.";
+                } else if ((component[i - 1].operation == kDnnAffineOp)
+                    || (component[i - 1].operation == kDnnDiagonalOp)) {
+                    auto pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
+                    pAffineLayer->pwl.nSegments = component[i].op.pwl.num_segments;
+                    pAffineLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
+                } else if (component[i - 1].operation == kDnnRecurrentOp) {
+                    auto pRecurrentLayer = reinterpret_cast<intel_recurrent_layer_t *>(pLayer->pLayerStruct);
+                    pRecurrentLayer->pwl.nSegments = component[i].op.pwl.num_segments;
+                    pRecurrentLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
+                } else if ((component[i - 1].operation == kDnnConvolutional1dOp)
+                    || ((component[i - 1].operation == kDnnMaxPoolOp)
+                        && (component[i - 2].operation == kDnnConvolutional1dOp))) {
+                    auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
+                    pConvolutionalLayer->pwl.nSegments = component[i].op.pwl.num_segments;
+                    pConvolutionalLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
+                    if (component[i - 1].operation != kDnnMaxPoolOp) {
+                        pLayer->nOutputColumns = component[i].num_columns_out;
+                    }
+                }
+                pLayer++;
+
+                break;
+            case kDnnInterleaveOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = nullptr;
+                pLayer->pOutputs = component[i].ptr_outputs;
+                pLayer->nLayerKind = INTEL_INTERLEAVE;
+                pLayer->pLayerStruct = nullptr;
+                pLayer++;
+                break;
+            case kDnnDeinterleaveOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = nullptr;
+                pLayer->pOutputs = component[i].ptr_outputs;
+                pLayer->nLayerKind = INTEL_DEINTERLEAVE;
+                pLayer->pLayerStruct = nullptr;
+                pLayer++;
+                break;
+            case kDnnCopyOp:
+                pLayer->nInputRows = component[i].num_rows_in;
+                pLayer->nInputColumns = component[i].num_columns_in;
+                pLayer->nOutputRows = component[i].num_rows_out;
+                pLayer->nOutputColumns = component[i].num_columns_out;
+                pLayer->nBytesPerInput = component[i].num_bytes_per_input;
+                pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
+                pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
+                pLayer->pInputs = component[i].ptr_inputs;
+                pLayer->pOutputsIntermediate = nullptr;
+                pLayer->pOutputs = component[i].ptr_outputs;
+                pLayer->nLayerKind = INTEL_COPY;
+                pLayer->pLayerStruct = nullptr;
+                {
+                    pLayer->pLayerStruct = _mm_malloc(sizeof(intel_copy_layer_t), 64);
+                    if (pLayer->pLayerStruct == nullptr) {
+                        THROW_GNA_EXCEPTION << pLayer->nLayerKind << " could not allocate memory for INTEL_COPY layer structure.";
+                    }
+                    auto *pCopyLayer = reinterpret_cast<intel_copy_layer_t *>(pLayer->pLayerStruct);
+                    pCopyLayer->nCopyRows = component[i].op.copy.num_copy_rows;
+                    pCopyLayer->nCopyCols = component[i].op.copy.num_copy_columns;
+                }
+                pLayer++;
+                break;
+            default: {
+                THROW_GNA_EXCEPTION << "GNA does yet not support " << intel_dnn_operation_name[component[i].operation];
+            }
+        }
+    }
+    // enable debugging of partial array of components
+    ptr_nnet->nLayers = std::distance(ptr_nnet->pLayers, pLayer);
+}
+
+void AmIntelDnn::DestroyGNAStruct(intel_nnet_type_t *ptr_nnet) {
+    ptr_nnet->nGroup = 0;
+    if (ptr_nnet->pLayers != nullptr) {
+        for (int i = 0; i < ptr_nnet->nLayers; i++) {
+            switch (ptr_nnet->pLayers[i].nLayerKind) {
+                case INTEL_AFFINE:break;
+                case INTEL_AFFINE_DIAGONAL:break;
+                case INTEL_RECURRENT:break;
+                case INTEL_CONVOLUTIONAL:break;
+                case INTEL_INTERLEAVE:break;
+                case INTEL_DEINTERLEAVE:break;
+                case INTEL_COPY:break;
+                default:break;
+            }
+            if (ptr_nnet->pLayers[i].pLayerStruct != nullptr) {
+                _mm_free(ptr_nnet->pLayers[i].pLayerStruct);
+            }
+        }
+        if (ptr_nnet->pLayers != nullptr) {
+            _mm_free(ptr_nnet->pLayers);
+        }
+    }
+    ptr_nnet->nLayers = 0;
+}
+
+void AmIntelDnn::GetScaledOutput(float *ptr_output, uint32_t component_index) {
+    if (component_index > num_components()) {
+        fprintf(stderr, "Illegal component index %d in GetScaledOutput\n", component_index);
+        throw -1;
+    }
+    if (ptr_output != nullptr) {
+        float scale_factor = OutputScaleFactor(component_index);
+        uint32_t num_elements = component[component_index].num_rows_out * component[component_index].num_columns_out;
+        if (number_type_ == kDnnFloat) {
+            float *ptr_input = reinterpret_cast<float *>(component[component_index].ptr_outputs);
+            for (uint32_t i = 0; i < num_elements; i++) {
+                ptr_output[i] = ptr_input[i] / scale_factor;
+            }
+        } else if (component[component_index].num_bytes_per_output == 2) {
+            int16_t *ptr_input = reinterpret_cast<int16_t *>(component[component_index].ptr_outputs);
+            for (uint32_t i = 0; i < num_elements; i++) {
+                ptr_output[i] = static_cast<float>(ptr_input[i]) / scale_factor;
+            }
+        } else {
+            int32_t *ptr_input = reinterpret_cast<int32_t *>(component[component_index].ptr_outputs);
+            for (uint32_t i = 0; i < num_elements; i++) {
+                ptr_output[i] = static_cast<float>(ptr_input[i]) / scale_factor;
+            }
+        }
+    } else {
+        fprintf(stderr, "Output pointer is nullptr in GetScaledOutput\n");
+        throw -1;
+    }
+}
+
+void AmIntelDnn::WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet) {
+#ifdef LIGHT_DUMP
+    if (nnet) {
+        for (int i = 0; i < nnet->nLayers; i++) {
+            auto component = nnet->pLayers;
+            std::stringstream out_file_name;
+            auto getLayerType = [](intel_layer_kind_t kind){
+                switch (kind){
+                    case INTEL_AFFINE : return "affine";
+                    case INTEL_AFFINE_DIAGONAL : return "diag";
+                    case INTEL_RECURRENT : return "recurrent";
+                    case INTEL_CONVOLUTIONAL : return "convolution";
+                    case INTEL_INTERLEAVE : return "interleave";
+                    case INTEL_DEINTERLEAVE : return "deinterleave";
+                    case INTEL_COPY : return "copy";
+                    default: return "unknown";
+                }
+            };
+            out_file_name << std::setfill('0') << std::setw(2) << i << "_"
+                          << getLayerType(component[i].nLayerKind)
+                          << "-" << nnet->pLayers[i].nInputRows
+                          << "-" << nnet->pLayers[i].nOutputRows;
+
+            auto inputfileName = getDumpFolderNameGNA() + out_file_name.str() + "_input.txt";
+            auto outFileName = getDumpFolderNameGNA() + out_file_name.str() + "_output.txt";
+            auto pwlFileName = getDumpFolderNameGNA() + out_file_name.str() + "_pwl.txt";
+            auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt";
+
+            std::ofstream out_file(outFileName.c_str(), std::ios::out);
+            std::ofstream pwl_file(pwlFileName.c_str(), std::ios::out);
+            std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in);
+            std::ofstream in_file(inputfileName.c_str(), std::ios::out);
+
+            float  summOfDiff = 0.f;
+            float  summOfSqDiff = 0.f;
+            float  maxD = 0.0f;
+            int    numItems = 0;
+
+            auto write_pwl = [&pwl_file](intel_pwl_func_t & pwl) {
+                for (int k =0; k < pwl.nSegments; k++) {
+                    pwl_file << pwl.pSegments[k].slope << ", " << pwl.pSegments[k].xBase << ", " << pwl.pSegments[k].yBase << "\n";
+                }
+            };
+            if (nnet->pLayers[i].nLayerKind == INTEL_AFFINE || nnet->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL) {
+                auto affine = reinterpret_cast<intel_affine_layer_t*>(nnet->pLayers[i].pLayerStruct);
+                write_pwl(affine->pwl);
+            }
+            if (nnet->pLayers[i].nLayerKind == INTEL_CONVOLUTIONAL) {
+                auto conv = reinterpret_cast<intel_convolutional_layer_t*>(nnet->pLayers[i].pLayerStruct);
+                write_pwl(conv->pwl);
+            }
+
+            for (int k = 0; k < component[i].nOutputRows; k++) {
+                for (int j = 0; j < component[i].nOutputColumns; j++) {
+                    float floatValue = 0.f;
+                    if (component[i].nBytesPerOutput == 4) {
+                        auto value = (reinterpret_cast<int32_t *>(component[i].pOutputs)[k * component[i].nOutputColumns + j]);
+                        floatValue = (static_cast<float>(value) / 1.0);
+                    } else {
+                        auto value = reinterpret_cast<int16_t *>(component[i].pOutputs)[k * component[i].nOutputColumns + j];
+                        floatValue = (static_cast<float>(value) / 1.0);
+                    }
+                    out_file << std::setw(8) << floatValue << "\n";
+                    if (ref_out_file) {
+                        float ref_value = 0.f;
+                        ref_out_file >> ref_value;
+                        float diff = (ref_value - floatValue);
+                        diff = diff  < 0 ? -diff : diff;
+                        summOfDiff += diff;
+                        summOfSqDiff += diff * diff;
+                        maxD = std::max(maxD, diff);
+                        numItems++;
+                    }
+                }
+            }
+            if (numItems) {
+                auto rmse = sqrt(summOfSqDiff / numItems);
+                auto avg = summOfDiff / numItems;
+                std :: cout << std::left << std::setw(55) << out_file_name.str()
+                            << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse
+                            << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg
+                            << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl;
+            }
+
+
+            for (int k = 0; k < component[i].nInputRows; k++) {
+                for (int j = 0; j < component[i].nInputColumns; j++) {
+                    if (component[i].nBytesPerInput == 4) {
+                        in_file << std::setw(8)
+                                << (reinterpret_cast<int32_t *>(component[i].pInputs)[k * component[i].nInputColumns + j]);
+                    } else {
+                        in_file << std::setw(8)
+                                << (reinterpret_cast<int16_t *>(component[i].pInputs)[k * component[i].nInputColumns + j]);
+                    }
+                    in_file << "\n";
+                }
+            }
+        }
+    }
+#endif
+}
+
+void AmIntelDnn::WriteInputAndOutputText() {
+#ifdef LIGHT_DUMP
+    for (int i = 0; i < num_components(); i++) {
+        std::stringstream out_file_name;
+        out_file_name << std::setfill('0') << std::setw(2) << i << "_"
+                      << intel_dnn_operation_name[component[i].operation]
+                      << "-" << component[i].num_rows_in
+                      << "-" << component[i].num_rows_out;
+        if (component[i].operation == kDnnPiecewiselinearOp) {
+            out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id];
+        }
+        auto inputfileName = getDumpFolderName() + out_file_name.str() + "_input.txt";
+        auto outFileName = getDumpFolderName() + out_file_name.str() + "_output.txt";
+        auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt";
+
+        std::ofstream out_file(outFileName.c_str(), std::ios::out);
+        std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in);
+        std::ofstream in_file(inputfileName.c_str(), std::ios::out);
+
+        float  summOfDiff = 0.f;
+        float  summOfSqDiff = 0.f;
+        float  maxD = 0.0f;
+        int    numItems = 0;
+
+        for (int k = 0; k < component[i].num_rows_out; k++) {
+            for (int j = 0; j < component[i].num_columns_out; j++) {
+                float floatValue = 0.f;
+                if (component[i].num_bytes_per_output == 4) {
+                    if (number_type_ == kDnnInt) {
+                        auto value = (reinterpret_cast<int32_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j]);
+                    //    out_file << std::setw(8) << value << "\n";
+                        floatValue = (static_cast<float>(value) / component[i].output_scale_factor);
+
+                    } else {
+                        floatValue = (reinterpret_cast<float*>(component[i].ptr_outputs)[
+                            k * component[i].num_columns_out+ j]) / component[i].output_scale_factor;
+                    }
+                } else {
+                    auto value = reinterpret_cast<int16_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
+                 //   out_file << std::setw(8) << value << "\n";
+                    floatValue = (static_cast<float>(value) / component[i].output_scale_factor);
+                }
+                out_file << std::setw(8) << floatValue << "\n";
+                if (ref_out_file) {
+                    float ref_value = 0.f;
+                    ref_out_file >> ref_value;
+                    float diff = (ref_value - floatValue);
+                    diff = diff < 0.f ? -diff : diff;
+                    summOfDiff += diff;
+                    summOfSqDiff += diff * diff;
+                    maxD = std::max(maxD, diff);
+                    numItems++;
+                }
+            }
+        }
+        if (numItems) {
+            auto rmse = sqrt(summOfSqDiff / numItems);
+            auto avg = summOfDiff / numItems;
+            std :: cout << std::left << std::setw(55) << out_file_name.str()
+                        << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse
+                        << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg
+                        << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl;
+        }
+
+
+        for (int k = 0; k < component[i].num_rows_in; k++) {
+            for (int j = 0; j < component[i].num_columns_in; j++) {
+                if (component[i].num_bytes_per_input == 4) {
+                    if (number_type_ == kDnnInt) {
+                        in_file << std::setw(8)
+                                << (reinterpret_cast<int32_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in
+                                    + j]);
+                    } else {
+                        in_file << std::setw(8)
+                                << (reinterpret_cast<float *>(component[i].ptr_inputs)[k * component[i].num_columns_in
+                                    + j]);
+                    }
+                } else {
+                    in_file << std::setw(8)
+                            << (reinterpret_cast<int16_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in
+                                + j]);
+                }
+                in_file << "\n";
+            }
+        }
+#endif
+    }
+}
+
+bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2) {
+    bool isCompatible = true;
+
+    // compare basic structures to see if they are compatible
+    if (dnn1.num_components() != dnn2.num_components()) isCompatible = false;
+    for (int i = 0; i < dnn1.num_components(); i++) {
+        if (dnn1.component[i].num_rows_in != dnn2.component[i].num_rows_in) isCompatible = false;
+        if (dnn1.component[i].num_columns_in != dnn2.component[i].num_columns_in) isCompatible = false;
+        if (dnn1.component[i].num_rows_out != dnn2.component[i].num_rows_out) isCompatible = false;
+        if (dnn1.component[i].num_columns_out != dnn2.component[i].num_columns_out) isCompatible = false;
+        if (dnn1.component[i].operation != dnn2.component[i].operation) isCompatible = false;
+    }
+
+    return (isCompatible);
+}
+
+void ClearScoreError(intel_score_error_t *error) {
+    error->num_scores = 0;
+    error->num_errors = 0;
+    error->max_error = 0.0;
+    error->sum_error = 0.0;
+    error->sum_squared_error = 0.0;
+    error->max_rel_error = 0.0;
+    error->sum_rel_error = 0.0;
+    error->sum_squared_rel_error = 0.0;
+}
+
+void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error) {
+    total_error->num_errors += error->num_errors;
+    total_error->num_scores += error->num_scores;
+    total_error->sum_error += error->sum_error;
+    total_error->sum_squared_error += error->sum_squared_error;
+    if (error->max_error > total_error->max_error) {
+        total_error->max_error = error->max_error;
+    }
+    total_error->sum_rel_error += error->sum_rel_error;
+    total_error->sum_squared_rel_error += error->sum_squared_rel_error;
+    if (error->max_rel_error > total_error->max_rel_error) {
+        total_error->max_rel_error = error->max_rel_error;
+    }
+}
+
+void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs) {
+    // Assumes input vector contains log likelihoods
+    // The computes x[i] = x[i] - log(sum_j exp(x[j]))
+    // This normalizes the likelihoods by the sum of likelihoods but stores them as log likelihoods
+
+    float max_score = ptr_input[0];
+    float sum = 0.0;
+    float diff;
+    // find max score for normalization to [0,1]
+    for (uint32_t i = 0; i < num_inputs; i++) {
+        if (ptr_input[i] > max_score) {
+            max_score = ptr_input[i];
+        }
+    }
+    for (uint32_t i = 0; i < num_inputs; i++) {
+        sum += exp(ptr_input[i] - max_score);
+    }
+    if (sum < 1.0e-20) {
+        fprintf(stderr, "Warning:  attempt to take log(0) in SoftmaxGoogle()!\n");
+        sum = 1.0e-20;
+    }
+    diff = max_score + log(sum);
+    for (uint32_t i = 0; i < num_outputs; i++) {
+        ptr_output[i] = ptr_input[i] - diff;
+    }
+}
diff --git a/inference-engine/src/gna_plugin/dnn.h b/inference-engine/src/gna_plugin/dnn.h
new file mode 100644
index 00000000000000..8a1506dbe88ef9
--- /dev/null
+++ b/inference-engine/src/gna_plugin/dnn.h
@@ -0,0 +1,823 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <malloc.h>
+#include <memory.h>
+#include <xmmintrin.h>
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <string>
+#include <iomanip>
+#include <type_traits>
+#include <vector>
+#include "gna-api.h"
+
+#define DNN_MAX_BATCH_SIZE 8
+#define DNN_MAX_INPUTS 3072
+#define DNN_MAX_OUTPUTS 8192
+#define DNN_MAX_ERROR 1.0e-4f
+#define DNN_NUM_BYTES_INT_BIAS 4
+#define DNN_NUM_BYTES_INT_AFFINE_OUT 4
+#define DNN_RAND_INT8_AMPLITUDE 127.0f
+#define DNN_RAND_INT16_AMPLITUDE 16384.0f
+#define DNN_RAND_INT32_AMPLITUDE 1048576.0f
+#define DNN_RAND_FLOAT32_AMPLITUDE 8.0f
+
+enum DnnActivationType {
+    kActNone,
+    kActSigmoid,
+    kActTanh,
+    kActRelu,
+    kActLeakyRelu,
+    kActIdentity,
+    kActKaldiLstmClipping,
+    kActCustom,
+    kActNumType
+};
+struct DnnActivation {
+    // for prelu
+    DnnActivationType type;
+    float negative_slope;
+    operator DnnActivationType () const noexcept {
+        return type;
+    }
+    static DnnActivation fromType(DnnActivationType type) {
+        DnnActivation activation;
+        activation.type = type;
+        activation.negative_slope = 0.0f;
+        return activation;
+    }
+};
+
+static_assert(std::is_trivial<DnnActivation>::value, "DnnActivation is not trival type");
+
+static const char *intel_dnn_activation_name[kActNumType] = {
+    "kActNone",
+    "kActSigmoid",
+    "kActTanh",
+    "kActRelu",
+    "kActLeakyRelu",
+    "kActIdentity",
+    "kActKaldiLstmClipping",
+    "kActCustom"
+};
+
+typedef enum DnnSoftmaxType {
+    kSoftmaxNone,
+    kSoftmaxKaldiSumgroup,
+    kSoftmaxEesen,
+    kSoftmaxGoogle,
+    kSoftmaxNumType
+} intel_dnn_softmax_type_t;
+
+static const char *intel_dnn_softmax_name[kSoftmaxNumType] = {
+    "kSoftmaxNone",
+    "kSoftmaxKaldiSumGroup",
+    "kSoftmaxKaldiApplyLog",
+    "kSoftmaxGoogle"
+};
+
+typedef enum {
+    kDnnUnknownOrientation,
+    kDnnInterleavedOrientation,
+    kDnnNonInterleavedOrientation,
+    kDnnNumOrientation
+} intel_dnn_orientation_t;
+
+typedef enum {
+    kDnnNullOp,
+    kDnnAffineOp,
+    kDnnDiagonalOp,
+    kDnnConvolutional1dOp,
+    kDnnPiecewiselinearOp,
+    kDnnMaxPoolOp,
+    kDnnRecurrentOp,
+    kDnnInterleaveOp,
+    kDnnDeinterleaveOp,
+    kDnnCopyOp,
+    kDnnNumOp
+} intel_dnn_operation_t;
+
+static const char *intel_dnn_operation_name[kDnnNumOp] = {
+    "kDnnNullOp",
+    "kDnnAffineOp",
+    "kDnnDiagonalOp",
+    "kDnnConvolutional1dOp",
+    "kDnnPiecewiselinearOp",
+    "kDnnMaxPoolOp",
+    "kDnnRecurrentOp",
+    "kDnnInterleaveOp",
+    "kDnnDeinterleaveOp",
+    "kDnnCopyOp"
+};
+
+typedef enum {
+    kDnnMacroOpNone,
+    kDnnMacroOpLstm,
+    kDnnMacroOpBiLstm,
+    kDnnNumMacroOp
+} intel_dnn_macro_operation_t;
+
+static const char *intel_dnn_macro_operation_name[kDnnNumMacroOp] = {
+    "kDnnMacroOpNone",
+    "kDnnMacroOpLstm",
+    "kDnnMacroOpBiLstm"
+};
+
+typedef enum {
+    kDnnFloat,
+    kDnnInt,
+    kDnnNumNumberType
+} intel_dnn_number_type_t;
+
+static const char *intel_dnn_number_type_name[kDnnNumNumberType] = {
+    "kDnnFloat",
+    "kDnnInt"
+};
+
+typedef struct {
+    uint32_t num_bytes_per_weight;
+    uint32_t num_bytes_per_bias;
+    float weight_scale_factor;
+    void *ptr_weights;
+    void *ptr_biases;
+} intel_affine_t;
+
+typedef struct {
+    uint32_t num_bytes_per_weight;
+    uint32_t num_bytes_per_bias;
+    uint32_t num_filters;
+    uint32_t num_filter_rows;
+    uint32_t num_filter_coefficients;
+    uint32_t num_feature_maps;
+    uint32_t num_feature_map_rows;
+    uint32_t num_feature_map_columns;
+    float weight_scale_factor;
+    void *ptr_filters;     // filters stored one after the other
+    void *ptr_biases;
+} intel_convolutionalD_t;
+
+typedef struct {
+    uint32_t num_inputs;         // pool size
+    uint32_t num_inputs_step;     // pool step
+    uint32_t num_inputs_stride;  // pool stride (number of convolution filters)
+    bool do_sum_not_max;
+} intel_maxpool_t;
+
+typedef struct {
+    DnnActivation func_id;       // identifies function being approximated
+    uint32_t num_segments;
+    intel_pwl_segment_t *ptr_segments;
+} intel_piecewiselinear_t;
+
+typedef struct {
+    uint32_t num_vector_delay;
+    uint32_t num_bytes_per_weight;
+    uint32_t num_bytes_per_bias;
+    float weight_scale_factor;
+    void *ptr_feedbacks;
+    void *ptr_weights;
+    void *ptr_biases;
+} intel_recurrent_t;
+
+typedef struct {
+} intel_interleave_t;
+
+typedef struct {
+} intel_deinterleave_t;
+
+typedef struct {
+    uint32_t num_copy_columns;        // number of columns to copy
+    uint32_t num_copy_rows;            // number of rows to copy
+} intel_copy_t;
+
+typedef struct {
+    uint32_t num_rows_in;
+    uint32_t num_columns_in;
+    uint32_t num_rows_out;
+    uint32_t num_columns_out;
+    uint32_t num_bytes_per_input;
+    uint32_t num_bytes_per_output;
+    intel_dnn_operation_t operation;
+    intel_dnn_macro_operation_t macro_operation;
+    intel_dnn_orientation_t orientation_in;
+    intel_dnn_orientation_t orientation_out;
+    union operation_struct_t {
+        intel_affine_t affine;
+        intel_convolutionalD_t conv1D;
+        intel_maxpool_t maxpool;
+        intel_piecewiselinear_t pwl;
+        intel_recurrent_t recurrent;
+        intel_interleave_t interleave;
+        intel_deinterleave_t deinterleave;
+        intel_copy_t copy;
+    } op;
+    void *ptr_inputs;
+    void *ptr_outputs;
+    float output_scale_factor;
+} intel_dnn_component_t;
+
+typedef struct {
+    uint32_t num_scores;
+    uint32_t num_errors;
+    float threshold;
+    float max_error;
+    float rms_error;
+    float sum_error;
+    float sum_rms_error;
+    float sum_squared_error;
+    float max_rel_error;
+    float sum_rel_error;
+    float sum_squared_rel_error;
+} intel_score_error_t;
+
+class AmIntelDnn {
+ public:
+    AmIntelDnn()
+        : ptr_active_outputs_(NULL),
+          num_active_outputs_(0),
+          input_scale_factor_(1.0),
+          num_left_context(0),
+          num_right_context(0),
+          do_rotate_input(false),
+          num_rotate_rows(0),
+          num_rotate_columns(0),
+          softmax_type(kSoftmaxNone),
+          ptr_sumgroup_sizes(NULL),
+          num_sumgroup_sizes(0),
+          ptr_priors(NULL) {
+    }
+
+    ~AmIntelDnn() {
+        component.clear();
+        if (ptr_sumgroup_sizes != NULL) {
+            _mm_free(ptr_sumgroup_sizes);
+        }
+        if (ptr_priors != NULL) {
+            _mm_free(ptr_priors);
+        }
+    }
+
+    uint32_t num_components() { return (uint32_t) component.size(); }
+
+    void Init(void *ptr_memory, uint32_t num_memory_bytes, intel_dnn_number_type_t number_type, float scale_factor);
+    void InitActiveList(uint32_t *ptr_active_list);
+
+    template<class A, class B, class C, class D>
+    static void InitAffineComponent(intel_dnn_component_t &comp,
+                             uint32_t num_rows_in,
+                             uint32_t num_columns,
+                             uint32_t num_rows_out,
+                             uint32_t num_bytes_per_input,
+                             uint32_t num_bytes_per_output,
+                             uint32_t num_bytes_per_weight,
+                             uint32_t num_bytes_per_bias,
+                             float weight_scale_factor,
+                             float output_scale_factor,
+                             A *&ptr_inputs,
+                             B *&ptr_outputs,
+                             C *&ptr_weights,
+                             D *&ptr_biases,
+                             bool isDiag = false) {
+        InitAffineComponentPrivate(comp,
+                                   num_rows_in,
+                                   num_columns,
+                                   num_rows_out,
+                                   num_bytes_per_input,
+                                   num_bytes_per_output,
+                                   num_bytes_per_weight,
+                                   num_bytes_per_bias,
+                                   weight_scale_factor,
+                                   output_scale_factor,
+                                   (void *&) ptr_inputs,
+                                   (void *&) ptr_outputs,
+                                   (void *&) ptr_weights,
+                                   (void *&) ptr_biases,
+                                   isDiag,
+                                   true);
+    }
+
+    template<class A, class B, class C, class D>
+    void InitAffineComponent(uint32_t component_index,
+                             uint32_t num_rows_in,
+                             uint32_t num_columns,
+                             uint32_t num_rows_out,
+                             uint32_t num_bytes_per_input,
+                             uint32_t num_bytes_per_output,
+                             uint32_t num_bytes_per_weight,
+                             uint32_t num_bytes_per_bias,
+                             float weight_scale_factor,
+                             float output_scale_factor,
+                             A *&ptr_inputs,
+                             B *&ptr_outputs,
+                             C *&ptr_weights,
+                             D *&ptr_biases,
+                             bool isDiag = false) {
+        InitAffineComponentPrivate(component[component_index],
+                                   num_rows_in,
+                                   num_columns,
+                                   num_rows_out,
+                                   num_bytes_per_input,
+                                   num_bytes_per_output,
+                                   num_bytes_per_weight,
+                                   num_bytes_per_bias,
+                                   weight_scale_factor,
+                                   output_scale_factor,
+                                   (void *&) ptr_inputs,
+                                   (void *&) ptr_outputs,
+                                   (void *&) ptr_weights,
+                                   (void *&) ptr_biases,
+                                   isDiag,
+                                   false);
+    }
+
+    void InitDiagonalComponent(uint32_t component_index,
+                               uint32_t num_rows_in,
+                               uint32_t num_columns,
+                               uint32_t num_rows_out,
+                               uint32_t num_bytes_per_input,
+                               uint32_t num_bytes_per_output,
+                               uint32_t num_bytes_per_weight,
+                               uint32_t num_bytes_per_bias,
+                               float weight_scale_factor,
+                               float output_scale_factor,
+                               void *ptr_inputs,
+                               void *ptr_outputs,
+                               void *ptr_weights,
+                               void *ptr_biases);
+
+    template<class A, class B, class C, class D>
+    void InitConvolutional1DComponent(uint32_t component_index,
+                                      uint32_t num_rows_in,
+                                      uint32_t num_columns_in,
+                                      uint32_t num_rows_out,
+                                      uint32_t num_columns_out,
+                                      uint32_t num_bytes_per_input,
+                                      uint32_t num_bytes_per_output,
+                                      uint32_t num_bytes_per_weight,
+                                      uint32_t num_bytes_per_bias,
+                                      uint32_t num_filters,
+                                      uint32_t num_filter_rows,
+                                      uint32_t num_filter_coefficients,
+                                      uint32_t num_feature_maps,
+                                      uint32_t num_feature_map_rows,
+                                      uint32_t num_feature_map_columns,
+                                      float weight_scale_factor,
+                                      float output_scale_factor,
+                                      A *& ptr_inputs,
+                                      B *& ptr_outputs,
+                                      C *& ptr_filters,
+                                      D *& ptr_biases) {
+        InitConvolutional1DComponentPrivate(component[component_index],
+                                            num_rows_in,
+                                            num_columns_in,
+                                            num_rows_out,
+                                            num_columns_out,
+                                            num_bytes_per_input,
+                                            num_bytes_per_output,
+                                            num_bytes_per_weight,
+                                            num_bytes_per_bias,
+                                            num_filters,
+                                            num_filter_rows,
+                                            num_filter_coefficients,
+                                            num_feature_maps,
+                                            num_feature_map_rows,
+                                            num_feature_map_columns,
+                                            weight_scale_factor,
+                                            output_scale_factor,
+                                            (void *&) ptr_inputs,
+                                            (void *&) ptr_outputs,
+                                            (void *&) ptr_filters,
+                                            (void *&) ptr_biases,
+                                            false);
+    }
+
+    template<class A, class B, class C, class D>
+    static void InitConvolutional1DComponent(intel_dnn_component_t &comp,
+                                      uint32_t num_rows_in,
+                                      uint32_t num_columns_in,
+                                      uint32_t num_rows_out,
+                                      uint32_t num_columns_out,
+                                      uint32_t num_bytes_per_input,
+                                      uint32_t num_bytes_per_output,
+                                      uint32_t num_bytes_per_weight,
+                                      uint32_t num_bytes_per_bias,
+                                      uint32_t num_filters,
+                                      uint32_t num_filter_rows,
+                                      uint32_t num_filter_coefficients,
+                                      uint32_t num_feature_maps,
+                                      uint32_t num_feature_map_rows,
+                                      uint32_t num_feature_map_columns,
+                                      float weight_scale_factor,
+                                      float output_scale_factor,
+                                      A *& ptr_inputs,
+                                      B *& ptr_outputs,
+                                      C *& ptr_filters,
+                                      D *& ptr_biases) {
+        InitConvolutional1DComponentPrivate(comp,
+                                            num_rows_in,
+                                            num_columns_in,
+                                            num_rows_out,
+                                            num_columns_out,
+                                            num_bytes_per_input,
+                                            num_bytes_per_output,
+                                            num_bytes_per_weight,
+                                            num_bytes_per_bias,
+                                            num_filters,
+                                            num_filter_rows,
+                                            num_filter_coefficients,
+                                            num_feature_maps,
+                                            num_feature_map_rows,
+                                            num_feature_map_columns,
+                                            weight_scale_factor,
+                                            output_scale_factor,
+                                            (void *&) ptr_inputs,
+                                            (void *&) ptr_outputs,
+                                            (void *&) ptr_filters,
+                                            (void *&) ptr_biases,
+                                            true);
+    }
+
+
+
+    // TODO: this functions accepted component_index only used in legacy code
+    void InitMaxpoolComponent(uint32_t component_index,
+                              uint32_t num_rows_in,
+                              uint32_t num_columns_in,
+                              uint32_t num_rows_out,
+                              uint32_t num_columns_out,
+                              uint32_t num_bytes_per_input,
+                              uint32_t num_bytes_per_output,
+                              uint32_t num_pool_size,
+                              uint32_t num_pool_step,
+                              uint32_t num_pool_stride,
+                              bool do_sum_not_max,
+                              float output_scale_factor,
+                              void * ptr_inputs,
+                              void * ptr_outputs) {
+        InitMaxpoolComponentPrivate(component[component_index],
+            num_rows_in,
+            num_columns_in,
+            num_rows_out,
+            num_columns_out,
+            num_bytes_per_input,
+            num_bytes_per_output,
+            num_pool_size,
+            num_pool_step,
+            num_pool_stride,
+            do_sum_not_max,
+            output_scale_factor,
+            (void *&) ptr_inputs,
+            (void *&) ptr_outputs,
+            false);
+    }
+
+    template<class A, class B>
+    static void InitMaxpoolComponent(intel_dnn_component_t &cmp,
+                              uint32_t num_rows_in,
+                              uint32_t num_columns_in,
+                              uint32_t num_rows_out,
+                              uint32_t num_columns_out,
+                              uint32_t num_bytes_per_input,
+                              uint32_t num_bytes_per_output,
+                              uint32_t num_pool_size,
+                              uint32_t num_pool_step,
+                              uint32_t num_pool_stride,
+                              bool do_sum_not_max,
+                              float output_scale_factor,
+                              A *&ptr_inputs,
+                              B *&ptr_outputs) {
+        InitMaxpoolComponentPrivate(cmp,
+                                    num_rows_in,
+                                    num_columns_in,
+                                    num_rows_out,
+                                    num_columns_out,
+                                    num_bytes_per_input,
+                                    num_bytes_per_output,
+                                    num_pool_size,
+                                    num_pool_step,
+                                    num_pool_stride,
+                                    do_sum_not_max,
+                                    output_scale_factor,
+                                    (void *&) ptr_inputs,
+                                    (void *&) ptr_outputs,
+                                    true);
+    }
+
+
+
+
+    void InitPiecewiseLinearComponent(uint32_t component_index,
+                                      DnnActivation function_id,
+                                      intel_dnn_orientation_t orientation,
+                                      uint32_t num_rows,
+                                      uint32_t num_columns,
+                                      uint32_t num_bytes_per_input,
+                                      uint32_t num_bytes_per_output,
+                                      uint32_t num_segments,
+                                      float output_scale_factor,
+                                      void * ptr_inputs,
+                                      void * ptr_outputs,
+                                      intel_pwl_segment_t *ptr_segments) {
+        InitPiecewiseLinearComponentPrivate(component[component_index],
+                                            function_id,
+                                            orientation,
+                                            num_rows,
+                                            num_columns,
+                                            num_bytes_per_input,
+                                            num_bytes_per_output,
+                                            num_segments,
+                                            output_scale_factor,
+                                            ptr_inputs,
+                                            ptr_outputs,
+                                            ptr_segments,
+                                            false);
+    }
+    template<class A, class B>
+    static void InitPiecewiseLinearComponent(intel_dnn_component_t &cmp,
+                                      DnnActivation function_id,
+                                      intel_dnn_orientation_t orientation,
+                                      uint32_t num_rows,
+                                      uint32_t num_columns,
+                                      uint32_t num_bytes_per_input,
+                                      uint32_t num_bytes_per_output,
+                                      uint32_t num_segments,
+                                      float output_scale_factor,
+                                      A *&ptr_inputs,
+                                      B *&ptr_outputs,
+                                      intel_pwl_segment_t *ptr_segments) {
+        InitPiecewiseLinearComponentPrivate(cmp,
+                                            function_id,
+                                            orientation,
+                                            num_rows,
+                                            num_columns,
+                                            num_bytes_per_input,
+                                            num_bytes_per_output,
+                                            num_segments,
+                                            output_scale_factor,
+                                            (void *&) ptr_inputs,
+                                            (void *&) ptr_outputs,
+                                            ptr_segments,
+                                            true);
+    }
+
+
+    void InitRecurrentComponent(uint32_t component_index,
+                                uint32_t num_rows,
+                                uint32_t num_columns_in,
+                                uint32_t num_columns_out,
+                                uint32_t num_bytes_per_input,
+                                uint32_t num_bytes_per_output,
+                                uint32_t num_vector_delay,
+                                uint32_t num_bytes_per_weight,
+                                uint32_t num_bytes_per_bias,
+                                float weight_scale_factor,
+                                float output_scale_factor,
+                                void *ptr_inputs,
+                                void *ptr_feedbacks,
+                                void *ptr_outputs,
+                                void *ptr_weights,
+                                void *ptr_biases);
+    void InitInterleaveComponent(uint32_t component_index,
+                                 uint32_t num_rows,
+                                 uint32_t num_columns,
+                                 uint32_t num_bytes_per_input,
+                                 uint32_t num_bytes_per_output,
+                                 float output_scale_factor,
+                                 void *ptr_inputs,
+                                 void *ptr_outputs);
+    void InitDeinterleaveComponent(uint32_t component_index,
+                                   uint32_t num_rows,
+                                   uint32_t num_columns,
+                                   uint32_t num_bytes_per_input,
+                                   uint32_t num_bytes_per_output,
+                                   float output_scale_factor,
+                                   void *ptr_inputs,
+                                   void *ptr_outputs);
+    void InitCopyComponent(uint32_t component_index,
+                           intel_dnn_orientation_t orientation,
+                           uint32_t num_rows_in,
+                           uint32_t num_columns_in,
+                           uint32_t num_rows_out,
+                           uint32_t num_columns_out,
+                           uint32_t num_bytes_per_input,
+                           uint32_t num_bytes_per_output,
+                           float output_scale_factor,
+                           uint32_t num_copy_rows,
+                           uint32_t num_copy_columns,
+                           void *ptr_inputs,
+                           void *ptr_outputs) {
+        InitCopyComponentPrivate(component[component_index],
+                                 orientation,
+                                 num_rows_in,
+                                 num_columns_in,
+                                 num_rows_out,
+                                 num_columns_out,
+                                 num_bytes_per_input,
+                                 num_bytes_per_output,
+                                 output_scale_factor,
+                                 num_copy_rows,
+                                 num_copy_columns,
+                                 ptr_inputs,
+                                 ptr_outputs,
+                                 false);
+    }
+
+    template<class A, class B>
+    static  void InitCopyComponent(intel_dnn_component_t &cmp,
+                                   intel_dnn_orientation_t orientation,
+                                   uint32_t num_rows_in,
+                                   uint32_t num_columns_in,
+                                   uint32_t num_rows_out,
+                                   uint32_t num_columns_out,
+                                   uint32_t num_bytes_per_input,
+                                   uint32_t num_bytes_per_output,
+                                   float output_scale_factor,
+                                   uint32_t num_copy_rows,
+                                   uint32_t num_copy_columns,
+                                   A *&ptr_inputs,
+                                   B *&ptr_outputs) {
+        InitCopyComponentPrivate(cmp,
+                                 orientation,
+                                 num_rows_in,
+                                 num_columns_in,
+                                 num_rows_out,
+                                 num_columns_out,
+                                 num_bytes_per_input,
+                                 num_bytes_per_output,
+                                 output_scale_factor,
+                                 num_copy_rows,
+                                 num_copy_columns,
+                                 (void *&) ptr_inputs,
+                                 (void *&) ptr_outputs,
+                                 true);
+    }
+    void AddComponents(uint32_t num_components_to_add);
+    void ClearComponent(uint32_t component_index);
+    void ClearState();
+    uint32_t CopyActiveList(std::vector<std::vector<uint32_t> > &active_list, uint32_t list_index);
+    void Propagate();
+    intel_dnn_macro_operation_t MacroOperation(uint32_t component_index);
+    void SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation);
+    float InputScaleFactor(uint32_t component_index);
+    float WeightScaleFactor(uint32_t component_index);
+    float OutputScaleFactor(uint32_t component_index) {
+        return OutputScaleFactor(component[component_index]);
+    }
+    float OutputScaleFactor(intel_dnn_component_t &comp);
+    void SetInputScaleFactor(float scale_factor) { input_scale_factor_ = scale_factor; }
+    void SetOutputScaleFactor(uint32_t component_index, float scale_factor);
+    void PrintOutputs(uint32_t component_index);
+    uint32_t CompareScores(void *ptr_scores, intel_score_error_t *score_error, uint32_t num_frames);
+    void WriteGraphWizModel(const char *filename);
+    void WriteDnnText(const char *filename, intel_dnn_number_type_t number_type);
+    uint32_t MemoryRequiredToReadDnnText(const char *filename);
+    void ReadDnnText(const char *filename, void *ptr_memory, uint32_t num_memory_bytes, float *ptr_scale_in);
+
+    void InitGNAStruct(intel_nnet_type_t *ptr_nnet);
+    void DestroyGNAStruct(intel_nnet_type_t *ptr_nnet);
+    void GetScaledOutput(float *ptr_output, uint32_t component_index);
+    uint32_t *ptr_active_outputs() { return (ptr_active_outputs_); }
+    uint32_t num_active_outputs() { return (num_active_outputs_); }
+    uint32_t num_gna_layers() {
+        uint32_t num_layers = 0;
+        for (uint32_t i = 0; i < component.size(); i++) {
+            if ((component[i].operation == kDnnAffineOp) || (component[i].operation == kDnnDiagonalOp)
+                || (component[i].operation == kDnnConvolutional1dOp) || (component[i].operation == kDnnCopyOp)
+                || (component[i].operation == kDnnDeinterleaveOp) || (component[i].operation == kDnnInterleaveOp)
+                || (component[i].operation == kDnnRecurrentOp)) {
+                num_layers++;
+            }
+        }
+        return (num_layers);
+    }
+    uint32_t num_group_in() {
+        return ((component.size() > 0) ? ((component[0].orientation_in == kDnnInterleavedOrientation)
+                                          ? component[0].num_columns_in : component[0].num_rows_in) : 0);
+    }
+    uint32_t num_group_out() {
+        return ((component.size() > 0) ? ((component[component.size() - 1].orientation_out
+            == kDnnInterleavedOrientation) ? component[component.size() - 1].num_columns_out : component[
+                                              component.size() - 1].num_rows_out) : 0);
+    }
+
+    std::vector<intel_dnn_component_t> component;
+    uint32_t num_left_context;
+    uint32_t num_right_context;
+    bool do_rotate_input;
+    uint32_t num_rotate_rows = 0;
+    uint32_t num_rotate_columns = 0;
+    DnnSoftmaxType softmax_type;
+    uint32_t *ptr_sumgroup_sizes;
+    uint32_t num_sumgroup_sizes;
+    float *ptr_priors;
+
+    void WriteInputAndOutputText();
+    static void WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet);
+    void BeginNewWrite();
+
+ private:
+    void *ptr_dnn_memory_;
+    uint32_t num_bytes_dnn_memory_;
+    uint32_t *ptr_active_outputs_;
+    uint32_t num_active_outputs_;
+    intel_dnn_number_type_t number_type_;
+    float input_scale_factor_;
+
+    static void InitCopyComponentPrivate(intel_dnn_component_t &cmp,
+                                         intel_dnn_orientation_t orientation,
+                                         uint32_t num_rows_in,
+                                         uint32_t num_columns_in,
+                                         uint32_t num_rows_out,
+                                         uint32_t num_columns_out,
+                                         uint32_t num_bytes_per_input,
+                                         uint32_t num_bytes_per_output,
+                                         float output_scale_factor,
+                                         uint32_t num_copy_rows,
+                                         uint32_t num_copy_columns,
+                                         void *&ptr_inputs,
+                                         void *&ptr_outputs,
+                                         bool postInitMem);
+
+    static void InitMaxpoolComponentPrivate(intel_dnn_component_t &cmp,
+                                     uint32_t num_rows_in,
+                                     uint32_t num_columns_in,
+                                     uint32_t num_rows_out,
+                                     uint32_t num_columns_out,
+                                     uint32_t num_bytes_per_input,
+                                     uint32_t num_bytes_per_output,
+                                     uint32_t num_pool_size,
+                                     uint32_t num_pool_step,
+                                     uint32_t num_pool_stride,
+                                     bool do_sum_not_max,
+                                     float output_scale_factor,
+                                     void *&ptr_inputs,
+                                     void *&ptr_outputs,
+                                     bool   postInitMem);
+
+    static void InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &cmp,
+                                             DnnActivation function_id,
+                                             intel_dnn_orientation_t orientation,
+                                             uint32_t num_rows,
+                                             uint32_t num_columns,
+                                             uint32_t num_bytes_per_input,
+                                             uint32_t num_bytes_per_output,
+                                             uint32_t num_segments,
+                                             float   output_scale_factor,
+                                             void *& ptr_inputs,
+                                             void *& ptr_outputs,
+                                             intel_pwl_segment_t *ptr_segments,
+                                             bool    postInitMem);
+
+    static void InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp,
+                                             uint32_t num_rows_in,
+                                             uint32_t num_columns_in,
+                                             uint32_t num_rows_out,
+                                             uint32_t num_columns_out,
+                                             uint32_t num_bytes_per_input,
+                                             uint32_t num_bytes_per_output,
+                                             uint32_t num_bytes_per_weight,
+                                             uint32_t num_bytes_per_bias,
+                                             uint32_t num_filters,
+                                             uint32_t num_filter_rows,
+                                             uint32_t num_filter_coefficients,
+                                             uint32_t num_feature_maps,
+                                             uint32_t num_feature_map_rows,
+                                             uint32_t num_feature_map_columns,
+                                             float   weight_scale_factor,
+                                             float   output_scale_factor,
+                                             void *& ptr_inputs,
+                                             void *& ptr_outputs,
+                                             void *& ptr_filters,
+                                             void *& ptr_biases,
+                                             bool    postInitMem);
+
+    static void InitAffineComponentPrivate(intel_dnn_component_t &comp,
+                                           uint32_t num_rows_in,
+                                           uint32_t num_columns,
+                                           uint32_t num_rows_out,
+                                           uint32_t num_bytes_per_input,
+                                           uint32_t num_bytes_per_output,
+                                           uint32_t num_bytes_per_weight,
+                                           uint32_t num_bytes_per_bias,
+                                           float  weight_scale_factor,
+                                           float  output_scale_factor,
+                                           void *&ptr_inputs,
+                                           void *&ptr_outputs,
+                                           void *&ptr_weights,
+                                           void *&ptr_biases,
+                                           bool   isDiag,
+                                           bool   postInitMem);
+};
+
+void PlotFloatIntDnn(AmIntelDnn *dnn, AmIntelDnn *dnn_int);
+bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2);
+void ClearScoreError(intel_score_error_t *error);
+void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error);
+void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs);
diff --git a/inference-engine/src/gna_plugin/dnn_memory.cpp b/inference-engine/src/gna_plugin/dnn_memory.cpp
new file mode 100644
index 00000000000000..16496b5bf33c5c
--- /dev/null
+++ b/inference-engine/src/gna_plugin/dnn_memory.cpp
@@ -0,0 +1,30 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cstdio>
+#include <cstdlib>
+#include "dnn_memory.hpp"
+#include "gna-api.h"
+
+void MemoryAssign(void **ptr_dest,
+                  void **ptr_memory,
+                  uint32_t num_bytes_needed,
+                  uint32_t *ptr_num_bytes_used,
+                  uint32_t num_memory_bytes,
+                  const char *name) {
+    if (*ptr_num_bytes_used + ALIGN(num_bytes_needed, 64) > num_memory_bytes) {
+        fprintf(stderr,
+                "Out of memory in %s (%d+ALIGN(%d)>%d)!\n",
+                name,
+                *ptr_num_bytes_used,
+                num_bytes_needed,
+                num_memory_bytes);
+        throw -1;
+    } else {
+        uint8_t *ptr_bytes = reinterpret_cast<uint8_t *>(*ptr_memory);
+        *ptr_dest = *ptr_memory;
+        *ptr_memory = ptr_bytes + ALIGN(num_bytes_needed, 64);
+        *ptr_num_bytes_used += ALIGN(num_bytes_needed, 64);
+    }
+}
diff --git a/inference-engine/src/gna_plugin/dnn_memory.hpp b/inference-engine/src/gna_plugin/dnn_memory.hpp
new file mode 100644
index 00000000000000..5ab2c961f40ade
--- /dev/null
+++ b/inference-engine/src/gna_plugin/dnn_memory.hpp
@@ -0,0 +1,13 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <cstdint>
+extern void MemoryAssign(void **ptr_dest,
+                         void **ptr_memory,
+                         uint32_t num_bytes_needed,
+                         uint32_t *ptr_num_bytes_used,
+                         uint32_t num_memory_bytes,
+                         const char *name);
diff --git a/inference-engine/src/gna_plugin/dnn_traits.hpp b/inference-engine/src/gna_plugin/dnn_traits.hpp
new file mode 100644
index 00000000000000..0a92bb342013b3
--- /dev/null
+++ b/inference-engine/src/gna_plugin/dnn_traits.hpp
@@ -0,0 +1,90 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "dnn.h"
+
+template<intel_dnn_operation_t layer>
+struct DnnTrait {};
+
+template<>
+struct DnnTrait<kDnnDiagonalOp> {
+    using Type = intel_affine_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.affine;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnPiecewiselinearOp> {
+    using Type = intel_piecewiselinear_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.pwl;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnAffineOp> {
+    using Type = intel_affine_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.affine;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnConvolutional1dOp> {
+    using Type = intel_convolutionalD_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.conv1D;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnMaxPoolOp> {
+    using Type = intel_maxpool_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.maxpool;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnRecurrentOp> {
+    using Type = intel_recurrent_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.recurrent;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnInterleaveOp> {
+    using Type = intel_interleave_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.interleave;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnDeinterleaveOp> {
+    using Type = intel_deinterleave_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.deinterleave;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnCopyOp> {
+    using Type = intel_copy_t;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return &component.op.copy;
+    }
+};
+
+template<>
+struct DnnTrait<kDnnNullOp> {
+    using Type = void;
+    static Type *getLayer(intel_dnn_component_t &component) {
+        return nullptr;
+    }
+};
diff --git a/inference-engine/src/gna_plugin/floatmath.cpp b/inference-engine/src/gna_plugin/floatmath.cpp
new file mode 100644
index 00000000000000..3ea41127959395
--- /dev/null
+++ b/inference-engine/src/gna_plugin/floatmath.cpp
@@ -0,0 +1,423 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "floatmath.h"
+#include "pwl.h"
+#include "gna_plugin_log.hpp"
+#include <cmath>
+
+
+void CNNFilter32(intel_dnn_component_t *component) {
+    float *ptr_filters = reinterpret_cast<float *>(component->op.conv1D.ptr_filters);
+    float *ptr_biases = reinterpret_cast<float *>(component->op.conv1D.ptr_biases);
+    float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
+    float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
+    uint32_t num_group = component->num_rows_in;
+    uint32_t num_filter_outputs = component->op.conv1D.num_feature_map_rows - component->op.conv1D.num_filter_rows + 1;
+    uint32_t
+        num_inputs_band_stride = component->op.conv1D.num_feature_maps * component->op.conv1D.num_feature_map_columns;
+    uint32_t num_filter_coefficients = component->op.conv1D.num_filter_coefficients;
+
+    if ((component->num_rows_in != 1) || (component->num_rows_out != 1)
+        || (component->num_columns_out != num_filter_outputs * component->op.conv1D.num_filters)) {
+        THROW_GNA_EXCEPTION << "Bad problem dimensions in CNNFilter32!";
+    }
+
+    for (uint32_t j = 0; j < num_filter_outputs; j++) {
+        float *ptr_in = ptr_inputs + j * num_inputs_band_stride;
+        for (uint32_t i = 0; i < component->op.conv1D.num_filters; i++) {
+            float *ptr_coef = ptr_filters + i * num_filter_coefficients;
+            float sum = ptr_biases[i];
+            for (uint32_t k = 0; k < num_filter_coefficients; k++) {
+                sum += ptr_in[k] * ptr_coef[k];
+            }
+            ptr_outputs[j * component->op.conv1D.num_filters + i] = sum;
+        }
+    }
+}
+
+void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
+    if (number_type == kDnnInt) {
+        int32_t *ptr_inputs = reinterpret_cast<int32_t *>(component->ptr_inputs);
+        int32_t *ptr_outputs = reinterpret_cast<int32_t *>(component->ptr_outputs);
+        uint32_t num_inputs = component->num_columns_in;
+        uint32_t num_columns = component->op.maxpool.num_inputs_stride;
+        uint32_t num_pool_size = component->op.maxpool.num_inputs;
+        uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
+        uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
+        uint32_t num_rows_out = num_rows_in / num_pool_step;
+
+        for (uint32_t i = 0; i < num_columns; i++) {
+            int32_t m = 0;
+            if (component->op.maxpool.do_sum_not_max) {
+                uint32_t num_saturate = 0;
+                for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
+                    int64_t sum = 0;
+                    uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
+                    for (uint32_t k = j; k < num_end; k++) {
+                        sum += ptr_inputs[k * num_columns + i];
+                    }
+                    if (sum > 2147483647.0) {
+                        ptr_outputs[m * num_columns + i] = 2147483647L;
+                        num_saturate++;
+                    } else if (sum < -2147483648.0) {
+                        ptr_outputs[m * num_columns + i] = -2147483648L;
+                        num_saturate++;
+                    } else {
+                        ptr_outputs[m * num_columns + i] = (int32_t) sum;
+                    }
+                    m++;
+                }
+                if (num_saturate > 0) {
+                    fprintf(stderr, "Warning:  %d saturations in CNNMaxPool()\n", num_saturate);
+                }
+            } else {
+                for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
+                    int32_t max = INT32_MIN;
+                    uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
+                    for (uint32_t k = j; k < num_end; k++) {
+                        if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
+                    }
+                    ptr_outputs[m * num_columns + i] = max;
+                    m++;
+                }
+            }
+        }
+    } else {
+        float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
+        float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
+        uint32_t num_inputs = component->num_columns_in;
+        uint32_t num_columns = component->op.maxpool.num_inputs_stride;
+        uint32_t num_pool_size = component->op.maxpool.num_inputs;
+        uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
+        uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
+        uint32_t num_rows_out = num_rows_in / num_pool_step;
+
+        for (uint32_t i = 0; i < num_columns; i++) {
+            int32_t m = 0;
+            if (component->op.maxpool.do_sum_not_max) {
+                for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
+                    float sum = 0.0;
+                    uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
+                    for (uint32_t k = j; k < num_end; k++) {
+                        sum += ptr_inputs[k * num_columns + i];
+                    }
+                    ptr_outputs[m * num_columns + i] = sum;
+                    m++;
+                }
+            } else {
+                for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
+                    float max = -1e20f;
+                    uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
+                    for (uint32_t k = j; k < num_end; k++) {
+                        if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
+                    }
+                    ptr_outputs[m * num_columns + i] = max;
+                    m++;
+                }
+            }
+        }
+    }
+}
+
+void PwlApply16(intel_dnn_component_t *component, uint32_t num_subset_size) {
+    if (component->orientation_in == kDnnInterleavedOrientation) {  // subsets only supported in interleaved orientation
+        PwlApply16(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
+    } else {
+        PwlApply16(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
+    }
+}
+
+void PwlApply16(intel_dnn_component_t *component,
+                uint32_t num_row_start,
+                uint32_t num_row_end,
+                uint32_t num_col_start,
+                uint32_t num_col_end) {
+    uint32_t num_saturate = 0;
+    uint32_t num_segments = component->op.pwl.num_segments;
+    if (num_segments > 0) {
+        intel_pwl_segment_t *ptr_segment = component->op.pwl.ptr_segments;
+        for (int i = num_row_start; i <= num_row_end; i++) {
+            int32_t *ptr_input = reinterpret_cast<int32_t *>(component->ptr_inputs) + i * component->num_columns_in;
+            int16_t *ptr_output = reinterpret_cast<int16_t *>(component->ptr_outputs) + i * component->num_columns_in;
+            for (int j = num_col_start; j <= num_col_end; j++) {
+                int32_t xbase = (int32_t) (ptr_segment[0].xBase & XBASEMASK);
+                int32_t input = ptr_input[j];
+                if (input <= xbase) {
+                    ptr_output[j] = ptr_segment[0].yBase;
+                } else {
+                    uint32_t slope_shift;
+                    int16_t slope, ybase;
+                    int64_t diff, prod, prod_shift, sum;
+                    uint32_t k = num_segments / 2;
+                    uint32_t k_upper = num_segments;
+                    uint32_t k_lower = 0;
+                    while (k_upper > k_lower + 1) {
+                        xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
+                        if (xbase > input) {
+                            k_upper = k;
+                            k = (k + k_lower) / 2;
+                        } else {
+                            k_lower = k;
+                            k = (k_upper + k) / 2;
+                        }
+                    }
+                    xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
+                    slope_shift = ((ptr_segment[k].xBase & ~XBASEMASK) + 1) * 8;
+                    slope = ptr_segment[k].slope;
+                    ybase = ptr_segment[k].yBase;
+                    diff = (int64_t) input - (int64_t) xbase;
+                    prod = diff * slope;
+                    prod_shift = prod >> slope_shift;
+                    sum = prod_shift + (int64_t) ybase;
+                    if (sum > 32767LL) {
+                        ptr_output[j] = 32767;
+                        num_saturate++;
+                    } else if (sum < -32768LL) {
+                        ptr_output[j] = -32768;
+                        num_saturate++;
+                    } else {
+                        ptr_output[j] = (int16_t) sum;
+                    }
+                }
+            }
+        }
+    }
+
+    if (num_saturate > 0) {
+        fprintf(stderr, "Warning:  %d saturations in PwlApply16!\n", num_saturate);
+    }
+}
+
+void PwlApply32(intel_dnn_component_t *component, uint32_t num_subset_size) {
+    if (component->orientation_in == kDnnInterleavedOrientation) {  // subsets only supported in interleaved orientation
+        PwlApply32(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
+    } else {
+        PwlApply32(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
+    }
+}
+
+void PwlApply32(intel_dnn_component_t *component,
+                uint32_t num_row_start,
+                uint32_t num_row_end,
+                uint32_t num_col_start,
+                uint32_t num_col_end) {
+    intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component->op.pwl);
+    float *ptr_in = reinterpret_cast<float *>(component->ptr_inputs);
+    float *ptr_out = reinterpret_cast<float *>(component->ptr_outputs);
+    uint32_t num_columns = component->num_columns_in;
+    switch (transform->func_id.type) {
+        case kActSigmoid:
+            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
+                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
+                    ptr_out[i * num_columns + j] = 0.5 * (1.0 + tanh(0.5 * ptr_in[i * num_columns + j]));
+                }
+            }
+            break;
+        case kActTanh:
+            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
+                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
+                    ptr_out[i * num_columns + j] = tanh(ptr_in[i * num_columns + j]);
+                }
+            }
+            break;
+        case kActRelu:
+            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
+                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
+                    ptr_out[i * num_columns + j] =
+                        (ptr_in[i * num_columns + j] < 0.0f) ? ptr_in[i * num_columns + j] * transform->func_id.negative_slope : ptr_in[i * num_columns + j];
+                }
+            }
+            break;
+        case kActIdentity:
+            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
+                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
+                    ptr_out[i * num_columns + j] = ptr_in[i * num_columns + j];
+                }
+            }
+            break;
+        case kActKaldiLstmClipping:
+            for (uint32_t i = num_row_start; i <= num_row_end; i++) {
+                for (uint32_t j = num_col_start; j <= num_col_end; j++) {
+                    float val = ptr_in[i * num_columns + j];
+                    if (val > KALDI_LSTM_CLIP_UPPER) {
+                        ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_UPPER;
+                    } else if (val < KALDI_LSTM_CLIP_LOWER) {
+                        ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_LOWER;
+                    } else {
+                        ptr_out[i * num_columns + j] = val;
+                    }
+                }
+            }
+            break;
+        case kActCustom:
+            // break;
+        default:fprintf(stderr, "Unknown piecewise linear function type!\n");
+            throw -1;
+    }
+}
+
+#ifdef __cplusplus
+extern "C" {  // API uses C linkage so that it can be used by C and C++ applications
+#endif
+
+#ifdef _NO_MKL_
+void cblas_sgemm1(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
+                  const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
+                  const MKL_INT K, const float alpha, const float *A,
+                  const MKL_INT lda, const float *B, const MKL_INT ldb,
+                  const float beta, float *C, const MKL_INT ldc) {
+    int i, j, k;
+
+    if (Layout != CblasRowMajor) {
+        fprintf(stderr, "Only row major is supported in cblas_sgemm!\n");
+        throw -1;
+    }
+
+    if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
+        for (i = 0; i < M; i++) {
+            for (j = 0; j < N; j++) {
+                float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
+                for (k = 0; k < K; k++) {
+                    sum += A[i * lda + k] * B[k * ldb + j];
+                }
+                C[i * ldc + j] = sum;
+            }
+        }
+    } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
+        for (i = 0; i < M; i++) {
+            for (j = 0; j < N; j++) {
+                float sum;
+                sum = beta * C[i * ldc + j];
+                for (k = 0; k < K; k++) {
+                    sum += alpha * A[i * lda + k] * B[j * ldb + k];
+                }
+                C[i * ldc + j] = sum;
+            }
+        }
+    } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
+        for (i = 0; i < M; i++) {
+            for (j = 0; j < N; j++) {
+                float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
+                for (k = 0; k < K; k++) {
+                    sum += A[k * lda + i] * B[k * ldb + j];
+                }
+                C[i * ldc + j] = sum;
+            }
+        }
+    } else {
+        fprintf(stderr, "Expected A not transposed in cblas_sgemm!\n");
+        throw -1;
+    }
+}
+void cblas_ssbmv1(const CBLAS_LAYOUT Layout, const CBLAS_UPLO Uplo,
+                  const MKL_INT N, const MKL_INT K, const float alpha, const float *A,
+                  const MKL_INT lda, const float *X, const MKL_INT incX,
+                  const float beta, float *Y, const MKL_INT incY) {
+    int i, j, k;
+
+    if (Layout != CblasRowMajor) {
+        fprintf(stderr, "Only row major is supported in cblas_ssbmv!\n");
+        throw -1;
+    }
+    if (Uplo != CblasLower) {
+        fprintf(stderr, "Only lower format is supported in cblas_ssbmv!\n");
+        throw -1;
+    }
+    if (K != 0) {
+        fprintf(stderr, "Only diagonal matrices supported in cblas_ssbmv at this time!\n");
+        throw -1;
+    }
+    if ((alpha == 1.0) && (beta == 1.0) && (incX == 1) && (incY == 1)) {
+        for (i = 0; i < N; i++) {
+            Y[i] += A[i] * X[i];
+        }
+    } else {
+        fprintf(stderr, "Only alpha=1, beta=1, incX=1, incY=1, LDA=1 supported in cblas_ssbmv at this time!\n");
+        throw -1;
+    }
+}
+#endif  // #ifdef _NO_MKL_
+
+void cblas_sgemm_subset(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
+                        const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
+                        const MKL_INT K, const float alpha, const float *A,
+                        const MKL_INT lda, const float *B, const MKL_INT ldb,
+                        const float beta, float *C, const MKL_INT ldc,
+                        const uint32_t *OutputList, const MKL_INT L) {
+    int i, j, k, l;
+
+    if (Layout != CblasRowMajor) {
+        fprintf(stderr, "Only row major is supported in cblas_sgemm_subset!\n");
+        throw -1;
+    }
+
+    if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
+        for (l = 0; l < L; l++) {
+            i = OutputList[l];
+            for (j = 0; j < N; j++) {
+                float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
+                for (k = 0; k < K; k++) {
+                    sum += A[i * lda + k] * B[k * ldb + j];
+                }
+                C[l * ldc + j] = sum;
+            }
+        }
+    } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
+        for (i = 0; i < M; i++) {
+            for (l = 0; l < L; l++) {
+                float sum;
+                j = OutputList[l];
+                sum = beta * C[i * ldc + l];
+                for (k = 0; k < K; k++) {
+                    sum += alpha * A[i * lda + k] * B[j * ldb + k];
+                }
+                C[i * ldc + l] = sum;
+            }
+        }
+    } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
+        for (l = 0; l < L; l++) {
+            i = OutputList[l];
+            for (j = 0; j < N; j++) {
+                float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
+                for (k = 0; k < K; k++) {
+                    sum += A[k * lda + i] * B[k * ldb + j];
+                }
+                C[l * ldc + j] = sum;
+            }
+        }
+    } else {
+        fprintf(stderr, "Expected A not transposed in cblas_sgemm_subset!\n");
+        throw -1;
+    }
+}
+
+// C = [ A1 A2 ] * X + B
+void sgemv_split(const uint32_t N,
+                 const uint32_t K1,
+                 const uint32_t K2,
+                 const float *A1,
+                 const float *A2,
+                 const float *X,
+                 const float *B,
+                 float *C) {
+    uint32_t num_columns = K1 + K2;
+    uint32_t num_rows = N;
+    uint32_t i, j;
+
+    for (i = 0; i < num_rows; i++) {
+        float sum = B[i];
+        for (j = 0; j < K1; j++) {
+            sum += A1[j] * X[i * num_columns + j];
+        }
+        for (j = K1; j < num_columns; j++) {
+            sum += A2[j - K1] * X[i * num_columns + j];
+        }
+        C[i] = sum;
+    }
+}
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif
diff --git a/inference-engine/src/gna_plugin/floatmath.h b/inference-engine/src/gna_plugin/floatmath.h
new file mode 100644
index 00000000000000..ff9bf9938016ab
--- /dev/null
+++ b/inference-engine/src/gna_plugin/floatmath.h
@@ -0,0 +1,71 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma  once
+
+#include <stdlib.h>
+#include <stdio.h>
+#ifndef _NO_MKL_
+#include <mkl_dnn.h>
+#include <mkl_cblas.h>
+#endif
+// #include "types.h"
+#include "dnn.h"
+
+#ifndef CBLAS_LAYOUT
+#define CBLAS_LAYOUT CBLAS_ORDER
+#endif
+
+#define CNN_MAX_POOL_SIZE 6
+
+void CNNFilter32(intel_dnn_component_t *component);
+void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type);
+
+#ifdef _NO_MKL_
+#ifndef _MKL_H_
+#define _MKL_H_
+typedef enum { CblasRowMajor = 101, CblasColMajor = 102 } CBLAS_LAYOUT;
+typedef enum { CblasNoTrans = 111, CblasTrans = 112, CblasConjTrans = 113 } CBLAS_TRANSPOSE;
+typedef enum { CblasUpper = 121, CblasLower = 122 } CBLAS_UPLO;
+typedef enum { CblasNonUnit = 131, CblasUnit = 132 } CBLAS_DIAG;
+typedef enum { CblasLeft = 141, CblasRight = 142 } CBLAS_SIDE;
+typedef CBLAS_LAYOUT CBLAS_ORDER; /* this for backward compatibility with CBLAS_ORDER */
+#define MKL_INT int
+#endif  // #ifndef _MKL_H_
+#endif  // #ifdef _NO_MKL_
+
+#ifdef __cplusplus
+extern "C" {  // API uses C linkage so that it can be used by C and C++ applications
+#endif
+
+#ifdef _NO_MKL_
+void cblas_sgemm1(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
+                  const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
+                  const MKL_INT K, const float alpha, const float *A,
+                  const MKL_INT lda, const float *B, const MKL_INT ldb,
+                  const float beta, float *C, const MKL_INT ldc);
+void cblas_ssbmv1(const CBLAS_LAYOUT Layout, const CBLAS_UPLO Uplo,
+                  const MKL_INT N, const MKL_INT K, const float alpha, const float *A,
+                  const MKL_INT lda, const float *X, const MKL_INT incX,
+                  const float beta, float *Y, const MKL_INT incY);
+#endif  // #ifdef _NO_MKL_
+void cblas_sgemm_subset(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
+                        const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
+                        const MKL_INT K, const float alpha, const float *A,
+                        const MKL_INT lda, const float *B, const MKL_INT ldb,
+                        const float beta, float *C, const MKL_INT ldc,
+                        const uint32_t *OutputList, const MKL_INT L);
+void sgemv_split(const uint32_t N,
+                 const uint32_t K1,
+                 const uint32_t K2,
+                 const float *A1,
+                 const float *A2,
+                 const float *X,
+                 const float *B,
+                 float *C);
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/inference-engine/src/gna_plugin/gna_allocator.hpp b/inference-engine/src/gna_plugin/gna_allocator.hpp
new file mode 100644
index 00000000000000..ae62b1f76f6b1e
--- /dev/null
+++ b/inference-engine/src/gna_plugin/gna_allocator.hpp
@@ -0,0 +1,33 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <functional>
+#include "gna_device.hpp"
+#include "polymorh_allocator.hpp"
+
+/**
+ * wrap GNA interface into c++ allocator friendly one
+ */
+class GNAAllocator {
+    std::reference_wrapper<GNADeviceHelper> _device;
+
+ public:
+    typedef uint8_t value_type;
+
+    explicit GNAAllocator(GNADeviceHelper &device) : _device(device) {
+    }
+    uint8_t *allocate(std::size_t n) {
+        uint32_t granted = 0;
+        auto result = _device.get().alloc(n, &granted);
+        if (result == nullptr || granted == 0) {
+            throw std::bad_alloc();
+        }
+        return result;
+    }
+    void deallocate(uint8_t *p, std::size_t n) {
+        _device.get().free();
+    }
+};
diff --git a/inference-engine/src/gna_plugin/gna_api_wrapper.hpp b/inference-engine/src/gna_plugin/gna_api_wrapper.hpp
index bce210b01d2f6e..fb9d2cc2ef152e 100644
--- a/inference-engine/src/gna_plugin/gna_api_wrapper.hpp
+++ b/inference-engine/src/gna_plugin/gna_api_wrapper.hpp
@@ -1,22 +1,11 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
 
-#if defined __INTEL_COMPILER || defined _MSC_VER
-#include <malloc.h>
-#else
-#include <mm_malloc.h>
-#endif
 #include <gna-api-types-xnn.h>
 #include "gna_plugin_log.hpp"
-
-#if GNA_LIB_VER == 2
-#include <gna2-model-api.h>
-#include <gna2_model_helper.hpp>
-#endif
-
 namespace GNAPluginNS {
 
 /**
@@ -27,62 +16,9 @@ template <class T>
 class CPPWrapper {
 };
 
-#if GNA_LIB_VER == 2
-template <>
-class CPPWrapper<Gna2Model> {
- public:
-     Gna2Model obj;
-
-    CPPWrapper() {
-        obj.NumberOfOperations = 0;
-        obj.Operations = nullptr;
-    }
-
-    /**
-     * creates nnet structure of n layers
-     * @param n - number  of layers
-     */
-    explicit CPPWrapper(size_t n) {
-        if (n == 0) {
-            THROW_GNA_EXCEPTION << "Can't allocate array of intel_nnet_layer_t objects of zero length";
-        }
-        obj.Operations = reinterpret_cast<Gna2Operation*>(gnaUserAllocator(n * sizeof(Gna2Operation)));
-        if (obj.Operations == nullptr) {
-            THROW_GNA_EXCEPTION << "out of memory in while allocating "<< n << " GNA layers";
-        }
-        obj.NumberOfOperations = n;
-        for (int i = 0; i < obj.NumberOfOperations; i++) {
-            obj.Operations[i].Type = Gna2OperationTypeNone;
-            obj.Operations[i].Operands = nullptr;
-            obj.Operations[i].NumberOfOperands = 0;
-            obj.Operations[i].Parameters = nullptr;
-            obj.Operations[i].NumberOfParameters = 0;
-        }
-    }
-    ~CPPWrapper() {
-        if (obj.Operations != nullptr) {
-            for (int i = 0; i < obj.NumberOfOperations; i++) {
-                freeGna2Operation(obj.Operations[i]);
-            }
-            gnaUserFree(obj.Operations);
-            obj.Operations = nullptr;
-        }
-        obj.NumberOfOperations = 0;
-    }
-    Gna2Model * operator ->() {
-        return &obj;
-    }
-    Gna2Model * operator *() {
-        return &obj;
-    }
-    operator Gna2Model &() {
-        return *this;
-    }
-};
-#else
 template <>
 class CPPWrapper<intel_nnet_type_t> {
-public:
+ public:
     intel_nnet_type_t obj;
 
     CPPWrapper() {
@@ -96,18 +32,14 @@ class CPPWrapper<intel_nnet_type_t> {
      * @param n - number  of layers
      */
     explicit CPPWrapper(size_t n) {
-        if (n == 0) {
-            THROW_GNA_EXCEPTION << "Can't allocate array of intel_nnet_layer_t objects of zero length";
-        }
         obj.pLayers = reinterpret_cast<intel_nnet_layer_t *>(_mm_malloc(n * sizeof(intel_nnet_layer_t), 64));
         if (obj.pLayers == nullptr) {
-            THROW_GNA_EXCEPTION << "out of memory in while allocating " << n << " GNA layers";
+            THROW_GNA_EXCEPTION << "out of memory in while allocating "<< n << " GNA layers";
         }
         obj.nLayers = n;
         for (int i = 0; i < obj.nLayers; i++) {
             obj.pLayers[i].pLayerStruct = nullptr;
         }
-        obj.nGroup = 0;
     }
     ~CPPWrapper() {
         for (int i = 0; i < obj.nLayers; i++) {
@@ -123,10 +55,9 @@ class CPPWrapper<intel_nnet_type_t> {
     intel_nnet_type_t * operator *() {
         return &obj;
     }
-    operator intel_nnet_type_t &() {
+    operator  intel_nnet_type_t &() {
         return *this;
     }
 };
-#endif
 
-}  // namespace GNAPluginNS
+}  // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/gna_device.cpp b/inference-engine/src/gna_plugin/gna_device.cpp
index aec8699036c756..3936bc89b4410e 100644
--- a/inference-engine/src/gna_plugin/gna_device.cpp
+++ b/inference-engine/src/gna_plugin/gna_device.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,301 +7,62 @@
 #include <map>
 #include <string>
 #include <cstring>
-#include <vector>
 
-#if GNA_LIB_VER == 2
-#include "gna_api_wrapper.hpp"
-#include "gna2-device-api.h"
-#include "gna2-inference-api.h"
-#include "gna2-instrumentation-api.h"
-#include "gna2-memory-api.h"
-#include "gna2_model_export_helper.hpp"
-#else
 #include "gna-api-status.h"
 #include "gna-api.h"
-#endif
 
 #include "details/ie_exception.hpp"
 #include "gna_plugin_log.hpp"
+#include "gna/gna_config.hpp"
 
 uint8_t* GNADeviceHelper::alloc(uint32_t size_requested, uint32_t *size_granted) {
-    void * memPtr;
-#if GNA_LIB_VER == 1
-    memPtr = GNAAlloc(nGNAHandle, size_requested, size_granted);
-#else
-    const auto status = Gna2MemoryAlloc(size_requested, size_granted, &memPtr);
-    checkGna2Status(status);
-#endif
-    if (memPtr == nullptr) {
-        THROW_GNA_EXCEPTION << "GNAAlloc failed to allocate memory. Requested: " << size_requested << " Granted: " << *(size_granted);
-    }
-    dumpXNNROPtr = memPtr;
-    dumpXNNROSize = *size_granted;
-    return static_cast<uint8_t *>(memPtr);
+    return reinterpret_cast<uint8_t *>(GNAAlloc(nGNAHandle, size_requested, size_granted));
 }
 
-void GNADeviceHelper::free(void * ptr) {
-#if GNA_LIB_VER == 1
-    GNAFree(nGNAHandle);
-#else
-    const auto status = Gna2MemoryFree(ptr);
-    checkGna2Status(status);
-#endif
+void GNADeviceHelper::propagateSync(const intel_nnet_type_t *pNeuralNetwork,
+                                    const uint32_t *pActiveIndices,
+                                    uint32_t nActiveIndices) {
+    wait(propagate(pNeuralNetwork, pActiveIndices, nActiveIndices));
 }
 
-#if GNA_LIB_VER == 1
 uint32_t GNADeviceHelper::propagate(const intel_nnet_type_t *pNeuralNetwork,
                    const uint32_t *pActiveIndices,
                    uint32_t nActiveIndices) {
     uint32_t reqId;
-
     nGNAStatus = GNAPropagateForward(nGNAHandle, pNeuralNetwork,
                                      pActiveIndices, nActiveIndices, &reqId, nGNAProcType);
     checkStatus();
     return reqId;
 }
-#else
-void GNADeviceHelper::setUpActiveList(const uint32_t requestConfigId, uint32_t layerIndex, uint32_t* ptr_active_indices, uint32_t num_active_indices) {
-    const auto status = Gna2RequestConfigEnableActiveList(requestConfigId, layerIndex, num_active_indices, ptr_active_indices);
-    checkGna2Status(status);
-}
-void GNADeviceHelper::propagateSync(const uint32_t requestConfigId) {
-    wait(propagate(requestConfigId));
-}
-
-uint32_t GNADeviceHelper::propagate(const uint32_t requestConfigId) {
-    uint32_t reqId;
-    const auto status = Gna2RequestEnqueue(requestConfigId, &reqId);
-    checkGna2Status(status);
-    return reqId;
-}
-
-uint32_t GNADeviceHelper::createModel(const Gna2Model& gnaModel) const {
-    uint32_t modelId;
-    const auto status = Gna2ModelCreate(nGnaDeviceIndex, &gnaModel, &modelId);
-
-    checkGna2Status(status, gnaModel);
-    return modelId;
-}
-
-void GNADeviceHelper::releseModel(const uint32_t model_id) {
-    const auto status = Gna2ModelRelease(model_id);
-    checkGna2Status(status);
-}
-
-uint32_t GNADeviceHelper::createRequestConfig(const uint32_t model_id) {
-    uint32_t reqConfId;
-    auto status = Gna2RequestConfigCreate(model_id, &reqConfId);
-    checkGna2Status(status);
-    status = Gna2RequestConfigSetAccelerationMode(reqConfId, gna2AccelerationMode);
-    checkGna2Status(status);
-    if (gna2HwConsistency != Gna2DeviceVersionSoftwareEmulation) {
-        status = Gna2RequestConfigEnableHardwareConsistency(reqConfId, gna2HwConsistency);
-        checkGna2Status(status);
-    }
-    status = Gna2InstrumentationConfigAssignToRequestConfig(instrumentationConfigId, reqConfId);
-    checkGna2Status(status);
-
-    return reqConfId;
-}
-
-void GNADeviceHelper::checkGna2Status(Gna2Status status, const Gna2Model& gnaModel) {
-    if (!Gna2StatusIsSuccessful(status)) {
-        std::vector<char> gna2StatusBuffer(1024);
-        const auto s = Gna2StatusGetMessage(status, gna2StatusBuffer.data(), gna2StatusBuffer.size());
-        if (!Gna2StatusIsSuccessful(s))
-            snprintf(gna2StatusBuffer.data(), gna2StatusBuffer.size(), "Gna2StatusGetMessage(%d) returned (%d)",
-                static_cast<int>(status), static_cast<int>(s));
-        if (status == Gna2StatusDeviceIngoingCommunicationError ||
-            status == Gna2StatusDeviceOutgoingCommunicationError) {
-            THROW_GNA_EXCEPTION << "Unsuccessful Gna2Status: (" << status << ") " << gna2StatusBuffer.data() << ", consider updating the GNA driver";
-        }
-
-        Gna2ModelError error;
-        Gna2ModelGetLastError(&error);
-
-        std::stringstream ss;
-        ss << "\n GNA Library Error:\n";
-        const Gna2ItemType type = error.Source.Type;
-        const std::string errorType = errorTypes.find(type) != errorTypes.end()
-                                      ? errorTypes.at(type)
-                                      : "Unknown Error Type";
-
-        ss << "   Type (" << std::to_string(type) << "): " << errorType << "\n";
-
-        if (error.Source.OperationIndex != GNA2_DISABLED) {
-            const Gna2OperationType opTypeIndex = gnaModel.Operations[error.Source.OperationIndex].Type;
-            const std::string operationType = operationTypes.find(opTypeIndex) != operationTypes.end()
-                                              ? operationTypes.at(opTypeIndex)
-                                              : "Unknown Operation Type";
-            const std::string operandType = operandTypes.find({ opTypeIndex, error.Source.OperandIndex }) != operandTypes.end()
-                                              ? operandTypes.at({ opTypeIndex, error.Source.OperandIndex })
-                                              : "Unknown Operand Type";
-
-            ss << "   OperationIndex (" << std::to_string(error.Source.OperationIndex) << "): "
-                << operationType << "\n";
-            ss << "   OperandIndex(" << std::to_string(error.Source.OperandIndex) << "): "
-                << operandType << "\n";
-            ss << "   ParamIndex (" << std::to_string(error.Source.ParameterIndex) << ")\n";
-            ss << "   DimIndex (" << std::to_string(error.Source.ShapeDimensionIndex) << ")\n";
-        }
-
-        const Gna2ErrorType reason = error.Reason;
-        const std::string errorReason = errorReasons.find(reason) != errorReasons.end()
-                                        ? errorReasons.at(reason)
-                                        : "Unknown Error Reason";
-        ss << "   Reason (" << std::to_string(reason) << "): " << errorReason << "\n";
-        ss << "   Value (0x" << std::hex << std::to_string(error.Value) << ")";
-
-        THROW_GNA_EXCEPTION << "\nUnsuccessful Gna2Status: (" << status << ") " << gna2StatusBuffer.data() << ss.str();
-    }
-}
-
-void GNADeviceHelper::checkGna2Status(Gna2Status status) {
-    if (!Gna2StatusIsSuccessful(status)) {
-        std::vector<char> gna2StatusBuffer(1024);
-        const auto s = Gna2StatusGetMessage(status, gna2StatusBuffer.data(), gna2StatusBuffer.size());
-        if (!Gna2StatusIsSuccessful(s))
-            snprintf(gna2StatusBuffer.data(), gna2StatusBuffer.size(), "Gna2StatusGetMessage(%d) returned (%d)",
-                static_cast<int>(status), static_cast<int>(s));
-        if (status == Gna2StatusDeviceIngoingCommunicationError ||
-            status == Gna2StatusDeviceOutgoingCommunicationError) {
-            THROW_GNA_EXCEPTION << "Unsuccessful Gna2Status: (" << status << ") " << gna2StatusBuffer.data() << ", consider updating the GNA driver";
-        }
-        THROW_GNA_EXCEPTION << "Unsuccessful Gna2Status: (" << status << ") " << gna2StatusBuffer.data();
-    }
-}
-
-const std::map <Gna2ItemType, const std::string> GNADeviceHelper::errorTypes = {
-            {Gna2ItemTypeNone, "Model context is not applicable or unnecessary"},
-            {Gna2ItemTypeModelNumberOfOperations, "Gna2Model::NumberOfOperations"},
-            {Gna2ItemTypeModelOperations, "Gna2Model::Operations array"},
-            {Gna2ItemTypeOperationType, "Gna2Model::Operations[x]->Gna2Operation::Type"},
-            {Gna2ItemTypeOperationOperands, "Gna2Model::Operations[x]->Gna2Operation::Operands array"},
-            {Gna2ItemTypeOperationNumberOfOperands, "Gna2Model::Operations[x]->Gna2Operation::NumberOfOperands"},
-            {Gna2ItemTypeOperationParameters, "Gna2Model::Operations[x]->Gna2Operation::Parameters array"},
-            {Gna2ItemTypeOperationNumberOfParameters, "Gna2Model::Operations[x]->Gna2Operation::NumberOfParameters"},
-            {Gna2ItemTypeOperandMode, "Gna2Model::Operations[x]->Gna2Operation::Operands[y]->Gna2Tensor::Mode"},
-            {Gna2ItemTypeOperandLayout, "Gna2Model::Operations[x]->Gna2Operation::Operands[y]->Gna2Tensor::Layout"},
-            {Gna2ItemTypeOperandType, "Gna2Model::Operations[x]->Gna2Operation::Operands[y]->Gna2Tensor::Type"},
-            {Gna2ItemTypeOperandData, "Gna2Model::Operations[x]->Gna2Operation::Operands[y]->Gna2Tensor::Data"},
-            {Gna2ItemTypeParameter, "Gna2Model::Operations[x]->Gna2Operation::Parameters[z]->Parameter, can be of type Gna2Shape, enumeration or integer"},
-            {Gna2ItemTypeShapeNumberOfDimensions, "Gna2Model::Operations[x]->{Gna2Tensor}, Parameter}->Gna2Shape::NumberOfDimensions"},
-            {Gna2ItemTypeShapeDimensions, "Gna2Model::Operations[x]->{Gna2Tensor}, Parameter}->Gna2Shape::Dimensions"},
-            {Gna2ItemTypeInternal, "Internal model item, that is a derivative of other model parameters"}
-};
-
-const std::map <Gna2ErrorType, const std::string> GNADeviceHelper::errorReasons = {
-            { Gna2ErrorTypeNone, "No error detected"},
-            { Gna2ErrorTypeNotTrue, "Item value was expected to be true"},
-            { Gna2ErrorTypeNotFalse, "Item value was expected to be false"},
-            { Gna2ErrorTypeNullNotAllowed, "Item value was expected to be not null"},
-            { Gna2ErrorTypeNullRequired, "Item value was expected to be null"},
-            { Gna2ErrorTypeBelowRange, "Item value was below supported range"},
-            { Gna2ErrorTypeAboveRange, "Item value was above supported range"},
-            { Gna2ErrorTypeNotEqual, "Item value was not equal supported one"},
-            { Gna2ErrorTypeNotGtZero, "Item value was below zero"},
-            { Gna2ErrorTypeNotZero, "Item value was not equal zero"},
-            { Gna2ErrorTypeNotOne, "Item value was not equal one"},
-            { Gna2ErrorTypeNotInSet, "Item value was not in supported set of values"},
-            { Gna2ErrorTypeNotMultiplicity, "Item value was not multiple of supported value"},
-            { Gna2ErrorTypeNotSuccess, "Item value was invalid, no detailed information available"},
-            { Gna2ErrorTypeNotAligned, "Item value was not aligned to supported value"},
-            { Gna2ErrorTypeArgumentMissing, "Some operation argument was not provided"},
-            { Gna2ErrorTypeArgumentInvalid, "Given operation argument was invalid or unexpected"},
-            { Gna2ErrorTypeRuntime, "Runtime error occurred during model creation"},
-            { Gna2ErrorTypeOther, "Unable to determine the root cause of the issue"}
-};
-
-const std::map <Gna2OperationType, const std::string> GNADeviceHelper::operationTypes = {
-            { Gna2OperationTypeNone, "None"},
-            { Gna2OperationTypeConvolution, "Convolution"},
-            { Gna2OperationTypeCopy, "Copy"},
-            { Gna2OperationTypeFullyConnectedAffine, "FullyConnectedAffine"},
-            { Gna2OperationTypeElementWiseAffine, "ElementWiseAffine"},
-            { Gna2OperationTypeGmm, "GMM"},
-            { Gna2OperationTypeRecurrent, "Recurrent"},
-            { Gna2OperationTypeTransposition, "Transpose"},
-            { Gna2OperationTypeThreshold, "Threshold"}
-};
-
-const std::map <const std::pair<Gna2OperationType, int32_t>, const std::string> GNADeviceHelper::operandTypes = {
-            {{Gna2OperationTypeConvolution, 0}, "Input"},
-            {{Gna2OperationTypeConvolution, 1}, "Output"},
-            {{Gna2OperationTypeConvolution, 2}, "Filters"},
-            {{Gna2OperationTypeConvolution, 3}, "Biases"},
-            {{Gna2OperationTypeConvolution, 4}, "Activation"},
-            {{Gna2OperationTypeCopy, 0}, "Input"},
-            {{Gna2OperationTypeCopy, 1}, "Output"},
-            {{Gna2OperationTypeFullyConnectedAffine, 0}, "Input"},
-            {{Gna2OperationTypeFullyConnectedAffine, 1}, "Output"},
-            {{Gna2OperationTypeFullyConnectedAffine, 2}, "Weights"},
-            {{Gna2OperationTypeFullyConnectedAffine, 3}, "Biases"},
-            {{Gna2OperationTypeFullyConnectedAffine, 4}, "Activation"},
-            {{Gna2OperationTypeFullyConnectedAffine, 5}, "WeightScaleFactors"},
-            {{Gna2OperationTypeElementWiseAffine, 0}, "Input"},
-            {{Gna2OperationTypeElementWiseAffine, 1}, "Output"},
-            {{Gna2OperationTypeElementWiseAffine, 2}, "Weights"},
-            {{Gna2OperationTypeElementWiseAffine, 3}, "Biases"},
-            {{Gna2OperationTypeElementWiseAffine, 4}, "Activation"},
-            {{Gna2OperationTypeGmm, 0}, "Input"},
-            {{Gna2OperationTypeGmm, 1}, "Output"},
-            {{Gna2OperationTypeGmm, 2}, "Means"},
-            {{Gna2OperationTypeGmm, 3}, "InverseCovariances"},
-            {{Gna2OperationTypeGmm, 4}, "Constants"},
-            {{Gna2OperationTypeRecurrent, 0}, "Input"},
-            {{Gna2OperationTypeRecurrent, 1}, "Output"},
-            {{Gna2OperationTypeRecurrent, 2}, "Weights"},
-            {{Gna2OperationTypeRecurrent, 3}, "Biases"},
-            {{Gna2OperationTypeRecurrent, 4}, "Activation"},
-            {{Gna2OperationTypeTransposition, 0}, "Input"},
-            {{Gna2OperationTypeTransposition, 1}, "Output"},
-            {{Gna2OperationTypeThreshold, 0}, "Input"},
-            {{Gna2OperationTypeThreshold, 1}, "Output"}
-};
-#endif
 
 void GNADeviceHelper::wait(uint32_t reqId) {
-#if GNA_LIB_VER == 2
-    const auto status = Gna2RequestWait(reqId, GNA_TIMEOUT);
-    checkGna2Status(status);
-#else
     if (isPerformanceMeasuring) {
         nGNAStatus = GNAWaitPerfRes(nGNAHandle, GNA_TIMEOUT, reqId, &nGNAPerfResults);
+        updateGnaPerfCounters();
     } else {
-        nGNAStatus = GNAWait(nGNAHandle, GNA_TIMEOUT, reqId);
+        nGNAStatus = GNAWait(nGNAHandle, 1000000, reqId);
     }
     checkStatus();
-#endif
-    updateGnaPerfCounters();
 }
 
-#if GNA_LIB_VER == 1
 GNADeviceHelper::DumpResult GNADeviceHelper::dumpXnn(const intel_nnet_type_t *pNeuralNetwork,
                                     const uint32_t *pActiveIndices,
                                     uint32_t nActiveIndices) {
-#else
-GNADeviceHelper::DumpResult GNADeviceHelper::dumpXnn(const uint32_t modelId) {
-#endif
     DumpResult r;
+    intel_gna_status_t gna_status;
 
-#if GNA_LIB_VER == 1
     if (!pNeuralNetwork) {
-        THROW_GNA_EXCEPTION << "GNADumpXnn got invalid NeuralNetwork parameter \n";
+        THROW_GNA_EXCEPTION<< "GNADumpXnn got invalid NeuralNetwork parameter \n";
     }
     r.model.reset(GNADumpXnn(pNeuralNetwork,
                              pActiveIndices,
                              nActiveIndices,
                              &r.header,
                              &nGNAStatus,
-                             [](size_t count)-> void* {return new char[count]();}),
-                             [](void * ptr) {::operator delete[](ptr);});
+                             [](size_t count)-> void* {return ::operator new(count);}),
+                             [](void * ptr) {::operator delete(ptr);});
+
     checkStatus();
-#else
-    r.model.reset(
-        ExportSueLegacyUsingGnaApi2(modelId, &r.header),
-        gnaUserFree);
-#endif
 
     if (r.model == nullptr) {
         THROW_GNA_EXCEPTION << "GNADumpXnn returned nullptr";
@@ -310,88 +71,31 @@ GNADeviceHelper::DumpResult GNADeviceHelper::dumpXnn(const uint32_t modelId) {
     return r;
 }
 
-#if GNA_LIB_VER == 2
-
-void GNADeviceHelper::dumpXnnForDeviceVersion(
-    const uint32_t modelId,
-    std::ostream & outStream,
-    const Gna2DeviceVersion targetDeviceVersion) {
-
-    Gna2ModelSueCreekHeader sueHeader;
-    auto ptr = ExportSueLegacyUsingGnaApi2(modelId, &sueHeader);
-    gnaUserFree(ptr);
-
-    ExportGnaDescriptorPartiallyFilled(sueHeader.NumberOfLayers, outStream);
-
-    ExportLdForDeviceVersion(modelId, outStream, targetDeviceVersion);
-    if (dumpXNNROPtr == nullptr) {
-        THROW_GNA_EXCEPTION << "Bad RO pointer (nullptr)";
-    }
-    outStream.write(static_cast<const char*>(dumpXNNROPtr), dumpXNNROSize);
-
-    // TODO: GNA2: remove
-    outStream.write("Gna2ModelSueCreekHeader", 24);
-    outStream.write(reinterpret_cast<const char*>(&sueHeader), sizeof(sueHeader));
-}
-#endif
-
-#if GNA_LIB_VER == 1
 void GNADeviceHelper::checkStatus() const {
     if ((nGNAStatus != GNA_NOERROR) && (nGNAStatus != GNA_SSATURATE)) {
         THROW_GNA_EXCEPTION << "Bad GNA status " << nGNAStatus << ", " << GNAStatusName[nGNAStatus];
     }
 }
-#endif
 
 void GNADeviceHelper::open(uint8_t n_threads) {
-#if GNA_LIB_VER == 1
     nGNAHandle = GNADeviceOpenSetThreads(&nGNAStatus, n_threads);
+
     checkStatus();
-#else
-    auto status = Gna2DeviceGetVersion(nGnaDeviceIndex, &detectedGnaDevVersion);
-    checkGna2Status(status);
-    if (gna2AccelerationMode == Gna2AccelerationModeHardware &&
-        detectedGnaDevVersion == Gna2DeviceVersionSoftwareEmulation) {
-        gnalog() << "GNA Device not detected, consider using other mode of acceleration";
-    }
-    status = Gna2DeviceOpen(nGnaDeviceIndex);
-    checkGna2Status(status);
-    // TODO: GNA2: uncomment when scratchpad repaired
-    // status = Gna2DeviceSetNumberOfThreads(nGnaDeviceIndex, n_threads);
-    // checkGna2Status(status);
-#endif
-    deviceOpened = true;
 }
 
 void GNADeviceHelper::close() {
-#if GNA_LIB_VER == 1
     GNADeviceClose(nGNAHandle);
     nGNAHandle = 0;
-#else
-    const auto status = Gna2DeviceClose(nGnaDeviceIndex);
-    checkGna2Status(status);
-#endif
-    deviceOpened = false;
 }
 
 void GNADeviceHelper::setOMPThreads(uint8_t const n_threads) {
-#if GNA_LIB_VER == 1
     gmmSetThreads(n_threads);
-#else
-    const auto status = Gna2DeviceSetNumberOfThreads(nGnaDeviceIndex, n_threads);
-    checkGna2Status(status);
-#endif
 }
 
 void GNADeviceHelper::updateGnaPerfCounters() {
-    if (!isPerformanceMeasuring)
-        return;
-#if GNA_LIB_VER == 2
-    instrumentationTotal[0] = instrumentationResults[0];
-    instrumentationTotal[1] = instrumentationResults[1];
-#else
     nGNAPerfResultsTotal.hw.stall = nGNAPerfResults.hw.stall;
     nGNAPerfResultsTotal.hw.total = nGNAPerfResults.hw.total;
+
     nGNAPerfResultsTotal.lib.submit = nGNAPerfResults.lib.submit;
     nGNAPerfResultsTotal.lib.preprocess = nGNAPerfResults.lib.preprocess;
     nGNAPerfResultsTotal.lib.process = nGNAPerfResults.lib.process;
@@ -406,26 +110,16 @@ void GNADeviceHelper::updateGnaPerfCounters() {
     nGNAPerfResultsTotal.drv.startHW = nGNAPerfResults.drv.startHW;
     nGNAPerfResultsTotal.drv.scoreHW = nGNAPerfResults.drv.scoreHW;
     nGNAPerfResultsTotal.drv.intProc = nGNAPerfResults.drv.intProc;
-#endif
 }
 
 void GNADeviceHelper::getGnaPerfCounters(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo>& retPerfCounters) {
     InferenceEngine::InferenceEngineProfileInfo info;
     info.status = InferenceEngine::InferenceEngineProfileInfo::EXECUTED;
-    info.cpu_uSec = 0;
-    info.execution_index = 0;
-    info.realTime_uSec = 0;
+
     // Hardware
-#if GNA_LIB_VER == 1
     info.realTime_uSec = nGNAPerfResultsTotal.hw.total;
-#else
-    info.realTime_uSec = instrumentationTotal[0];
-#endif
     retPerfCounters["1.1 Total scoring time in HW"] = info;
-#if GNA_LIB_VER == 1
+
     info.realTime_uSec = nGNAPerfResultsTotal.hw.stall;
-#else
-    info.realTime_uSec = instrumentationTotal[1];
-#endif
     retPerfCounters["1.2 Stall scoring time in HW"] = info;
 }
diff --git a/inference-engine/src/gna_plugin/gna_device.hpp b/inference-engine/src/gna_plugin/gna_device.hpp
index f122445907a22a..782821137dbbac 100644
--- a/inference-engine/src/gna_plugin/gna_device.hpp
+++ b/inference-engine/src/gna_plugin/gna_device.hpp
@@ -1,82 +1,38 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
 
-#include <cstdint>
+#include "gna-api-dumper.h"
+#include "gna-api-instrumentation.h"
+#include "ie_common.h"
 #include <memory>
 #include <string>
 #include <map>
 #include <thread>
 
-#include <ie_common.h>
-
-#if GNA_LIB_VER == 2
-#include "gna2-common-api.h"
-#include "gna2-inference-api.h"
-#include "gna2-instrumentation-api.h"
-
-#include "gna2-memory-api.h"
-#include "gna2-model-api.h"
-#include "gna2-model-suecreek-header.h"
-#else
-#include <gna-api.h>
-#include "gna-api-dumper.h"
-#include "gna-api-instrumentation.h"
-#endif
-
-
 /**
  * holds gna - style handle in RAII way
  */
 class GNADeviceHelper {
-#if GNA_LIB_VER == 1
     intel_gna_status_t nGNAStatus = GNA_NOERROR;
     intel_gna_handle_t nGNAHandle = 0;
     intel_gna_proc_t nGNAProcType = GNA_AUTO;
     intel_gna_perf_t nGNAPerfResults;
     intel_gna_perf_t nGNAPerfResultsTotal;
-#else
-    uint32_t nGnaDeviceIndex = 0;
-    Gna2AccelerationMode gna2AccelerationMode = Gna2AccelerationModeAuto;
-    Gna2DeviceVersion gna2HwConsistency = Gna2DeviceVersionSoftwareEmulation;
-    Gna2DeviceVersion detectedGnaDevVersion = Gna2DeviceVersionSoftwareEmulation;
-
-    static const uint32_t TotalGna2InstrumentationPoints = 2;
-    Gna2InstrumentationPoint gna2InstrumentationPoints[TotalGna2InstrumentationPoints] = {
-        Gna2InstrumentationPointHwTotalCycles,
-        Gna2InstrumentationPointHwStallCycles };
-
-    uint64_t instrumentationResults[TotalGna2InstrumentationPoints] = {};
-    uint64_t instrumentationTotal[TotalGna2InstrumentationPoints] = {};
-    uint32_t instrumentationConfigId = 0;
-
-#define MAX_TIMEOUT 500000
-#endif
     const uint32_t GNA_TIMEOUT = MAX_TIMEOUT;
-    bool isPerformanceMeasuring = false;
-    bool deviceOpened = false;
-public:
-#if GNA_LIB_VER == 1
+    bool isPerformanceMeasuring;
+
+ public:
     explicit GNADeviceHelper(intel_gna_proc_t proc_type = GNA_AUTO,
                             uint8_t lib_async_n_threads = 1,
                             bool use_openmp = false,
                             bool isPerformanceMeasuring = false) :
                                     nGNAProcType(proc_type),
                                     isPerformanceMeasuring(isPerformanceMeasuring) {
-#else
-     explicit GNADeviceHelper(Gna2AccelerationMode gna2accMode = Gna2AccelerationModeAuto,
-         Gna2DeviceVersion gna2HwConsistency = Gna2DeviceVersionSoftwareEmulation,
-         uint8_t lib_async_n_threads = 1,
-         bool use_openmp = false,
-         bool isPerformanceMeasuring = false) :
-         gna2AccelerationMode(gna2accMode),
-         gna2HwConsistency(gna2HwConsistency),
-         isPerformanceMeasuring(isPerformanceMeasuring) {
-#endif
-        open(lib_async_n_threads);
         initGnaPerfCounters();
+        open(lib_async_n_threads);
 
         if (use_openmp) {
             uint8_t num_cores = std::thread::hardware_concurrency();
@@ -84,17 +40,12 @@ class GNADeviceHelper {
         }
     }
 
-    GNADeviceHelper(const GNADeviceHelper&) = delete;
-    GNADeviceHelper& operator= (const GNADeviceHelper&) = delete;
     ~GNADeviceHelper() {
-        if (deviceOpened) {
-            close();
-        }
+        close();
     }
 
     uint8_t *alloc(uint32_t size_requested, uint32_t *size_granted);
 
-#if GNA_LIB_VER == 1
     void propagateSync(const intel_nnet_type_t *pNeuralNetwork,
                        const uint32_t *pActiveIndices,
                        uint32_t nActiveIndices);
@@ -102,80 +53,39 @@ class GNADeviceHelper {
     uint32_t propagate(const intel_nnet_type_t *pNeuralNetwork,
                        const uint32_t *pActiveIndices,
                        uint32_t nActiveIndices);
-#else
-    void setUpActiveList(unsigned req_config_id, uint32_t layerIndex, uint32_t* ptr_active_indices, uint32_t num_active_indices);
-    void propagateSync(const uint32_t requestConfigId);
-    uint32_t propagate(const uint32_t requestConfigId);
-#if GNA_LIB_VER == 2
-    uint32_t createModel(const Gna2Model& gnaModel) const;
-#else
-    uint32_t createModel(const intel_nnet_type_t& intel_nnet_type);
-#endif
-    void releseModel(const uint32_t model_id);
-    uint32_t createRequestConfig(const uint32_t model_id);
-    bool hasGnaHw() const {
-        return Gna2DeviceVersionSoftwareEmulation != detectedGnaDevVersion;
-    }
-    static void checkGna2Status(Gna2Status status);
-    static void checkGna2Status(Gna2Status status, const Gna2Model& gnaModel);
-#endif
+
     void wait(uint32_t id);
 
+
     struct DumpResult {
-#if GNA_LIB_VER == 2
-        Gna2ModelSueCreekHeader header;
-#else
         intel_gna_model_header header;
-#endif
         std::shared_ptr<void> model;
     };
 
-    const void * dumpXNNROPtr = nullptr;
-    uint32_t dumpXNNROSize = 0;
-
-#if GNA_LIB_VER == 1
     DumpResult dumpXnn(const intel_nnet_type_t *pNeuralNetwork,
                  const uint32_t *pActiveIndices,
                  uint32_t nActiveIndices);
-    intel_gna_status_t getGNAStatus() const noexcept {
-        return nGNAStatus;
-    }
-#else
 
-    DumpResult dumpXnn(const uint32_t modelId);
-    void dumpXnnForDeviceVersion(const uint32_t modelId,
-        std::ostream & outStream,
-        Gna2DeviceVersion targetDeviceVersion);
-#endif
-    void free(void * ptr);
 
+    void free() {
+        GNAFree(nGNAHandle);
+    }
     void updateGnaPerfCounters();
     void getGnaPerfCounters(std::map<std::string,
                         InferenceEngine::InferenceEngineProfileInfo>& retPerfCounters);
+
  private:
     void open(uint8_t const n_threads);
 
     void close();
-#if GNA_LIB_VER == 1
+
     void checkStatus() const;
-#else
-    static const std::map <Gna2ItemType, const std::string> errorTypes;
-    static const std::map <Gna2ErrorType, const std::string> errorReasons;
-    static const std::map <Gna2OperationType, const std::string> operationTypes;
-    static const std::map <const std::pair<Gna2OperationType, int32_t>, const std::string > operandTypes;
-#endif
+
     void setOMPThreads(uint8_t const n_threads);
 
     void initGnaPerfCounters() {
-#if GNA_LIB_VER == 1
         nGNAPerfResults = {{0, 0, 0, 0, 0, 0, 0}, {0, 0}, {0, 0, 0}, {0, 0}};
         nGNAPerfResultsTotal = {{0, 0, 0, 0, 0, 0, 0}, {0, 0}, {0, 0, 0}, {0, 0}};
-#else
-        const auto status = Gna2InstrumentationConfigCreate(TotalGna2InstrumentationPoints,
-            gna2InstrumentationPoints,
-            instrumentationResults,
-            &instrumentationConfigId);
-        checkGna2Status(status);
-#endif
     }
-};  // NOLINT
+};
+
diff --git a/inference-engine/src/gna_plugin/gna_executable_network.hpp b/inference-engine/src/gna_plugin/gna_executable_network.hpp
index 90f01ff17cbfef..1230624fb15530 100644
--- a/inference-engine/src/gna_plugin/gna_executable_network.hpp
+++ b/inference-engine/src/gna_plugin/gna_executable_network.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -8,11 +8,10 @@
 #include <map>
 #include <vector>
 
-#include <net_pass.h>
 #include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
 #include "gna_infer_request.hpp"
 #include "gna_plugin.hpp"
-#include <threading/ie_executor_manager.hpp>
+#include <cpp_interfaces/ie_executor_manager.hpp>
 #include <cpp_interfaces/impl/ie_executable_network_thread_safe_async_only.hpp>
 
 namespace GNAPluginNS {
@@ -21,26 +20,16 @@ class GNAExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafe
     std::shared_ptr<GNAPlugin> plg;
 
  public:
-    GNAExecutableNetwork(const std::string &aotFileName, std::shared_ptr<GNAPlugin> plg)
-        : plg(plg) {
+    GNAExecutableNetwork(const std::string &aotFileName, const std::map<std::string, std::string> &config) :
+        plg(std::make_shared<GNAPlugin>(config)) {
         plg->ImportNetwork(aotFileName);
         _networkInputs  = plg->GetInputs();
         _networkOutputs = plg->GetOutputs();
     }
 
-    GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, std::shared_ptr<GNAPlugin> plg)
-        : plg(plg) {
-        InferenceEngine::NetPass::ConvertPrecision(network, InferenceEngine::Precision::I64, InferenceEngine::Precision::I32);
-        InferenceEngine::NetPass::ConvertPrecision(network, InferenceEngine::Precision::U64, InferenceEngine::Precision::I32);
-        plg->LoadNetwork(network);
-    }
-
-    GNAExecutableNetwork(const std::string &aotFileName, const std::map<std::string, std::string> &config)
-        : GNAExecutableNetwork(aotFileName, std::make_shared<GNAPlugin>(config)) {
-    }
-
     GNAExecutableNetwork(InferenceEngine::ICNNNetwork &network, const std::map<std::string, std::string> &config)
-        : GNAExecutableNetwork(network, std::make_shared<GNAPlugin>(config)) {
+        : plg(std::make_shared<GNAPlugin>(config)) {
+        plg->LoadNetwork(network);
     }
 
     InferenceEngine::AsyncInferRequestInternal::Ptr
@@ -60,24 +49,5 @@ class GNAExecutableNetwork : public InferenceEngine::ExecutableNetworkThreadSafe
     void Export(const std::string &modelFileName) override {
         plg->Export(modelFileName);
     }
-
-    using ExecutableNetworkInternal::Export;
-
-    void ExportImpl(std::ostream&) override {
-        THROW_IE_EXCEPTION << NOT_IMPLEMENTED_str;
-    }
-
-    void GetConfig(const std::string &name,
-                   InferenceEngine::Parameter &result,
-                   InferenceEngine::ResponseDesc* /*resp*/) const override {
-        result = plg->GetConfig(name, {});
-    }
-
-    void GetMetric(const std::string& name,
-                   InferenceEngine::Parameter& result,
-                   InferenceEngine::ResponseDesc* /* resp */) const override {
-        result = plg->GetMetric(name, {});
-    }
 };
-
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_helper.cpp b/inference-engine/src/gna_plugin/gna_helper.cpp
index 729ae1b4fffdff..604828c339dd0e 100644
--- a/inference-engine/src/gna_plugin/gna_helper.cpp
+++ b/inference-engine/src/gna_plugin/gna_helper.cpp
@@ -1,9 +1,10 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
-//  gna_helper.cpp : various GNA-related utility functions
-//
 
+#include "lstm.hpp"
+
+#define USING_GCC
 #define PROFILE
 
 #include <cstdint>
@@ -12,10 +13,7 @@
 #include <vector>
 #include <sstream>
 #include <string>
-#include <gna-api-types-xnn.h>
-#include "gna_plugin_log.hpp"
-
-#include "gna_lib_ver_selector.hpp"
+#include "gna-api.h"
 
 #ifndef WIN32
 #include <profiler.h>
@@ -77,10 +75,8 @@ void profilerRtcStartAccumulate(intel_gna_profiler_rtc *p) {
 //    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &p->start);
 }
 void profilerRtcStopAccumulate(intel_gna_profiler_rtc *p) {
+    timespec diff;
     if (nullptr == p) return;
-// TODO: consider removing dead code from this file
-
-//  timespec diff;
 //    clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &p->stop);
 //    if ((p->stop.tv_nsec - p->start.tv_nsec)<0) {
 //        diff.tv_sec = p->stop.tv_sec - p->start.tv_sec - 1;
@@ -118,7 +114,7 @@ void PrintMatrixInt32(char *ptr_name, int32_t *ptr_matrix, int num_rows, int num
 }
 
 void PrintMatrixFloat32(char *ptr_name, float *ptr_matrix, int num_rows, int num_cols, int lda) {
-#if (defined _WIN32 || defined _WIN64) && (_MSC_VER < 1900)
+#if (_WIN32 || _WIN64) && (_MSC_VER < 1900)
     _set_output_format(_TWO_DIGIT_EXPONENT);
 #endif
     printf("%s:  %dx%d lda %d\n", ptr_name, num_rows, num_cols, lda);
@@ -228,7 +224,7 @@ uint32_t BufferOffsetFromAddress(std::vector<intel_memory_region_t> &vBuffer, vo
 }
 
 std::string LayerName(intel_nnet_layer_t *pLayer) {
-    const auto nKind = pLayer->nLayerKind;
+    intel_layer_kind_t nKind = pLayer->nLayerKind;
     std::string sKind;
     if (nKind == INTEL_AFFINE) {
         sKind = "affine";
@@ -246,7 +242,7 @@ std::string LayerName(intel_nnet_layer_t *pLayer) {
 }
 
 uint32_t NumInputs(intel_nnet_layer_t *pLayer) {
-    const auto nKind = pLayer->nLayerKind;
+    intel_layer_kind_t nKind = pLayer->nLayerKind;
     uint32_t nInputs;
     if ((nKind == INTEL_AFFINE) || (nKind == INTEL_AFFINE_DIAGONAL)) {
         nInputs = pLayer->nInputRows;
@@ -262,7 +258,7 @@ uint32_t NumInputs(intel_nnet_layer_t *pLayer) {
 }
 
 uint32_t NumOutputs(intel_nnet_layer_t *pLayer) {
-    const auto nKind = pLayer->nLayerKind;
+    intel_layer_kind_t nKind = pLayer->nLayerKind;
     uint32_t nOutputs;
     if ((nKind == INTEL_AFFINE) || (nKind == INTEL_AFFINE_DIAGONAL)) {
         nOutputs = pLayer->nOutputRows;
@@ -278,7 +274,7 @@ uint32_t NumOutputs(intel_nnet_layer_t *pLayer) {
 }
 
 uint32_t NumGroupSize(intel_nnet_layer_t *pLayer) {
-    const auto nKind = pLayer->nLayerKind;
+    intel_layer_kind_t nKind = pLayer->nLayerKind;
     uint32_t nGroupSize;
     if ((nKind == INTEL_AFFINE) || (nKind == INTEL_AFFINE_DIAGONAL)) {
         nGroupSize = pLayer->nOutputColumns;
@@ -292,3 +288,162 @@ uint32_t NumGroupSize(intel_nnet_layer_t *pLayer) {
     }
     return (nGroupSize);
 }
+
+void ExportGnaNetworkAndrzej(const char *ptr_name, intel_nnet_type_t *pNeuralNetwork) {
+    std::string sXmlFileName;
+    sXmlFileName.append(ptr_name);
+    sXmlFileName.append("/model.xml");
+    std::ofstream xml_file(sXmlFileName.c_str(), std::ios::out);
+    if (xml_file.good()) {
+        std::vector<intel_memory_region_t> vBuffer;
+        //  find all the memory regions in the network
+        for (uint32_t layer = 0; layer < pNeuralNetwork->nLayers; layer++) {
+            intel_nnet_layer_t *pLayer = &pNeuralNetwork->pLayers[layer];
+            intel_affine_layer_t *pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
+            uint32_t nPWLSegments = 0;
+            uint32_t nWeightWidth = 0;
+            AddBufferEntry(vBuffer,
+                           LayerName(pLayer),
+                           "pInputs",
+                           pLayer->pInputs,
+                           pLayer->nBytesPerInput * pLayer->nInputColumns * pLayer->nInputRows);
+            AddBufferEntry(vBuffer,
+                           LayerName(pLayer),
+                           "pOutputs",
+                           pLayer->pOutputs,
+                           pLayer->nBytesPerOutput * pLayer->nOutputColumns * pLayer->nOutputRows);
+            AddBufferEntry(vBuffer,
+                           LayerName(pLayer),
+                           "pOutputsIntermediate",
+                           pLayer->pOutputsIntermediate,
+                           pLayer->nBytesPerIntermediateOutput * pLayer->nOutputColumns * pLayer->nOutputRows);
+            if ((pLayer->nLayerKind == INTEL_AFFINE) || (pLayer->nLayerKind == INTEL_AFFINE_DIAGONAL)) {
+                uint32_t nBytesWeights =
+                    (pLayer->nLayerKind == INTEL_AFFINE) ? pAffineLayer->affine.nBytesPerWeight * pLayer->nInputRows
+                        * pLayer->nOutputRows : pAffineLayer->affine.nBytesPerWeight * pLayer->nOutputRows;
+                nPWLSegments = pAffineLayer->pwl.nSegments;
+                nWeightWidth = pAffineLayer->affine.nBytesPerWeight;
+                AddBufferEntry(vBuffer, LayerName(pLayer), "pWeights", pAffineLayer->affine.pWeights, nBytesWeights);
+                AddBufferEntry(vBuffer,
+                               LayerName(pLayer),
+                               "pBiases",
+                               pAffineLayer->affine.pBiases,
+                               pAffineLayer->affine.nBytesPerBias * pLayer->nOutputRows);
+                if (nPWLSegments > 0) {
+                    AddBufferEntry(vBuffer,
+                                   LayerName(pLayer),
+                                   "pSegments",
+                                   pAffineLayer->pwl.pSegments,
+                                   sizeof(intel_pwl_segment_t) * nPWLSegments);
+                }
+            } else if (pLayer->nLayerKind == INTEL_INTERLEAVE) {
+            } else if (pLayer->nLayerKind == INTEL_DEINTERLEAVE) {
+            } else {
+                fprintf(stderr, "Error:  layer kind not yet supported in ExportGnaNetworkAndrzej()!\n");
+                exit(EXIT_FAILURE);
+            }
+        }
+        //  write XML network description
+        xml_file << "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
+        xml_file << "<model>\n\n\n";
+        xml_file << "    <!--Neural network topology definition. -->\n";
+        xml_file << "    <network type=\"2b\" grouping=\"" << pNeuralNetwork->nGroup << "\">\n";
+        for (uint32_t layer = 0; layer < pNeuralNetwork->nLayers; layer++) {
+            intel_nnet_layer_t *pLayer = &pNeuralNetwork->pLayers[layer];
+            intel_affine_layer_t *pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
+            //  below is hard-coded for the Google LSTM model -- it is only for debugging
+            std::string sClass = (layer < pNeuralNetwork->nLayers - 1) ? "LSTM_" : "DNN_";
+            std::string sName;
+            uint32_t nGoogleLayer;
+            if (pNeuralNetwork->nGroup == 1) {
+                sName = (layer < pNeuralNetwork->nLayers - 1) ? intel_lstm_projected_layer_name[layer % NUM_LSTM_LAYERS]
+                                                              : "final affine layer";
+                nGoogleLayer = layer / NUM_LSTM_LAYERS;
+            } else if (pNeuralNetwork->nGroup == 4) {
+                sName = (layer < pNeuralNetwork->nLayers - 1) ? intel_lstm_projected_layer_g4_name[layer
+                    % NUM_LSTM_G4_LAYERS] : "final affine layer";
+                nGoogleLayer = layer / NUM_LSTM_G4_LAYERS;
+            } else {
+                  sName = "affine"; sName + std::to_string(layer);
+                  nGoogleLayer = layer;
+                  // fprintf(stderr, "Error:  unsupported grouping factor in ExportGnaNetworkAndrzej()!\n");
+                  // exit(EXIT_FAILURE);
+            }
+            xml_file << "        <layer type=\"" << LayerName(pLayer) << "\" class=\"" << sClass.c_str()
+                     << nGoogleLayer;
+            xml_file << "\" name=\"" << sName.c_str() << "\" grouping=\"" << NumGroupSize(pLayer) << "\">\n";
+            xml_file << "            <input>\n";
+            xml_file << "                <count>" << NumInputs(pLayer) << "</count>\n";
+            xml_file << "                <buffer offset=\"" << BufferOffsetFromAddress(vBuffer, pLayer->pInputs)
+                     << "\">";
+            xml_file << BufferNameFromAddress(vBuffer, pLayer->pInputs) << "</buffer>\n";
+            xml_file << "            </input>\n";
+            xml_file << "            <output>\n";
+            xml_file << "                <count>" << NumOutputs(pLayer) << "</count>\n";
+            xml_file << "                <buffer offset=\"" << BufferOffsetFromAddress(vBuffer, pLayer->pOutputs)
+                     << "\">";
+            xml_file << BufferNameFromAddress(vBuffer, pLayer->pOutputs) << "</buffer>\n";
+            xml_file << "            </output>\n";
+            if (pLayer->pOutputsIntermediate != NULL) {
+                xml_file << "            <temp>\n";
+                xml_file << "                <count>" << NumOutputs(pLayer) << "</count>\n";
+                xml_file << "                <buffer offset=\""
+                         << BufferOffsetFromAddress(vBuffer, pLayer->pOutputsIntermediate) << "\">";
+                xml_file << BufferNameFromAddress(vBuffer, pLayer->pOutputsIntermediate) << "</buffer>\n";
+                xml_file << "            </temp>\n";
+            }
+            if ((pLayer->nLayerKind == INTEL_AFFINE) || (pLayer->nLayerKind == INTEL_AFFINE_DIAGONAL)) {
+                xml_file << "            <weights>" << BufferNameFromAddress(vBuffer, pAffineLayer->affine.pWeights)
+                         << "</weights>\n";
+                xml_file << "            <biases offset=\""
+                         << BufferOffsetFromAddress(vBuffer, pAffineLayer->affine.pBiases) << "\">";
+                xml_file << BufferNameFromAddress(vBuffer, pAffineLayer->affine.pBiases) << "</biases>\n";
+                if (pAffineLayer->pwl.nSegments > 0) {
+                    xml_file << "            <pwl>" << BufferNameFromAddress(vBuffer, pAffineLayer->pwl.pSegments)
+                             << "</pwl>\n";
+                }
+            }
+            xml_file << "        </layer>\n\n";
+        }
+        xml_file << "    </network>\n\n";
+        xml_file.flush();
+
+        //  write buffer list to XML and create data files
+        xml_file << "    <buffers>\n";
+        for (uint32_t i = 0; i < vBuffer.size(); i++) {
+            std::string sName = ptr_name;
+            sName.append("/");
+            sName.append(BufferNameFromAddress(vBuffer, vBuffer.at(i).pAddress));
+            bool found = false;
+            for (uint32_t j = 0; j < i; j++) {
+                std::string sPrevName = BufferNameFromAddress(vBuffer, vBuffer.at(j).pAddress);
+                if (sPrevName.compare(sName) == 0) found = true;
+            }
+            if (!found) {
+                xml_file << "        <buffer>\n";
+                xml_file << "            <name>" << sName << "</name>\n";
+                if (sName.compare(0, 4, "buf_") == 0) {
+                    xml_file << "            <size>" << vBuffer.at(i).nBytes << "</size>\n";
+                } else {
+                    std::string sFileName;
+                    sFileName.append(sName);
+                    sFileName.append(".dat");
+                    xml_file << "            <file>" << sFileName << "</file>\n";
+                    std::ofstream data_file(sFileName.c_str(), std::ios::binary);
+                    data_file.write(reinterpret_cast<char *>(vBuffer.at(i).pAddress), vBuffer.at(i).nBytes);
+                    data_file.close();
+                }
+                xml_file << "        </buffer>\n";
+            }
+        }
+        xml_file << "    </buffers>\n";
+        xml_file << "<!--Temporary scratch buffer is required but not used in this model definition. -->\n";
+        xml_file << "    <scratchpad>\n";
+        xml_file << "        <size>65536</size>\n";
+        xml_file << "    </scratchpad>\n";
+        xml_file << "</model>\n";
+        xml_file.close();
+    } else {
+        fprintf(stderr, "Failed to open %s for writing!\n", ptr_name);
+    }
+}
diff --git a/inference-engine/src/gna_plugin/gna_infer_request.hpp b/inference-engine/src/gna_plugin/gna_infer_request.hpp
index 17688f4688a5ce..ba8e99f7920237 100644
--- a/inference-engine/src/gna_plugin/gna_infer_request.hpp
+++ b/inference-engine/src/gna_plugin/gna_infer_request.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -31,15 +31,8 @@ class GNAInferRequest : public InferenceEngine::AsyncInferRequestInternal {
         }
 
         // copy inputs blobs since we need to have them in separate address space to allow simultaneous infer requests
-        for (auto output : _networkOutputs) {
-            _outputs[output.first] =
-                plg->GetOutputBlob(output.first, output.second->getTensorDesc().getPrecision());
-        }
-
-        for (auto input : _networkInputs) {
-            _inputs[input.first] =
-                plg->GetInputBlob(input.first, input.second->getTensorDesc().getPrecision());
-        }
+        _outputs[_networkOutputs.begin()->first] = plg->GetOutputBlob(networkOutputs.begin()->second->getPrecision());
+        _inputs[_networkInputs.begin()->first] = plg->GetInputBlob(networkInputs.begin()->second->getInputPrecision());
     }
     /**
      * @brief Infers specified input(s) in synchronous mode
@@ -72,12 +65,7 @@ class GNAInferRequest : public InferenceEngine::AsyncInferRequestInternal {
     }
 
     InferenceEngine::StatusCode Wait(int64_t millis_timeout) override {
-        if (inferRequestIdx == -1) {
-            return InferenceEngine::INFER_NOT_STARTED;
-        } else if (millis_timeout < -1) {
-            THROW_IE_EXCEPTION << PARAMETER_MISMATCH_str;
-        }
-
+        if (inferRequestIdx == -1) return InferenceEngine::INFER_NOT_STARTED;
         plg->Wait(inferRequestIdx);
         return InferenceEngine::OK;
     }
diff --git a/inference-engine/src/gna_plugin/gna_layer_info.hpp b/inference-engine/src/gna_plugin/gna_layer_info.hpp
new file mode 100644
index 00000000000000..7e6da438ef23a8
--- /dev/null
+++ b/inference-engine/src/gna_plugin/gna_layer_info.hpp
@@ -0,0 +1,206 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <string>
+#include <memory>
+#include "inference_engine.hpp"
+#include "details/caseless.hpp"
+#include "ie_algorithm.hpp"
+
+
+namespace GNAPluginNS {
+
+/**
+ * @brief detecting of const pointer for dynamic cast operations
+ * @tparam T
+ */
+template <class T>
+struct is_const_pointer : public std::false_type{
+};
+
+template <class T>
+struct is_const_pointer<const T *> : public std::true_type{
+};
+
+
+/**
+ * similar to type traits determined in standard library this trait provides details per layer type, with some attributes specific for GNA
+ * we don't need to have compile time performance for this yet
+ */
+class LayerInfo {
+    InferenceEngine::CNNLayer * layer;
+
+#define IS_VALID() if (nullptr == layer) return false
+
+ public:
+    explicit LayerInfo(InferenceEngine::CNNLayer & layer)
+        : LayerInfo(&layer) {
+    }
+    explicit LayerInfo(const InferenceEngine::CNNLayerPtr & layer)
+        : LayerInfo(layer.get()) {
+    }
+    explicit LayerInfo(InferenceEngine::CNNLayer * layer)
+        : layer(layer) {
+    }
+    bool has16BOutput() const noexcept {
+        IS_VALID();
+        static InferenceEngine::details::caseless_set<std::string> layersWith16BOutputs = {"memory", "input", "split", "slice", "concat", "copy"};
+        return layersWith16BOutputs.find(layer->type) != layersWith16BOutputs.end() ||
+                                                                        isActivation() ||
+                                                            (isCrop() && !isCropAffined());
+    }
+    bool has32BOutput() const noexcept {
+        IS_VALID();
+        static  InferenceEngine::details::caseless_set<std::string> layersWith32BOutputs =
+                {"FullyConnected", "InnerProduct", "Eltwise", "ScaleShift", "Convolution", "Pooling"};
+        return (layersWith32BOutputs.find(layer->type) != layersWith32BOutputs.end()) ||
+                                                            (isCrop() && isCropAffined());
+    }
+    static bool isBatchSizeConstrained(const std::string name) {
+        static InferenceEngine::details::caseless_set<std::string> layersWithConstrains = {"memory", "convolution"};
+        return layersWithConstrains.find(name) != layersWithConstrains.end();
+    }
+    bool isActivation() const noexcept {
+        IS_VALID();
+        static InferenceEngine::details::caseless_set<std::string> activations = {"clamp", "sigmoid", "identity", "relu", "leakyrelu", "tanh", "prelu"};
+        return activations.find(layer->type) != activations.end();
+    }
+    bool isRelu() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "relu");
+    }
+    bool isConvolution() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "convolution");
+    }
+    bool isPower() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "power");
+    }
+    bool has32BInput() const noexcept {
+        IS_VALID();
+        return isActivation() || isPooling();
+    }
+    bool isInput() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "input");
+    }
+    bool isEltwise() const noexcept {
+        IS_VALID();
+        return nullptr != as<const InferenceEngine::EltwiseLayer*>();
+    }
+    bool isEltwiseSum() const noexcept {
+        IS_VALID();
+        if (!isEltwise()) return false;
+        return dynamic_cast<const InferenceEngine::EltwiseLayer*>(layer)->_operation ==
+            InferenceEngine::EltwiseLayer::Sum;
+    }
+    bool isEltwiseMul() const noexcept {
+        IS_VALID();
+        if (!isEltwise()) return false;
+        return dynamic_cast<const InferenceEngine::EltwiseLayer*>(layer)->_operation ==
+            InferenceEngine::EltwiseLayer::Prod;
+    }
+    bool isIdentity() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "identity");
+    }
+    bool isFullyConnected() const noexcept {
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "FullyConnected") ||
+                InferenceEngine::details::CaselessEq<std::string>()(layer->type, "InnerProduct");
+    }
+    bool isConvolutional() const noexcept {
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "Convolution");
+    }
+    bool isSplit() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "split");
+    }
+    bool isSlice() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "slice");
+    }
+    bool isConcat() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "concat");
+    }
+    bool isReshape() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "reshape");
+    }
+    bool isPermute() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "permute");
+    }
+    bool isPooling() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "Pooling");
+    }
+    bool isMaxPooling() const noexcept {
+        IS_VALID();
+        if (!isPooling()) return false;
+        return as<const InferenceEngine::PoolingLayer*>()->_type == InferenceEngine::PoolingLayer::MAX;
+    }
+    bool isMemory() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "memory");
+    }
+    bool isCrop() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "crop");
+    }
+    bool isCropAffined() const noexcept {
+        auto cropLayer = dynamic_cast<InferenceEngine::CropLayer *> (layer);
+        size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
+        return (ALIGN(cropOffset, 8) != cropOffset);
+    }
+    bool isCopy() const noexcept {
+        IS_VALID();
+        return InferenceEngine::details::CaselessEq<std::string>()(layer->type, "copy");
+    }
+    size_t paddingSize() const noexcept {
+        static InferenceEngine::details::caseless_set<std::string> layersWithPossiblePadding = {"FullyConnected",
+                                                                        "InnerProduct",
+                                                                             "Pooling",
+                                                                         "Convolution"};
+        if (layersWithPossiblePadding.find(layer->type) != layersWithPossiblePadding.end()) {
+            size_t size_without_padding = 0;
+            auto inputs = layer->insData.begin()->lock();
+            if (inputs) {
+                size_without_padding = InferenceEngine::details::product(begin(inputs->dims),
+                                                                   end(inputs->dims));
+            }
+            return ALIGN(size_without_padding, 8) - size_without_padding;
+        }
+        return 0;
+    }
+    template <class T>
+    typename std::enable_if<!is_const_pointer<T>::value, T>::type as() noexcept {
+        return dynamic_cast<T>(layer);
+    }
+    template <class T>
+    typename std::enable_if<is_const_pointer<T>::value, T>::type as() const noexcept {
+        return dynamic_cast<T>(layer);
+    }
+    operator InferenceEngine::CNNLayer *() noexcept {
+        return layer;
+    }
+    operator const InferenceEngine::CNNLayer *() const noexcept {
+        return layer;
+    }
+    operator InferenceEngine::CNNLayerPtr () const noexcept {
+        return std::shared_ptr<InferenceEngine::CNNLayer>(layer, [] (InferenceEngine::CNNLayer * p) {});
+    }
+
+    #undef IS_VALID
+};
+
+inline std::ostream & operator <<(std::ostream &os, const LayerInfo & info) {
+    os << static_cast<const InferenceEngine::CNNLayer*>(info)->name;
+    return os;
+}
+
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_mem_requests.hpp b/inference-engine/src/gna_plugin/gna_mem_requests.hpp
new file mode 100644
index 00000000000000..24163dc4f5a16d
--- /dev/null
+++ b/inference-engine/src/gna_plugin/gna_mem_requests.hpp
@@ -0,0 +1,175 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <list>
+#include <vector>
+#include <algorithm>
+#include <functional>
+
+namespace GNAPluginNS {
+
+enum rType {
+    REQUEST_STORE,
+    REQUEST_ALLOCATE,
+    REQUEST_BIND,
+    REQUEST_INITIALIZER,
+};
+/**
+ * @brief region of firmware data
+ */
+enum rRegion {
+    REGION_RO,
+    REGION_RW,
+    REGION_AUTO,
+};
+
+struct MemRequest {
+    rType _type;
+    rRegion  _region;
+    void *_ptr_out;
+    const void *_ptr_in = nullptr;
+    std::function<void(void * data, size_t size)> _initializer;
+    // holds arbitrary value
+    std::vector<uint8_t> _data;
+    uint8_t _element_size;
+    size_t _num_elements;
+    size_t _alignment;
+    size_t _offset;
+    // expansion in bytes due to large depended layers
+    size_t _padding = 0;
+    MemRequest(rRegion region,
+                rType req,
+                void *ptr_out,
+                const void *ptr_in,
+                uint8_t element_size = 0,
+                size_t num_elements = 0,
+                size_t alignment = 1,
+                size_t offset = 0) : _region(region),
+                                     _type(req),
+                                     _ptr_out(ptr_out),
+                                     _ptr_in(ptr_in),
+                                     _element_size(element_size),
+                                     _num_elements(num_elements),
+                                     _alignment(alignment),
+                                     _offset(offset) {}
+
+    /**
+     * Store value only request
+     * @tparam T
+     * @param req
+     * @param ptr_out
+     * @param element
+     * @param num_elements
+     * @param alignment
+     */
+    template<class T>
+    MemRequest(rRegion region,
+                void *ptr_out,
+                T element,
+                size_t num_elements,
+                size_t alignment = 1) : _region(region),
+                                        _type(REQUEST_STORE),
+                                        _ptr_out(ptr_out),
+                                        _element_size(sizeof(T)),
+                                        _num_elements(num_elements),
+                                        _alignment(alignment) {
+        _data.resize(sizeof(T));
+        std::copy(reinterpret_cast<uint8_t *>(&element), reinterpret_cast<uint8_t *>(&element) + sizeof(T), _data.begin());
+    }
+/**
+     * Store initializer request
+     * @param req
+     * @param ptr_out
+     * @param element
+     * @param num_elements
+     * @param alignment
+     */
+    MemRequest(rRegion region,
+               void   *ptr_out,
+               size_t  regionSize,
+               std::function<void(void * data, size_t size)> initializer,
+               size_t  alignment = 1) : _region(region),
+                                        _type(REQUEST_INITIALIZER),
+                                        _ptr_out(ptr_out),
+                                        _element_size(1),
+                                        _num_elements(regionSize),
+                                        _alignment(alignment),
+                                        _initializer(initializer) {
+    }
+};
+
+/**
+ * Adapter for requests submission and actual request queue
+ */
+class GNAMemRequestsQueue {
+ public:
+    virtual ~GNAMemRequestsQueue() {}
+
+    /**
+     * @brief register initialiser to access memory once it is actually allocated
+     * @param ptr_out
+     * @param ptr_in
+     * @param num_bytes
+     * @param alignment
+     */
+    void push_initializer(void *ptr_out, size_t num_bytes, std::function<void(void * data, size_t size)> initializer, size_t alignment = 1) {
+        futureHeap().push_back({regionType(), ptr_out, num_bytes, initializer, alignment});
+    }
+
+    void push_ptr(void *ptr_out, const void *ptr_in, size_t num_bytes, size_t alignment = 1) {
+        futureHeap().push_back({regionType(), REQUEST_STORE, ptr_out, ptr_in, 1, num_bytes, alignment});
+    }
+
+    /**
+     * copy input to intermediate buffer
+     * @param ptr_out
+     * @param ptr_in
+     * @param num_bytes
+     */
+    void push_local_ptr(void *ptr_out, const void *ptr_in, size_t num_bytes, size_t alignment = 1) {
+        localStorage().emplace_back(reinterpret_cast<const uint8_t *>(ptr_in),
+                                    reinterpret_cast<const uint8_t *>(ptr_in) + num_bytes);
+        futureHeap().push_back({regionType(), REQUEST_STORE, ptr_out, &localStorage().back().front(), 1, num_bytes, alignment});
+    }
+
+    /**
+     *
+     * @param ptr_out
+     * @param num_bytes
+     */
+    void reserve_ptr(void *ptr_out, size_t num_bytes)  {
+        futureHeap().push_back({regionType(), REQUEST_ALLOCATE, ptr_out, nullptr, 1, num_bytes});
+    }
+
+    /**
+     *
+     * @param source
+     * @param dest - source is binded to dest pointer after allocation
+     * @param offset - offset in bytes in sourse that will be set in dest
+     * @param num_bytes - bind can request for bigger buffer that originally allocated via reserve(),
+     *      if that happens - reserved request parameters will be updated bero commiting memory
+     */
+    void bind_ptr(void *source, const void *dest, size_t offset = 0, size_t num_bytes = 0)  {
+        futureHeap().push_back({regionType(), REQUEST_BIND, source, dest, 1, num_bytes, 1, offset});
+    }
+    /**
+     * @brief allocates buffer and set all its values to T value
+     */
+    template<class T>
+    void push_value(void *ptr_out, T value, size_t num_elements, size_t alignment = 1) {
+        futureHeap().push_back({regionType(), ptr_out, value, num_elements, alignment});
+    }
+
+    /**
+     * @brief interface for actual queue storage
+     */
+    virtual rRegion regionType() const = 0;
+    virtual std::vector<MemRequest> & futureHeap()  = 0;
+    virtual std::list<std::vector<char>> &localStorage() = 0;
+};
+
+
+
+}  // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/gna_memory.hpp b/inference-engine/src/gna_plugin/gna_memory.hpp
new file mode 100644
index 00000000000000..d1c96506bc036d
--- /dev/null
+++ b/inference-engine/src/gna_plugin/gna_memory.hpp
@@ -0,0 +1,227 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include "gna_mem_requests.hpp"
+#include <memory>
+#include <vector>
+#include <list>
+#include <algorithm>
+#include <functional>
+
+/**
+ * Pads memory size to given number of Bytes
+ *
+ * Please always use this padding macro for consistency
+ *
+ * @memSize size (in bytes) of memory to be padded
+ * @align   number of bytes to pad
+ * @return  memory size (int bytes) padded to given value
+ */
+#ifndef ALIGN
+# define ALIGN(memSize, pad)   (static_cast<int>(((memSize) + pad -1) / pad) * pad)
+#endif
+
+namespace GNAPluginNS {
+
+
+
+/**
+ * @brief encapsulate various request to allocate GNA specific memory,
+ * in order to issue single allocation call and configure actual pointers in requests
+ * @tparam Allocator - a GNAAllocator in case of actual HW offloads
+ */
+template<class Allocator = std::allocator<uint8_t>>
+class GNAMemory : public GNAMemRequestsQueue {
+    std::vector<MemRequest> _future_heap;
+    std::list<std::vector<char>> _local_storage;
+    size_t _total = 0;
+    size_t _rw_section_size = 0;
+    size_t _ro_section_size = 0;
+    Allocator _allocator;
+    std::shared_ptr<uint8_t> heap;
+    size_t _page_alignment = 1;
+
+    class GNAMemRequestsReadOnlyQueue : public GNAMemRequestsQueue {
+        std::reference_wrapper<GNAMemRequestsQueue> _that;
+     public:
+        explicit GNAMemRequestsReadOnlyQueue(GNAMemory & that) : _that(that) {
+        }
+        rRegion regionType() const override {
+            return REGION_RO;
+        };
+        std::vector<MemRequest> & futureHeap()  override {
+            return _that.get().futureHeap();
+        }
+        std::list<std::vector<char>> &localStorage() override {
+            return _that.get().localStorage();
+        }
+    };
+
+    GNAMemRequestsReadOnlyQueue readOnlyFrontEnd;
+
+ public:
+    explicit GNAMemory(size_t pageAlignment = 1)
+        : readOnlyFrontEnd(*this), _page_alignment(pageAlignment) {}
+
+    explicit GNAMemory(const Allocator &a, size_t pageAlignment = 1)
+        : _allocator(a), readOnlyFrontEnd(*this), _page_alignment(pageAlignment) {}
+
+    GNAMemRequestsQueue & readonly() {
+        return readOnlyFrontEnd;
+    }
+
+    /**
+     * @brief calculates size required for all requests, allocates memory and updates pointers
+     */
+    void commit() {
+        // 1st stage -- looking for expandable bind requests:
+        for (auto &originated : _future_heap) {
+            if (originated._type == REQUEST_BIND) continue;
+            size_t offset = 0;
+            iterate_binded(originated, [&](MemRequest & reference, MemRequest & binded) {
+                if (&originated == &reference) {
+                    offset = 0;
+                }
+                offset += binded._offset;
+                auto current = offset + ALIGN(binded._num_elements * binded._element_size, binded._alignment);
+                auto original_no_pad = ALIGN(originated._num_elements * originated._element_size, originated._alignment);
+                auto original_with_pad = ALIGN(originated._num_elements * originated._element_size + originated._padding, originated._alignment);
+
+                originated._padding = ALIGN(std::max(original_with_pad, current), originated._alignment) - original_no_pad;
+            });
+        }
+
+        updateSectionsSizes();
+
+        _total = _rw_section_size + _ro_section_size;
+
+        // allocation with memory setting to 0 internally
+        heap = allocate(_total);
+        auto setupOffsets = [&](std::function<bool(MemRequest & request)> filter, size_t offset) {
+            for (auto &re : _future_heap) {
+                if (re._type == REQUEST_BIND) continue;
+                if (filter(re)) continue;
+
+                auto sz = re._element_size * re._num_elements;
+
+                if (re._ptr_out != nullptr) {
+                    auto cptr = heap.get() + offset;
+                    *reinterpret_cast<void **>(re._ptr_out) = cptr;
+                    // std::cout << "ALLOCATED=" << cptr << ", size=" << re._element_size * re._num_elements << "\n";
+                    iterate_binded(re, [](MemRequest & reference, MemRequest & binded) {
+                        *reinterpret_cast<void **>(binded._ptr_out) =
+                            binded._offset + reinterpret_cast<uint8_t *>(*reinterpret_cast<void **>(reference._ptr_out));
+                    });
+
+                    // std::cout << "size=" << ALIGN(sz, re._alignment) << "\n" << std::flush;
+
+                    switch (re._type) {
+                        case REQUEST_ALLOCATE :break;
+                        case REQUEST_STORE : {
+                            if (re._ptr_in != nullptr) {
+                                memcpy(cptr, re._ptr_in, sz);
+                            } else {
+                                size_t of = 0;
+                                for (int i = 0; i < re._num_elements; i++, of += re._element_size) {
+                                    std::copy(std::begin(re._data), std::end(re._data), cptr + of);
+                                }
+                            }
+                            break;
+                        }
+                        case REQUEST_INITIALIZER : {
+                            re._initializer(cptr, sz);
+                            break;
+                        }
+                    }
+                }
+
+                offset += ALIGN(sz + re._padding, re._alignment);
+            }
+        };
+
+        setupOffsets([](MemRequest & request) {
+            return request._region != REGION_RW;
+        }, 0);
+
+        setupOffsets([](MemRequest & request) {
+            return request._region != REGION_RO;
+        }, _rw_section_size);
+    }
+
+    void *getBasePtr() {
+        return heap.get();
+    }
+
+    size_t getRWBytes() {
+        updateSectionsSizes();
+        return _rw_section_size;
+    }
+
+    size_t getTotalBytes() {
+        updateSectionsSizes();
+        return _total;
+    }
+
+ protected:
+    rRegion regionType() const override {
+        return REGION_RW;
+    };
+    std::vector<MemRequest> & futureHeap()  override {
+        return _future_heap;
+    }
+    std::list<std::vector<char>> &localStorage() override {
+        return _local_storage;
+    }
+
+    template<class T>
+    void iterate_binded(MemRequest & reference, const T & visitor) {
+        for (auto &re : _future_heap) {
+            if (re._type == REQUEST_BIND && re._ptr_in == reference._ptr_out) {
+                // std::cout << "  [binded=" << re._ptr_out <<"]\n";
+                visitor(reference, re);
+                // TODO: no circular dependency checking, only tree-style dependency supported
+                iterate_binded(re, visitor);
+            }
+        }
+    }
+
+
+    std::shared_ptr<uint8_t> allocate(size_t bytes) {
+        std::shared_ptr<uint8_t> sp(_allocator.allocate(bytes), [=](uint8_t *p) {
+            _allocator.deallocate(p, bytes);
+        });
+        std::fill(sp.get(), sp.get() + bytes, 0);
+        return sp;
+    }
+
+ protected:
+    void updateSectionsSizes() {
+        // count total size and size of read/write regions
+        _rw_section_size = 0;
+        _ro_section_size = 0;
+        for (auto &re : _future_heap) {
+            auto current = ALIGN(re._num_elements * re._element_size + re._padding, re._alignment);
+#ifdef GNA_HEAP_PROFILER
+            std::cout << "chunk: " << " region: " << re._region << ", " <<
+                    "type: " << (re._type  == REQUEST_STORE ? "store " : re._type == REQUEST_BIND ? "bind  " : "alloc ") <<
+                    std::setw(10) << re._num_elements << ", " <<
+                    static_cast<int>(re._element_size) << ", " <<
+                    re._padding << ", " <<
+                    re._offset << ", " <<
+                    re._alignment << std::endl;
+#endif
+            if (re._type == REQUEST_BIND) continue;
+
+            if (re._region == REGION_RW) {
+                _rw_section_size += current;
+            } else {
+                _ro_section_size += current;
+            }
+        }
+        _rw_section_size = ALIGN(_rw_section_size, _page_alignment);
+        _ro_section_size = ALIGN(_ro_section_size, _page_alignment);
+    }
+};
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_memory_state.hpp b/inference-engine/src/gna_plugin/gna_memory_state.hpp
new file mode 100644
index 00000000000000..7edcb02e5bfd09
--- /dev/null
+++ b/inference-engine/src/gna_plugin/gna_memory_state.hpp
@@ -0,0 +1,25 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <cpp_interfaces/impl/ie_memory_state_internal.hpp>
+#include "gna_plugin.hpp"
+
+namespace  GNAPluginNS {
+
+class GNAMemoryState : public InferenceEngine::MemoryStateInternal {
+    std::shared_ptr<GNAPlugin> plg;
+ public:
+    using Ptr = InferenceEngine::MemoryStateInternal::Ptr;
+
+    explicit GNAMemoryState(std::shared_ptr<GNAPlugin> plg)
+        : InferenceEngine::MemoryStateInternal("GNAResetState"), plg(plg) {}
+    void Reset() override {
+        plg->Reset();
+    }
+};
+
+}  // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/gna_model_serial.cpp b/inference-engine/src/gna_plugin/gna_model_serial.cpp
index 74f3af1125e1a0..3b14b8c81c4950 100644
--- a/inference-engine/src/gna_plugin/gna_model_serial.cpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -7,20 +7,12 @@
 #include <details/ie_exception.hpp>
 #include <ios>
 #include <iomanip>
-#include <map>
-
-#if defined __INTEL_COMPILER || defined _MSC_VER
-#include <malloc.h>
-#else
+#ifndef _WIN32
 #include <mm_malloc.h>
 #endif
-
-#include "gna_plugin.hpp"
+#include <gna-api-types-xnn.h>
 #include "gna_model_serial.hpp"
-
-inline void writeNBytes(const void *ptr, uint32_t size, std::ostream & os) {
-    os.write(static_cast<const char*>(ptr), size);
-}
+#include "gna_plugin_log.hpp"
 
 template <class T>
 inline void writeBits(const T & obj, std::ostream & os) {
@@ -32,10 +24,6 @@ inline void readBits(T & obj, std::istream & is) {
     is.read(reinterpret_cast<char *>(&obj), sizeof(T));
 }
 
-inline void readNBytes(void * ptr, uint32_t size, std::istream & is) {
-    is.read(reinterpret_cast<char *>(ptr), size);
-}
-
 template <int nBits, class T>
 inline void readNBits(T & obj, std::istream & is) {
     std::array<uint8_t, nBits / 8> tmp;
@@ -44,15 +32,11 @@ inline void readNBits(T & obj, std::istream & is) {
     obj = * reinterpret_cast<T*>(&tmp.front());
 }
 
-inline void * offsetToPointer(void * const base, uint64_t offset) {
-    return reinterpret_cast<uint8_t *>(base) + offset;
-}
-
 template <class T>
 inline void readOffset(T & ptr, void *base,  std::istream & is) {
     uint64_t offset = 0ull;
     readBits(offset, is);
-    ptr = reinterpret_cast<T>(offsetToPointer(base, offset));
+    ptr = reinterpret_cast<T>(reinterpret_cast<uint8_t *>(base) + offset);
 }
 
 union {
@@ -79,8 +63,8 @@ ModelHeader GNAModelSerial::ReadHeader(std::istream &is) {
                            std::hex << std::setw(2) << static_cast<short>(header.gnam[2]) <<
                            std::hex << std::setw(2) << static_cast<short>(header.gnam[3]);
     }
-    if (header.version.major != HEADER_MAJOR) {
-        THROW_GNA_EXCEPTION << "Imported file unsupported: major version should be == " << HEADER_MAJOR;
+    if (header.version.major < 1) {
+        THROW_GNA_EXCEPTION << "Imported file unsupported: major version sould be > 1";
     }
     if (header.headerSize < sizeof(header)) {
         THROW_GNA_EXCEPTION << "Unsupported header size minimal value is : " << sizeof (header) << ", but read: " << header.headerSize;
@@ -96,228 +80,10 @@ ModelHeader GNAModelSerial::ReadHeader(std::istream &is) {
     return header;
 }
 
-#define offsetFromBase(field)\
-getOffsetFromBase(field, #field)
-
-#if GNA_LIB_VER == 2
-
-bool IsEmptyTensor(const Gna2Tensor& t) {
-    return t.Type == Gna2DataTypeNone &&
-        t.Data == nullptr &&
-        t.Layout[0] == '\0' &&
-        t.Mode == Gna2TensorModeDefault &&
-        t.Shape.NumberOfDimensions == 0;
-}
-
-const std::map<Gna2OperationType, std::vector<uint32_t>> GnaParamSize{
-    {Gna2OperationTypeFullyConnectedAffine, {sizeof(Gna2BiasMode), sizeof(uint32_t)}},
-    {Gna2OperationTypeConvolution, {
-        sizeof(Gna2Shape),
-        sizeof(Gna2BiasMode),
-        sizeof(Gna2PoolingMode),
-        sizeof(Gna2Shape),
-        sizeof(Gna2Shape),
-        sizeof(Gna2Shape)}},
-    {Gna2OperationTypeCopy, {sizeof(Gna2Shape)}},
-};
-
-void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream & is) {
+void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize,  std::istream & is) {
     is.exceptions(std::istream::failbit);
 
-    for (auto operation = gna2Model->Operations; operation != gna2Model->Operations + gna2Model->NumberOfOperations; ++operation) {
-        readNBits<32>(operation->Type, is);
-        readBits(operation->NumberOfOperands, is);
-        operation->Operands = static_cast<Gna2Tensor const **>(gnaUserAllocator(sizeof(Gna2Tensor*) * operation->NumberOfOperands));
-        for (uint32_t i = 0; i < operation->NumberOfOperands; i++) {
-            Gna2Tensor t{};
-            readBits(t, is);
-            if (IsEmptyTensor(t)) {
-                operation->Operands[i] = nullptr;
-            } else {
-                operation->Operands[i] = static_cast<Gna2Tensor const *>(gnaUserAllocator(sizeof(Gna2Tensor)));
-                t.Data = offsetToPointer(basePointer, reinterpret_cast<uint64_t>(t.Data));
-                const_cast<Gna2Tensor&>(*operation->Operands[i]) = t;
-            }
-        }
-        readBits(operation->NumberOfParameters, is);
-        switch (operation->Type) {
-        case Gna2OperationTypeElementWiseAffine:
-        case Gna2OperationTypeFullyConnectedAffine:
-        case Gna2OperationTypeConvolution:
-        case Gna2OperationTypeCopy:
-            break;
-        case Gna2OperationTypeRecurrent:
-            THROW_GNA_EXCEPTION << "Importing of recurrent operation not supported";
-        case Gna2OperationTypeTransposition:
-            THROW_GNA_EXCEPTION << "Importing of transposition operation not supported";
-        default:
-            THROW_GNA_EXCEPTION << "Importing of unknown GNA operation type(" << operation->Type << ")  not supported";
-        }
-        if (operation->NumberOfParameters > 0)
-            operation->Parameters = static_cast<void **>(gnaUserAllocator(sizeof(void*) * operation->NumberOfParameters));
-        else
-            operation->Parameters = nullptr;
-        for (uint32_t i = 0; i < operation->NumberOfParameters; i++) {
-            uint32_t paramSize;
-            readBits(paramSize, is);
-            if (paramSize == 0) {
-                operation->Parameters[i] = nullptr;
-                continue;
-            }
-            operation->Parameters[i] = gnaUserAllocator(paramSize);
-            readNBytes(operation->Parameters[i], paramSize, is);
-
-            if (GnaParamSize.at(operation->Type).size() <= i) {
-                THROW_GNA_EXCEPTION << "Cannot import parameter of index: " << i;
-            }
-            if (paramSize != GnaParamSize.at(operation->Type).at(i)) {
-                THROW_GNA_EXCEPTION << "Parameter size mismatch on import: " << i;
-            }
-        }
-    }
-
-    // writing memory information
-    uint32_t nStates = 0;
-    readBits(nStates, is);
-    if (pstates != nullptr) {
-        pstates->resize(nStates);
-    }
-
-    for (int i = 0; i != nStates; i++) {
-        void *pSegment;
-        readOffset(pSegment, basePointer, is);
-        uint32_t segmentSz;
-        readBits(segmentSz, is);
-        if (pstates) {
-            (*pstates)[i] = { pSegment, segmentSz };
-        }
-    }
-
-
-    // once structure has been read lets read whole gna graph
-    is.read(reinterpret_cast<char*>(basePointer), gnaGraphSize);
-}
-
-
-uint32_t guessGrouping(Gna2Model const& model) {
-    if (model.NumberOfOperations == 0 ||
-        model.Operations == nullptr ||
-        model.Operations[0].Operands == nullptr ||
-        model.Operations[0].NumberOfOperands == 0 ||
-        model.Operations[0].Operands[0]->Shape.NumberOfDimensions < 2) {
-        THROW_GNA_EXCEPTION << "Can not guess grouping";
-    }
-    return (std::min)(model.Operations[0].Operands[0]->Shape.Dimensions[0], model.Operations[0].Operands[0]->Shape.Dimensions[1]);
-}
-
-void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostream & os) const {
-    os.exceptions(std::ostream::failbit);
-
-    const std::vector<Gna2Operation>
-        layers(gna2Model->Operations, gna2Model->Operations + gna2Model->NumberOfOperations);
-
-
-    // all offsets will be from this pointer
-    auto getOffsetFromBase = [basePointer, &gnaGraphSize](void * pointer, const char * name = nullptr) {
-        auto offset = static_cast<uint64_t>(std::distance(reinterpret_cast<uint8_t*>(basePointer), reinterpret_cast<uint8_t*>(pointer)));
-        if (offset > gnaGraphSize) {
-            THROW_GNA_EXCEPTION << "offset to " << (name == nullptr ? "" : name) << "(0x" << pointer
-                << ") not in range segment retuned from GNAAlloc(0x" << basePointer << "-0x"
-                << reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(basePointer) + gnaGraphSize) << ")";
-        }
-        return offset;
-    };
-
-    auto getTensorWithProperOffset = [&getOffsetFromBase](const Gna2Tensor& tensor) {
-        Gna2Tensor out = tensor;
-        out.Data = reinterpret_cast<void*>(getOffsetFromBase(tensor.Data));
-        return out;
-    };
-
-    auto convert_to_serial = [getOffsetFromBase](const GNAModelSerial::RuntimeEndPoint& ep) {
-        ModelHeader::EndPoint out;
-        out.elements_count = ep.elements_count;
-        out.descriptor_offset = offsetFromBase(ep.descriptor_ptr);
-        out.scaleFactor = ep.scaleFactor;
-        out.element_size = ep.element_size;
-        return out;
-    };
-    /**
-     * writing header
-     */
-    ModelHeader header;
-    header.gnam[0] = 'G';
-    header.gnam[1] = 'N';
-    header.gnam[2] = 'A';
-    header.gnam[3] = 'M';
-    header.headerSize = sizeof(ModelHeader);
-    header.version.major = HEADER_MAJOR;
-    header.version.minor = HEADER_MINOR;
-    header.gnaMemSize = gnaGraphSize;
-    header.layersCount = layers.size();
-    header.nGroup = guessGrouping(*gna2Model);
-    header.input = convert_to_serial(input);
-    header.output = convert_to_serial(output);
-
-    header.nRotateRows = nRotateRows;
-    header.nRotateColumns = nRotateColumns;
-
-
-    writeBits(header, os);
-
-    for (const auto & layer : layers) {
-        writeBits(static_cast<uint32_t>(layer.Type), os);
-        writeBits(layer.NumberOfOperands, os);
-
-        for (uint32_t i = 0; i < layer.NumberOfOperands; i++) {
-            if (layer.Operands[i] == nullptr)
-                writeBits(Gna2Tensor{}, os);
-            else
-                writeBits(getTensorWithProperOffset(*layer.Operands[i]), os);
-        }
-
-        writeBits(layer.NumberOfParameters, os);
-
-        // writing parameters
-        switch (layer.Type) {
-        case Gna2OperationTypeElementWiseAffine:
-        case Gna2OperationTypeFullyConnectedAffine:
-        case Gna2OperationTypeConvolution:
-        case Gna2OperationTypeCopy:
-            break;
-        case Gna2OperationTypeRecurrent:
-            THROW_GNA_EXCEPTION << "Exporting of recurrent operation not supported";
-        case Gna2OperationTypeTransposition:
-            THROW_GNA_EXCEPTION << "Exporting of interleave operation not supported";
-        default:
-            THROW_GNA_EXCEPTION << "Exporting of unknown GNA operation type(" << layer.Type << ")  not supported";
-        }
-        for (uint32_t i = 0; i < layer.NumberOfParameters; i++) {
-            if (layer.Parameters[i] == nullptr) {
-                writeBits(static_cast<uint32_t>(0), os);
-                continue;
-            }
-            const auto paramSize = GnaParamSize.at(layer.Type).at(i);
-            writeBits(paramSize, os);
-            writeNBytes(layer.Parameters[i], paramSize, os);
-        }
-    }
-    // writing memory information
-    writeBits(static_cast<uint32_t>(states.size()), os);
-    for (auto && state : states) {
-        writeBits(offsetFromBase(state.first), os);
-        writeBits(state.second, os);
-    }
-
-    // once structure has been written lets push gna graph
-    os.write(reinterpret_cast<char*>(basePointer), gnaGraphSize);
-}
-#else
-
-void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream & is) {
-    is.exceptions(std::istream::failbit);
-
-    auto readPwl = [&is, basePointer](intel_pwl_func_t & value) {
+    auto readPwl = [&is, basePointer] (intel_pwl_func_t & value) {
         readBits(value.nSegments, is);
         if (value.nSegments != 0) {
             readOffset(value.pSegments, basePointer, is);
@@ -338,74 +104,60 @@ void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream
 
         // reading layers structs
         switch (layer->nLayerKind) {
-        case INTEL_AFFINE_DIAGONAL:
-        case INTEL_AFFINE: {
-            layer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
-            if (layer->pLayerStruct == nullptr) {
-                THROW_GNA_EXCEPTION << "could not allocate memory for intel_affine_layer_t structure.";
-            }
-
-            auto &affine = *reinterpret_cast<intel_affine_layer_t *>(layer->pLayerStruct);
-            readBits(affine.affine.nBytesPerWeight, is);
-            readBits(affine.affine.nBytesPerBias, is);
-            readOffset(affine.affine.pWeights, basePointer, is);
-            readOffset(affine.affine.pBiases, basePointer, is);
-            readPwl(affine.pwl);
-            break;
-        }
-        case INTEL_CONVOLUTIONAL: {
-            layer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64);
-            if (layer->pLayerStruct == nullptr) {
-                THROW_GNA_EXCEPTION << "could not allocate memory for intel_convolutional_layer_t structure.";
+            case INTEL_AFFINE_DIAGONAL:
+            case INTEL_AFFINE: {
+                layer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
+                if (layer->pLayerStruct == nullptr) {
+                    THROW_GNA_EXCEPTION << "could not allocate memory for intel_affine_layer_t structure.";
+                }
+
+                auto &affine = *reinterpret_cast<intel_affine_layer_t *>(layer->pLayerStruct);
+                readBits(affine.affine.nBytesPerWeight, is);
+                readBits(affine.affine.nBytesPerBias, is);
+                readOffset(affine.affine.pWeights, basePointer, is);
+                readOffset(affine.affine.pBiases, basePointer, is);
+                readPwl(affine.pwl);
+                break;
             }
-
-            auto &convolution = *reinterpret_cast<intel_convolutional_layer_t *>(layer->pLayerStruct);
-            readBits(convolution.nFilterCoefficients, is);
-            readBits(convolution.nBytesFilterCoefficient, is);
-            readBits(convolution.nBytesBias, is);
-            readBits(convolution.nFilters, is);
-            readBits(convolution.nFeatureMaps, is);
-            readBits(convolution.nFeatureMapRows, is);
-            readBits(convolution.nFeatureMapColumns, is);
-            readBits(convolution.nFilterRows, is);
-            readOffset(convolution.pFilters, basePointer, is);
-            readOffset(convolution.pBiases, basePointer, is);
-            readBits(convolution.nPoolSize, is);
-            readBits(convolution.nPoolStride, is);
-            readBits(convolution.poolType, is);
-            readPwl(convolution.pwl);
-            break;
-        }
-
-        case INTEL_COPY: {
-            layer->pLayerStruct = _mm_malloc(sizeof(intel_copy_layer_t), 64);
-            if (layer->pLayerStruct == nullptr) {
-                THROW_GNA_EXCEPTION << "could not allocate memory for intel_copy_layer_t structure.";
+            case INTEL_CONVOLUTIONAL: {
+                layer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64);
+                if (layer->pLayerStruct == nullptr) {
+                    THROW_GNA_EXCEPTION <<"could not allocate memory for intel_convolutional_layer_t structure.";
+                }
+
+                auto &convolution = *reinterpret_cast<intel_convolutional_layer_t *>(layer->pLayerStruct);
+                readBits(convolution.nFilterCoefficients, is);
+                readBits(convolution.nBytesFilterCoefficient, is);
+                readBits(convolution.nBytesBias, is);
+                readBits(convolution.nFilters, is);
+                readBits(convolution.nFeatureMaps, is);
+                readBits(convolution.nFeatureMapRows, is);
+                readBits(convolution.nFeatureMapColumns, is);
+                readBits(convolution.nFilterRows, is);
+                readOffset(convolution.pFilters, basePointer, is);
+                readOffset(convolution.pBiases, basePointer, is);
+                readBits(convolution.nPoolSize, is);
+                readBits(convolution.nPoolStride, is);
+                readBits(convolution.poolType, is);
+                readPwl(convolution.pwl);
+                break;
             }
 
-            auto &copy = *reinterpret_cast<intel_copy_layer_t *>(layer->pLayerStruct);
-            readBits(copy.nCopyRows, is);
-            readBits(copy.nCopyCols, is);
-            break;
-        }
-
-        case INTEL_RECURRENT:
-            THROW_GNA_EXCEPTION << "Importing of recurrent layer not supported";
-        case INTEL_INTERLEAVE:
-            THROW_GNA_EXCEPTION << "Importing of interleave layer not supported";
-        case INTEL_DEINTERLEAVE:
-            THROW_GNA_EXCEPTION << "Importing of deinterleave layer not supported";
-        default:
-            THROW_GNA_EXCEPTION << "Importing of unknown GNA layer kind(" << layer->nLayerKind << ")  not supported";
+            case INTEL_RECURRENT:
+                THROW_GNA_EXCEPTION << "Importing of recurrent layer not supported";
+            case INTEL_INTERLEAVE:
+                THROW_GNA_EXCEPTION << "Importing of interleave layer not supported";
+            case INTEL_DEINTERLEAVE:
+                THROW_GNA_EXCEPTION << "Importing of deinterleave layer not supported";
+            case INTEL_COPY:
+                THROW_GNA_EXCEPTION << "Importing of copy layer not supported";
+            default:
+                THROW_GNA_EXCEPTION << "Importing of unknown GNA layer kind(" << layer->nLayerKind << ")  not supported";
         }
 
         // reading offsets of inputs/outputs
         readOffset(layer->pInputs, basePointer, is);
-        if (layer->nLayerKind == INTEL_COPY) {
-            layer->pOutputsIntermediate = nullptr;
-        } else {
-            readOffset(layer->pOutputsIntermediate, basePointer, is);
-        }
+        readOffset(layer->pOutputsIntermediate, basePointer, is);
         readOffset(layer->pOutputs, basePointer, is);
     }
 
@@ -417,13 +169,13 @@ void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream
     }
 
     for (int i = 0; i != nStates; i++) {
-        void *pSegment;
-        readOffset(pSegment, basePointer, is);
-        uint32_t segmentSz;
-        readBits(segmentSz, is);
-        if (pstates) {
-            (*pstates)[i] = { pSegment, segmentSz };
-        }
+       void *pSegment;
+       readOffset(pSegment, basePointer, is);
+       uint32_t segmentSz;
+       readBits(segmentSz, is);
+       if (pstates) {
+           (*pstates)[i] = {pSegment, segmentSz};
+       }
     }
 
 
@@ -431,6 +183,10 @@ void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream
     is.read(reinterpret_cast<char*>(basePointer), gnaGraphSize);
 }
 
+#define offsetFromBase(field)\
+getOffsetFromBase(field, #field)
+
+
 /**
  *
  * @param ptr_nnet
@@ -438,7 +194,6 @@ void GNAModelSerial::Import(void *basePointer, size_t gnaGraphSize, std::istream
  * about base adress it is relatively easy to calculate
  * @param os
  */
-
 void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostream & os) const {
     os.exceptions(std::ostream::failbit);
 
@@ -451,7 +206,7 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
         auto offset = static_cast<uint64_t >(std::distance(reinterpret_cast<uint8_t*>(basePointer), reinterpret_cast<uint8_t*>(pointer)));
         if (offset > gnaGraphSize) {
             THROW_GNA_EXCEPTION << "offset to " << (name == nullptr ? "" : name) << "(0x" << pointer
-                               << ") not in range segment returned from GNAAlloc(0x" << basePointer << "-0x"
+                               << ") not in range segment retuned from GNAAlloc(0x" << basePointer << "-0x"
                                << reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(basePointer) + gnaGraphSize) << ")";
         }
         return offset;
@@ -536,28 +291,21 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
                 break;
             }
 
-            case INTEL_COPY: {
-                auto &copy = *reinterpret_cast<intel_copy_layer_t *>(layer.pLayerStruct);
-                writeBits(copy.nCopyRows, os);
-                writeBits(copy.nCopyCols, os);
-                break;
-            }
-
             case INTEL_RECURRENT:
                 THROW_GNA_EXCEPTION << "Exporting of recurrent layer not supported";
             case INTEL_INTERLEAVE:
                 THROW_GNA_EXCEPTION << "Exporting of interleave layer not supported";
             case INTEL_DEINTERLEAVE:
                 THROW_GNA_EXCEPTION << "Exporting of deinterleave layer not supported";
+            case INTEL_COPY:
+                THROW_GNA_EXCEPTION << "Exporting of copy layer not supported";
             default:
                 THROW_GNA_EXCEPTION << "Exporting of unknown GNA layer kind(" << layer.nLayerKind << ")  not supported";
         }
 
         // writing offsets from base.
         writeBits(offsetFromBase(layer.pInputs), os);
-        if (layer.nLayerKind != INTEL_COPY) {
-            writeBits(offsetFromBase(layer.pOutputsIntermediate), os);
-        }
+        writeBits(offsetFromBase(layer.pOutputsIntermediate), os);
         writeBits(offsetFromBase(layer.pOutputs), os);
     }
     // writing memory information
@@ -570,5 +318,3 @@ void GNAModelSerial::Export(void * basePointer, size_t gnaGraphSize, std::ostrea
     // once structure has been written lets push gna graph
     os.write(reinterpret_cast<char*>(basePointer), gnaGraphSize);
 }
-
-#endif
diff --git a/inference-engine/src/gna_plugin/gna_model_serial.hpp b/inference-engine/src/gna_plugin/gna_model_serial.hpp
index 28dacfb306fe6b..0ba5be5ab16e55 100644
--- a/inference-engine/src/gna_plugin/gna_model_serial.hpp
+++ b/inference-engine/src/gna_plugin/gna_model_serial.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -8,10 +8,6 @@
 #include <vector>
 #include <utility>
 #include "gna-api.h"
-#include "gna_plugin_log.hpp"
-#if GNA_LIB_VER == 2
-#include "gna2-model-api.h"
-#endif
 
 #pragma pack(push, 1)
 
@@ -19,16 +15,10 @@
  * version history
  * 1.0 - basic support
  * 1.1 - added memory information
- * 2.0 - for use with GNA2 library
  */
-#if GNA_LIB_VER == 2
-#define HEADER_MAJOR 2
-#define HEADER_MINOR 0
-#else
+
 #define HEADER_MAJOR 1
 #define HEADER_MINOR 1
-#endif
-
 
 /**
  * @brief Header version 1.0
@@ -64,10 +54,12 @@ struct ModelHeader {
      * @brief Number of GNA Layers
      */
     uint64_t layersCount = 0ull;
+
     /**
      * @brief Grouping level
      */
     uint32_t nGroup = 0u;
+
     /**
      * Convolution related setting - they are affecting input transformation
      */
@@ -108,25 +100,25 @@ struct ModelHeader {
 class GNAModelSerial {
  public:
     /*
-     * In runtime endpoint mostly same as in serial version, except of descriptor field
+     * In runtime endpoint mostly same as in serial version, except pf descriptor field
      */
     struct RuntimeEndPoint {
         /**
          * if scale factor is different then pased into infer , network might need to be requantized
          */
-        float scaleFactor = 0;
+        float scaleFactor;
         /**
          * Pointer descriptor
          */
-        void* descriptor_ptr = nullptr;
+        void* descriptor_ptr;
         /**
          * Endpoint resolution in bytes.
          */
-        uint32_t element_size = 0;
+        uint32_t element_size;
         /**
          * Number of elements
          */
-        uint32_t elements_count = 0;
+        uint32_t elements_count;
 
         RuntimeEndPoint() = default;
         RuntimeEndPoint(double scaleFactor,
@@ -141,11 +133,7 @@ class GNAModelSerial {
     using MemoryType = std::vector<std::pair<void*, uint32_t>>;
 
 private:
-#if GNA_LIB_VER == 2
-    Gna2Model * gna2Model;
-#else
     intel_nnet_type_t *ptr_nnet;
-#endif
     RuntimeEndPoint input, output;
     uint32_t nRotateRows = 0;
     uint32_t nRotateColumns = 0;
@@ -153,41 +141,28 @@ class GNAModelSerial {
     MemoryType states, *pstates = nullptr;
 
  public:
-#if GNA_LIB_VER == 2
-    GNAModelSerial(Gna2Model * model, MemoryType & states_holder)
-        : gna2Model(model), pstates(&states_holder) {
+    /**
+     *
+     * @brief Used for import/export
+     * @param ptr_nnet
+     * @param inputScale  - in/out parameter representing input scale factor
+     * @param outputScale - in/out parameter representing output scale factor
+     */
+    GNAModelSerial(intel_nnet_type_t *ptr_nnet, MemoryType &states_holder)
+        : ptr_nnet(ptr_nnet) , pstates(&states_holder) {
     }
 
+    /**
+     * @brief used for export only since runtime params are not passed by pointer
+     * @param ptr_nnet
+     * @param runtime
+     */
     GNAModelSerial(
-        Gna2Model * model,
+        intel_nnet_type_t *ptr_nnet,
         RuntimeEndPoint input,
-        RuntimeEndPoint output) : gna2Model(model), input(input), output(output) {
+        RuntimeEndPoint output) : ptr_nnet(ptr_nnet), input(input), output(output) {
     }
 
-#else
-     /**
-  *
-  * @brief Used for import/export
-  * @param ptr_nnet
-  * @param inputScale  - in/out parameter representing input scale factor
-  * @param outputScale - in/out parameter representing output scale factor
-  */
-     GNAModelSerial(intel_nnet_type_t *ptr_nnet, MemoryType &states_holder)
-         : ptr_nnet(ptr_nnet), pstates(&states_holder) {
-     }
-
-     /**
-      * @brief used for export only since runtime params are not passed by pointer
-      * @param ptr_nnet
-      * @param runtime
-      */
-     GNAModelSerial(
-         intel_nnet_type_t *ptr_nnet,
-         RuntimeEndPoint input,
-         RuntimeEndPoint output) : ptr_nnet(ptr_nnet), input(input), output(output) {
-     }
-#endif
-
     GNAModelSerial & SetInputRotation(uint32_t nRotateRows, uint32_t nRotateColumns) {
       this->nRotateColumns = nRotateColumns;
       this->nRotateRows = nRotateRows;
@@ -231,4 +206,4 @@ class GNAModelSerial {
     void Export(void *basePtr,
                 size_t gnaGraphSize,
                 std::ostream &os) const;
-};
+};
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/gna_plugin.cpp b/inference-engine/src/gna_plugin/gna_plugin.cpp
index 1a961647832ec1..620aa489c1b175 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.cpp
@@ -1,13 +1,32 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #define NOMINMAX
+#include "cpp_interfaces/base/ie_plugin_base.hpp"
+#include "gna_plugin.hpp"
+#include "ie_plugin_config.hpp"
+#include "debug.h"
+#include "blob_factory.hpp"
+#include "gna_plugin_log.hpp"
+#include "gna_layer_info.hpp"
+#include <utility>
+#include <limits>
+#include "ie_memcpy.h"
+
+#ifdef PLOT
+void ExportGnaNetworkAndrzej(const char *ptr_name, intel_nnet_type_t* pNeuralNetwork);
+#endif
 
-#include <cstdlib>
+#include <stdlib.h>
+#include <stdio.h>
 #include <iostream>
+#include <fstream>
+#include <stdexcept>
 #include <vector>
-#include <cstring>
+#include <malloc.h>
+#include <math.h>
+#include <string.h>
 #include <list>
 #include <algorithm>
 #include <map>
@@ -15,62 +34,87 @@
 #include <unordered_map>
 #include <unordered_set>
 #include <memory>
-#include <utility>
-#include <limits>
-
-#include <low_precision_transformations/blob_transformation.hpp>
-#include <graph_tools.hpp>
-#include <debug.h>
-#include <gna/gna_config.hpp>
+#include <dnn_memory.hpp>
+#include <ie_layers.h>
+#include "details/caseless.hpp"
+#include <gna-api-types-xnn.h>
+#include "gna-api.h"
+#include "gna-api-dumper.h"
+#include "dnn.h"
+#include "pwl.h"
+#include "util.h"
+#include "quantization/quantization.h"
+#include "lstm.hpp"
+#include "graph_tools.hpp"
 #include "gna_plugin_config.hpp"
-#include <ie_util_internal.hpp>
-#include "gna_plugin.hpp"
-#include "optimizer/gna_pass_manager.hpp"
-#include "layers/gna_layer_type.hpp"
-#include "preprocessing.hpp"
-#include "frontend/weights_converter.hpp"
-#include "frontend/model_quantizer.hpp"
-#include "gna_fused_iterator.hpp"
-#include "backend/am_intel_dnn.hpp"
-#include "memory/gna_allocator.hpp"
-#include "memory/gna_memory_state.hpp"
+#include "gna/gna_config.hpp"
+#include "quantization/model_quantizer.hpp"
 #include "gna_model_serial.hpp"
+#include "gna_memory_state.hpp"
+#include "details/ie_cnn_network_tools.h"
 
-#if GNA_LIB_VER == 2
-#include <gna2-model-api.h>
-
-uint32_t ToByteSize(const Gna2DataType type) {
-    switch (type) {
-    case Gna2DataTypeInt8:
-    case Gna2DataTypeUint8:
-        return 1;
-    case Gna2DataTypeInt16:
-    case Gna2DataTypeUint16:
-        return 2;
-    case Gna2DataTypeInt32:
-    case Gna2DataTypeUint32:
-        return 4;
-    case Gna2DataTypeInt64:
-    case Gna2DataTypeUint64:
-        return 8;
-    default:
-        return 0;
-    }
-}
-
-constexpr uint32_t GNAPluginNS::GNAPlugin::FAKE_REQUEST_CONFIG_ID;
-#endif
 using namespace InferenceEngine;
 using namespace std;
 using namespace GNAPluginNS;
 using namespace InferenceEngine::details;
 
-#ifdef __clang__
-namespace InferenceEngine {
-    template<>
-    InferenceEngine::TBlob<intel_compound_bias_t, std::enable_if<true, void> >::~TBlob() { free(); }
+#ifdef VERBOSE
+#define VERBOSE_LEVEL (1)
+#else
+#define VERBOSE_LEVEL (0)
+#endif
+
+#ifdef PLOT
+#define PLOT_LEVEL (1)
+#else
+#define PLOT_LEVEL (0)
+#endif
+
+
+#define PAGE_SIZE_BYTES 4096
+
+#define FROM_IR_DIM(mem, idx)\
+((mem->dims.size() > idx - 1) ? mem->dims[idx - 1] : 1)
+
+inline int16_t GNAPluginNS::ConvertFloatToInt16(float src) {
+        float rounding_value = (src > 0) ? 0.5f : -0.5f;
+        float value = src + rounding_value;
+        if (value > 32767.0) {
+            return 32767;
+        } else if (value < -32768.0) {
+            return -32768;
+        }
+        return (int16_t)value;
+}
+
+void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
+                    const float *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor) {
+    if (!ptr_dst || !ptr_src) {
+        return;
+    }
+    for (uint32_t i = 0; i < num_rows*num_columns; i++) {
+        ptr_dst[i] = GNAPluginNS::ConvertFloatToInt16(ptr_src[i]*scale_factor);
+    }
+}
+void GNAPluginNS::ConvertToFloat(float *ptr_dst,
+                    int32_t *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor) {
+    if (!ptr_dst || !ptr_src) {
+        return;
+    }
+    for (uint32_t i = 0; i < num_rows; i++) {
+        int32_t *ptr_int_row = ptr_src + i * num_columns;
+        float *ptr_float_row = ptr_dst + i * num_columns;
+        for (uint32_t j = 0; j < num_columns; j++) {
+            ptr_float_row[j] = static_cast<float>(ptr_int_row[j]) / scale_factor;
+        }
+    }
 }
-#endif  // __clang__
 
 template <typename T, typename U>
 void GNAPlugin::copyInputData(T *dst,
@@ -79,8 +123,7 @@ void GNAPlugin::copyInputData(T *dst,
                 uint32_t num_group,
                 uint32_t num_vector_elements,
                 uint32_t num_vector_stride,
-                intel_dnn_orientation_t orientation,
-                float scaleFactor) {
+                intel_dnn_orientation_t orientation) {
     if (!dst || !src) {
         return;
     }
@@ -88,7 +131,7 @@ void GNAPlugin::copyInputData(T *dst,
         for (uint32_t i = 0; i < num_frames; i++) {
             for (uint32_t j = 0; j < num_vector_elements; j++) {
                 if (!std::is_same<T, U>::value) {
-                    dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * scaleFactor);
+                    dst[j * num_group + i] = GNAPluginNS::ConvertFloatToInt16(src[i * num_vector_elements + j] * input_scale_factor);
                 } else {
                     dst[j * num_group + i] = src[i * num_vector_elements + j];
                 }
@@ -107,26 +150,25 @@ void GNAPlugin::copyInputData(T *dst,
     } else {
         if (!std::is_same<T, U>::value) {
             for (uint32_t i = 0; i < num_frames; i++) {
-                T *ptr_dst_vec = reinterpret_cast<T *>(dst) + i * num_vector_stride;
-                const U *ptr_src_vec = reinterpret_cast<const U *>(src) + i * num_vector_elements;
+                T *ptr_dst_vec = const_cast<T *>(reinterpret_cast<const T *>(dst) + i * num_vector_stride);
+                U *ptr_src_vec = const_cast<U *>(reinterpret_cast<const U *>(src) + i * num_vector_elements);
                 std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
                 for (int j=0; j < num_vector_elements; j++) {
-                    ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * scaleFactor);
+                    ptr_dst_vec[j] = GNAPluginNS::ConvertFloatToInt16(ptr_src_vec[j] * input_scale_factor);
                 }
             }
 
         } else {
             for (uint32_t i = 0; i < num_frames; i++) {
-                void *ptr_dst_vec = reinterpret_cast<uint8_t *>(dst) + i * num_vector_stride * sizeof(T);
-                const void *ptr_src_vec = reinterpret_cast<const uint8_t *>(src) + i * num_vector_elements * sizeof(U);
+                void *ptr_dst_vec = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(dst) + i * num_vector_stride * sizeof(T));
+                void *ptr_src_vec = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(src) + i * num_vector_elements * sizeof(U));
                 std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
-                ie_memcpy(ptr_dst_vec, num_vector_elements * sizeof(T),
-                    ptr_src_vec, num_vector_elements * sizeof(T));
+                std::memcpy(ptr_dst_vec, ptr_src_vec, num_vector_elements * sizeof(T));
             }
         }
 
         for (uint32_t i = num_frames; i < num_group; i++) {
-            void *ptr_dst_vec = reinterpret_cast<uint8_t *>(dst) + i * num_vector_stride * sizeof(T);
+            void *ptr_dst_vec = const_cast<uint8_t *>(reinterpret_cast<const uint8_t *>(dst) + i * num_vector_stride * sizeof(T));
             std::memset(ptr_dst_vec, 0, num_vector_stride * sizeof(T));
         }
     }
@@ -136,8 +178,7 @@ template <typename T, typename U>
 void GNAPlugin::copyInputDataWithSplit(T *const dst,
                 const U *src,
                 const GNASplitLayer& splitInfo,
-                size_t precision_size,
-                int idx) {
+                size_t precision_size) {
     if (!dst || !src) {
         return;
     }
@@ -146,15 +187,11 @@ void GNAPlugin::copyInputDataWithSplit(T *const dst,
     precision_size = sizeof(T);
     // we found split/slice layer connected to Input
     for (auto&& outputLayer : splitInfo.splitOutputLayers) {
-        uint32_t begin = outputLayer.offset / precision_size;
+        uint32_t begin = outputLayer.offset/precision_size;
         uint32_t end = (outputLayer.offset + outputLayer.pure_size)/precision_size;
-        if (dst_ptr - dst >= end) {
-            // output layer with bind pointer as previous one. Skip
-            continue;
-        }
         for (uint32_t i = begin; i < end; ++i) {
             if (!std::is_same<T, U>::value) {
-                *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * inputsDesc->getScaleFactor(idx));
+                *(dst_ptr++) = GNAPluginNS::ConvertFloatToInt16(*(src_ptr++) * input_scale_factor);
             } else {
                 *(dst_ptr++) = *(src_ptr++);
             }
@@ -167,7 +204,7 @@ void GNAPlugin::copyInputDataWithSplit(T *const dst,
 }
 
 void GNAPlugin::ExportScores(void *ptr_dst,
-                  const void *ptr_src,
+                  void *ptr_src,
                   intel_dnn_orientation_t orientation,
                   uint32_t num_frames,
                   uint32_t num_group,
@@ -181,7 +218,7 @@ void GNAPlugin::ExportScores(void *ptr_dst,
     if (orientation == kDnnInterleavedOrientation) {
         if (num_bytes_per_element == 2) {
             int16_t *dst = reinterpret_cast<int16_t *>(ptr_dst);
-            const int16_t *src = reinterpret_cast<const int16_t *>(ptr_src);
+            int16_t *src = reinterpret_cast<int16_t *>(ptr_src);
             for (uint32_t i = 0; i < num_frames; i++) {
                 for (uint32_t j = 0; j < num_active_elements; j++) {
                     dst[i * num_vector_elements + j] = src[j * num_group + i];
@@ -192,7 +229,7 @@ void GNAPlugin::ExportScores(void *ptr_dst,
             }
         } else if (num_bytes_per_element == 4) {  // should work for both int and float
             int32_t *dst = reinterpret_cast<int32_t *>(ptr_dst);
-            const int8_t *src = reinterpret_cast<const int8_t*>(ptr_src);
+            int8_t *src = reinterpret_cast<int8_t*>(ptr_src);
             for (uint32_t i = 0; i < num_frames; i++) {
                 for (uint32_t j = 0; j < num_active_elements; j++) {
                     auto input_ptr = src + (j * num_group + i) * num_bytes_per_element_input;
@@ -200,11 +237,11 @@ void GNAPlugin::ExportScores(void *ptr_dst,
 
                     switch (num_bytes_per_element_input) {
                         case 2 : {
-                            *dst_ptr  = static_cast<int32_t>(*reinterpret_cast<const int16_t*>(input_ptr));
+                            *dst_ptr  = static_cast<int32_t>(*reinterpret_cast<int16_t*>(input_ptr));
                             break;
                         }
                         case 4 : {
-                            *dst_ptr = *reinterpret_cast<const int32_t *>(input_ptr);
+                            *dst_ptr  = *reinterpret_cast<int32_t*>(input_ptr);
                             break;
                         }
                         default:
@@ -221,19 +258,17 @@ void GNAPlugin::ExportScores(void *ptr_dst,
     } else {
         if (num_bytes_per_element == 2) {
             for (uint32_t i = 0; i < num_frames; i++) {
-                auto ptr_dst_vec = reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(int16_t);
-                auto ptr_src_vec = reinterpret_cast<const uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(int16_t);
+                void *ptr_dst_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(int16_t));
+                void *ptr_src_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(int16_t));
                 memset(ptr_dst_vec, 0, num_vector_elements * sizeof(int16_t));
-                ie_memcpy(ptr_dst_vec, num_active_elements * sizeof(int16_t),
-                    ptr_src_vec, num_active_elements * sizeof(int16_t));
+                memcpy(ptr_dst_vec, ptr_src_vec, num_active_elements * sizeof(int16_t));
             }
         } else if (num_bytes_per_element == 4) {  // should work for both int and float
             for (uint32_t i = 0; i < num_frames; i++) {
-                void *ptr_dst_vec = reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(float);
-                const void *ptr_src_vec = reinterpret_cast<const uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(float);
+                void *ptr_dst_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_dst) + i * num_vector_elements * sizeof(float));
+                void *ptr_src_vec = reinterpret_cast<void *> (reinterpret_cast<uint8_t *>(ptr_src) + i * num_vector_stride * sizeof(float));
                 memset(ptr_dst_vec, 0, num_vector_elements * sizeof(float));
-                ie_memcpy(ptr_dst_vec, num_active_elements * sizeof(float),
-                    ptr_src_vec, num_active_elements * sizeof(float));
+                memcpy(ptr_dst_vec, ptr_src_vec, num_active_elements * sizeof(float));
             }
         } else {
             THROW_GNA_EXCEPTION << "Unsupported target precision for infer : " << num_bytes_per_element << "bytes";
@@ -245,105 +280,1204 @@ void GNAPlugin::ImportFrames(
                   void *ptr_dst,
                   const void *ptr_src,
                   Precision input_precision,
-                  float scaleFactor,
                   intel_dnn_orientation_t orientation,
                   uint32_t num_frames,
                   uint32_t num_group,
                   uint32_t num_vector_elements,
                   uint32_t num_vector_stride) {
+    // special case if split/slice layers connected
+    // with Input detected
+    auto it = split_connection.end();
+    if (split_connection.size() != 0) {
+        it = std::find_if(split_connection.begin(), split_connection.end(), []
+                    (const std::pair<std::string, GNASplitLayer> &item) -> bool {
+                        return CaselessEq<std::string>()(item.second.splitInputLayer.name, "Input");
+                    });
+    }
     if (orientation == kDnnInterleavedOrientation) {
         // TODO : fix that as well
-        if (input_precision == Precision::U8) {
-            auto src = reinterpret_cast<const uint8_t *>(ptr_src);
-            auto dst = reinterpret_cast<int16_t *>(ptr_dst);
-            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
-        } else if (input_precision.size() == 2) {
-            auto dst = reinterpret_cast<int16_t *>(ptr_dst);
-            auto src = reinterpret_cast<const int16_t *>(ptr_src);
-            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+        if (input_precision.size() == 2) {
+            int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
+            int16_t *src = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_src));
+            if (it != split_connection.end()) {
+                copyInputDataWithSplit(dst, src, it->second, input_precision.size());
+            } else {
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+            }
         } else if (input_precision.size() == 4) {
             if (!gnadevice) {
-                auto dst = reinterpret_cast<float *>(ptr_dst);
-                auto src = reinterpret_cast<const float *>(ptr_src);
-                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
+                float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
+                if (it != split_connection.end()) {
+                    copyInputDataWithSplit(dst, src, it->second, input_precision.size());
+                } else {
+                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+                }
             } else {
-                auto dst = reinterpret_cast<int16_t *>(ptr_dst);
-                auto src = reinterpret_cast<const float *>(ptr_src);
-                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                int16_t *dst = reinterpret_cast<int16_t *>(ptr_dst);
+                const float *src = reinterpret_cast<const float *>(ptr_src);
+                if (it != split_connection.end()) {
+                    copyInputDataWithSplit(dst, src, it->second, input_precision.size());
+                } else {
+                    copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+                }
             }
         }
     } else {
-        if (input_precision == Precision::U8) {
-            auto src = reinterpret_cast<const uint8_t *>(ptr_src);
+        if (input_precision.size()== 2) {
+            int16_t *dst = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_dst));
+            int16_t *src = const_cast<int16_t *>(reinterpret_cast<const int16_t *>(ptr_src));
+            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
+        } else if (input_precision.size() == 4) {
             if (!gnadevice) {
-                auto dst = reinterpret_cast<float *>(ptr_dst);
-                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                float *dst = const_cast<float *>(reinterpret_cast<const float *>(ptr_dst));
+                float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
             } else {
-                auto dst = reinterpret_cast<int16_t *>(ptr_dst);
-                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                uint16_t *dst = const_cast<uint16_t *>(reinterpret_cast<const uint16_t *>(ptr_dst));
+                float *src = const_cast<float *>(reinterpret_cast<const float *>(ptr_src));
+                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation);
             }
+        }
+    }
+}
 
-        } else if (input_precision.size()== 2) {
-            auto dst = reinterpret_cast<int16_t *>(ptr_dst);
-            auto src = reinterpret_cast<const int16_t *>(ptr_src);
-            copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
-        } else if (input_precision.size() == 4) {
-            if (!gnadevice) {
-                auto dst = reinterpret_cast<float *>(ptr_dst);
-                auto src = reinterpret_cast<const float *>(ptr_src);
-                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+void GNAPlugin::fillMemoryConnections(std::map<std::string,
+                                            std::vector<InferenceEngine::CNNLayerPtr>>&
+                                                                            memoryPairs) {
+    for (auto &memory : memoryPairs) {
+        auto inputLayer = memory.second[1];
+        auto outputLayer = memory.second[0];
+
+        IE_ASSERT(1 == outputLayer->insData.size());
+
+        // creating connection for layers output as form of extramap
+        memory_connection.emplace_back(memory.first, GNAMemoryLayer(inputLayer, outputLayer));
+    }
+}
+
+void GNAPlugin::fillConcatConnections(InferenceEngine::CNNLayerPtr layer) {
+    // creating connection for each layer outputs as form of extramap
+    GNAPlugin::GNAConcatLayer layerInfoItem(layer);
+    size_t concat_size = 0;
+    std::string& id = layer->name;
+
+    for (size_t i = 0; i < layer->insData.size(); ++i) {
+        auto dataInput = layer->insData[i].lock();
+        if (!dataInput) {
+            THROW_GNA_EXCEPTION << "Input layer pointer for concat is unexpectedly absent";
+        }
+
+        auto ptrConcatLayerInput = dataInput->creatorLayer.lock();
+        if (!ptrConcatLayerInput) {
+            THROW_GNA_EXCEPTION << "Input layer for concat is unexpectedly absent";
+        }
+        layerInfoItem.concatInputLayers.emplace_back(
+                GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo({ptrConcatLayerInput->name, concat_size}));
+
+        size_t layer_size =
+                     InferenceEngine::details::product(begin(dataInput->dims),
+                                                      end(dataInput->dims)) * dataInput->precision.size();
+        concat_size += layer_size;
+    }
+    layerInfoItem.reserved_size = concat_size;
+    concat_connection.emplace(id, layerInfoItem);
+}
+
+void GNAPlugin::fillSplitConnections(InferenceEngine::CNNLayerPtr layer) {
+    // creating connection for each layer inputs as form of extramap
+    GNAPlugin::GNASplitLayer layerInfoItem(layer);
+    size_t split_size = 0;
+    std::string& id = layer->name;
+    auto dataInput = layer->insData.begin()->lock();
+    if (!dataInput) {
+        THROW_GNA_EXCEPTION << "Input layer pointer for split/slice is unexpectedly absent";
+    }
+    auto ptrSplitLayerInput = dataInput->creatorLayer.lock();
+    if (!ptrSplitLayerInput) {
+        THROW_GNA_EXCEPTION << "Input layer for split/slice is unexpectedly absent";
+    }
+
+    LayerInfo ptrSplitLayerInputLayerInfo(ptrSplitLayerInput);
+    for (size_t i = 0; i < layer->outData.size(); ++i) {
+        size_t padding = 0;
+        size_t layer_size = 0;
+        auto& dataOutput = layer->outData[i];
+
+        if (!dataOutput || !dataInput) {
+            THROW_GNA_EXCEPTION << "Output layer pointer for split/slice is unexpectedly absent";
+        }
+
+        for (auto&& ptrSplitLayerOutputPair : dataOutput->getInputTo()) {
+            auto& ptrSplitLayerOutput = ptrSplitLayerOutputPair.second;
+            if (!ptrSplitLayerOutput) {
+                THROW_GNA_EXCEPTION << "Output layer for split/slice is unexpectedly absent";
+            }
+
+            padding = std::max(padding, LayerInfo(ptrSplitLayerOutput).paddingSize())
+                                                        * dataOutput->precision.size();
+            layer_size =
+                    InferenceEngine::details::product(begin(dataOutput->dims),
+                                                     end(dataOutput->dims)) * dataOutput->precision.size();
+
+            layerInfoItem.splitOutputLayers.emplace_back(ptrSplitLayerOutput->name, split_size, layer_size);
+        }
+
+        split_size += ptrSplitLayerInputLayerInfo.isInput() ?
+                                ALIGN64(padding + layer_size):
+                                        padding + layer_size;
+    }
+    layerInfoItem.reserved_size = split_size;
+    layerInfoItem.splitInputLayer =
+                    GNAPlugin::GNASplitLayer::SplitConnectedLayerInfo({ptrSplitLayerInput->type, 0,
+                                                                    InferenceEngine::details::product(begin(dataInput->dims),
+                                                                    end(dataInput->dims)) * dataInput->precision.size()});
+    split_connection.emplace(id, layerInfoItem);
+}
+
+void GNAPlugin::DiagonalPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    AffinePrimitive(layer, true);
+}
+
+void GNAPlugin::ConvolutionPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto &convolution = dynamic_cast<ConvolutionLayer &>(*layer.get());
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    auto inputs = layer->insData.begin()->lock();
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_feature_map_rows = FROM_IR_DIM(inputs, 1) / convolution._stride_x;
+    uint32_t num_feature_map_columns = FROM_IR_DIM(inputs, 3) * convolution._stride_x / num_feature_maps;
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(inputs, 3);
+    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+    uint32_t num_padding = ALIGN(convolution._kernel_x * num_feature_map_columns * num_feature_maps, 8)
+                                            - convolution._kernel_x * num_feature_map_columns * num_feature_maps;
+    void *ptr_inputs;
+    void *ptr_outputs;
+    void *ptr_weights;
+    void *ptr_biases;
+
+    // TODO: questionable why for biases that are no in IR we inventing precision
+    auto biasPrecision = convolution._biases ? convolution._biases->precision() : outputs->precision;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+
+#ifdef PLOT
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name << dnnComponentsForLayer.size() - 1 << "\n";
+#endif
+    auto num_input_padding = ALIGN(num_feature_maps * num_feature_map_columns * num_feature_map_rows, 8)
+                                                        -  num_feature_maps * num_feature_map_columns * num_feature_map_rows;
+    auto num_filter_rows = convolution._kernel_x / convolution._stride_x;
+    dnn.InitConvolutional1DComponent(currentComponent,
+                            1,
+                            num_feature_maps *  num_feature_map_columns * num_feature_map_rows + num_input_padding,
+                            1,
+                            num_rows_out * convolution._out_depth,
+                            inputs->precision.size(),
+                            outputs->precision.size(),
+                            convolution._weights->precision().size(),
+                            biasPrecision.size(),
+                            convolution._out_depth,
+                            num_filter_rows,
+                            num_feature_maps * num_feature_map_columns * num_filter_rows + num_padding,
+
+                            num_feature_maps,  // interesting - why this is so in gna_example
+                            num_feature_map_rows,
+                            num_feature_map_columns,
+
+                            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs,
+                            ptr_weights,
+                            ptr_biases);
+
+    // update num_feature_maps for next convolutional layer
+    num_feature_maps = convolution._out_depth;  // = number of filters
+
+    size_t num_data_bytes_out =
+                        InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+                                                                                * outputs->precision.size();
+
+    size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size();
+
+    auto connectedInputLayer = connectInput(layer, ptr_inputs, num_data_bytes_in).input;
+
+    // TODO: convolution might be not the first layer in sorted order but connected via split for example - dont know how kaldi will handle that
+    if (LayerInfo(connectedInputLayer).isInput()) {
+        //  Kaldi features are opposite orientation
+        dnn.num_rotate_rows = num_feature_map_columns;
+        dnn.num_rotate_columns = num_feature_map_rows;
+    }
+
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+
+    // rotate
+    auto TransposeMatrix = [](uint8_t *ptr_matrix, size_t element_size, uint32_t num_rows, uint32_t num_cols) {
+        std::vector<uint8_t> temp_buffer(num_rows * num_cols * element_size);
+        for (uint32_t i = 0; i < num_rows; i++) {
+            for (uint32_t j = 0; j < num_cols; j++) {
+                    ie_memcpy(&temp_buffer.front() + (j*num_rows + i)*element_size,
+                          temp_buffer.size() - (i * num_cols + j) * element_size,
+                          ptr_matrix + (i*num_cols+j)*element_size,
+                          element_size);
+            }
+        }
+        return temp_buffer;
+    };
+
+    std::vector<uint8_t > transposedWeights;
+    for (uint32_t k = 0; k < convolution._out_depth; k++) {
+        uint8_t *ptr_filt_current
+            = convolution._weights->cbuffer().as<uint8_t *>() + k * num_columns_in * convolution._kernel[X_AXIS] * convolution.precision.size();
+        auto transposedPart = TransposeMatrix(ptr_filt_current, convolution.precision.size(), num_columns_in, convolution._kernel[X_AXIS]);
+        transposedWeights.insert(transposedWeights.end(), transposedPart.begin(), transposedPart.end());
+    }
+
+    if (num_padding == 0) {
+        gnamem->readonly().push_local_ptr(ptr_weights, transposedWeights.data(), convolution._weights->byteSize(), 64);
+    } else {
+        auto elementsIn = convolution._kernel_x * num_feature_map_columns + num_padding;
+        auto paddedWeights = elementsIn * convolution._out_depth;
+        auto paddedWeightsSize = paddedWeights * convolution.precision.size();
+        auto elements_in_row = convolution._kernel_x * num_feature_map_columns;
+        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) {
+            for (int i = 0; i < convolution._out_depth; i++) {
+                memcpy(data,
+                       transposedWeights.data() + elements_in_row * i * convolution.precision.size(),
+                       elements_in_row * convolution.precision.size());
+
+                data = reinterpret_cast<uint8_t *>(data) + elementsIn * convolution.precision.size();
+            }
+        }, 64);
+    }
+
+    if (convolution._biases) {
+        gnamem->readonly().push_ptr(ptr_biases,
+                                    convolution._biases->cbuffer().as<const void *>(),
+                                    convolution._biases->byteSize(),
+                                    64);
+    } else {
+        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+    }
+}
+
+void GNAPlugin::PowerPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto &power = dynamic_cast<PowerLayer &>(*layer.get());
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    if (power.power != 1.0) {
+        THROW_IE_EXCEPTION << "[GNA plugin] unsupported power factor, expected 1 but was " << power.power;
+    }
+
+    auto input = layer->insData[0].lock();
+
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_rows_in = FROM_IR_DIM(input, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(input, 2);
+    uint32_t num_rows_out = num_rows_in;
+
+    void *ptr_inputs;
+    void *ptr_outputs;
+    void *ptr_weights;
+    void *ptr_biases;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+    dnn.InitAffineComponent(currentComponent,
+                            num_rows_in,
+                            num_columns_in,
+                            num_rows_out,
+                            input->precision.size(),
+                            outputs->precision.size(),
+                            // TODO: only fp32 and Int16 tested
+                            quantized == nullptr ? input->precision.size() : 2,
+                            quantized == nullptr ? input->precision.size() : 4,
+                            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs,
+                            ptr_weights,
+                            ptr_biases,
+                            true);
+
+#ifdef PLOT
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n";
+#endif
+
+    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+        * outputs->precision.size();
+
+    size_t num_data_bytes_in = InferenceEngine::details::product(begin(input->dims), end(input->dims))
+        * input->precision.size();
+
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+    connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
+
+    if (power.scale != 1.0f) {
+        if (quantized == nullptr) {
+            gnamem->readonly().push_value(ptr_weights, power.scale, num_rows_out, 64);
+        } else {
+            auto scaledIdentity = quantized->_weights_quant.scale * power.scale;
+
+            #define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5):((a) + 0.5))
+
+            auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
+            gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+        }
+    }
+
+    if (power.offset != 0.0f) {
+        if (quantized == nullptr) {
+            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+        } else {
+            gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+        }
+    } else {
+        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+    }
+}
+
+void GNAPlugin::PoolingPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto &pooling = dynamic_cast<PoolingLayer &>(*layer.get());
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    auto inputs = layer->insData.begin()->lock();
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(inputs, 3);
+    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+    uint32_t num_columns_out = FROM_IR_DIM(outputs, 3);
+    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+
+    void *ptr_inputs;
+    void *ptr_outputs;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+
+#ifdef PLOT
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name << dnnComponentsForLayer.size() - 1 << "\n";
+#endif
+    switch (pooling._type) {
+        case PoolingLayer::MAX: break;
+        // we are loosing precision here
+        case PoolingLayer::AVG:
+        default:
+            // TODO: convert to SUMM pooling
+            THROW_GNA_EXCEPTION << "Layer :" << layer->name << " not supported";
+    }
+
+    dnn.InitMaxpoolComponent(currentComponent,
+                            1,
+                            num_columns_in * num_rows_in ,
+                            1,
+                            num_columns_out * num_rows_out,
+                            inputs->precision.size(),
+                            outputs->precision.size(),
+                            pooling._kernel[X_AXIS],
+                            pooling._kernel[X_AXIS],
+                            num_columns_in,
+                            false,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs);
+
+    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+        * outputs->precision.size();
+
+    size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size();
+
+    connectInput(layer, ptr_inputs, num_data_bytes_in);
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+}
+
+void GNAPlugin::CopyPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    auto inputs = layer->insData.begin()->lock();
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
+    uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+    uint32_t num_columns_out = FROM_IR_DIM(outputs, 2);
+    uint32_t num_padding_in = ALIGN(num_rows_in, 8) - num_rows_in;
+    uint32_t num_padding_out = ALIGN(num_rows_out, 8) - num_rows_out;
+    void *ptr_inputs;
+    void *ptr_outputs;
+    auto orientation = (num_cnn_rows_out > 0) ? kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+    dnn.InitCopyComponent(currentComponent,
+                          orientation,
+                          num_rows_in + num_padding_in,
+                          num_columns_in,
+                          num_rows_out + num_padding_out,
+                          num_columns_out,
+                          inputs->precision.size(),
+                          outputs->precision.size(),
+                          quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                          num_rows_out + num_padding_out,
+                          num_columns_out,
+                          ptr_inputs,
+                          ptr_outputs);
+
+    size_t num_data_bytes_out = ALIGN(InferenceEngine::details::product(
+                                                            begin(outputs->dims), end(outputs->dims)), 8)
+                                                                                * outputs->precision.size();
+    size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding_in) * inputs->precision.size();
+
+    connectInput(layer, ptr_inputs, num_data_bytes_in);
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+}
+
+void GNAPlugin::ConcatPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto concatLayer = dynamic_cast<InferenceEngine::ConcatLayer *> (layer.get());
+
+    if (concatLayer == nullptr) {
+        return;
+    }
+    if (concatLayer->insData.size() != 2) {
+        THROW_GNA_EXCEPTION << "Concat layer has unsupported number of incoming layers.";
+    }
+
+    auto prevInput0 = concatLayer->insData[0].lock();
+    auto prevInput1 = concatLayer->insData[1].lock();
+    if (!prevInput0 || !prevInput1) {
+        THROW_GNA_EXCEPTION << "Input layer for concat is unexpectedly absent";
+    }
+    if (prevInput0->precision.size() != prevInput1->precision.size()) {
+        THROW_GNA_EXCEPTION << "Different precision for Concat input layers are not supported";
+    }
+
+    for (auto &&outLayer : concatLayer->outData.front()->getInputTo()) {
+        if ( LayerInfo(outLayer.second).isConcat() ) {
+            auto& concatLayerInfo = concat_connection.find(concatLayer->name)->second;
+            connectOutput(layer, &concatLayerInfo.gna_ptr,
+                          &concatLayerInfo.gna_ptr, concatLayerInfo.reserved_size);
+        }
+    }
+}
+
+void GNAPlugin::CropPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto cropLayer = dynamic_cast<InferenceEngine::CropLayer *> (layer.get());
+
+    if (cropLayer == nullptr) {
+        return;
+    }
+    if (cropLayer->axis.size() > 1) {
+        THROW_GNA_EXCEPTION <<
+        "Crop layer does not support the number of cropped dimentions = "
+        << cropLayer->axis.size() << ".";
+    }
+
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+    size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
+    size_t cropSize = cropLayer->dim.back() * cropLayer->precision.size();
+
+    if (ALIGN(cropOffset, 8) == cropOffset) {
+        // leave crop as it is
+        GNAPlugin::GNACropLayer cropLayerInfoItem(layer);
+        std::string& id = layer->name;
+        crop_connection.emplace(id, cropLayerInfoItem);
+        auto cropLayerInfo = crop_connection.find(cropLayer->name);
+
+        if (cropLayerInfo == crop_connection.end()) {
+            THROW_GNA_EXCEPTION <<
+            "Item is not in the storage but it was added recently...\n";
+        }
+
+        // calculate index idx for connectInput last parameter
+        connectInput(layer, &cropLayerInfo->second.gna_ptr, cropSize + cropOffset, cropOffset, 0);
+
+        // cases for certain output layers
+        for (auto &&outLayer : layer->outData.front()->getInputTo()) {
+            auto& nextLayer = outLayer.second;
+            if ( LayerInfo(nextLayer).isConcat() ) {
+                connectOutput(layer, &cropLayerInfo->second.gna_ptr, &cropLayerInfo->second.gna_ptr, cropSize);
+            }
+        }
+    } else {
+        gnalog() << "Crop " << layer->name << " is being replaced by Affine layer...\n";
+        auto outputs = *layer->outData.begin();
+        auto inputs = layer->insData.begin()->lock();
+
+        uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+        uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
+        uint32_t num_rows_out = FROM_IR_DIM(outputs, 1);
+        uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+
+        void *ptr_inputs;
+        void *ptr_outputs;
+        void *ptr_weights;
+        void *ptr_biases;
+
+        dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+        auto &currentComponent = dnnComponentsForLayer.back().second;
+        dnn.InitAffineComponent(currentComponent,
+                                num_rows_in + num_padding,
+                                num_columns_in,
+                                num_rows_out,
+                                inputs->precision.size(),
+                                4,
+                                quantized == nullptr ? inputs->precision.size() : 2,
+                                4,
+                                quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                                quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                                ptr_inputs,
+                                ptr_outputs,
+                                ptr_weights,
+                                ptr_biases,
+                                false);
+
+        size_t num_data_bytes_out =
+        InferenceEngine::details::product(
+                                          begin(outputs->dims), end(outputs->dims)) * 4;
+
+        size_t num_data_bytes_in = num_columns_in *
+        (num_rows_in + num_padding) * inputs->precision.size();
+
+        connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 0);
+        connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+
+        gnamem->readonly().push_initializer(ptr_weights, num_rows_out * (num_rows_in + num_padding)*layer->precision.size(), [=](void * data, size_t size) {
+            int out = 0;
+            for (int input = cropLayer->offset.back(); input < num_rows_out + cropLayer->offset.back(); ++input) {
+                auto mem_ptr = reinterpret_cast<uint8_t *>(data) + input * layer->precision.size() + out * (num_rows_in+num_padding) * layer->precision.size();
+                if (quantized == nullptr) {
+                    auto float_ptr = reinterpret_cast<float *>(mem_ptr);
+                    *float_ptr = 1.0f;
+                } else {
+                    auto int_ptr = reinterpret_cast<uint16_t *>(mem_ptr);
+                    *int_ptr = 1;
+                }
+                ++out;
+            }
+        }, 64);
+        if (quantized == nullptr) {
+            gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+        } else {
+            gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+        }
+    }
+}
+
+void GNAPlugin::SplitPrimitive(InferenceEngine::CNNLayerPtr layer) {
+//  Nothing to do
+}
+
+void GNAPlugin::SlicePrimitive(InferenceEngine::CNNLayerPtr layer) {
+//  Nothing to do
+}
+
+void GNAPlugin::EltwisePrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto &eltwise = dynamic_cast<EltwiseLayer &>(*layer.get());
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    // for eltwise should be one input of 4 bytes and one of 2 bytes - detecting that
+    auto inputs2Bytes = layer->insData[0].lock();
+    auto inputs4Bytes = layer->insData[1].lock();
+
+    int biasesLayerIdx = 1;
+
+    if (quantized) {
+        if (eltwise._operation == EltwiseLayer::Sum) {
+            if (inputs4Bytes->precision.size() != 4) {
+                std::swap(inputs4Bytes, inputs2Bytes);
+                biasesLayerIdx = 0;
+            }
+            IE_ASSERT(inputs2Bytes->precision.size() == 2);
+            IE_ASSERT(inputs4Bytes->precision.size() == 4);
+        } else {
+            // for mul both inputs should be 2 bytes precision
+            IE_ASSERT(inputs2Bytes->precision.size() == 2);
+            IE_ASSERT(inputs4Bytes->precision.size() == 2);
+        }
+    }
+
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs4Bytes, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(inputs4Bytes, 2);
+    uint32_t num_rows_out = num_rows_in;
+
+    void *ptr_inputs;
+    void *ptr_outputs;
+    void *ptr_weights;
+    void *ptr_biases;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+    dnn.InitAffineComponent(currentComponent,
+                            num_rows_in,
+                            num_columns_in,
+                            num_rows_out,
+                            inputs2Bytes->precision.size(),
+                            outputs->precision.size(),
+                            // TODO: only fp32 and Int16 tested
+                            quantized == nullptr ? inputs2Bytes->precision.size() : 2,
+                            quantized == nullptr ? inputs4Bytes->precision.size() : 4,
+                            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs,
+                            ptr_weights,
+                            ptr_biases,
+                            true);
+
+#ifdef PLOT
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name << "diagonal_"<< dnnComponentsForLayer.size() - 1 << "\n";
+#endif
+
+    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+        * outputs->precision.size();
+
+    size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs2Bytes->dims), end(inputs2Bytes->dims))
+        * inputs2Bytes->precision.size();
+
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+    connectInput(layer, ptr_inputs, num_data_bytes_in, 0, 1 - biasesLayerIdx);
+
+    switch (eltwise._operation) {
+        case EltwiseLayer::Sum:
+            if (quantized == nullptr) {
+                gnamem->readonly().push_value(ptr_weights, 1.0f, num_rows_out, 64);
+            } else {
+                auto scaledIdentity = quantized->_weights_quant.scale;
+
+                #define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5):((a) + 0.5))
+
+                auto quantizedIdentity = FLOAT_TO_INT16(std::min(scaledIdentity, static_cast<float>(INT16_MAX)));
+                gnamem->readonly().push_value<int16_t>(ptr_weights, quantizedIdentity, num_rows_out, 64);
+            }
+            connectInput(layer, ptr_biases, num_data_bytes_in, 0, biasesLayerIdx);
+            break;
+
+        case EltwiseLayer::Prod:
+            if (quantized == nullptr) {
+                gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
             } else {
-                auto dst = reinterpret_cast<uint16_t *>(ptr_dst);
-                auto src = reinterpret_cast<const float *>(ptr_src);
-                copyInputData(dst, src, num_frames, num_group, num_vector_elements, num_vector_stride, orientation, scaleFactor);
+                gnamem->readonly().push_value<int32_t>(ptr_biases, 0, num_rows_out, 64);
+            }
+            connectInput(layer, ptr_weights, num_data_bytes_in, 0, biasesLayerIdx);
+            break;
+
+        default:
+            THROW_GNA_EXCEPTION << "Unsupported eltwise operation: " << eltwise._operation;
+    }
+}
+
+void GNAPlugin::AffinePrimitive(InferenceEngine::CNNLayerPtr layer, bool isDiag) {
+    auto &weightable = dynamic_cast<WeightableLayer &>(*layer.get());
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+
+    auto inputs = layer->insData.begin()->lock();
+    auto outputs = *layer->outData.begin();
+
+    uint32_t num_rows_in = FROM_IR_DIM(inputs, 1);
+    uint32_t num_columns_in = FROM_IR_DIM(inputs, 2);
+    uint32_t num_rows_out = isDiag ? num_rows_in : FROM_IR_DIM(outputs, 1);
+    uint32_t num_padding = ALIGN(num_rows_in, 8) - num_rows_in;
+
+    void *ptr_inputs;
+    void *ptr_outputs;
+    void *ptr_weights;
+    void *ptr_biases;
+
+    // TODO: questionable why for biases that are no in IR we inventing precision
+    auto biasPrecision = weightable._biases ? weightable._biases->precision() : outputs->precision;
+
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+
+#ifdef PLOT
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name << (isDiag ? "diagonal_" : "affine_") << dnnComponentsForLayer.size() - 1 << "\n";
+#endif
+
+    dnn.InitAffineComponent(currentComponent,
+                            num_rows_in + num_padding,
+                            num_columns_in,
+                            num_rows_out,
+                            inputs->precision.size(),
+                            outputs->precision.size(),
+                            weightable._weights->precision().size(),
+                            biasPrecision.size(),
+                            quantized == nullptr ? 1 : quantized->_weights_quant.scale,
+                            quantized == nullptr ? 1 : quantized->_dst_quant.scale,
+                            ptr_inputs,
+                            ptr_outputs,
+                            ptr_weights,
+                            ptr_biases,
+                            isDiag);
+
+    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+        * outputs->precision.size();
+
+    size_t num_data_bytes_in = num_columns_in * (num_rows_in + num_padding) * inputs->precision.size();
+
+    auto connectionInfo = connectInput(layer, ptr_inputs, num_data_bytes_in);
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+
+    auto transpose = false;
+    auto transposedRows = 0;
+    auto transposedCols = 0;
+    /**
+     * TODO: enable transpose correction between Conv/affine layers implement dedicated pass
+     * TF topologies have inplace permutes so we dont care
+     * kaldi topologies did this internally
+     */
+    if (0 && connectionInfo.needTransposeWeights) {
+        gnalog() << "Transposing weights for layer: " << layer->name << "\n";
+        // direct order is 0, 1, 2, 3, supported order is only 0,3,2,1 where dim 2 is usually equals to 1
+        auto permuteOrder = connectionInfo.permute->GetParamAsInts("order");
+        if (permuteOrder != vector<int>({0, 3, 2, 1})) {
+            THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") <<
+                               ", but only support 0, 3, 2, 1";
+        }
+        transpose = !isDiag;
+        transposedRows = connectionInfo.permute->input()->getDims()[3];
+        transposedCols = connectionInfo.permute->input()->getDims()[1];
+    }
+
+    if (num_padding == 0) {
+        if (!transpose) {
+            gnamem->readonly().push_ptr(ptr_weights,
+                                        weightable._weights->cbuffer().as<const void *>(),
+                                        weightable._weights->byteSize(),
+                                        64);
+        } else {
+            // ToDO: write unit tests for transpose
+            gnamem->readonly().push_initializer(ptr_weights, weightable._weights->byteSize(), [=](void * data, size_t size) {
+                for (int k = 0; k < (isDiag ? 1 : num_rows_out); k++) {
+                    auto rowOffset = k * transposedRows * transposedCols * weightable.precision.size();
+                    auto cbuffer = weightable._weights->cbuffer().as<const uint8_t *>() + rowOffset;
+                    auto u8Data = reinterpret_cast<uint8_t *>(data) + rowOffset;
+                    for (int j = 0; j < transposedCols; j++) {
+                        for (int i = 0; i < transposedRows; i++) {
+                            auto offsetWrite = (transposedRows * j + i) * weightable.precision.size();
+                            auto offsetRead = (i * transposedCols + j) * weightable.precision.size();
+                            memcpy(u8Data + offsetWrite, cbuffer + offsetRead, weightable.precision.size());
+                        }
+                    }
+                }
+            }, 64);
+        }
+    } else {
+        auto elementsIn = (num_rows_in + num_padding) * num_columns_in;
+        auto paddedWeights = isDiag ? elementsIn : elementsIn * num_rows_out;
+        auto paddedWeightsSize = paddedWeights * weightable.precision.size();
+
+        gnamem->readonly().push_initializer(ptr_weights, paddedWeightsSize, [=](void * data, size_t size) {
+            for (int i = 0; i < (isDiag ? 1 : num_rows_out); i++) {
+                memcpy(data,
+                       weightable._weights->cbuffer().as<const uint8_t *>() + num_rows_in * i * weightable.precision.size(),
+                       num_rows_in * weightable.precision.size());
+                data = reinterpret_cast<uint8_t *>(data) + (num_rows_in + num_padding) * weightable.precision.size();
             }
+        }, 64);
+    }
+
+    if (weightable._biases) {
+        gnamem->readonly().push_ptr(ptr_biases,
+                         weightable._biases->cbuffer().as<const void *>(),
+                         weightable._biases->byteSize(),
+                         64);
+    } else {
+        gnamem->readonly().push_value(ptr_biases, 0.0f, num_rows_out, 64);
+    }
+}
+
+void GNAPlugin::PWLPrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto *generic = dynamic_cast<GenericLayer *>(layer.get());
+    std::string type;
+    std::vector<intel_pwl_segment_t> ptr_pwl_segments;
+    uint32_t num_rows;
+    uint32_t num_columns;
+    void *ptr_inputs;
+    void *ptr_outputs;
+
+    do {
+        if (generic == nullptr) {
+            type = layer->type;
+            break;
+        }
+
+        if (CaselessEq<string>()(layer->type, "activation")) {
+            type = generic->GetParamAsString("type");
+            break;
+        } else {
+            type = layer->type;
+            break;
         }
+    } while (false);
+
+    auto inputs = layer->insData.begin()->lock();
+    auto outputs = *layer->outData.begin();
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
+    float output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
+
+    auto orientation = (num_cnn_rows_out > 0) ? kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
+
+    if (inputs->dims.size() == 4) {
+        num_columns = FROM_IR_DIM(inputs, 3) * FROM_IR_DIM(inputs, 1);
+        num_rows = 1;
+    } else {
+        num_columns = FROM_IR_DIM(inputs, 2);
+        num_rows = FROM_IR_DIM(inputs, 1);
     }
+
+    size_t num_data_bytes_out = InferenceEngine::details::product(begin(outputs->dims), end(outputs->dims))
+        * outputs->precision.size();
+
+    size_t num_data_bytes_in = InferenceEngine::details::product(begin(inputs->dims), end(inputs->dims))
+        * inputs->precision.size();
+
+    static caseless_unordered_map<std::string, DnnActivationType> supportedActivations = {
+        {"sigmoid", kActSigmoid},
+        {"tanh", kActTanh},
+        {"relu", kActRelu},
+        {"leakyrelu", kActLeakyRelu},
+        {"clamp", kActKaldiLstmClipping},
+        {"identity", kActIdentity}
+    };
+
+    auto it = supportedActivations.find(type);
+    if (it == supportedActivations.end()) {
+        THROW_GNA_EXCEPTION << "Activation function type not yet supported: " << type;
+    }
+    auto activation_type = DnnActivation::fromType(it->second);
+    activation_type.negative_slope = (it->second == kActRelu) ? dynamic_cast<ReLULayer*>(layer.get())->negative_slope : 0.0f;
+
+    // TODO: need to take graph dependency instead of linear
+    auto &prevComponent = dnnComponentsForLayer.back().second;
+    dnnComponentsForLayer.emplace_back(layer->name, intel_dnn_component_t());
+    auto &currentComponent = dnnComponentsForLayer.back().second;
+
+    intel_pwl_segment_t *ptr_pwl_segments_target = nullptr;
+
+    if (!inputs->precision.is_float()) {
+        // TODO: generalize activation function code
+        // now that scale factors are known, create PWL approximations to activation functions
+        float input_scale_factor = dnn.OutputScaleFactor(prevComponent);
+        if (uniformPwlDesign) {
+            switch (activation_type) {
+                case kActSigmoid:ptr_pwl_segments.resize(SIGMOID_NUM_SEGMENTS);
+                    break;
+                case kActTanh:ptr_pwl_segments.resize(TANH_NUM_SEGMENTS);
+                    break;
+                case kActRelu:ptr_pwl_segments.resize(RELU_NUM_SEGMENTS);
+                    break;
+                case kActLeakyRelu:ptr_pwl_segments.resize(RELU_NUM_SEGMENTS);
+                    break;
+                case kActKaldiLstmClipping:
+                case kActIdentity:ptr_pwl_segments.resize(IDENTITY_NUM_SEGMENTS);
+                    break;
+                case kActCustom:
+                default:THROW_GNA_EXCEPTION << "Activation function type not yet supported " << activation_type;
+            }
+            PwlDesign16(activation_type,
+                        &*ptr_pwl_segments.begin(),
+                        static_cast<uint32_t>(ptr_pwl_segments.size()),
+                        input_scale_factor,
+                        output_scale_factor);
+        } else {
+            PwlDesignOpt16(activation_type,
+                           ptr_pwl_segments,
+                           input_scale_factor,
+                           output_scale_factor);
+        }
+        ptr_pwl_segments_target = reinterpret_cast<intel_pwl_segment_t *>(&ptr_pwl_segments_target);
+    }
+
+    dnn.InitPiecewiseLinearComponent(currentComponent,
+                                     activation_type,
+                                     orientation,
+                                     num_rows,
+                                     num_columns,
+                                     inputs->precision.size(),
+                                     outputs->precision.size(),
+                                     ptr_pwl_segments.size(),
+                                     output_scale_factor,
+                                     ptr_inputs,
+                                     ptr_outputs,
+                                     ptr_pwl_segments_target);
+#ifdef PLOT
+#define GET_ACTIVATION_NAME(name)\
+case name:\
+    actName = #name;\
+    break;
+    string actName = "unknown";
+    switch (activation_type) {
+        GET_ACTIVATION_NAME(kActSigmoid);
+        GET_ACTIVATION_NAME(kActTanh);
+        GET_ACTIVATION_NAME(kActRelu);
+        GET_ACTIVATION_NAME(kActLeakyRelu);
+        GET_ACTIVATION_NAME(kActKaldiLstmClipping);
+        GET_ACTIVATION_NAME(kActIdentity);
+    }
+    cout << "IR layer : " << std::left << std::setw(20) << layer->name <<  actName << "_" << dnnComponentsForLayer.size() - 1 <<"\n";
+#endif
+
+    connectInput(layer, ptr_inputs, num_data_bytes_in);
+    connectOutput(layer, ptr_outputs, ptr_inputs, num_data_bytes_out);
+
+    if (ptr_pwl_segments_target != nullptr) {
+        gnamem->readonly().push_local_ptr(ptr_pwl_segments_target,
+                                          &ptr_pwl_segments.front(),
+                                          ptr_pwl_segments.size() * sizeof(intel_pwl_segment_t),
+                                          64);
+    }
+}
+
+
+void GNAPlugin::PermutePrimitive(InferenceEngine::CNNLayerPtr layer) {
+    auto layerOrder = layer->GetParamAsInts("order");
+
+    if (layerOrder != vector<int>({0, 3, 2, 1})) {
+        THROW_IE_EXCEPTION << "[GNA plugin] Unsupported permute order: was " << layer->GetParamAsString("order") <<
+                           ", but only support 0,3,2,1";
+    }
+}
+
+class LayersBuilder {
+    using CreatorFnc = std::function<void(GNAPlugin*, CNNLayerPtr)>;
+
+ public:
+    LayersBuilder(const std::vector<std::string> &types, CreatorFnc callback) {
+        for (auto && str : types) {
+            getStorage()[str] = callback;
+        }
+    }
+    static caseless_unordered_map<std::string, CreatorFnc> &getStorage() {
+        static caseless_unordered_map<std::string, CreatorFnc> LayerBuilder;
+        return LayerBuilder;
+    }
+};
+
+#define CREATE(name) [](GNAPlugin *p, CNNLayerPtr l) {p->name(l);}
+void SKIP(GNAPlugin*, CNNLayerPtr) {}
+
+void GNAPlugin::CreateLayerPrimitive(CNNLayerPtr layer) {
+    static const LayersBuilder layersBuilder[] = {
+        {{"Input"}, [](GNAPlugin*, CNNLayerPtr l) {}},  // skip input layers they are not used in GNA lib, only as a memory blobs
+        {{"FullyConnected", "InnerProduct"}, CREATE(AffinePrimitive)},
+        {{"ScaleShift"}, CREATE(DiagonalPrimitive)},
+        {{"Eltwise"},
+         CREATE(EltwisePrimitive)},  // same as diagonal while weights are not taken from network, rather than from another output
+        {{"Split"}, SKIP},  // skip information about which part of prev layer need to consume handle during layer creation
+        {{"Slice"}, SKIP},
+        {{"clamp", "sigmoid", "relu", "tanh", "identity"}, CREATE(PWLPrimitive)},
+        {{"Convolution"}, CREATE(ConvolutionPrimitive)},
+        {{"Permute"}, CREATE(PermutePrimitive)},  // permute of certain form (2D transpose) can be assimilated in followed FC layer
+        {{"Pooling"}, CREATE(PoolingPrimitive)},
+        {{"Power"} , CREATE(PowerPrimitive)},
+        {{"Concat"}, CREATE(ConcatPrimitive)},
+        {{"Reshape"}, SKIP},  // TODO: handled not in GNA but rather in GNA plugin
+        {{"Crop"}, CREATE(CropPrimitive)},
+        {{"Copy"}, CREATE(CopyPrimitive)},
+    };
+    auto it = LayersBuilder::getStorage().find(layer->type);
+    if (it != LayersBuilder::getStorage().end()) {
+        it->second(this, layer);
+    } else {
+        THROW_GNA_EXCEPTION << "Unsupported layer: " << layer->name << ":" << layer->type;
+    }
+}
+
+
+GNAPlugin::GNAPlugin(const std::map<std::string, std::string>& configMap) {
+    // holds actual value of a found key
+    std::string value;
+    auto if_set = [&](std::string key, const std::function<void()> & handler) {
+        auto keyInMap = configMap.find(key);
+        if (keyInMap != configMap.end()) {
+            value = keyInMap->second;
+            handler();
+        }
+    };
+
+    if_set(GNA_CONFIG_KEY(SCALE_FACTOR), [&] {
+        input_scale_factor = std::stod(value);
+    });
+
+    if_set(GNA_CONFIG_KEY(FIRMWARE_MODEL_IMAGE), [&] {
+        dumpXNNPath = value;
+    });
+
+    if_set(GNA_CONFIG_KEY(DEVICE_MODE), [&] {
+        static caseless_unordered_map <std::string, uint32_t> supported_values = {
+            {GNAConfigParams::GNA_AUTO, GNA_AUTO},
+            {GNAConfigParams::GNA_HW, GNA_HARDWARE},
+            {GNAConfigParams::GNA_SW, GNA_SOFTWARE},
+            {GNAConfigParams::GNA_SW_EXACT, GNA_SOFTWARE & GNA_HARDWARE}
+        };
+        auto procType = supported_values.find(value);
+        if (procType == supported_values.end()) {
+            THROW_GNA_EXCEPTION << "GNA device mode unsupported: " << value;
+        }
+        gna_proc_type = static_cast<intel_gna_proc_t>(procType->second);
+    });
+
+    if_set(GNA_CONFIG_KEY(COMPACT_MODE), [&] {
+        if (value == PluginConfigParams::YES) {
+            compact_mode = true;
+        } else if (value == PluginConfigParams::NO) {
+            compact_mode = false;
+        } else {
+            THROW_GNA_EXCEPTION << "GNA compact mode should be YES/NO, but not" << value;
+        }
+    });
+
+    if_set(CONFIG_KEY(EXCLUSIVE_ASYNC_REQUESTS), [&] {
+        if (value == PluginConfigParams::YES) {
+            exclusive_async_requests  = true;
+        } else if (value == PluginConfigParams::NO) {
+            exclusive_async_requests  = false;
+        } else {
+            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+        }
+    });
+
+    if_set(GNA_CONFIG_KEY(PRECISION), [&] {
+        auto precision = Precision::FromStr(value);
+        if (precision != Precision::I8 && precision != Precision::I16) {
+            THROW_GNA_EXCEPTION << "Unsupported precision of GNA hardware, should be Int16 or Int8, but was: " << value;
+        }
+        gnaPrecision = precision;
+    });
+
+    if_set(GNA_CONFIG_KEY(PWL_UNIFORM_DESIGN), [&] {
+        if (value == PluginConfigParams::YES) {
+            uniformPwlDesign = true;
+        } else if (value == PluginConfigParams::NO) {
+            uniformPwlDesign = false;
+        } else {
+            THROW_GNA_EXCEPTION << "GNA pwl uniform algorithm parameter "
+                                                            << "should be equal to YES/NO, but not" << value;
+        }
+    });
+
+    if_set(CONFIG_KEY(PERF_COUNT), [&] {
+        if (value == PluginConfigParams::YES) {
+            performance_counting = true;
+        } else if (value == PluginConfigParams::NO) {
+            performance_counting = false;
+        } else {
+            THROW_GNA_EXCEPTION << "GNA performance counter enabling parameter "
+                                                            << "should be equal to YES/NO, but not" << value;
+        }
+    });
+
+    if_set(GNA_CONFIG_KEY(LIB_N_THREADS), [&] {
+        uint64_t lib_threads = std::stoul(value, NULL, 10);
+        if (lib_threads == 0 || lib_threads > std::numeric_limits<uint8_t>::max()/2-1) {
+            THROW_GNA_EXCEPTION << "Unsupported accelerator lib number of threads: " << value
+                                                            << ", should be greateer than 0 and less than 127";
+        }
+        gna_lib_async_threads_num = lib_threads;
+    });
+
+    if_set(CONFIG_KEY(SINGLE_THREAD), [&] {
+        if (value == PluginConfigParams::YES) {
+            gna_openmp_multithreading  = false;
+        } else if (value == PluginConfigParams::NO) {
+            gna_openmp_multithreading  = true;
+        } else {
+            THROW_GNA_EXCEPTION << "EXCLUSIVE_ASYNC_REQUESTS should be YES/NO, but not" << value;
+        }
+    });
+}
+
+GNAPluginNS::GNAPlugin::LayerType GNAPlugin::LayerTypeFromStr(const std::string &str) {
+    static const caseless_map<std::string, GNAPlugin::LayerType> LayerNameToType = {
+        { "Input" , Input },
+        { "Convolution" , Convolution },
+        { "ReLU" , ReLU },
+        { "Sigmoid" , Sigmoid },
+        { "TanH" , TanH },
+        { "Pooling" , Pooling },
+        { "FullyConnected" , FullyConnected },
+        { "InnerProduct" , InnerProduct},
+        { "Split" , Split },
+        { "Slice" , Slice },
+        { "Eltwise" , Eltwise },
+        { "Reshape" , Reshape },
+        { "ScaleShift" , ScaleShift },
+        { "Clamp" , Clamp },
+        { "Concat" , Concat },
+        { "Copy", Copy },
+        { "Permute" , Permute },
+        { "Power" , Power},
+        { "Memory" , Memory },
+        { "Crop" , Crop }
+    };
+    auto it = LayerNameToType.find(str);
+    if (it != LayerNameToType.end())
+        return it->second;
+    else
+        return NO_TYPE;
 }
 
-GNAPlugin::GNAPlugin() {
-    Init();
-    UpdateFieldsFromConfig();
-}
+bool GNAPlugin::AreLayersSupported(ICNNNetwork& network, std::string& errMessage) {
+    CNNLayerSet inputLayers;
+    InferenceEngine::InputsDataMap inputs;
+    std::unordered_set<CNNLayer *> allLayers;
+    auto specifiedDevice = network.getTargetDevice();
+    auto network_precision = network.getPrecision();
+    network.getInputsInfo(inputs);
+    auto network_input_precision = inputs.begin()->second->getInputPrecision();
+    auto batch_sise = network.getBatchSize();
+    if (network_precision != Precision::FP32) {
+        errMessage = "The plugin does not support networks with " + std::string(network_precision.name()) + " format.\n";
+        return false;
+    }
+    if (network_input_precision != Precision::FP32 &&
+        network_input_precision != Precision::I16) {
+        errMessage = "The plugin does not support input precision with " + std::string(network_input_precision.name()) + " format.\n";
+        return false;
+    }
+    if (specifiedDevice != InferenceEngine::TargetDevice::eCPU &&
+        specifiedDevice != InferenceEngine::TargetDevice::eGNA &&
+        specifiedDevice != InferenceEngine::TargetDevice::eDefault) {
+        errMessage = "The plugin does not support target device: " + std::string(getDeviceName(specifiedDevice)) + ".\n";
+        return false;
+    }
 
-GNAPlugin::GNAPlugin(const std::map<std::string, std::string>& configMap) {
-    Init();
-    SetConfig(configMap);
-}
+    if (inputs.empty()) {
+        errMessage = "Network is empty (GNA)\n";
+        return false;
+    }
 
-void GNAPlugin::Init() {
-    dnn = std::make_shared<backend::AMIntelDNN>(backend::AMIntelDNN());
-    inputsDesc = std::make_shared<GNAPluginNS::InputDesc>(GNAPluginNS::InputDesc());
-    gnaFlags = std::make_shared<GNAPluginNS::GNAFlags>(GNAPluginNS::GNAFlags());
+    auto & secondLayers = inputs.begin()->second->getInputData()->getInputTo();
+    if (secondLayers.empty()) {
+        errMessage = "Network consists of input layer only (GNA)\n";
+        return false;
+    }
 
-    graphCompiler.setDNNPtr(dnn);
-    graphCompiler.setInputDescPtr(inputsDesc);
-    graphCompiler.setGNAFlagsPtr(gnaFlags);
-}
+    bool check_result = true;
+    InferenceEngine::details::UnorderedDFS(allLayers,
+                                           secondLayers.begin()->second,
+                                           [&](const CNNLayerPtr layer) {
+                                                if (LayerTypeFromStr(layer->type) == NO_TYPE) {
+                                                    errMessage = "Layer is unsupported by GNA: " + layer->name + ":" + layer->type + "\n";
+                                                    check_result =  false;
+                                                }
+                                                if (batch_sise != 1 && LayerInfo::isBatchSizeConstrained(layer->type)) {
+                                                    check_result =  false;
+                                                }
+                                            }, false);
 
-void GNAPlugin::InitGNADevice() {
-#if GNA_LIB_VER == 1
-    gnadevice = std::make_shared<GNADeviceHelper>(config.gna_proc_type,
-                                        gnaFlags->gna_lib_async_threads_num,
-                                        gnaFlags->gna_openmp_multithreading,
-                                        gnaFlags->performance_counting);
-#else
-    gnadevice = std::make_shared<GNADeviceHelper>(config.pluginGna2AccMode,
-                                                  config.pluginGna2DeviceConsistent,
-                gnaFlags->gna_lib_async_threads_num,
-                gnaFlags->gna_openmp_multithreading,
-                gnaFlags->performance_counting);
-#endif
-    size_t page_size_bytes = 4096;
-    gnamem = std::make_shared<gna_memory_type>(memory::make_polymorph<memory::GNAAllocator>(gnadevice), page_size_bytes);
-    graphCompiler.setGNAMemoryPtr(gnamem);
+    return check_result;
 }
 
 void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
-    // move blobs from Constant layers to Convolution, Deconvolution, FullyConnected layers attributes
-    BlobTransformation blobsTransformation;
-    blobsTransformation.transform(network, true);
-
     //  Check the input network
     std::string error;
     if (!AreLayersSupported(network, error)) {
@@ -351,87 +1485,68 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
     }
 
     // network optimisation phases
-    int passIdx = 0;
-    auto run_passes = [&] (const CNNNetPtr& network, bool runBeforeCopy) {
-        auto passes = make_shared<PassManager>(policy, network, runBeforeCopy);
-        passes->registerPass<RemoveConstPass>();
-        passes->registerPass<UnrollTIPass>();
-        passes->registerPass<RemoveConstPass>();
-        passes->registerPass<UnrollLSTMCellPass>();
-
-        passes->registerPass<SubstitutePReluPass>();
-        passes->registerPass<SubstituteSoftSignPass>();
-
-        passes->registerPass<ReorderMaxPoolPass>();
-        passes->registerPass<InsertSplitAligningFilterPass>();
-
-        passes->registerPass<InsertConcatAligningFilterPass>();
-        passes->registerPass<ReorderConcatInputsPass>();
-        if (policy.PermutePolicy != Policy::Permute::DISABLED) {
-            passes->registerPass<ReversePermutationsPass>();
-        }
-        passes->registerPass<InsertIdentityLayerPass>();
-        passes->registerPass<InsertCopyLayerPass>();
-        passes->registerPass<InsertDiagonalLayerPass>();
-        passes->registerPass<HandleMultipleActivationsForTheLayerPass>();
-        passes->registerPass<SubstituteScaleShiftBroadCastPass>();
-        passIdx = passes->run(passIdx);
+    auto run_passes = [&] (CNNNetPtr network) {
+        auto layers = CNNNetSortTopologically(*network.get());
+        substitutePRelu(layers);
+        layers = CNNNetSortTopologically(*network.get());
+        reorderMaxPool(layers);
+        applyOrientations(layers);
+        insertIdentityLayer(layers);
+        insertDiagonalLayer(layers);
     };
 
-    ICNNNetwork::Ptr newNet;
-    if (gnaFlags->sw_fp32) {
-        auto visitor = [&](InferenceEngine::CNNLayerPtr lp) {
-            transformLayer(lp, WeightsConverter());
-            return lp;
-        };
-        newNet = InferenceEngine::CNNNetCopy(network, visitor);
-        // to run all passes need to have two calls to pass manager
-        run_passes(newNet, true);
-        run_passes(newNet, false);
-    } else {
-        switch (config.gnaPrecision) {
-            case Precision::I16:
-                ModelQuantizer<QuantI16> q16;
-                newNet = q16.quantize(network, run_passes, inputsDesc->inputScaleFactors);
-                break;
-            case Precision::I8:
-                ModelQuantizer<QuantI8> q8;
-                newNet = q8.quantize(network, run_passes, inputsDesc->inputScaleFactors);
-                break;
-            default:
-                THROW_GNA_EXCEPTION << "no mans land for GNA precision";
-                break;
-        }
-    }
-
-    auto inputLayers = CNNNetGetAllInputLayers(*newNet);
+    Config supported = Config({
+        {TargetDevice::eGNA, Precision::FP32, [&](InferenceEngine::ICNNNetwork &network) -> CNNNetworkPtr {
+            if (gnaPrecision == Precision::I16) {
+                ModelQuantizer<QuantI16> q;
+                return q.quantize(network, run_passes, input_scale_factor);
+            }
 
-#ifdef PLOT
-    std::ofstream file("gna_passes.dot");
-    saveGraphToDot(*newNet, file, [](const CNNLayerPtr layer,
-                                           ordered_properties &printed_properties,
-                                           ordered_properties &node_properties) {
-        // printing quantized params
-        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
-        if (!quantized) {
-            return;
+            if (gnaPrecision == Precision::I8) {
+                ModelQuantizer<QuantI8> q;
+                return q.quantize(network, run_passes, input_scale_factor);
+            }
+            THROW_GNA_EXCEPTION << "no mans land for GNA precision";
+        }},
+        // TODO: need to have advanced precision matcher based on layers/biases
+        {TargetDevice::eGNA, Precision::MIXED},
+        {TargetDevice::eGNA, Precision::I16},
+        {TargetDevice::eCPU, Precision::FP32
+#define EMULATE_GNA_API_LAYERS
+#ifdef  EMULATE_GNA_API_LAYERS
+            , [&](InferenceEngine::ICNNNetwork & network) {
+            auto visitor = [&](InferenceEngine::CNNLayerPtr lp) {
+                return lp;
+            };
+            auto copiedNet = InferenceEngine::CNNNetCopy(network, visitor);
+            run_passes(copiedNet);
+
+            return copiedNet;
         }
-        printed_properties.emplace_back(
-            "scale factor", std::to_string(quantized->_dst_quant.scale));
-    });
 #endif
+    }
+    });
 
-    auto sortedNet = CNNNetSortTopologicallyEx(*newNet, make_fuzed_order);
-
-    // passing policy to compiler
-    graphCompiler.setPolicy(policy);
-
-    if (sortedNet.empty()) {
-        THROW_GNA_EXCEPTION << "Sorted network is empty";
+    supported.setDefaultDevice(TargetDevice::eGNA);
+    auto newNet = supported.find_configuration(network).convert(network);
+    auto networkPrecision = newNet->getPrecision();
+
+    if (!networkPrecision.is_float()) {
+        gnadevice.reset(new GNADeviceHelper(gna_proc_type,
+                                            gna_lib_async_threads_num,
+                                            gna_openmp_multithreading,
+                                            performance_counting));
+        gnamem.reset(new gna_memory_type(
+                    make_polymorph<GNAAllocator>(*gnadevice.get()), PAGE_SIZE_BYTES));
+    } else {
+        gnamem.reset(new gna_memory_type(make_polymorph<std::allocator<uint8_t>>()));
     }
 
+    // creating intel dnn_t structures from network
+    auto sortedNet = CNNNetSortTopologically(*newNet);
     std::vector<CNNLayerPtr> sortedNoMem;
-    std::unordered_map<std::string, std::vector<InferenceEngine::CNNLayerPtr>> memoryPairs;
+    std::map<std::string,
+                    std::vector<InferenceEngine::CNNLayerPtr>> memoryPairs;
     // find all memory layers pairs and mark which one used as outputs
     for (auto &layer : sortedNet) {
         auto generic = dynamic_cast<GenericLayer *>(layer.get());
@@ -447,199 +1562,88 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
             memoryPairs[id][generic->GetParamAsInt("index")] = layer;
             continue;
         } else if (layerInfo.isConcat()) {
-            graphCompiler.fillConcatConnections(layer);
+            fillConcatConnections(layer);
         } else if (layerInfo.isSplit() || layerInfo.isSlice()) {
-            graphCompiler.fillSplitConnections(layer);
+            fillSplitConnections(layer);
         }
         sortedNoMem.push_back(layer);
     }
 
     // fill in extra storage with memory layers
-    graphCompiler.fillMemoryConnections(memoryPairs);
-
-    if (!graphCompiler.memory_connection.empty()) {
-        gnaFlags->gna_lib_async_threads_num = 1;
-    }
-
-    if (gnaFlags->sw_fp32) {
-        gnamem.reset(new gna_memory_type(memory::make_polymorph<std::allocator<uint8_t>>()));
-        graphCompiler.setGNAMemoryPtr(gnamem);
-    } else {
-        InitGNADevice();
-    }
+    fillMemoryConnections(memoryPairs);
 
     // keep inputs information and create input primitives
     newNet->getInputsInfo(inputsDataMap);
     if (inputsDataMap.empty()) {
         THROW_GNA_EXCEPTION << " No inputs for the topology";
     }
+    if (inputsDataMap.size() != 1) {
+        THROW_GNA_EXCEPTION << " cannot infer topologies with more than one inputs";
+    }
+
+    inputDims = inputsDataMap.begin()->second->getDims();
 
     // keep output dims
     newNet->getOutputsInfo(outputsDataMap);
     if (outputsDataMap.empty()) {
         THROW_GNA_EXCEPTION << "No outputs for the topology";
     }
-
-    for (auto && input : inputsDataMap) {
-        inputsDesc->getPtrInputsGlobal(input.first).resize(gnaFlags->gna_lib_async_threads_num);
+    if (outputsDataMap.size() != 1) {
+        THROW_GNA_EXCEPTION << "cannot infer topologies with more than one output";
     }
+    outputDims = outputsDataMap.begin()->second->dims;
 
+    ptr_inputs_global.resize(gna_lib_async_threads_num);
+    ptr_outputs_global.resize(gna_lib_async_threads_num);
     // CreatingLayer primitives
-    for (auto & layer : sortedNoMem) {
-        graphCompiler.CreateLayerPrimitive(layer);
-    }
-    for (auto& inputLayer : inputLayers) {
-        auto layerInfo = LayerInfo(inputLayer);
-        if (layerInfo.isInput() && 0 == inputsDesc->bytes_allocated_for_input[inputLayer->name]) {
-            graphCompiler.connectOutput(inputLayer, &inputsDesc->getPtrInputsGlobal(inputLayer->name).front(), 0);
-        }
-    }
-    // TODO: graph might be static - should we support that
-    if (graphCompiler.dnnComponents.components.empty()) {
-        THROW_GNA_EXCEPTION << "No GNA primitives created based on topology. This might indicate trivial topology";
-    }
-
-    /// setting-up output layers information
-    outputsDesc.resize(outputsDataMap.size());
-
-    auto initOutput = [this]
-            (int idx, const intel_dnn_component_t & component, CNNLayerPtr layer) {
-        // auto idx = std::distance(outputsDataMap.begin(), outputPort);
-        auto & desc = outputsDesc[idx];
-        auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
-
-        desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num);
-        desc.orientation = component.orientation_out;
-        desc.num_bytes_per_element = component.num_bytes_per_output;
-        desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
-        // TODO: this need to be fixed
-        desc.num_elements = component.num_rows_out;
-
-        // binding ptr for first infer request - then others will be setup during relocation
-        gnamem->bind_ptr(&desc.ptrs.front(), &component.ptr_outputs);
-    };
-
-    int portId = 0;
-    for (auto && outPort : outputsDataMap) {
-        // gets output layer pointer in original topology not in cloned
-        auto outLayer = outPort.second->getCreatorLayer().lock();
-
-        // Memory layers are not dnnComponents hence we need to make switch with identity layer
-        if (outLayer->type == "Memory") {
-            // traverse memory connection to find corresponding output_memory
-            for (auto && memConnection : graphCompiler.memory_connection) {
-                if (memConnection.second.getInput()->name == outLayer->name) {
-                    // if connection is found, replace memory input layer with memory output layer
-                    outLayer = memConnection.second.getOutput();
-                    break;
-                }
-            }
-        }
-
-        // searching for outData represented in GNA blob
-        // using ufs - upper first search
-        gnalog() << "[UFS] searching for : "<< outPort.first << " representation in GNA\n";
-        bool stopSearching = false;
-
-        CNNNetDFS(outLayer, [this, &outPort, portId, &stopSearching, &initOutput](CNNLayerPtr layer) {
-            auto irLayerAvatar = std::find_if(
-                graphCompiler.dnnComponents.components.begin(),
-                graphCompiler.dnnComponents.components.end(),
-                [&layer](std::pair<std::string, intel_dnn_component_t> & value) {
-                    return value.first == layer->name;
-            });
-
-            gnalog() << "[UFS] from : "<< outPort.first <<" reached: " << layer->name << "\n";
-
-            // probing gna_primitives
-            if (irLayerAvatar != graphCompiler.dnnComponents.components.end()) {
-                initOutput(portId, irLayerAvatar->second, layer);
-                stopSearching = true;
-            }
-
-            // probing concatInfo
-            if (!stopSearching && LayerInfo(layer).isConcat()) {
-                auto concatConnection  = graphCompiler.concat_connection.find(layer->name);
-                if (concatConnection != graphCompiler.concat_connection.end()) {
-                    //initOutput(portId, irLayerAvatar->second, layer);
-
-                    auto &desc = outputsDesc[portId];
-                    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(layer);
-
-                    desc.ptrs.resize(gnaFlags->gna_lib_async_threads_num);
-                    // TODO: what is orientation for concat
-                    desc.orientation = kDnnInterleavedOrientation;
-                    desc.num_bytes_per_element = layer->outData.front()->getPrecision().size();
-                    desc.scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
-                    desc.num_elements = concatConnection->second.reserved_size / desc.num_bytes_per_element;
-
-                    // binding ptr for first infer request - then others will be setup during relocation
-                    gnamem->bind_ptr(&desc.ptrs.front(), &concatConnection->second.gna_ptr);
-                    stopSearching = true;
-                }
-            }
-        }, true, [&stopSearching](InferenceEngine::CNNLayer* from) {
-            return make_upstream_order(!stopSearching ? from : nullptr);
-        });
-        if (!stopSearching) {
-            THROW_GNA_EXCEPTION << "unsupported topology: cannot locate " << outPort.first
-                                << " after compiling GNA graph";
-        }
-        portId++;
+    // TODO: solely gna_example convolution hack
+    num_feature_maps = 1;
+    for (auto layer = sortedNoMem.begin(); layer != sortedNoMem.end(); ++layer) {
+        CreateLayerPrimitive(*layer);
     }
+    gnamem->bind_ptr(&ptr_outputs_global.front(), &dnnComponentsForLayer.back().second.ptr_outputs);
 
-    // TODO: how active list will work in multioutput case
     // make room for active list
-    gnamem->reserve_ptr(nullptr,
-        ALIGN64(outputsDesc.front().num_bytes_per_element * outputsDesc.front().num_elements), 64);
+    auto &last_component = dnnComponentsForLayer.back().second;
+    gnamem->reserve_ptr(nullptr, ALIGN64(last_component.num_bytes_per_output * last_component.num_rows_out));
 
     void *pParallelExecutionData  = nullptr;
 
-    // reserving more bytes for intermediate data in parallel case - TODO: this works incorrectly in compact mode at lest
+    // reserving more bytes for intermidiate data in parallel case - TODO: this works incorrectly in compact mode at lest
     rwSegmentSize = gnamem->getRWBytes();
-    if (gnaFlags->gna_lib_async_threads_num > 1) {
-        gnamem->reserve_ptr(&pParallelExecutionData, gnamem->getRWBytes() * (gnaFlags->gna_lib_async_threads_num - 1), 64);
+    if (gna_lib_async_threads_num > 1) {
+        gnamem->reserve_ptr(&pParallelExecutionData, gnamem->getRWBytes() * (gna_lib_async_threads_num - 1));
     }
 
     gnamem->commit();
 
-    dnn->Init(gnamem->getBasePtr(),
+    dnn.Init(gnamem->getBasePtr(),
              gnamem->getTotalBytes(),
-             gnaFlags->sw_fp32 ? kDnnFloat : kDnnInt,
+             networkPrecision.is_float() ? kDnnFloat : kDnnInt,
              1);
 
-    // TODO: this copy is unneeded; in fact, we can directly create gna structs from list
-    for (auto &element : graphCompiler.dnnComponents.components) {
-        dnn->component.push_back(element.second);
+    // TODO: this copy unneed infact we can directly create gna structs from list
+    for (auto &element : dnnComponentsForLayer) {
+        dnn.component.push_back(element.second);
     }
 
     // in fp32 mode last PWL cannot be computed without that
-    dnn->InitActiveList(NULL);
+    dnn.InitActiveList(NULL);
 
-#if GNA_LIB_VER == 2
-    gnaModels.push_back(std::make_tuple(make_shared<CPPWrapper<Gna2Model>>()));
-#else
-    nnets.emplace_back(make_shared<CPPWrapper<intel_nnet_type_t>>(), -1, InferenceEngine::BlobMap());
-#endif
-    if (!gnaFlags->sw_fp32) {
+    nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(0), -1, InferenceEngine::BlobMap()));
+
+    if (!networkPrecision.is_float()) {
         // number of layer gets calculated inside that InitGNAStruct function
-#if GNA_LIB_VER == 2
-        dnn->InitGNAStruct(&std::get<0>(gnaModels.front())->obj);
-#else
-        dnn->InitGNAStruct(&std::get<0>(nnets.front())->obj);
-#endif
+        dnn.InitGNAStruct(&std::get<0>(nnets.front())->obj);
     }
 
-    // creating same gna RW segment for parallel infer requests
-    for (int i = 1; i != gnaFlags->gna_lib_async_threads_num; i++) {
-#if GNA_LIB_VER == 2
-        gnaModels.push_back(std::make_tuple(make_shared<CPPWrapper<Gna2Model>>()));
+    // creating same gna RW segment for paralle infer requests
+    for (int i = 1; i != gna_lib_async_threads_num; i++) {
+        nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(0), -1, InferenceEngine::BlobMap()));
+
         // this can be improved by just copy all structures, but we are too lazy
-        dnn->InitGNAStruct(&std::get<0>(gnaModels.back())->obj);
-#else
-        nnets.emplace_back(make_shared<CPPWrapper<intel_nnet_type_t>>(), -1, InferenceEngine::BlobMap());
-        dnn->InitGNAStruct(&std::get<0>(nnets.back())->obj);
-#endif
+        dnn.InitGNAStruct(&std::get<0>(nnets.back())->obj);
+
         // relocate rw pointers to new offset
         auto basePtr = reinterpret_cast<uint8_t*>(pParallelExecutionData) + rwSegmentSize * (i - 1);
 
@@ -652,167 +1656,48 @@ void GNAPlugin::LoadNetwork(ICNNNetwork &network) {
             }
         };
 
-        for (auto &&input : inputsDesc->ptr_inputs_global_storage) {
-            relocate(input[i], input[0]);
-        }
-
-        // relocating all output pointers
-        for (int j = 0; j < outputsDesc.size(); ++j) {
-            relocate(outputsDesc[j].ptrs[i], outputsDesc[j].ptrs[0]);
-        }
-
-#if GNA_LIB_VER == 2
-        for (int j = 0; j != std::get<0>(gnaModels.front())->obj.NumberOfOperations; j++) {
-            auto & gnaOperation = std::get<0>(gnaModels[i])->obj.Operations[j];
-            relocate(const_cast<Gna2Tensor*>(gnaOperation.Operands[0])->Data, gnaOperation.Operands[0]->Data);
-            relocate(const_cast<Gna2Tensor*>(gnaOperation.Operands[1])->Data, gnaOperation.Operands[1]->Data);
-#else
+        relocate(ptr_inputs_global[i], ptr_inputs_global[0]);
+        relocate(ptr_outputs_global[i], ptr_outputs_global[0]);
         for (int j = 0; j != std::get<0>(nnets.front())->obj.nLayers; j++) {
             auto & layer = std::get<0>(nnets[i])->obj.pLayers[j];
+
             relocate(layer.pInputs, layer.pInputs);
             relocate(layer.pOutputs, layer.pOutputs);
             relocate(layer.pOutputsIntermediate, layer.pOutputsIntermediate);
-#endif
-        }
-    }
-
-    // calculating input orientation without memory layers, since their orientation not changed during infer right now
-    std::unordered_map<string, string> skippedLayers;
-
-    bool withConv = false;
-    for (auto &layer : sortedNet) {
-        auto layerInfo = LayerInfo(layer);
-        if (layerInfo.isConvolution()) {
-            withConv = true;
-            break;
         }
     }
-    if (withConv) {
-        for (auto &layer : sortedNet) {
-            for (int i = 0; CNNNetHasPrevLayer(layer.get(), i); i++) {
-                auto prevLayer = CNNNetPrevLayer(layer.get(), i);
-                if (!skippedLayers.count(prevLayer->name)) {
-                    if (CNNNetHasPrevLayer(prevLayer.get())) {
-                        continue;
-                    }
-
-                    // we are in the one of input layers
-                    if (LayerInfo(prevLayer).isMemory()) {
-                        continue;
-                    }
-                }
-
-                auto dnnLayer = graphCompiler.dnnComponents.findComponent(layer);
-                string inputName = prevLayer->name;
-                if (skippedLayers.count(prevLayer->name)) {
-                    inputName = skippedLayers[prevLayer->name];
-                }
-
-                // non functional layer - skipped by gna
-                if (nullptr == dnnLayer) {
-                    // storing input name for skipped layer
-                    skippedLayers[layer->name] = inputName;
-                    continue;
-                }
+    orientation_in = dnn.component[0].orientation_in;
+    orientation_out = dnn.component[dnn.num_components()-1].orientation_out;
+    num_bytes_per_output = dnn.component[dnn.num_components()-1].num_bytes_per_output;
 
-                // input orientation might be already initialized, thus verify that it matches
-                if (!inputsDesc->orientation_in.count(inputName)) {
-                    inputsDesc->orientation_in[inputName] = dnnLayer->orientation_in;
-                } else {
-                    if (inputsDesc->orientation_in[inputName] != dnnLayer->orientation_in) {
-                        THROW_GNA_EXCEPTION << "orientation for input layer: " << inputName << "cannot be calculated";
-                    }
-                }
-            }
-        }
-    } else {
-        for (auto& inputLayer : inputLayers) {
-            inputsDesc->orientation_in[inputLayer->name] = kDnnInterleavedOrientation;
-        }
-    }
+    auto quantized = InferenceEngine::getInjectedData<QuantizedLayerParams>(sortedNoMem.back());
+    output_scale_factor = quantized != nullptr ? quantized->_dst_quant.scale : 1.0f;
 
-    num_rotate_rows = dnn->num_rotate_rows;
-    num_rotate_columns = dnn->num_rotate_columns;
+    num_rotate_rows = dnn.num_rotate_rows;
+    num_rotate_columns = dnn.num_rotate_columns;
 
     DumpXNNToFile();
 
 #ifdef PLOT
-    dnn->WriteGraphWizModel("gna-blob.dot");
-#endif
-#if GNA_LIB_VER == 2
-    createRequestConfigsForGnaModels();
-#endif
-}
-
-#if GNA_LIB_VER == 2
-void GNAPlugin::createRequestConfigsForGnaModels() {
-    if (!gnadevice) {
-        gnaRequestConfigToRequestIdMap.push_back(std::make_tuple(FAKE_REQUEST_CONFIG_ID, -1, InferenceEngine::BlobMap()));
-        return;
-    }
-    for (auto& model : gnaModels) {
-        const auto& gnaNnet = std::get<0>(model).get()->obj;
-        const auto modelId = gnadevice->createModel(gnaNnet);
-        const auto requestConfigId = gnadevice->createRequestConfig(modelId);
-        gnaRequestConfigToRequestIdMap.push_back(std::make_tuple(requestConfigId, -1, InferenceEngine::BlobMap()));
-    }
-}
-
+    dnn.WriteGraphWizModel("graph.dot");
+    // ExportGnaNetworkAndrzej("layers/loaded_from_ir", &nnet->obj);
 #endif
-
-int GNAPlugin::GetDeviceVersionFromString(const std::string deviceString) {
-    constexpr uint32_t embeddedSuffix = 0xE;
-    if (deviceString.empty())
-        return 0x100 + embeddedSuffix;
-    if (deviceString.size() == 4 && deviceString.substr(0, 3) == "GNA") {
-        int version = deviceString[3] - '0';
-            if (version > 0) {
-            version <<= 8;
-            version += embeddedSuffix;
-            return version;
-        }
-    }
-    THROW_GNA_EXCEPTION << "Wrong GNA generation for embedded model dump: " << deviceString;
 }
-
 void GNAPlugin::DumpXNNToFile() const {
     // TODO: output  precision as well as pointer might be incorrect, LSTM for sure
     // gna looks automatically set layer 0 as output and adjust it's pointer / precision/ size respectively
-    if (config.dumpXNNPath.empty()) {
-        return;
-    }
-
-    const auto versionInt = GetDeviceVersionFromString(config.dumpXNNGeneration);
-
-    if (!gnadevice) {
-        THROW_GNA_EXCEPTION << "Cannot generate XNNDump for float network";
-    }
-    std::ofstream dumpStream(config.dumpXNNPath, std::ios::out | std::ios::binary);
-#if GNA_LIB_VER == 1
-    if (versionInt != 0x10E)
-        THROW_GNA_EXCEPTION << "Wrong GNA version for embedded model dump: " << config.dumpXNNGeneration;
-    auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices);
-    dump.header.rw_region_size = gnamem->getRWBytes();
-    dump.header.input_scaling_factor = inputsDesc->inputScaleFactors.front();
-    dump.header.output_scaling_factor = outputsDesc.front().scale_factor;
-    dumpStream.write(reinterpret_cast<char*>(&dump.header), sizeof(intel_gna_model_header));
-    dumpStream.write(reinterpret_cast<char*>(dump.model.get()), dump.header.model_size);
-#else
-    auto const modelId = gnadevice->createModel(std::get<0>(gnaModels.front())->obj);
-    if (versionInt == Gna2DeviceVersionEmbedded1_0) {
-        auto dump = gnadevice->dumpXnn(modelId);
-        dump.header.RwRegionSize = gnamem->getRWBytes();
-        dump.header.InputScalingFactor = inputsDesc->inputScaleFactors.front();
-        dump.header.OutputScalingFactor = outputsDesc.front().scale_factor;
-        dumpStream.write(reinterpret_cast<char*>(&dump.header), sizeof(Gna2ModelSueCreekHeader));
-        dumpStream.write(reinterpret_cast<char*>(dump.model.get()), dump.header.ModelSize);
-    } else {
-        static_assert(sizeof(versionInt) >= sizeof(Gna2DeviceVersion), "");
-        gnadevice->dumpXnnForDeviceVersion(modelId, dumpStream,
-            *reinterpret_cast<const Gna2DeviceVersion*>(&versionInt));
+    if (!dumpXNNPath.empty()) {
+        if (!gnadevice) {
+            THROW_GNA_EXCEPTION << "Cannot generate XNNDump for float network";
+        }
+        auto dump = gnadevice->dumpXnn(&std::get<0>(nnets.front())->obj, ptr_active_indices, num_active_indices);
+        dump.header.rw_region_size = gnamem->getRWBytes();
+        dump.header.input_scaling_factor = input_scale_factor;
+        dump.header.output_scaling_factor = output_scale_factor;
+        std::ofstream dumpStream(dumpXNNPath, std::ios::out | std::ios::binary);
+        dumpStream.write(reinterpret_cast<char*>(&dump.header), sizeof(intel_gna_model_header));
+        dumpStream.write(reinterpret_cast<char*>(dump.model.get()), dump.header.model_size);
     }
-    gnadevice->releseModel(modelId);
-#endif
 }
 
 void RotateFeatures(uint8_t *ptr_feat,
@@ -833,8 +1718,7 @@ void RotateFeatures(uint8_t *ptr_feat,
                               element_size);
                 }
             }
-            ie_memcpy(ptr_in, num_feature_vector_elements * element_size,
-                &temp.front(), num_feature_vector_elements * element_size);
+            memcpy(ptr_in, &temp.front(), num_feature_vector_elements * element_size);
         }
     } else {
         THROW_GNA_EXCEPTION << "Rotate dimensions (" << num_rotate_rows << "," << num_rotate_columns
@@ -842,182 +1726,128 @@ void RotateFeatures(uint8_t *ptr_feat,
     }
 }
 
-uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &inputs, InferenceEngine::BlobMap &result) {
-#if GNA_LIB_VER == 2
-    auto& nnets = gnaRequestConfigToRequestIdMap;
-#endif
+uint32_t GNAPlugin::QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) {
+    return QueueInference(*input.begin()->second.get(), result);
+
+    /*if (!syncPoints.empty()) {
+        syncPoints.back().second = result;
+    }*/
+}
+
+uint32_t GNAPlugin::QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result) {
+    auto inputLayout = input.layout();
+    if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) {
+        THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: " << input.layout();
+    }
+    if (inputLayout == NCHW) {
+        inputLayout = NC;
+    }
+    auto is2D = input.layout() ==  Layout::NC || input.layout() == Layout ::CN;
+
     auto freeNnet = std::find_if(std::begin(nnets), std::end(nnets), [](decltype(nnets.front()) & item) {
         return std::get<1>(item) == -1;
     });
 
     if (freeNnet == nnets.end()) {
-        if (!graphCompiler.memory_connection.empty()) {
-            Wait(0);
-            freeNnet = nnets.begin();
-        } else {
-            THROW_IE_EXCEPTION << as_status << REQUEST_BUSY
-                               << "GNA executable network has max of "
-                               << static_cast<uint32_t >(gnaFlags->gna_lib_async_threads_num)
-                               << " parallel infer requests, please sync one of already running";
-        }
+        THROW_IE_EXCEPTION << as_status << REQUEST_BUSY
+                           << "GNA executable network has max of " << static_cast<uint32_t >(gna_lib_async_threads_num)
+                           << " parallel infer requests, please sync one of already running";
     }
 
+    auto nnet = std::get<0>(*freeNnet).get();
     auto idx = static_cast<uint32_t>(std::distance(std::begin(nnets), freeNnet));
 
-    int inputNum = 0;
-    for (auto &input : inputs) {
-        auto inputLayout = input.second->getTensorDesc().getLayout();
-        if (inputLayout != Layout::NC && inputLayout != Layout::CN && inputLayout != NCHW) {
-            THROW_GNA_EXCEPTION << "Expected input blob to have Layout::NC or Layout::CN, but was: "
-                                << input.second->getTensorDesc().getLayout();
-        }
-        if (inputLayout == NCHW) {
-            inputLayout = NC;
-        }
-        auto is2D = input.second->getTensorDesc().getLayout() == Layout::NC || input.second->getTensorDesc().getLayout() == Layout::CN;
-
-        if (!inputsDesc->ptr_inputs_global_id.count(input.first)) {
-            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
-            THROW_GNA_EXCEPTION << "network not loaded : input pointer for " << input.first << " not set";
-        }
-
-        if (inputsDesc->getPtrInputsGlobal(input.first)[idx] == nullptr) {
-            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
-            THROW_GNA_EXCEPTION << "network not loaded : input pointer for (" << input.first << " at inferRequest #"
-                                << idx << " not set";
-        }
+    if (ptr_inputs_global[idx] == nullptr) {
+        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+        THROW_GNA_EXCEPTION << "network not loaded : global input pointer not set";
+    }
 
-        if (inputsDesc->getOrientation(input.first) == kDnnUnknownOrientation) {
-            // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
-            THROW_GNA_EXCEPTION << "network not loaded : input orientation for " << input.first << " not set";
-        }
+    if (orientation_in == kDnnUnknownOrientation) {
+        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+        THROW_GNA_EXCEPTION << "network not loaded : input orientation not set";
+    }
 
-        for (auto& outputDesc : outputsDesc) {
-            if (outputDesc.orientation == kDnnUnknownOrientation) {
-                // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
-                THROW_GNA_EXCEPTION << "network not loaded : output orientation not set";
-            }
-        }
+    if (orientation_out == kDnnUnknownOrientation) {
+        // should not happen in user code however might happen if there any non executable network based integration of GNAPlugin instance
+        THROW_GNA_EXCEPTION << "network not loaded : output orientation not set";
+    }
 
-        auto dims = input.second->getTensorDesc().getDims();
-
-        ImportFrames(inputsDesc->getPtrInputsGlobal(input.first)[idx],
-                     input.second->cbuffer().as<float *>(),
-                     input.second->getTensorDesc().getPrecision(),
-                     gnaFlags->sw_fp32 ? 1.0f : inputsDesc->getScaleFactor(inputNum),
-                     inputsDesc->getOrientation(input.first),
-                     dims[0],
-                     is2D ? dims[dims.size() - 2] : dims[0],
-                     is2D ? dims[dims.size() - 1] : dims[dims.size() - 1] * dims[dims.size() - 2] * dims[dims.size() - 3],
-                     is2D ? dims[dims.size() - 1] : dims[dims.size() - 1] * dims[dims.size() - 2] * dims[dims.size() - 3]);
-
-        bool isOneChannel = input.second->getTensorDesc().getDims()[1] == 1;
-        if (((inputLayout == Layout::NC || inputLayout == Layout::NCHW)
-            != (inputsDesc->getOrientation(input.first) == kDnnInterleavedOrientation))
-            && !isOneChannel) {
-            RotateFeatures(reinterpret_cast<uint8_t *>(inputsDesc->getPtrInputsGlobal(input.first)[idx]),
-                           gnadevice ? 2 : 4,
-                           // TODO: only works for cnn4a and google command so far
-                           dims[0],
-                           is2D ? dims[dims.size() - 1] : dims[dims.size() - 1] * dims[dims.size() - 3],  // num_feature_vectors looks batch should be there
-                           num_rotate_rows,
-                           num_rotate_columns);
-        }
-        ++inputNum;
+    ImportFrames(ptr_inputs_global[idx],
+                 input.cbuffer().as<float *>(),
+                 input.precision(),
+                 orientation_in,
+                 input.dims()[input.dims().size() - 1],
+                 is2D ? input.dims()[1] : input.dims()[input.dims().size() - 1],
+                 is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2],
+                 is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2]);
+
+    if ((inputLayout == Layout::NC || inputLayout == Layout::NCHW) != (orientation_in == kDnnInterleavedOrientation)) {
+        RotateFeatures(reinterpret_cast<uint8_t*>(ptr_inputs_global[idx]),
+                       gnadevice ? 2 : 4,
+                       // TODO: only works for cnn4a and google command so far
+                       input.dims()[input.dims().size() - 1],
+                       is2D ? input.dims()[0] :input.dims()[0]*input.dims()[2],  // num_feature_vectors looks batch should be there
+                       num_rotate_rows,
+                       num_rotate_columns);
     }
 
     if (!gnadevice) {
-        dnn->Propagate();
-        if (freeNnet != nnets.end()) {
-            std::get<1>(*freeNnet) = 1;
-        }
+        dnn.Propagate();
+        std::get<1>(*freeNnet) = 1;
     } else {
-#if GNA_LIB_VER == 1
-        auto nnet = std::get<0>(*freeNnet).get();
         std::get<1>(*freeNnet) = gnadevice->propagate(&nnet->obj, ptr_active_indices, num_active_indices);
-#else
-        const auto reqConfigId = std::get<0>(*freeNnet);
-        if (ptr_active_indices != nullptr && num_active_indices > 0 && activeLayerIndex != 0xffffffff)
-            gnadevice->setUpActiveList(reqConfigId, activeLayerIndex, ptr_active_indices, num_active_indices);
-        std::get<1>(*freeNnet) = gnadevice->propagate(reqConfigId);
-#endif
-    }
-
-#ifdef PLOT
-    dnn->BeginNewWrite(dnn_dump_write_index);
-    if (dnn->num_components() != 0) {
-        dnn->WriteDnnText("Net_.txt", kDnnFloat);
-    }
-    dnn_dump_write_index++;
-#endif
-    if (freeNnet != nnets.end()) {
-        // TODO: GNA2: Substitute properly when using GNA 2.0 Library setting and CPU
-        std::get<2>(*freeNnet) = result;
     }
+    std::get<2>(*freeNnet) = result;
     return idx;
 }
 
-void GNAPlugin::Wait(uint32_t request_idx) {
-#if GNA_LIB_VER == 2
-    auto& nnets = gnaRequestConfigToRequestIdMap;
-#endif
-    if (nnets.size() <= request_idx) return;    // TODO: GNA2: check whether necessary
+void GNAPlugin::Wait(uint32_t idx) {
     // already synced TODO: might be copy required ???
-    if (std::get<1>(nnets[request_idx]) == -1) return;
+    if (std::get<1>(nnets[idx]) == -1) return;
 
     if (gnadevice) {
-        gnadevice->wait(std::get<1>(nnets[request_idx]));
+        gnadevice->wait(std::get<1>(nnets[idx]));
     }
 
-    std::get<1>(nnets[request_idx]) = -1;
-    auto &request = std::get<2>(nnets[request_idx]);
+    std::get<1>(nnets[idx]) = -1;
+    auto & output = *std::get<2>(nnets[idx]).begin()->second;
 #ifdef PLOT
-    if (dnn->num_components() != 0) {
-        dnn->WriteInputAndOutputText();
+    dnn.BeginNewWrite();
+    if (dnn.num_components() != 0) {
+        dnn.WriteDnnText("Net_.txt", kDnnFloat);
+        dnn.WriteInputAndOutputText();
     }
-#if GNA_LIB_VER == 1
-    dnn->WriteInputAndOutputTextGNA(&std::get<0>(nnets[request_idx])->obj);
-#else
-    dnn->WriteInputAndOutputTextGNA(std::get<0>(gnaModels[request_idx])->obj);
+    dnn.WriteInputAndOutputTextGNA(&std::get<0>(nnets.front())->obj);
 #endif
-#endif
-    int output_idx = 0;
-    for (auto && outputBlobIt : request) {
-        auto & outputBlob = outputBlobIt.second;
-        auto & outputDesc = outputsDesc[output_idx];
-        if (outputBlob->getTensorDesc().getLayout() == Layout::NC) {
-            // TODO: rotate can be incorporated with exporting - used only in unit tests so far
-            // TODO: restore:
+
+    if (output.layout() == Layout::NC) {
+        // TODO: rotate can be incorporated with exporting - used only in unit tests so far
+        // TODO: restore:
 //        if (orientation_out != kDnnInterleavedOrientation) {
-//            if (inputs.size() != 1) {
-//                THROW_GNA_EXCEPTION << "Invalid number of inputs for  for deinterleave " << inputs.size()
-//                                    << ", only 1 supported";
-//            }
-//            auto dims = inputs.begin()->second->dims();
 //            RotateFeatures(reinterpret_cast<uint8_t*>(ptr_outputs_global),
 //                           gnadevice ? 2 : 4,
-//                           dims[dims.size() - 1],
-//                           dims[0],  // num_feature_vectors looks batch should be there
-//                           dims[0],
-//                           dims[dims.size() - 1]);
+//                           input.dims()[input.dims().size() - 1],
+//                           input.dims()[0],  // num_feature_vectors looks batch should be there
+//                           input.dims()[0],
+//                           input.dims()[input.dims().size() - 1]);
 //        }
-            auto& exportOutputDims = outputBlob->getTensorDesc().getDims();
-            ExportScores(outputBlob->buffer(),
-                         outputDesc.ptrs[request_idx],
-                         outputDesc.orientation,
-                         exportOutputDims[0],
-                         exportOutputDims[exportOutputDims.size() - 2],
-                         exportOutputDims[exportOutputDims.size() - 1],
-                         exportOutputDims[exportOutputDims.size() - 1],
-                         exportOutputDims[exportOutputDims.size() - 1],
-                         outputDesc.num_bytes_per_element,
-                         sizeof(float));
-        } else if (outputBlob->getTensorDesc().getLayout() != Layout::CN) {
-            THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC or Layout::CN. But was "
-                << outputBlob->getTensorDesc().getLayout();
-        }
 
-        if (gnadevice) {
+        ExportScores(output.buffer(),
+                     ptr_outputs_global[idx],
+                     orientation_out,
+                     output.dims()[output.dims().size() - 1],
+                     output.dims()[1],
+                     output.dims()[0],
+                     output.dims()[0],
+                     output.dims()[0],
+                     // TODO: create better getter consider multiple outputs case
+                     gnadevice ? std::get<0>(nnets[idx])->obj.pLayers[std::get<0>(nnets[idx])->obj.nLayers - 1].nBytesPerOutput : sizeof(float),
+                     sizeof(float));
+    } else if (output.layout() != Layout::CN) {
+        THROW_GNA_EXCEPTION << "Expected output blob to have Layout::NC or Layout::CN. But was " << output.layout();
+    }
+
+    if (gnadevice) {
 #ifdef PLOT
         FILE *f = nullptr;
         static int num_infers = 0;
@@ -1026,93 +1856,79 @@ void GNAPlugin::Wait(uint32_t request_idx) {
         }
         num_infers++;
         if (f) {
-            auto dims = outputBlob->getTensorDesc().getDims();
-            for (int i = 0; i < dims[dims.size() - 2]; i++) {
-                for (int j = 0; j < dims[dims.size() - 1]; j++) {
-                    fprintf(f, "%d ", outputBlob->cbuffer().as<int32_t *>()[dims[dims.size() - 1] * i + j]);
+            for (int i = 0; i < output.dims()[1]; i++) {
+                for (int j = 0; j < output.dims()[0]; j++) {
+                    fprintf(f, "%d ", output.cbuffer().as<int32_t *>()[output.dims()[0] * i + j]);
                 }
                 fprintf(f, "\n");
-                }
-                fprintf(f, "\n\n");
             }
+            fprintf(f, "\n\n");
+        }
 #endif
-            ConvertToFloat(outputBlob->buffer(),
-                           outputBlob->buffer(),
-                           outputBlob->getTensorDesc().getDims()[outputBlob->getTensorDesc().getDims().size() - 1],
-                           outputBlob->getTensorDesc().getDims()[outputBlob->getTensorDesc().getDims().size() - 2],
-                           outputDesc.scale_factor);
+        ConvertToFloat(output.buffer(),
+                       output.buffer(),
+                       output.dims()[0],
+                       output.dims()[1],
+                       output_scale_factor);
 #ifdef PLOT
         if (f) {
-            auto dims = outputBlob->getTensorDesc().getDims();
-            for (int i = 0; i < dims[dims.size() - 2]; i++) {
-                for (int j = 0; j < dims[dims.size() - 1]; j++) {
-                    fprintf(f, "%.2f ", outputBlob->cbuffer().as<float *>()[dims[dims.size() - 1] * i + j]);
+            for (int i = 0; i < output.dims()[1]; i++) {
+                for (int j = 0; j < output.dims()[0]; j++) {
+                    fprintf(f, "%.2f ", output.cbuffer().as<float *>()[output.dims()[0] * i + j]);
                 }
                 fprintf(f, "\n");
-                }
-                fclose(f);
             }
-#endif
+            fclose(f);
         }
-        output_idx++;
+#endif
     }
 }
 
-void GNAPlugin::Reset() {
-    graphCompiler.Reset();
-}
 
 void GNAPlugin::Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &output) {
-    BlobMap bmInput;
-    BlobMap bmOutput;
-    if (inputsDataMap.size() != 1) {
-        THROW_GNA_EXCEPTION << "cannot infer using Infer(Blob&, Blob&)"<< "model accepts " << inputsDataMap.size() << " inputs";
-    }
+    BlobMap result;
+    result["output"] = std::shared_ptr<Blob>(&output, [](Blob*){});
+    Wait(QueueInference(input, result));
+}
 
-    IE_ASSERT(!inputsDataMap.empty());
-    bmInput[inputsDataMap.begin()->first] = std::shared_ptr<Blob>(const_cast<Blob*>(&input), [](Blob*){});
-    IE_ASSERT(!outputsDataMap.empty());
-    bmOutput[outputsDataMap.begin()->first] = std::shared_ptr<Blob>(&output, [](Blob*){});
-    Infer(bmInput, bmOutput);
+void GNAPlugin::Reset() {
+    for (auto && memLayer : memory_connection) {
+        std::memset(memLayer.second.gna_ptr, 0, memLayer.second.reserved_size);
+    }
+    for (auto && concatLayer : concat_connection) {
+        std::memset(concatLayer.second.gna_ptr, 0, concatLayer.second.reserved_size);
+    }
 }
 
-void GNAPlugin::Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) {
-    Wait(QueueInference(input, result));
+void GNAPlugin::Infer(const BlobMap &inputs, BlobMap &result) {
+    auto &input = *inputs.begin()->second.get();
+    auto &output = *result.begin()->second.get();
+    Infer(input, output);
 }
 
-Blob::Ptr GNAPlugin::GetOutputBlob(const std::string& name, InferenceEngine::Precision precision) {
+Blob::Ptr GNAPlugin::GetOutputBlob(InferenceEngine::Precision precision) {
     // need to have intermediate blob for interleave conversion
     InferenceEngine::Blob::Ptr outputBlob;
-    auto outputDims = outputsDataMap[name]->getTensorDesc().getDims();
-    outputBlob = make_blob_with_precision(TensorDesc(precision, outputDims, outputDims.size() == 2 ? NC : NCHW));
+    outputBlob = make_blob_with_precision(precision, NC, outputDims);
     outputBlob->allocate();
     return outputBlob;
 }
 
-Blob::Ptr GNAPlugin::GetInputBlob(const std::string& name, InferenceEngine::Precision precision) {
+Blob::Ptr GNAPlugin::GetInputBlob(InferenceEngine::Precision precision) {
     InferenceEngine::Blob::Ptr inputBlob;
     // need to have intermediate blob for interleave conversion
     // TODO: NCHW format support is experimental = c++ MO did insert reshape, while TF mo - not
-    auto inputDims = inputsDataMap[name]->getTensorDesc().getDims();
-    inputBlob = make_blob_with_precision(TensorDesc(precision, inputDims, inputDims.size() == 2 ? NC : NCHW));
+    inputBlob = make_blob_with_precision(precision, inputDims.size() == 2 ? NC : NCHW, inputDims);
     inputBlob->allocate();
     return inputBlob;
 }
 
 std::vector<InferenceEngine::MemoryStateInternal::Ptr>  GNAPlugin::QueryState() {
-    if (graphCompiler.memory_connection.empty()) {
+    if (memory_connection.empty()) {
         return {};
     }
 
-    return {std::make_shared<memory::GNAMemoryState>(shared_from_this())};
-}
-
-std::string GNAPlugin::GetName() const noexcept {
-    return _pluginName;
-}
-
-void GNAPlugin::SetName(const std::string & pluginName) noexcept {
-    _pluginName = pluginName;
+    return {std::make_shared<GNAMemoryState>(shared_from_this())};
 }
 
 InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::string &modelFileName) {
@@ -1124,128 +1940,96 @@ InferenceEngine::IExecutableNetwork::Ptr GNAPlugin::ImportNetwork(const std::str
 
     auto header = GNAModelSerial::ReadHeader(inputStream);
 
-    InitGNADevice();
+    gnadevice.reset(new GNADeviceHelper(gna_proc_type,
+                                        gna_lib_async_threads_num,
+                                        gna_openmp_multithreading));
+    gnamem.reset(new gna_memory_type(make_polymorph<GNAAllocator>(*gnadevice.get()), PAGE_SIZE_BYTES));
 
-    graphCompiler.setGNAMemoryPtr(gnamem);
     void *basePtr = nullptr;
     gnamem->reserve_ptr(&basePtr, header.gnaMemSize);
     gnamem->commit();
-#if GNA_LIB_VER == 2
-    gnaModels.push_back(std::make_tuple(make_shared<CPPWrapper<Gna2Model>>(header.layersCount)));
-#else
-    nnets.emplace_back(make_shared<CPPWrapper<intel_nnet_type_t>>(header.layersCount), -1, InferenceEngine::BlobMap());
+
+    nnets.push_back(std::make_tuple(make_shared<CPPWrapper<intel_nnet_type_t>>(header.layersCount), -1, InferenceEngine::BlobMap()));
     std::get<0>(nnets.back())->obj.nGroup = header.nGroup;
-#endif
     GNAModelSerial::MemoryType  mt;
-#if GNA_LIB_VER == 2
-    auto serial = GNAModelSerial(&std::get<0>(gnaModels.back())->obj, mt);
-#else
     auto serial = GNAModelSerial(&std::get<0>(nnets.back())->obj, mt);
-#endif
     serial.Import(basePtr, header.gnaMemSize, inputStream);
 
-    inputsDesc->getPtrInputsGlobal("input").push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.input.descriptor_offset));
-    // TODO: import of multioutput network not supported
-    outputsDesc.resize(1);
-    auto &outputDesc = outputsDesc.front();
-    outputDesc.ptrs.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.output.descriptor_offset));
+    ptr_inputs_global.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.input.descriptor_offset));
+    ptr_outputs_global.push_back(reinterpret_cast<float*>(reinterpret_cast<uint8_t *> (basePtr) + header.output.descriptor_offset));
 
-#if GNA_LIB_VER == 2
-    auto getOrientation = [](Gna2Operation & gnaOperation) {
-        return gnaOperation.Type == Gna2OperationTypeConvolution ?
-            kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
-    };
-#else
     auto getOrientation = [](intel_nnet_layer_t & layer) {
         return layer.nLayerKind == INTEL_CONVOLUTIONAL ?
            kDnnNonInterleavedOrientation : kDnnInterleavedOrientation;
     };
-#endif
 
-#if GNA_LIB_VER == 2
-    inputsDesc->orientation_in["input"] = getOrientation(std::get<0>(gnaModels.back())->obj.Operations[0]);
-    outputDesc.orientation = getOrientation(std::get<0>(gnaModels.back())->obj.Operations[std::get<0>(gnaModels.back())->obj.NumberOfOperations - 1]);
-#else
-    inputsDesc->orientation_in["input"] = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]);
-    outputDesc.orientation = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers - 1]);
-#endif
-    outputDesc.num_bytes_per_element = header.output.element_size;
+    orientation_in = getOrientation(std::get<0>(nnets.back())->obj.pLayers[0]);
+    orientation_out = getOrientation(std::get<0>(nnets.back())->obj.pLayers[std::get<0>(nnets.back())->obj.nLayers-1]);
+
+    num_bytes_per_output = header.output.element_size;
 
-    auto outputDims = SizeVector({header.nGroup, header.output.elements_count / header.nGroup});
-    auto inputDims = SizeVector({header.nGroup, header.input.elements_count / header.nGroup});
+
+    outputDims = SizeVector({header.output.elements_count / header.nGroup, header.nGroup});
+    inputDims = SizeVector({header.input.elements_count / header.nGroup, header.nGroup});
 
     inputsDataMap["input"] = std::make_shared<InputInfo>();
     inputsDataMap["input"]->setInputData(make_shared<Data>("input",
-                                                           TensorDesc(
-                                                                   Precision::FP32,
-                                                                   inputDims,
-                                                                   Layout::NC)));
+                                                           inputDims,
+                                                           Precision::FP32,
+                                                           Layout::NC));
     outputsDataMap["output"] = make_shared<Data>("output",
-                                                 TensorDesc(
-                                                         Precision::FP32,
-                                                         outputDims,
-                                                         Layout::NC));
+                                                 outputDims,
+                                                 Precision::FP32,
+                                                 Layout::NC);
 
-    outputDesc.scale_factor = header.output.scaleFactor;
-    inputsDesc->inputScaleFactors.push_back(header.input.scaleFactor);
+    output_scale_factor = header.output.scaleFactor;
+    input_scale_factor = header.input.scaleFactor;
 
     num_rotate_rows = header.nRotateRows;
     num_rotate_columns = header.nRotateColumns;
 
     for (auto && memory : mt) {
-        GNAMemoryLayer memoryLayer(nullptr, nullptr, gnaFlags->sw_fp32 ? 4 : 2);
+        GNAMemoryLayer memoryLayer(nullptr, nullptr);
         memoryLayer.gna_ptr = memory.first;
         memoryLayer.reserved_size = memory.second;
 
-        graphCompiler.memory_connection.emplace_back(make_pair(std::string("noname"), memoryLayer));
+        memory_connection.emplace_back(make_pair(std::string("noname"), memoryLayer));
     }
 
     DumpXNNToFile();
 
 #ifdef PLOT
-    dnn->WriteGraphWizModel("gna-blob-imported.dot");
-#endif
-#if GNA_LIB_VER == 2
-    createRequestConfigsForGnaModels();
+    dnn.WriteGraphWizModel("graph.dot");
+    // ExportGnaNetworkAndrzej("layers/loaded_from_aot_file", &nnet->obj);
 #endif
+
     return nullptr;
 }
 
 void GNAPlugin::Export(const std::string &fileName) {
-    if (inputsDesc->ptr_inputs_global_id.empty() || outputsDesc.empty()) {
+    if (ptr_inputs_global.empty() || ptr_outputs_global.empty()) {
         THROW_GNA_EXCEPTION << " network not loaded";
     }
 
-    if (inputsDesc->ptr_inputs_global_id.size() != 1) {
-        THROW_GNA_EXCEPTION << " exporting network with multiple inputs not supported";
-    }
-
     std::fstream outStream(fileName, ios_base::out | ios_base::binary);
 
     // TODO: nnet group parameter looks only used in application - so can we move this line into load network.
-    IE_ASSERT(!inputsDataMap.empty());
-    auto inputDims = inputsDataMap.begin()->second->getTensorDesc().getDims();
     if (inputDims.size() == 2) {
-#if GNA_LIB_VER == 1
-        std::get<0>(nnets.front())->obj.nGroup = inputDims[0];
-#endif
+        std::get<0>(nnets.front())->obj.nGroup = inputDims[1];
     }
-#if GNA_LIB_VER == 2
-    auto serial = GNAModelSerial(&std::get<0>(gnaModels.front())->obj,
-#else
+
     auto serial = GNAModelSerial(&std::get<0>(nnets.front())->obj,
-#endif
-                   {inputsDesc->inputScaleFactors.front(),
-                    inputsDesc->ptr_inputs_global_storage.front()[0],
+                   {input_scale_factor,
+                    ptr_inputs_global[0],
                     2,
-                    static_cast<uint32_t>(InferenceEngine::details::product(inputsDataMap.begin()->second->getTensorDesc().getDims()))},
-                   {outputsDesc.front().scale_factor,
-                    outputsDesc.front().ptrs.front(),
-                    outputsDesc.front().num_bytes_per_element,
-                    static_cast<uint32_t>(InferenceEngine::details::product(outputsDataMap.begin()->second->getTensorDesc().getDims()))})
-        .SetInputRotation(dnn->num_rotate_rows, dnn->num_rotate_columns);
-
-    for (auto && memoryConnection : graphCompiler.memory_connection) {
+                    static_cast<uint32_t>(InferenceEngine::details::product(inputsDataMap.begin()->second->getDims()))},
+                   {output_scale_factor,
+                    ptr_outputs_global[0],
+                    num_bytes_per_output,
+                    static_cast<uint32_t>(InferenceEngine::details::product(outputsDataMap.begin()->second->getDims()))})
+        .SetInputRotation(dnn.num_rotate_rows, dnn.num_rotate_columns);
+
+    for (auto && memoryConnection : memory_connection) {
         serial.AddState(memoryConnection.second.gna_ptr, memoryConnection.second.reserved_size);
     }
 
@@ -1253,46 +2037,238 @@ void GNAPlugin::Export(const std::string &fileName) {
 }
 
 void GNAPlugin::GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) {
-    if (gnaFlags->performance_counting) {
+    if (performance_counting) {
         gnadevice->getGnaPerfCounters(perfMap);
     }
 }
 
 void GNAPlugin::AddExtension(InferenceEngine::IExtensionPtr extension) {}
+void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config) {}
+
+intel_dnn_component_t * GNAPlugin::find_first_unused_input(InferenceEngine::CNNLayerPtr current) {
+    if (current->insData.empty()) return nullptr;
 
-void GNAPlugin::SetConfig(const std::map<std::string, std::string> &config_map) {
-    config.UpdateFromMap(config_map);
-    UpdateFieldsFromConfig();
+    auto prev_layer = current->insData.front().lock()->creatorLayer.lock();
+
+    return findDnnLayer(prev_layer);
 }
+void GNAPlugin::connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr, void *ptr_inputs, size_t num_data_bytes_out) {
+    gnalog() << "Connecting output " << layer->name << " ...\n";
+    // in case of Memory Layer it's input allocated in meminput layer
+    if (layer->outData.size() == 1) {
+        for (auto &&outLayer : layer->outData.front()->getInputTo()) {
+            auto& nextLayer = outLayer.second;
+            auto nextMemoryLayerIt =
+                std::find_if(begin(memory_connection), end(memory_connection),
+                                                        [&](MemoryConnection::value_type &comp) {
+                                                            return comp.second.getOutput()->name
+                                                                                == nextLayer->name;
+                                                        });
+            if (nextMemoryLayerIt != memory_connection.end()) {
+                auto &nextMemoryLayer = nextMemoryLayerIt->second;
+                // memory layer not yet initialized
+                if (nextMemoryLayer.reserved_size == 0) {
+                    gnamem->reserve_ptr(&nextMemoryLayer.gna_ptr, ALIGN64(num_data_bytes_out));
+                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, 0);
+
+                    nextMemoryLayer.reserved_offset = 0;
+                    nextMemoryLayer.reserved_size = ALIGN64(num_data_bytes_out);
+                } else {
+                    IE_ASSERT(nextMemoryLayer.reserved_size == ALIGN64(num_data_bytes_out));
+                    // same offsets
+                    gnamem->bind_ptr(ptr, &nextMemoryLayer.gna_ptr, nextMemoryLayer.reserved_offset);
+                }
+                return;
+            }
+        }
+
+        // if one of next layers is concat...
+        for (auto &&outLayer : layer->outData.front()->getInputTo()) {
+            auto nextLayer = outLayer.second;
+            if ( LayerInfo(nextLayer).isConcat() ) {
+                auto& name = layer->name;
+                // we look for this concat layer pointer in extra concat map
+                auto concatLayerInfo = concat_connection.find(
+                                nextLayer->name);
+
+                if (concatLayerInfo != concat_connection.end()) {
+                    auto &concatLayerInfoItem = concatLayerInfo->second;
+
+                    // find this input in vector sum all outputs in primitive
+                    auto it = std::find_if(concatLayerInfoItem.concatInputLayers.begin(),
+                                            concatLayerInfoItem.concatInputLayers.end(),
+                                            [&name](GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) {
+                                                return item.name == name;
+                                            });
+                    // reserve full size for concat
+                    if (!concatLayerInfoItem.output_allocation_flag) {
+                        // check if this concat is being included by other one
+                        // by going thru each concat and checking inputs
+                        auto included =
+                            std::find_if(concat_connection.begin(),
+                                           concat_connection.end(),
+                               [&concatLayerInfo]
+                                    (const std::pair<std::string, GNAPlugin::GNAConcatLayer> &concatItem) -> bool {
+                                        auto it = std::find_if(concatItem.second.concatInputLayers.begin(),
+                                                        concatItem.second.concatInputLayers.end(),
+                                                        [&concatLayerInfo]
+                                                            (const GNAPlugin::GNAConcatLayer::ConcatConnectedLayerInfo &item) -> bool {
+                                                                            return item.name == concatLayerInfo->first;
+                                                            });
+                                        return it != concatItem.second.concatInputLayers.end();
+                                    });
+                        if (included == concat_connection.end()) {
+                            gnamem->reserve_ptr(&concatLayerInfoItem.gna_ptr, ALIGN64(concatLayerInfoItem.reserved_size));
+                        }
+                        concatLayerInfo->second.output_allocation_flag = true;
+                    }
+                    gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, it->offset);
+                } else {
+                    // error
+                }
+                return;
+            }
+        }
+    }
 
-void GNAPlugin::UpdateFieldsFromConfig() {
-    inputsDesc->inputScaleFactors = config.inputScaleFactors;
-    *gnaFlags = config.gnaFlags;
+    intel_dnn_component_t * unused_input = nullptr;
+    if (compact_mode) {
+        unused_input = find_first_unused_input(layer);
+        if (unused_input != nullptr) {
+            gnamem->bind_ptr(ptr, &unused_input->ptr_inputs, 0, ALIGN64(num_data_bytes_out));
+        }
+    }
+    // cannot reuse suitable input
+    if (unused_input == nullptr) {
+        gnamem->reserve_ptr(ptr, ALIGN64(num_data_bytes_out));
+    }
 }
 
-void GNAPlugin::QueryNetwork(const InferenceEngine::ICNNNetwork& network,
-                             const std::map<std::string, std::string>& config,
-                             InferenceEngine::QueryNetworkResult& res) const {
-    std::unordered_set<CNNLayer *> allLayers;
-    InferenceEngine::InputsDataMap inputs;
+intel_dnn_component_t * GNAPlugin::findDnnLayer(CNNLayerPtr __layer) {
+    auto component = std::find_if(begin(dnnComponentsForLayer),
+                        end(dnnComponentsForLayer),
+                        [&](DnnComponentsForLayer::value_type &comp) {
+                            return comp.first == __layer->name;
+                        });
+    // check for generic prev layer
+    if (component != dnnComponentsForLayer.end()) {
+        return &component->second;
+    }
 
-    network.getInputsInfo(inputs);
-    std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network);
+    return nullptr;
+}
 
-    if (inputs.empty()) {
-        THROW_GNA_EXCEPTION << "Network is empty (GNA)\n";
+GNAPlugin::ConnectionDetails GNAPlugin::connectInput(CNNLayerPtr layer, void *ptr, size_t num_data_bytes_in, size_t offset, int idx) {
+    // selecting particular input layers
+    auto prevLayer = CNNNetPrevLayer(layer, idx);
+
+    gnalog() << "Connecting input " << layer->name << " to " << prevLayer->name << " ...\n";
+
+    // real input not a memory input
+    if (LayerInfo(prevLayer).isInput()) {
+        if (0 == bytes_alllocated_for_input) {
+            gnamem->push_value(&ptr_inputs_global.front(), static_cast<uint8_t>(0), num_data_bytes_in, 64);
+            bytes_alllocated_for_input = num_data_bytes_in;
+        }
+        if (ALIGN(num_data_bytes_in, 64) > ALIGN(bytes_alllocated_for_input, 64)) {
+            THROW_IE_EXCEPTION << "Layer: " << layer->name << " Cannot bind pointer to already allocated input, due to size_allocated="
+                                  << bytes_alllocated_for_input << ", and size_requested=" << num_data_bytes_in;
+        }
+        gnamem->bind_ptr(ptr, &ptr_inputs_global.front(), offset);
+        return prevLayer;
     }
 
-    auto const & secondLayers = inputs.begin()->second->getInputData()->getInputTo();
-    if (secondLayers.empty()) {
-        THROW_GNA_EXCEPTION << "Network consists of input layer only (GNA)\n";
+    LayerInfo layerInfoObj(prevLayer);
+    LayerInfo thisLayerInfoObj(layer);
+    // connecting to split/slice splitiing layers
+    if (layerInfoObj.isSplit() || layerInfoObj.isSlice()) {
+        auto& splittingLayer = prevLayer;
+        auto& splitName = splittingLayer->name;
+        auto& name = layer->name;
+
+        // we look for this concat layer pointer in extra concat map
+        auto splitLayerInfo = split_connection.find(splitName);
+
+        if (splitLayerInfo != split_connection.end()) {
+            auto &splitLayerInfoItem = splitLayerInfo->second;
+            // find this input in vector sum all outputs in primitive
+            auto it = std::find_if(splitLayerInfoItem.splitOutputLayers.begin(),
+                                    splitLayerInfoItem.splitOutputLayers.end(),
+                                            [&name](GNAPlugin::GNASplitLayer::SplitConnectedLayerInfo &item) {
+                                                return item.name == name;
+                                            });
+
+            if (it != splitLayerInfoItem.splitOutputLayers.end()) {
+                gnalog()  << "Connecting split/slice input \n";
+                auto res = connectInput(splittingLayer, ptr,
+                                            splitLayerInfoItem.reserved_size, it->offset, 0);
+                gnalog()  << "Connected \n";
+                return res;
+            }
+        }
+        THROW_GNA_EXCEPTION << "Split/Slice layer: " << splitName
+                                 << " is not included in extra map. Something wrong happened";
+    } else if (layerInfoObj.isConcat()) {
+        auto concatLayerInfo = concat_connection.find(
+                                                    prevLayer->name);
+        if (concatLayerInfo != concat_connection.end()) {
+            auto & concatLayerInfoItem = concatLayerInfo->second;
+            // dnnLayer that is input for concat output layer
+            gnamem->bind_ptr(ptr, &concatLayerInfoItem.gna_ptr, offset);
+            // return layer over concat
+            return CNNNetPrevLayer(prevLayer);
+        }
+    } else if (layerInfoObj.isCrop()) {
+        auto cropLayerInfo = crop_connection.find(
+                                                    prevLayer->name);
+        if (cropLayerInfo != crop_connection.end()) {
+            auto & cropLayerInfoItem = cropLayerInfo->second;
+            gnamem->bind_ptr(ptr, &cropLayerInfoItem.gna_ptr, offset);
+            return CNNNetPrevLayer(prevLayer);
+        }
     }
+    auto prevDnnLayer = findDnnLayer(prevLayer);
 
-    InferenceEngine::details::UnorderedDFS(allLayers,
-                                           secondLayers.begin()->second,
-                                           [&](CNNLayerPtr const& layer) {
-                                                if (LayerTypeFromStr(layer->type) != LayerType::NO_TYPE) {
-                                                    res.supportedLayersMap.insert({ layer->name, GetName() });
-                                                }
-                                            }, false);
+    // check for generic prev layer
+    if (prevDnnLayer != nullptr) {
+        gnamem->bind_ptr(ptr, &prevDnnLayer->ptr_outputs, offset);
+        return prevLayer;
+    }
+
+    auto prevMemoryLayer =
+        std::find_if(begin(memory_connection), end(memory_connection), [&](MemoryConnection::value_type &comp) {
+            return comp.second.getInput()->name == prevLayer->name;
+        });
+    if (prevMemoryLayer != memory_connection.end()) {
+        // dnnLayer that is input for memory output layer
+        auto& memoryLayer = prevMemoryLayer->second;
+        if (memoryLayer.reserved_size == 0) {
+            gnamem->reserve_ptr(&memoryLayer.gna_ptr, ALIGN64(num_data_bytes_in));
+            gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, offset);
+
+            memoryLayer.reserved_offset = offset;
+            memoryLayer.reserved_size = ALIGN64(num_data_bytes_in);
+        } else {
+            IE_ASSERT(memoryLayer.reserved_size == ALIGN64(num_data_bytes_in));
+            // same offsets
+            gnamem->bind_ptr(ptr, &memoryLayer.gna_ptr, memoryLayer.reserved_offset);
+        }
+
+        return prevLayer;
     }
+
+    // several layers are to be skipped right now
+    if (LayerInfo(prevLayer).isReshape()) {
+        gnalog()  << "Skipping reshape layer: " << prevLayer->name << "\n";
+        return connectInput(prevLayer, ptr, num_data_bytes_in, offset, 0);
+    }
+
+    if (LayerInfo(prevLayer).isPermute()) {
+        gnalog()  << "Skipping permute layer: " << prevLayer->name << "\n";
+        return {connectInput(prevLayer, ptr, num_data_bytes_in, offset, 0).input, true, prevLayer};
+    }
+
+
+    THROW_GNA_EXCEPTION << "Cannot connect input for: " << layer->name;
+}
+
diff --git a/inference-engine/src/gna_plugin/gna_plugin.hpp b/inference-engine/src/gna_plugin/gna_plugin.hpp
index 64a47467431288..53365d7a659e71 100644
--- a/inference-engine/src/gna_plugin/gna_plugin.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin.hpp
@@ -1,153 +1,170 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
 
+#include "cpp_interfaces/base/ie_plugin_base.hpp"
+#include "dnn.h"
+#include "gna_memory.hpp"
+#include "gna_device.hpp"
 #include <map>
-#include <unordered_map>
 #include <list>
 #include <string>
 #include <utility>
 #include <memory>
 #include <vector>
 #include <tuple>
+#include <gna-api-status.h>
+#include <gna-api.h>
 #include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
-#include <cpp_interfaces/interface/ie_imemory_state_internal.hpp>
-#include "descriptions/gna_flags.hpp"
-#include "descriptions/gna_input_desc.hpp"
-#include "descriptions/gna_output_desc.hpp"
-#include "backend/am_intel_dnn.hpp"
-#include "gna_data_types.hpp"
-#include "gna_graph_compiler.hpp"
-#include "gna_plugin_policy.hpp"
-#include "gna_plugin_log.hpp"
-#include "gna_plugin_config.hpp"
-
-#if GNA_LIB_VER == 2
-#include <gna2-model-api.h>
-#endif
+#include <cpp_interfaces/impl/ie_plugin_internal.hpp>
+#include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
+#include <graph_tools.hpp>
+#include "gna_allocator.hpp"
+#include "gna_api_wrapper.hpp"
 
 namespace GNAPluginNS {
-class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::enable_shared_from_this<GNAPlugin> {
- protected:
-    std::string _pluginName = "GNA";
 
-    Config config;
-    std::shared_ptr<GNAPluginNS::backend::AMIntelDNN> dnn;
-    std::shared_ptr<GNAPluginNS::GNAFlags> gnaFlags;
-    std::shared_ptr<GNAPluginNS::gna_memory_type> gnamem;
-    std::shared_ptr<GNAPluginNS::InputDesc> inputsDesc;
+void ConvertToInt16(int16_t *ptr_dst,
+                    const float *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor);
+void ConvertToFloat(float *ptr_dst,
+                    int32_t *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor);
+
+int16_t ConvertFloatToInt16(float src);
 
-    GNAPluginNS::GNAGraphCompiler graphCompiler;
+class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::enable_shared_from_this<GNAPlugin> {
+ protected:
+    AmIntelDnn dnn;
+    using dnn_ptr = std::shared_ptr<CPPWrapper<intel_nnet_type_t>>;
 
     /**
      * @brief - copy of nnet structure and indicator that related infer request not yet synced
      */
-#if GNA_LIB_VER == 1
     std::vector<std::tuple<dnn_ptr, int32_t, InferenceEngine::BlobMap>> nnets;
-#else
-    static constexpr uint32_t FAKE_REQUEST_CONFIG_ID = 0xffffffff;
-    std::vector<std::tuple<dnn_ptr>> gnaModels;
-    std::vector<std::tuple<uint32_t, int64_t, InferenceEngine::BlobMap>> gnaRequestConfigToRequestIdMap;
-#endif
-
-#if GNA_LIB_VER == 2
-    uint32_t activeLayerIndex = 0xffffffff;
-#endif
+
+    intel_dnn_orientation_t orientation_in = kDnnUnknownOrientation;
+    intel_dnn_orientation_t orientation_out = kDnnUnknownOrientation;
+    double input_scale_factor = 1.0;
+    double output_scale_factor = 1.0;
     uint32_t num_rotate_rows = 0;
     uint32_t num_rotate_columns = 0;
-    uint32_t *ptr_active_indices = nullptr;
-    uint32_t num_active_indices = 0;
-    uint32_t num_group_in = 0;
-    uint32_t dnn_dump_write_index = 0;
-
-    // index matches iterating order of cnnnetwork outputs info
-    std::vector<GNAPluginNS::OutputDesc> outputsDesc = std::vector<OutputDesc>();
-
-    intel_dnn_number_type_t output_type = kDnnInt;
-
-    GNAPluginNS::Policy policy;
 
-#if GNA_LIB_VER == 2
-    void createRequestConfigsForGnaModels();
-#endif
 
-    static int GetDeviceVersionFromString(const std::string deviceString);
+    uint32_t num_feature_maps = 1;
+    uint32_t num_memory_bytes;
 
-    std::shared_ptr<GNADeviceHelper> gnadevice;
-    /**
-     * @brief size of RW segment without extra memory for parallel execution
-     */
-    uint32_t rwSegmentSize = 0;
+    std::vector<void *> ptr_inputs_global;
+    std::vector<void *> ptr_outputs_global;
 
-    InferenceEngine::InputsDataMap inputsDataMap;
-    InferenceEngine::OutputsDataMap outputsDataMap;
+    int16_t *ptr_int_inputs = NULL;
+    int32_t *ptr_int_outputs = NULL;
+    uint32_t *ptr_active_indices = NULL;
+    uint32_t num_active_indices = 0;
+    uint32_t num_group_in = 0;
+    uint32_t num_bytes_weight;
+    uint32_t num_bytes_per_output = 0;
+
+    bool use_dynamic_quantization = false;
+    bool compact_mode = true;
+    bool exclusive_async_requests = false;
+    bool uniformPwlDesign = false;
+    uint8_t gna_lib_async_threads_num = 1;
+    bool gna_openmp_multithreading = false;
+    // precision of GNA hardware model
+    InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
+
+    bool performance_counting = false;
+    int  bytes_alllocated_for_input = 0;
+    intel_dnn_number_type_t output_type = kDnnInt;
+    std::string utterance_name;
+
+    // internal types
+    enum LayerType {
+        Input,
+        Convolution,
+        ReLU,
+        LeakyReLU,
+        Sigmoid,
+        TanH,
+        Activation,
+        Pooling,
+        FullyConnected,
+        InnerProduct,
+        Reshape,
+        Split,
+        Slice,
+        Eltwise,
+        ScaleShift,
+        Clamp,
+        Concat,
+        Copy,
+        Permute,
+        Memory,
+        Power,
+        Crop,
+        NO_TYPE
+    };
 
  public:
     explicit GNAPlugin(const std::map<std::string, std::string>& configMap);
     /**
      * @brief construct from aot rather then from cnn network
      */
-    GNAPlugin();
-
-    std::string GetName() const noexcept override;
-    void SetName(const std::string & pluginName) noexcept override;
+    GNAPlugin() = default;
 
-    void LoadNetwork(InferenceEngine::ICNNNetwork &network);
+    void LoadNetwork(InferenceEngine::ICNNNetwork &network) override;
+    using InferenceEngine::IInferencePluginInternal::Infer;
 
-    void Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result);
-    void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap);
+    void Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) override;
+    void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) override;
     void AddExtension(InferenceEngine::IExtensionPtr extension) override;
-
     void SetConfig(const std::map<std::string, std::string> &config) override;
     void LoadNetwork(InferenceEngine::IExecutableNetwork::Ptr &executableNetwork,
-                     const InferenceEngine::ICNNNetwork &network,
-                     const std::map<std::string, std::string> &config_map) override { THROW_GNA_EXCEPTION << "Not implemented"; }
-    InferenceEngine::ExecutableNetwork LoadNetwork(const InferenceEngine::ICNNNetwork &network,
-                                  const std::map<std::string, std::string> &config_map,
-                                  InferenceEngine::RemoteContext::Ptr context) override { THROW_GNA_EXCEPTION << "Not implemented"; }
-    void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result);
-    void SetCore(InferenceEngine::ICore*) noexcept override {}
-    InferenceEngine::ICore* GetCore() const noexcept override {return nullptr;}
+                     InferenceEngine::ICNNNetwork &network,
+                     const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
+    void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result) override;
+    void SetLogCallback(InferenceEngine::IErrorListener &listener) override {};
     void Reset();
+    /**
+     * @deprecated Use the version with config parameter
+     */
+    void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
+                      InferenceEngine::QueryNetworkResult &res) const override { }
     void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
                       const std::map<std::string, std::string>& config,
-                      InferenceEngine::QueryNetworkResult &res) const override;
+                      InferenceEngine::QueryNetworkResult &res) const override { }
     uint32_t QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result);
     void Wait(uint32_t idx = 0);
 
-    InferenceEngine::Parameter GetConfig(const std::string& name,
-                                         const std::map<std::string, InferenceEngine::Parameter> & options) const override;
-    InferenceEngine::Parameter GetMetric(const std::string& name,
-                                         const std::map<std::string, InferenceEngine::Parameter> & options) const override;
-    InferenceEngine::RemoteContext::Ptr CreateContext(const InferenceEngine::ParamMap& params) override { THROW_GNA_EXCEPTION << "Not implemented"; }
-    InferenceEngine::RemoteContext::Ptr GetDefaultContext() override { THROW_GNA_EXCEPTION << "Not implemented"; }
-
-    void Wait(uint32_t sync, InferenceEngine::Blob &result) { THROW_GNA_EXCEPTION << "Not implemented"; }
+    uint32_t QueueInference(const InferenceEngine::Blob &input, InferenceEngine::BlobMap &result);
+    /**
+     *
+     * @param sync - points to gna sync point
+     * @param idx - points to
+     * @param result
+     */
+    void Wait(uint32_t sync, InferenceEngine::Blob &result);
 
     void Export(const std::string &fileName);
-    InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName,
-                                                           const std::map<std::string, std::string> &config) override {
-        THROW_GNA_EXCEPTION << "Not implemented";
-    }
-    InferenceEngine::ExecutableNetwork ImportNetwork(std::istream& networkModel,
-                                                     const InferenceEngine::RemoteContext::Ptr& context,
-                                                     const std::map<std::string, std::string> &config) override {
-        THROW_GNA_EXCEPTION << "Not implemented";
-    }
-    InferenceEngine::ExecutableNetwork ImportNetwork(std::istream& networkModel,
-                                                     const std::map<std::string, std::string> &config) override {
-        THROW_GNA_EXCEPTION << "Not implemented";
-    }
-
+    InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName
+        , const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
     InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName);
 
+
+    bool IsExclusiveAsyncRequests() { return exclusive_async_requests; }
+
     /**
      * utility to provide input and output blobs externally to be used by InferenceEngine request API clients
      */
-    InferenceEngine::Blob::Ptr GetInputBlob(const std::string& name, InferenceEngine::Precision precision);
-    InferenceEngine::Blob::Ptr GetOutputBlob(const std::string& name, InferenceEngine::Precision precision);
+    InferenceEngine::Blob::Ptr GetInputBlob(InferenceEngine::Precision precision);
+    InferenceEngine::Blob::Ptr GetOutputBlob(InferenceEngine::Precision precision);
     /**
      * helpers to provide inputs info on AOT network
      */
@@ -159,28 +176,223 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::
      */
      std::vector<InferenceEngine::IMemoryStateInternal::Ptr>  QueryState();
 
-     /**
-      * test-wise API
-      */
-     void SetPolicy(GNAPluginNS::Policy p) {policy = p;}
+ protected:
+    uint32_t num_cnn_rows_out = 0;
+    bool done = false;
+    std::string dumpXNNPath;
+    intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
 
-     /**
-      * QueryMetrics API
-      */
+    void DumpXNNToFile() const;
+    void CreateLayerPrimitive(InferenceEngine::CNNLayerPtr);
+    void AffinePrimitive(InferenceEngine::CNNLayerPtr, bool isDiag = false);
+    void DiagonalPrimitive(InferenceEngine::CNNLayerPtr);
+    void ConvolutionPrimitive(InferenceEngine::CNNLayerPtr);
+    void PermutePrimitive(InferenceEngine::CNNLayerPtr);
+    void PoolingPrimitive(InferenceEngine::CNNLayerPtr);
+    void PowerPrimitive(InferenceEngine::CNNLayerPtr);
+    void ConcatPrimitive(InferenceEngine::CNNLayerPtr);
+    void CropPrimitive(InferenceEngine::CNNLayerPtr);
+    void EltwisePrimitive(InferenceEngine::CNNLayerPtr);
+    void SplitPrimitive(InferenceEngine::CNNLayerPtr);
+    void SlicePrimitive(InferenceEngine::CNNLayerPtr);
+    void PWLPrimitive(InferenceEngine::CNNLayerPtr);
+    void CopyPrimitive(InferenceEngine::CNNLayerPtr);
+    bool AreLayersSupported(InferenceEngine::ICNNNetwork& network, std::string& errMessage);
+    LayerType LayerTypeFromStr(std::string const &str);
+    /**
+     * maps tpe of connection to input and output layers also stores gna_pointer for alloc request
+     */
+    class GNAMemoryLayer {
+        InferenceEngine::CNNLayerPtr inputLayer;
+        InferenceEngine::CNNLayerPtr outputLayer;
+     public:
+        GNAMemoryLayer(InferenceEngine::CNNLayerPtr inLayer, InferenceEngine::CNNLayerPtr outLayer) :
+            inputLayer(inLayer), outputLayer(outLayer) {
+        }
+
+        InferenceEngine::CNNLayerPtr getInput() { return inputLayer; }
+        InferenceEngine::CNNLayerPtr getOutput() { return outputLayer; }
+
+        /**
+         * pointer to gna memory request
+         */
+        void *gna_ptr = nullptr;
+        /**
+         * gna memory of this size is reserved
+         */
+        size_t  reserved_size = 0;
+        /**
+         * gna memory of this offset from gna_ptr
+         */
+        size_t  reserved_offset = 0;
+    };
+
+    class GNAConcatLayer {
+        InferenceEngine::CNNLayerPtr concatLayer;
+
+     public:
+        explicit GNAConcatLayer(InferenceEngine::CNNLayerPtr layer) :
+                                        concatLayer(layer)
+                                        {}
+
+        InferenceEngine::CNNLayerPtr getConcat() { return concatLayer; }
+        /**
+         * pointer to gna memory request
+         */
+        void *gna_ptr = nullptr;
+        /**
+         * gna memory of this size is reserved for concat
+         */
+        size_t reserved_size = 0;
+        bool output_allocation_flag = false;
+        /**
+         * gna memory of this offset from gna_ptr
+         */
+        struct ConcatConnectedLayerInfo {
+            ConcatConnectedLayerInfo(const std::string& n,
+                                    size_t o) :
+                                     name(n),
+                                     offset(o) {}
+            std::string name = "";
+            size_t offset = 0;
+        };
+
+        std::vector<ConcatConnectedLayerInfo> concatInputLayers;
+    };
+
+    // Split, Slice
+    class GNASplitLayer {
+        InferenceEngine::CNNLayerPtr splitLayer;
+
+     public:
+        explicit GNASplitLayer(InferenceEngine::CNNLayerPtr layer) :
+                                        splitLayer(layer),
+                                        splitInputLayer()
+                                        {}
+
+        InferenceEngine::CNNLayerPtr getSplit() { return splitLayer; }
+        /**
+         * gna memory of this size is reserved for concat
+         */
+        size_t reserved_size = 0;
+        bool output_allocation_flag = false;
+        /**
+         * gna memory of this offset from gna_ptr
+         */
+        struct SplitConnectedLayerInfo {
+            SplitConnectedLayerInfo() {}
+            SplitConnectedLayerInfo(std::string& n,
+                                    size_t o,
+                                    size_t p) :
+                                     name(n),
+                                     offset(o),
+                                     pure_size(p) {}
+
+            SplitConnectedLayerInfo& operator=
+                    (SplitConnectedLayerInfo const& layerInfo) {
+                this->name      = layerInfo.name;
+                this->offset    = layerInfo.offset;
+                this->pure_size = layerInfo.pure_size;
+                return *this;
+            }
+            std::string name = "";
+            size_t offset    = 0;
+            size_t pure_size = 0;
+        };
+        SplitConnectedLayerInfo splitInputLayer;
+        std::vector<SplitConnectedLayerInfo> splitOutputLayers;
+    };
+
+    class GNACropLayer {
+        InferenceEngine::CNNLayerPtr cropLayer;
+
+    public:
+        explicit GNACropLayer(InferenceEngine::CNNLayerPtr layer) :
+        cropLayer(layer)
+        {}
+
+        InferenceEngine::CNNLayerPtr getCrop() { return cropLayer; }
+        /**
+         * pointer to gna croped memory beginning
+         */
+        void *gna_ptr = nullptr;
+    };
+    using MemoryConnection = std::list<std::pair<std::string, GNAMemoryLayer>>;
+    using ConcatConnection = std::map<std::string, GNAConcatLayer>;
+    using SplitConnection  = std::map<std::string, GNASplitLayer>;
+    using CropConnection  = std::map<std::string, GNACropLayer>;
+    // layers with extra storage for connections and additional
+    // non trivial processing
+    MemoryConnection memory_connection;
+    ConcatConnection concat_connection;
+    SplitConnection  split_connection;
+    CropConnection   crop_connection;
+    void fillMemoryConnections(std::map<std::string,
+                                 std::vector<InferenceEngine::CNNLayerPtr>> &memoryPairs);
+
+    void fillConcatConnections(InferenceEngine::CNNLayerPtr layer);
+    void fillSplitConnections(InferenceEngine::CNNLayerPtr layer);
+    /**
+     * maps layer name to dnn.component, in topological sort prev nodes will be initialized
+     */
+    using DnnComponentsForLayer = std::list<std::pair<std::string, intel_dnn_component_t>>;
+    std::list<std::pair<std::string, intel_dnn_component_t>> dnnComponentsForLayer;
 
-     InferenceEngine::Parameter GetAvailableDevices() const;
+    /**
+     * @brief returns corresponding dnn layer for topology layer
+     * @param __layer
+     * @return
+     */
+    intel_dnn_component_t * findDnnLayer(InferenceEngine::CNNLayerPtr __layer);
 
- protected:
-    void Init();
+    using allocator_type = PolymorphAllocator<uint8_t>;
+    using gna_memory_type = GNAMemory<allocator_type>;
 
-    void InitGNADevice();
+    std::unique_ptr<GNADeviceHelper> gnadevice;
+    /**
+     * @brief size of RW segment without extra memory for parallel execution
+     */
+    uint32_t rwSegmentSize = 0;
+    std::unique_ptr<gna_memory_type> gnamem;
 
-    void DumpXNNToFile() const;
+    /**
+     * Connects either memory output, or generic output to a layer
+     * @param layer - layer pointer
+     * @param ptr - pointer to pointer where to store  output layer information
+     * @param sz - sizeof output blob
+     * @param ptr_inputs - sizeof output blob
+     */
+    void connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr_outputs, void *ptr_inputs, size_t sz);
+    /**
+     * Connects certain input to this layer
+     * @param layer - layer that we connect input to
+     * @param pVoid - pointer that  holds current layer pointer in gna_mem request
+     * @param num_data_bytes_in - size
+     * @param offset - num bytes to advance in buffer
+     * @param idx - index of input port that we are connecting
+     * @return layer used as input
+     */
+    struct ConnectionDetails {
+        InferenceEngine::CNNLayerPtr  input;
+        bool needTransposeWeights = false;
+        InferenceEngine::CNNLayerPtr permute;
+        ConnectionDetails(InferenceEngine::CNNLayerPtr input,
+                          bool bTranspose = false,
+                          InferenceEngine::CNNLayerPtr permute = nullptr)
+            : input(input)
+            , needTransposeWeights(bTranspose)
+            , permute(permute) {
+        }
+    };
+    ConnectionDetails connectInput(InferenceEngine::CNNLayerPtr layer,
+                      void *pVoid,
+                      size_t num_data_bytes_in,
+                      size_t offset = 0,
+                      int idx = 0);
 
     void ImportFrames(void *ptr_dst,
                      const void *ptr_src,
                      InferenceEngine::Precision input_precision,
-                     float scaleFactor,
                      intel_dnn_orientation_t orientation,
                      uint32_t num_frames,
                      uint32_t num_group,
@@ -188,7 +400,7 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::
                      uint32_t num_vector_stride);
 
     void ExportScores(void *ptr_dst,
-                     const void *ptr_src,
+                     void *ptr_src,
                      intel_dnn_orientation_t orientation,
                      uint32_t num_frames,
                      uint32_t num_group,
@@ -198,6 +410,19 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::
                      uint32_t num_bytes_per_element_input,
                      uint32_t num_bytes_per_element);
 
+    friend void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
+                    const float *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor);
+    friend void GNAPluginNS::ConvertToFloat(float *ptr_dst,
+                    int32_t *ptr_src,
+                    const uint32_t num_rows,
+                    const uint32_t num_columns,
+                    const float scale_factor);
+
+    friend int16_t GNAPluginNS::ConvertFloatToInt16(float src);
+
     template <typename T, typename U>
     void copyInputData(T *dst,
                     const U *src,
@@ -205,17 +430,59 @@ class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::
                     uint32_t num_group,
                     uint32_t num_vector_elements,
                     uint32_t num_vector_stride,
-                    intel_dnn_orientation_t orientation,
-                    float scaleFactor);
+                    intel_dnn_orientation_t orientation);
 
     template <typename T, typename U>
     void copyInputDataWithSplit(T *const dst,
                     const U *src,
                     const GNASplitLayer& splitInfo,
-                    size_t precision_size,
-                    int idx = 0);
+                    size_t precision_size);
+    /**
+     * @brief GNA affine layers are always have activation atatched, while IR not
+     * @param net - copied net ready for quantisation
+     */
+    void insertIdentityLayer(std::vector<InferenceEngine::CNNLayerPtr> &layers);
 
-    void UpdateFieldsFromConfig();
-};
+    /**
+     * @brief GNA convolution layers have deinterleaved oriantations, while affine one doesn't
+     * so between convolution and affine layers permute layers need to be inserted,
+     * or removed if they are present in topology
+     * @param layers
+     */
+    void applyOrientations(std::vector<InferenceEngine::CNNLayerPtr> &layers);
 
+
+    /**
+     * brief @search for specific patter in the graph (6 layers are replaced by single one)
+     * @param layers
+     */
+    void substitutePRelu(std::vector<InferenceEngine::CNNLayerPtr> &layers);
+
+    std::vector<InferenceEngine::CNNLayerPtr> getCandidatesForIdentityInsertion(const InferenceEngine::CNNLayerPtr layer);
+
+    /**
+     * diagonal layer insertion required in cases where activation followed by split layers, or any other
+     * topology changing layers
+     */
+    void insertDiagonalLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
+
+    /**
+     * @brief MaxPool can be reordered with activation, on GNA there is a strategy to have conv->maxpool->activation
+     * it means maxpool receives 4 bytes, and produces 4 bytes
+     */
+    void reorderMaxPool(std::vector<InferenceEngine::CNNLayerPtr> & layers);
+
+    /**
+     * copy layer insertion required in cases where input layer does not have output memory
+     */
+    void insertCopyLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
+
+    intel_dnn_component_t * find_first_unused_input(InferenceEngine::CNNLayerPtr current);
+
+    InferenceEngine::SizeVector inputDims;
+    InferenceEngine::InputsDataMap inputsDataMap;
+
+    InferenceEngine::SizeVector outputDims;
+    InferenceEngine::OutputsDataMap outputsDataMap;
+};
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_plugin_config.hpp b/inference-engine/src/gna_plugin/gna_plugin_config.hpp
index 4bc24bd5c465f3..f82e4434e31dc5 100644
--- a/inference-engine/src/gna_plugin/gna_plugin_config.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_config.hpp
@@ -1,48 +1,67 @@
-// Copyright (C) 2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
-
-#if GNA_LIB_VER == 1
-#include <gna-api.h>
-#else
-#include <gna2-inference-api.h>
-#include <gna2-common-api.h>
-#endif
-#include "ie_precision.hpp"
-#include "descriptions/gna_flags.hpp"
 #include <vector>
-#include <map>
+#include <memory>
+#include <utility>
+#include <ie_icnn_network.hpp>
+#include "ie_common.h"
+#include "gna_plugin_log.hpp"
 
 namespace GNAPluginNS {
 
-struct Config {
-    Config() {
-        AdjustKeyMapValues();
+using CNNNetworkPtr = std::shared_ptr<InferenceEngine::ICNNNetwork>;
+
+struct Endpoint {
+    InferenceEngine::TargetDevice device;
+    InferenceEngine::Precision networkPrec;
+    std::function<CNNNetworkPtr(InferenceEngine::ICNNNetwork &network)> convert;
+
+    Endpoint(InferenceEngine::TargetDevice device,
+             InferenceEngine::Precision networkPrec,
+             std::function<CNNNetworkPtr(InferenceEngine::ICNNNetwork &network)> converter = [](InferenceEngine::ICNNNetwork &network) {
+                 return CNNNetworkPtr(&network, [](InferenceEngine::ICNNNetwork *nodelete) {});
+             }) : device(device), networkPrec(networkPrec), convert(converter) {
+    }
+};
+
+class Config {
+ public:
+    using Desc = std::vector<Endpoint>;
+    Desc supported;
+    InferenceEngine::TargetDevice _defaultDevice = InferenceEngine::TargetDevice::eDefault;
+
+ public:
+    explicit Config(std::vector<Endpoint> &&config)
+        : supported(std::move(config)) {
     }
-    void UpdateFromMap(const std::map<std::string, std::string>& configMap);
-    void AdjustKeyMapValues();
-    std::string GetParameter(const std::string& name) const;
-    std::vector<std::string> GetSupportedKeys() const;
 
-    // precision of GNA hardware model
-    InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
+    /**
+     * @brief default device value is plugin dependent, so it should be also set, to allow fallback
+     */
+    void setDefaultDevice(InferenceEngine::TargetDevice d) {
+        _defaultDevice = d;
+    }
 
-    std::string dumpXNNPath;
-    std::string dumpXNNGeneration;
+    inline Endpoint find_configuration(InferenceEngine::ICNNNetwork &network) {
+        auto device = network.getTargetDevice();
+        auto targetDevice = device == InferenceEngine::TargetDevice::eDefault ? _defaultDevice : device;
 
-#if GNA_LIB_VER == 1
-    intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
-#else
-    Gna2AccelerationMode pluginGna2AccMode = Gna2AccelerationModeSoftware;
-    Gna2DeviceVersion pluginGna2DeviceConsistent = Gna2DeviceVersion1_0;
-#endif
+        auto res = std::find_if(std::begin(supported), std::end(supported), [&](Endpoint &e) {
+            return e.networkPrec == network.getPrecision() && (
+                e.device == device ||
+                    e.device == targetDevice);
+        });
 
-    std::vector<float> inputScaleFactors;
-    GNAFlags gnaFlags;
+        if (res == std::end(supported)) {
+            THROW_GNA_EXCEPTION << "\"The plugin doesn't support target device: "
+                               << InferenceEngine::TargetDeviceInfo::name(network.getTargetDevice())
+                               << ".\nSupported target device: " << InferenceEngine::TargetDeviceInfo::name(InferenceEngine::TargetDevice::eGNA);
+        }
 
-    std::map<std::string, std::string> key_config_map;
+        return *res;
+    }
 };
-
 }  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp b/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp
index e6c4cc3ad9e2ba..d2312741f3f073 100644
--- a/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_entry_points.cpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -11,21 +11,9 @@ using namespace InferenceEngine;
 using namespace std;
 using namespace GNAPluginNS;
 
-IE_SUPPRESS_DEPRECATED_START
-
-static const Version gnaPluginDescription = {
-        {2, 1},
-        CI_BUILD_NUMBER
-#if GNA_LIB_VER == 2
-        "_with_GNA_LIB_VER==2"
-#endif
-        ,
-        "GNAPlugin"
-};
-
 INFERENCE_PLUGIN_API(StatusCode) CreatePluginEngine(IInferencePlugin *&plugin, ResponseDesc *resp) noexcept {
     try {
-        plugin = make_ie_compatible_plugin(gnaPluginDescription, make_shared<GNAPluginInternal>());
+        plugin = make_ie_compatible_plugin({1, 5, "GNAPlugin", "GNAPlugin"}, make_shared<GNAPluginInternal>());
         return OK;
     }
     catch (std::exception &ex) {
diff --git a/inference-engine/src/gna_plugin/gna_plugin_internal.hpp b/inference-engine/src/gna_plugin/gna_plugin_internal.hpp
index 0f9ec354f8374d..3c2dcf02ab825d 100644
--- a/inference-engine/src/gna_plugin/gna_plugin_internal.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_internal.hpp
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
@@ -10,71 +10,19 @@
 #include <cpp_interfaces/impl/ie_plugin_internal.hpp>
 #include <cpp_interfaces/impl/ie_executable_network_internal.hpp>
 #include "gna_executable_network.hpp"
-#include "gna_plugin_config.hpp"
 
 namespace GNAPluginNS {
 
 class GNAPluginInternal  : public InferenceEngine::InferencePluginInternal {
-private:
-    Config defaultConfig;
-    std::weak_ptr <GNAPlugin> plgPtr;
-    std::shared_ptr<GNAPlugin> GetCurrentPlugin() const {
-        auto ptr = plgPtr.lock();
-        if (ptr == nullptr) {
-            return std::make_shared<GNAPlugin>();
-        } else {
-            return ptr;
-        }
-    }
-
-public:
-    InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl(
-                                                const InferenceEngine::ICNNNetwork &network,
-                                                const std::map<std::string, std::string> &config) override {
-        Config updated_config(defaultConfig);
-        updated_config.UpdateFromMap(config);
-        auto plg = std::make_shared<GNAPlugin>(updated_config.key_config_map);
-        plgPtr = plg;
-        return std::make_shared<GNAExecutableNetwork>(*cloneNet(network), plg);
-    }
-
-    void SetConfig(const std::map<std::string, std::string> &config) override {
-        defaultConfig.UpdateFromMap(config);
-    }
-
-    InferenceEngine::IExecutableNetwork::Ptr  ImportNetwork(
-                                                const std::string &modelFileName,
-                                                const std::map<std::string, std::string> &config) override {
-        Config updated_config(defaultConfig);
-        updated_config.UpdateFromMap(config);
-        auto plg = std::make_shared<GNAPlugin>(updated_config.key_config_map);
-        plgPtr = plg;
-        return make_executable_network(std::make_shared<GNAExecutableNetwork>(modelFileName, plg));
-    }
-
-    using InferenceEngine::InferencePluginInternal::ImportNetwork;
-
-    std::string GetName() const noexcept override {
-        return GetCurrentPlugin()->GetName();
-    }
-
-    void QueryNetwork(const InferenceEngine::ICNNNetwork& network,
-                      const std::map<std::string, std::string>& config,
-                      InferenceEngine::QueryNetworkResult& res) const override {
-        auto plg = GetCurrentPlugin();
-        try {
-            plg->SetConfig(config);
-        } catch (InferenceEngine::details::InferenceEngineException) {}
-        plg->QueryNetwork(network, config, res);
-    }
-
-    InferenceEngine::Parameter GetMetric(const std::string& name,
-                                         const std::map<std::string, InferenceEngine::Parameter> & options) const override {
-        return GetCurrentPlugin()->GetMetric(name, options);
-    }
-
-    InferenceEngine::Parameter GetConfig(const std::string& name, const std::map<std::string, InferenceEngine::Parameter> & options) const override {
-        return defaultConfig.GetParameter(name);
+ public:
+    InferenceEngine::ExecutableNetworkInternal::Ptr LoadExeNetworkImpl(InferenceEngine::ICNNNetwork &network,
+                                                                       const std::map<std::string, std::string> &config) override {
+        return std::make_shared<GNAExecutableNetwork>(network, config);
+    }
+    void SetConfig(const std::map<std::string, std::string> &config) override {}
+    InferenceEngine::IExecutableNetwork::Ptr  ImportNetwork(const std::string &modelFileName,
+                                                            const std::map<std::string, std::string> &config) override {
+        return make_executable_network(std::make_shared<GNAExecutableNetwork>(modelFileName, config));
     }
 };
 
diff --git a/inference-engine/src/gna_plugin/gna_plugin_log.hpp b/inference-engine/src/gna_plugin/gna_plugin_log.hpp
index b3d5dc249ed9e6..08f45ad78dac27 100644
--- a/inference-engine/src/gna_plugin/gna_plugin_log.hpp
+++ b/inference-engine/src/gna_plugin/gna_plugin_log.hpp
@@ -1,15 +1,13 @@
-// Copyright (C) 2018-2020 Intel Corporation
+// Copyright (C) 2018 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
 
 #pragma once
 
-#include <ostream>
 #include <details/ie_exception.hpp>
 
 // #define GNA_DEBUG
-#ifdef  GNA_DEBUG
-#include <iostream>
+#ifdef GNA_DEBUG
 /**
  * @brief used for creating graphviz charts, and layers dump
  */
@@ -18,18 +16,6 @@
 # define gnawarn() std::cerr
 #else
 
-#ifdef VERBOSE
-#define VERBOSE_LEVEL (1)
-#else
-#define VERBOSE_LEVEL (0)
-#endif
-
-#ifdef PLOT
-#define PLOT_LEVEL (1)
-#else
-#define PLOT_LEVEL (0)
-#endif
-
 class GnaLog {
  public :
     template <class T>
@@ -50,27 +36,19 @@ inline GnaLog & gnawarn() {
     return gnalog();
 }
 
-#endif
-
 /**
  * @brief gna_plugin exception unification
  */
 #ifdef __PRETTY_FUNCTION__
 #undef __PRETTY_FUNCTION__
 #endif
-#ifdef _WIN32
+#if defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
 # define __PRETTY_FUNCTION__ __FUNCSIG__
 #else
 # define __PRETTY_FUNCTION__ __FUNCTION__
 #endif
 
 
+#endif
 
-#define GNA_LAYER_ASSERT(layer, expr)\
-if (!(expr)) { \
-    THROW_GNA_LAYER_EXCEPTION(layer) << ": " << #expr; \
-}
 #define THROW_GNA_EXCEPTION THROW_IE_EXCEPTION << "[GNAPlugin] in function " << __PRETTY_FUNCTION__<< ": "
-#define THROW_GNA_LAYER_EXCEPTION(layer) THROW_GNA_EXCEPTION << LAYER_NAME(layer)
-#define LAYER_NAME(layer) layer->type << " layer : \"" << layer->name << "\" "
-
diff --git a/inference-engine/src/gna_plugin/gna_plugin_passes.cpp b/inference-engine/src/gna_plugin/gna_plugin_passes.cpp
new file mode 100644
index 00000000000000..79d42d24036be9
--- /dev/null
+++ b/inference-engine/src/gna_plugin/gna_plugin_passes.cpp
@@ -0,0 +1,338 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <vector>
+#include <string>
+#include <memory>
+#include <utility>
+
+#include <quantization/quantized_layer_params.hpp>
+#include "gna_plugin.hpp"
+#include "gna_layer_info.hpp"
+
+
+using namespace InferenceEngine;
+using namespace std;
+using namespace GNAPluginNS;
+
+void GNAPlugin::insertDiagonalLayer(std::vector<CNNLayerPtr> & layers) {
+    int numOfDiagLayers = 0;
+    for (auto & l : layers) {
+        if (l->insData.empty()) continue;
+        auto prevLayer = CNNNetPrevLayer(l);
+        if (LayerInfo(l).isActivation()) {
+            if (LayerInfo(prevLayer).has32BOutput())
+                continue;
+        } else {
+            auto eltwise = dynamic_cast<InferenceEngine::EltwiseLayer *>(l.get());
+            if (!eltwise) {
+                continue;
+            }
+            // in case of eltwise sum one of input would be 4 bytes one - 2
+            // in case of eltwise mull one of input would be 2 bytes one - 2
+            // for e sum if we have 4-4 inputs we will handle that by inserting identity activation
+            // for e sum if we have 4-2 - OK
+            // for e sum if we have 2-2 inputs we need to insert diagonal -- handling here
+            // for e mul if we have 2-2 - OK
+            // for e mul if we have 2-4 - inputs we need to insert identity to put 4 bytes input into weights
+            // for e mul if we have 4-4 - inputs we need to insert 2 identities to put both 4 bytes input into weights
+
+            if (eltwise->_operation != EltwiseLayer::Sum)
+                continue;
+
+            auto prevLayer1 = CNNNetPrevLayer(l, 1);
+            if (!LayerInfo(prevLayer).has16BOutput() || !LayerInfo(prevLayer1).has16BOutput())
+                continue;
+        }
+
+#ifdef PLOT
+        std::cout << "Inserted Diagonal Layer between: " << prevLayer->name << " and " << l->name << "\n" << std::flush;
+#endif
+        // actual insertion
+        auto diagName = std::string("SyntheticScaleShift_") + std::to_string(numOfDiagLayers++);
+        auto diagLayer = make_shared<ScaleShiftLayer>(LayerParams({diagName, "ScaleShift", Precision::FP32}));
+
+        // TODO: diagonal size
+        std::vector<float> arrayOf1(l->outData[0]->dims[0], 1.f);
+        diagLayer->_weights = make_shared_blob<float>(l->outData[0]->precision, Layout::C, arrayOf1);;
+        auto newDims = l->outData[0]->dims;
+        auto dataPtr = std::make_shared<Data>(diagName,
+                                              newDims,
+                                              l->outData[0]->precision,
+                                              l->outData[0]->layout);
+
+        auto diagonalWithQuant = InferenceEngine::injectData<QuantizedLayerParams>(diagLayer);
+
+        dataPtr->creatorLayer = diagonalWithQuant;
+        diagonalWithQuant->outData.push_back(dataPtr);
+        CNNNetworkInsertLayer(prevLayer, l, diagonalWithQuant);
+    }
+}
+
+void GNAPlugin::reorderMaxPool(std::vector<InferenceEngine::CNNLayerPtr> & layers) {
+    // detecting following pattern
+    // conv->relu->maxpooling
+    // changing it to conv->mxpooling->relu
+    for (auto & l : layers) {
+        auto pool = LayerInfo(l);
+        if (!pool.isMaxPooling()) continue;
+
+        // checking prev layer type
+        auto activation = LayerInfo(CNNNetPrevLayer(l));
+        if (!activation.isActivation()) continue;
+
+        // if activation came from convolution
+        auto convolution = LayerInfo(CNNNetPrevLayer(static_cast<InferenceEngine::CNNLayer*>(activation)));
+        if (!convolution.isConvolution()) continue;
+
+        gnalog() << "MaxPooling: " << pool << ", reordered with activation: " << activation << "\n";
+
+        CNNNetSwapLayers(activation, pool);
+    }
+}
+
+std::vector<CNNLayerPtr> GNAPlugin::getCandidatesForIdentityInsertion(const CNNLayerPtr l) {
+    vector<CNNLayerPtr> prevLayers;
+
+    // skipping memory inputs and true inputs layers
+    if (l->insData.empty()) return {};
+
+    auto eltwise = dynamic_cast<InferenceEngine::EltwiseLayer *>(l.get());
+    auto concat = dynamic_cast<InferenceEngine::ConcatLayer *>(l.get());
+
+    // eltwise
+    if (eltwise != nullptr) {
+        // eltwise layer has 2 inputs, so depends on situation identity should or should not be inserted
+
+        // for  sum if we have 4-4 inputs we will handle that by inserting identity activation case (1)
+        // for  sum if we have 4-2 - OK
+        // for  sum if we have 2-2 inputs we need to insert diagonal
+
+        // for  mul if we have 2-2 - OK
+        // for  mul if we have 2-4 - inputs we need to insert identity activation to make 2 bytes input
+        // for  mul if we have 4-4 - inputs we need to insert 2 identities activations  to put 2 bytes input and weights
+        auto prev0 = CNNNetPrevLayer(l, 0);
+        auto prev1 = CNNNetPrevLayer(l, 1);
+        switch (eltwise->_operation) {
+            case EltwiseLayer::Sum:
+                if (!LayerInfo(prev0).has32BOutput() || !LayerInfo(prev1).has32BOutput()) {
+                    return prevLayers;
+                }
+                // TODO: wether there - are possibility to select what layer to quantize
+                prevLayers.push_back(prev0);
+                break;
+            case EltwiseLayer::Prod:
+                if (LayerInfo(prev0).has16BOutput() && LayerInfo(prev1).has16BOutput()) {
+                    return prevLayers;
+                }
+
+                if (LayerInfo(prev0).has32BOutput()) {
+                    prevLayers.push_back(prev0);
+                }
+
+                if (LayerInfo(prev1).has32BOutput()) {
+                    prevLayers.push_back(prev1);
+                }
+
+                break;
+            default :
+                THROW_GNA_EXCEPTION << "Eltwise Layer of type: " << eltwise->_operation << " not supported";
+        }
+    } else if (concat != nullptr) {
+        for (int i = 0; CNNNetHasPrevLayer(l.get(), i); ++i) {
+            auto prev = CNNNetPrevLayer(l, i);
+            if (LayerInfo(prev).has32BOutput()) {
+                prevLayers.push_back(prev);
+            }
+        }
+    } else {  // not eltwise or concat
+        // other layers has 1 inputs - situation is easier
+        // ex. activation or pooling - no need to insert identity activation.
+        if (LayerInfo(l).has32BInput())
+            return prevLayers;
+
+        auto prevLayer = CNNNetPrevLayer(l);
+        if (!LayerInfo(prevLayer).has32BOutput())
+            return prevLayers;
+
+        prevLayers.push_back(prevLayer);
+    }
+    return prevLayers;
+}
+
+void GNAPlugin::substitutePRelu(std::vector<InferenceEngine::CNNLayerPtr> &layers) {
+    auto getScale = [](CNNLayer* layer) {
+        auto powerCandidate = LayerInfo(layer);
+        if (!powerCandidate.isPower()) return 0.0f;
+        auto power = powerCandidate.as<PowerLayer*>();
+
+        return power->power == 1 && power->offset == 0.0f ? power->scale : 0.0f;
+    };
+
+    auto isScale = [getScale](CNNLayer* layer) {
+        return getScale(layer) != 0.0f;
+    };
+
+    auto isNegate = [getScale](CNNLayer* layer) {
+        return getScale(layer) == -1.0f;
+    };
+
+    auto getNext = [](CNNLayer* layer) {
+        CNNLayer* next = nullptr;
+        if (layer == nullptr) return next;
+        if (layer->outData.size() != 1) return next;
+        return layer->outData[0]->inputTo.begin()->second.get();
+    };
+
+    // TODO: unit tests for bad cases
+    for (auto & l : layers) {
+        // assume l is starting layer, that is followed by eltwise_sum(relu, negate/relu/scale/negate)
+        if (l->outData.size() != 1) continue;
+        auto &outputLayers = l->outData[0]->inputTo;
+        if (outputLayers.size() != 2) continue;
+
+        // one of followed layers need to be generic relu
+        auto first = LayerInfo(outputLayers.begin()->second);
+        auto second = LayerInfo((++outputLayers.begin())->second);
+
+        auto relu1 = outputLayers.begin()->second;
+        auto neg1 = (++outputLayers.begin())->second;
+        if (second.isRelu()) {
+            swap(first, second);
+            swap(relu1, neg1);
+        }
+        if (!first.isRelu()) continue;
+        // now we have relu as first layer, lets check second
+        // negate
+        if (!isNegate(neg1.get())) continue;
+
+        // relu
+        auto relu2 = getNext(second);
+        if (!LayerInfo(relu2).isRelu()) continue;
+
+        // scale
+        auto scale = getNext(relu2);
+        if (!isScale(scale)) continue;
+
+        // negate2
+        auto negate = getNext(scale);
+        if (!isNegate(negate)) continue;
+
+        // sum
+        auto sum = getNext(negate);
+        if (!LayerInfo(sum).isEltwiseSum()) continue;
+        if (sum->insData.size() != 2) continue;
+
+        auto s1 = sum->insData[0].lock()->creatorLayer.lock().get();
+        auto s2 = sum->insData[1].lock()->creatorLayer.lock().get();
+
+        if (s1 != static_cast<InferenceEngine::CNNLayer *>(first) &&
+            s2 != static_cast<InferenceEngine::CNNLayer *>(first)) {
+            continue;
+        }
+
+        // hurray we found parametric relu group - dont know what to do with it though
+        gnalog() << "PRelu with negative slope of " << -LayerInfo(scale).as<PowerLayer*>()->scale << " found" << std::endl;
+
+        // removing all layers references except of relu layer
+        outputLayers.clear();
+        outputLayers[relu1->name] = relu1;
+        // pointing relu to output of eltwise_summ
+        relu1->outData = sum->outData;
+        // changing creator layer
+        relu1->outData[0]->creatorLayer = relu1;
+        // pointing back to relu if any
+        if (!relu1->outData[0]->inputTo.empty()) {
+            auto summOutputLayer = relu1->outData[0]->inputTo.begin()->second;
+            summOutputLayer->insData.clear();
+            summOutputLayer->insData.push_back(relu1->outData[0]);
+        }
+
+        // changing negative slope
+        first.as<ReLULayer*>()->negative_slope = LayerInfo(scale).as<PowerLayer*>()->scale;
+    }
+}
+
+void GNAPlugin::applyOrientations(std::vector<CNNLayerPtr> & layers) {
+}
+
+void GNAPlugin::insertIdentityLayer(std::vector<CNNLayerPtr> &layers) {
+    int numOfIdentityLayers = 0;
+    for (auto & l : layers) {
+        for (auto && prev : getCandidatesForIdentityInsertion(l)) {
+            // actual insertion
+            auto activationName = std::string("identity_") + std::to_string(numOfIdentityLayers++);
+
+            gnalog() << "Inserted "<< activationName << " between: " << prev->name << " and " << l->name << "\n" << std::flush;
+
+            CNNLayerPtr activationLayer =
+                make_shared<GenericLayer>(LayerParams({activationName, "identity", Precision::FP32}));
+            auto inputData = l->insData[0].lock();
+            auto newDims = inputData->dims;
+            std::reverse(begin(newDims), end(newDims));
+
+            auto dataPtr = std::make_shared<Data>("FullyConnected",
+                                                  TensorDesc(inputData->precision,
+                                                             newDims,
+                                                             inputData->layout));
+
+            auto activationLayerWithQuant = InferenceEngine::injectData<QuantizedLayerParams>(activationLayer);
+            dataPtr->creatorLayer = activationLayerWithQuant;
+            activationLayerWithQuant->outData.push_back(dataPtr);
+            // wether 1 identity or all outputs TODO possible grouping here, need to implement special groupped inserter
+            bool notAll = false;
+            for (auto && nextData  : prev->outData) {
+                for (auto && nextLayer : nextData->inputTo) {
+                    if (nextLayer.second.get() == l.get())
+                        continue;
+                    if (getCandidatesForIdentityInsertion(nextLayer.second).empty()) {
+                        notAll = true;
+                    }
+                }
+            }
+
+            CNNNetworkInsertLayer(prev, notAll ? l : CNNLayerPtr(nullptr), activationLayerWithQuant);
+        }
+    }
+}
+
+void GNAPlugin::insertCopyLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers) {
+    int numCopyLayers = 0;
+    for (auto & l : layers) {
+        if (l->insData.empty()) continue;
+        auto prevLayer = CNNNetPrevLayer(l);
+        if ((LayerInfo(l).isMemory() && LayerInfo(prevLayer).isConcat()) ||
+            (LayerInfo(l).isConcat() && LayerInfo(prevLayer).isCrop())) {
+            if (LayerInfo(prevLayer).isCrop()) {
+                auto cropLayer = dynamic_cast<InferenceEngine::CropLayer *> (prevLayer.get());
+                size_t cropOffset = cropLayer->offset.back() * cropLayer->precision.size();
+                if (ALIGN(cropOffset, 8) != cropOffset) {
+                    // The crop will be replced by affine.
+                    // Copy layer insertion is not required
+                    continue;
+                }
+            }
+            std::string copyName = std::string("copy_") + std::to_string(numCopyLayers++);
+            gnalog() << "Inserted "<< copyName << " between: " << l->name << " and " << prevLayer->name << "\n" << std::flush;
+
+            CNNLayerPtr copyLayer =
+            make_shared<GenericLayer>(LayerParams({copyName, "Copy", Precision::FP32}));
+
+            auto inputData = l->insData[0].lock();
+            auto newDims = inputData->dims;
+
+            std::reverse(begin(newDims), end(newDims));
+
+            auto dataPtr = std::make_shared<Data>(copyName,
+                                                  TensorDesc(inputData->precision,
+                                                             newDims,
+                                                             inputData->layout));
+
+            auto copyWithQuant = InferenceEngine::injectData<QuantizedLayerParams>(copyLayer);
+            dataPtr->creatorLayer = copyWithQuant;
+            copyWithQuant->outData.push_back(dataPtr);
+            CNNNetworkInsertLayer(prevLayer, l, copyWithQuant);
+        }
+    }
+}
diff --git a/inference-engine/src/gna_plugin/lstm.cpp b/inference-engine/src/gna_plugin/lstm.cpp
new file mode 100644
index 00000000000000..53906e64325597
--- /dev/null
+++ b/inference-engine/src/gna_plugin/lstm.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "lstm.hpp"
+
+const char *intel_lstm_projected_layer_name[NUM_LSTM_LAYERS] = {
+    "combined input transform",
+    "combined recurrent transform",
+    "input gate",
+    "forget gate",
+    "cell gate input part 1",
+    "cell gate input part 2",
+    "cell gate output part 1",
+    "cell gate output part 2",
+    "output gate",
+    "hidden gated output",
+    "projected output"
+};
+
+const char *intel_lstm_projected_layer_g4_name[NUM_LSTM_G4_LAYERS] = {
+    "combined input transform",
+    "deinterleave",
+    "interleave 1",
+    "interleave 2",
+    "interleave 3",
+    "interleave 4",
+    "combined recurrent transform - 1",
+    "input gate - 1",
+    "forget gate - 1",
+    "cell gate input part 1 - 1",
+    "cell gate input part 2 - 1",
+    "cell gate output part 1 - 1",
+    "cell gate output part 2 - 1",
+    "output gate - 1",
+    "hidden gated output - 1",
+    "projected output - 1",
+    "combined recurrent transform - 2",
+    "input gate - 2",
+    "forget gate - 2",
+    "cell gate input part 1 - 2",
+    "cell gate input part 2 - 2",
+    "cell gate output part 1 - 2",
+    "cell gate output part 2 - 2",
+    "output gate - 2",
+    "hidden gated output - 2",
+    "projected output - 2",
+    "combined recurrent transform - 3",
+    "input gate - 3",
+    "forget gate - 3",
+    "cell gate input part 1 - 3",
+    "cell gate input part 2 - 3",
+    "cell gate output part 1 - 3",
+    "cell gate output part 2 - 3",
+    "output gate - 3",
+    "hidden gated output - 3",
+    "projected output - 3",
+    "combined recurrent transform - 4",
+    "input gate - 4",
+    "forget gate - 4",
+    "cell gate input part 1 - 4",
+    "cell gate input part 2 - 4",
+    "cell gate output part 1 - 4",
+    "cell gate output part 2 - 4",
+    "output gate - 4",
+    "hidden gated output - 4",
+    "projected output - 4",
+    "interleave"
+};
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/lstm.hpp b/inference-engine/src/gna_plugin/lstm.hpp
new file mode 100644
index 00000000000000..6ce8f10940e186
--- /dev/null
+++ b/inference-engine/src/gna_plugin/lstm.hpp
@@ -0,0 +1,209 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#define LSTM_GIFO_X_C (component_index)
+#define LSTM_GIFO_R_C (component_index+1)
+#define LSTM_INPUT_GATE_C (component_index+2)
+#define LSTM_INPUT_SIGMOID_C (component_index+3)
+#define LSTM_FORGET_GATE_C (component_index+4)
+#define LSTM_FORGET_SIGMOID_C (component_index+5)
+#define LSTM_CELL_INPUT1_C (component_index+6)
+#define LSTM_CELL_INPUT1_TANH_C (component_index+7)
+#define LSTM_CELL_INPUT2_C (component_index+8)
+#define LSTM_CELL_OUTPUT1_C (component_index+9)
+#define LSTM_CELL_TANH_C (component_index+10)
+#define LSTM_CELL_OUTPUT2_C (component_index+11)
+#define LSTM_CELL_CLIPPING_C (component_index+12)
+#define LSTM_OUTPUT_GATE_C (component_index+13)
+#define LSTM_OUTPUT_SIGMOID_C (component_index+14)
+#define LSTM_HIDDEN_C (component_index+15)
+#define LSTM_HIDDEN_IDENTITY_C (component_index+16)
+#define LSTM_PROJECTED_C (component_index+17)
+#define LSTM_PROJECTED_IDENTITY_C (component_index+18)
+#define NUM_LSTM_COMPONENTS 19
+
+#define BILSTM_GIFO_X_FW_C (component_index)
+#define BILSTM_GIFO_R_FW_C (component_index+1)
+#define BILSTM_INPUT_GATE_FW_C (component_index+2)
+#define BILSTM_INPUT_SIGMOID_FW_C (component_index+3)
+#define BILSTM_FORGET_GATE_FW_C (component_index+4)
+#define BILSTM_FORGET_SIGMOID_FW_C (component_index+5)
+#define BILSTM_CELL_INPUT1_FW_C (component_index+6)
+#define BILSTM_CELL_INPUT1_TANH_FW_C (component_index+7)
+#define BILSTM_CELL_INPUT2_FW_C (component_index+8)
+#define BILSTM_CELL_GATE_FW_C (component_index+9)
+#define BILSTM_CELL_OUTPUT1_FW_C (component_index+10)
+#define BILSTM_CELL_TANH_FW_C (component_index+11)
+#define BILSTM_CELL_COPY_FW_C (component_index+12)
+#define BILSTM_OUTPUT_GATE_FW_C (component_index+13)
+#define BILSTM_OUTPUT_SIGMOID_FW_C (component_index+14)
+#define BILSTM_HIDDEN_FW_C (component_index+15)
+#define BILSTM_HIDDEN_IDENTITY_FW_C (component_index+16)
+#define BILSTM_GIFO_X_BW_C (component_index+17)
+#define BILSTM_GIFO_R_BW_C (component_index+18)
+#define BILSTM_INPUT_GATE_BW_C (component_index+19)
+#define BILSTM_INPUT_SIGMOID_BW_C (component_index+20)
+#define BILSTM_FORGET_GATE_BW_C (component_index+21)
+#define BILSTM_FORGET_SIGMOID_BW_C (component_index+22)
+#define BILSTM_CELL_INPUT1_BW_C (component_index+23)
+#define BILSTM_CELL_INPUT1_TANH_BW_C (component_index+24)
+#define BILSTM_CELL_INPUT2_BW_C (component_index+25)
+#define BILSTM_CELL_GATE_BW_C (component_index+26)
+#define BILSTM_CELL_OUTPUT1_BW_C (component_index+27)
+#define BILSTM_CELL_TANH_BW_C (component_index+28)
+#define BILSTM_CELL_COPY_BW_C (component_index+29)
+#define BILSTM_OUTPUT_GATE_BW_C (component_index+30)
+#define BILSTM_OUTPUT_SIGMOID_BW_C (component_index+31)
+#define BILSTM_HIDDEN_BW_C (component_index+32)
+#define BILSTM_HIDDEN_IDENTITY_BW_C (component_index+33)
+#define NUM_BILSTM_COMPONENTS 34
+
+#include "gna-api.h"
+
+#define ACTIVATION_SCALE_IG  1024.0f
+#define ACTIVATION_SCALE_CI1 1024.0f
+#define ACTIVATION_SCALE_CO1 2048.0f
+#define ACTIVATION_SCALE_OG  2048.0f
+#define ACTIVATION_SCALE_HID 2048.0f
+#define MAX_WEIGHT_IFO_GATE  1024.0f
+#define NUM_WEIGHT_BYTES_IN        2
+#define NUM_WEIGHT_BYTES_PROJ    2
+
+typedef struct {
+    float min;
+    float max;
+    float sum;
+    float sum_squared;
+    uint32_t num_saturations;
+    uint32_t num_elements;
+} intel_buffer_stats_t;
+
+typedef struct {
+    intel_nnet_layer_t in;        // combined input transform
+    intel_nnet_layer_t rec;        // combined recurrent transform
+    intel_nnet_layer_t ig;        // input gate
+    intel_nnet_layer_t fg;        // forget gate
+    intel_nnet_layer_t ci1;        // cell gate input part 1
+    intel_nnet_layer_t ci2;        // cell gate input part 2
+    intel_nnet_layer_t co1;        // cell gate output part 1
+    intel_nnet_layer_t co2;        // cell gate output part 2
+    intel_nnet_layer_t og;        // output gate
+    intel_nnet_layer_t hid;        // hidden gated output
+    intel_nnet_layer_t proj;    // projected output
+} intel_lstm_projected_layer_t;
+
+typedef struct {
+    intel_affine_layer_t *in;        // combined input transform
+    intel_affine_layer_t *rec;        // combined recurrent transform
+    intel_affine_layer_t *ig;        // input gate
+    intel_affine_layer_t *fg;        // forget gate
+    intel_affine_layer_t *ci1;        // cell gate input part 1
+    intel_affine_layer_t *ci2;        // cell gate input part 2
+    intel_affine_layer_t *co1;        // cell gate output part 1
+    intel_affine_layer_t *co2;        // cell gate output part 2
+    intel_affine_layer_t *og;        // output gate
+    intel_affine_layer_t *hid;        // hidden gated output
+    intel_affine_layer_t *proj;        // projected output
+} intel_lstm_projected_transform_t;
+
+typedef struct {
+    intel_buffer_stats_t in;        // combined input transform
+    intel_buffer_stats_t rec;        // combined recurrent transform
+    intel_buffer_stats_t ig;        // input gate
+    intel_buffer_stats_t fg;        // forget gate
+    intel_buffer_stats_t ci1;        // cell gate input part 1
+    intel_buffer_stats_t ci2;        // cell gate input part 2
+    intel_buffer_stats_t co1;        // cell gate output part 1
+    intel_buffer_stats_t co2;        // cell gate output part 2
+    intel_buffer_stats_t og;        // output gate
+    intel_buffer_stats_t hid;        // hidden gated output
+    intel_buffer_stats_t proj;    // projected output
+} intel_lstm_projected_stats_t;
+
+typedef struct {
+    intel_nnet_layer_t rec;        // combined recurrent transform
+    intel_nnet_layer_t ig;        // input gate
+    intel_nnet_layer_t fg;        // forget gate
+    intel_nnet_layer_t ci1;        // cell gate input part 1
+    intel_nnet_layer_t ci2;        // cell gate input part 2
+    intel_nnet_layer_t co1;        // cell gate output part 1
+    intel_nnet_layer_t co2;        // cell gate output part 2
+    intel_nnet_layer_t og;        // output gate
+    intel_nnet_layer_t hid;        // hidden gated output
+    intel_nnet_layer_t proj;    // projected output
+} intel_lstm_partial_layer_t;
+
+typedef struct {
+    intel_affine_layer_t *rec;        // combined recurrent transform
+    intel_affine_layer_t *ig;        // input gate
+    intel_affine_layer_t *fg;        // forget gate
+    intel_affine_layer_t *ci1;        // cell gate input part 1
+    intel_affine_layer_t *ci2;        // cell gate input part 2
+    intel_affine_layer_t *co1;        // cell gate output part 1
+    intel_affine_layer_t *co2;        // cell gate output part 2
+    intel_affine_layer_t *og;        // output gate
+    intel_affine_layer_t *hid;        // hidden gated output
+    intel_affine_layer_t *proj;        // projected output
+} intel_lstm_partial_transform_t;
+
+typedef struct {
+    intel_buffer_stats_t rec;        // combined recurrent transform
+    intel_buffer_stats_t ig;        // input gate
+    intel_buffer_stats_t fg;        // forget gate
+    intel_buffer_stats_t ci1;        // cell gate input part 1
+    intel_buffer_stats_t ci2;        // cell gate input part 2
+    intel_buffer_stats_t co1;        // cell gate output part 1
+    intel_buffer_stats_t co2;        // cell gate output part 2
+    intel_buffer_stats_t og;        // output gate
+    intel_buffer_stats_t hid;        // hidden gated output
+    intel_buffer_stats_t proj;    // projected output
+} intel_lstm_partial_stats_t;
+
+typedef struct {
+    intel_nnet_layer_t in;                // combined input transform
+    intel_nnet_layer_t dintl;            // interleave x8
+    intel_nnet_layer_t intl1;            // deinterleave x2
+    intel_nnet_layer_t intl2;            // deinterleave x2
+    intel_nnet_layer_t intl3;            // deinterleave x2
+    intel_nnet_layer_t intl4;            // deinterleave x2
+    intel_lstm_partial_layer_t part[4];    // unrolled part
+    intel_nnet_layer_t intl;            // interleave x4
+} intel_lstm_projected_layer_g4_t;
+
+typedef struct {
+    intel_affine_layer_t *in;                // combined input transform
+    intel_lstm_partial_transform_t part[4];  // unrolled part
+} intel_lstm_projected_transform_g4_t;
+
+typedef struct {
+    intel_buffer_stats_t in;            // combined input transform
+    intel_lstm_partial_stats_t part[4];    // unrolled part
+} intel_lstm_projected_stats_g4_t;
+
+#define NUM_LSTM_LAYERS 11
+#define NUM_LSTM_G4_LAYERS 47
+
+extern const char *intel_lstm_projected_layer_name[NUM_LSTM_LAYERS];
+extern const char *intel_lstm_projected_layer_g4_name[NUM_LSTM_G4_LAYERS];
+/*
+void GetLstmBufferStats(intel_lstm_projected_layer_t *ptr_layer, std::vector<intel_lstm_projected_stats_t> &stats);
+void UpdateLstmBufferStats(std::vector<intel_lstm_projected_stats_t> &accum, std::vector<intel_lstm_projected_stats_t> stats);
+void ClearLstmBufferStats(std::vector<intel_lstm_projected_stats_t> &stats);
+void PrintLstmBufferStats(std::string preamble, std::vector<intel_lstm_projected_stats_t> stats);
+uint32_t NumBytesLstmMacroLayer(uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells, uint32_t num_group_size, uint32_t layer_num, bool is_compact);
+void InitLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform, uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells);
+void InitLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform, uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells);
+void AllocateLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform, intel_shared_outputs scratch, uint8_t **ptr_memory, uint32_t *ptr_num_bytes_used, uint32_t num_memory_bytes, bool is_compact);
+void AllocateLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform, intel_shared_outputs scratch, uint8_t **ptr_memory, uint32_t *ptr_num_bytes_used, uint32_t num_memory_bytes, bool is_compact);
+void ConnectLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform);
+void ConnectLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform);
+void QuantizeLstmMacroLayerG1(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_transform_t *ptr_transform, float input_scale, gna_scale_factor_t *scale, uint32_t j);
+void QuantizeLstmMacroLayerG4(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_transform_g4_t *ptr_transform, float input_scale, gna_scale_factor_t *scale, uint32_t j);
+void ReQuantizeLstmMacroLayerG1(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_t *ptr_layer, float input_scale, gna_scale_factor_t *scale, uint32_t j);
+void ReQuantizeLstmMacroLayerG4(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_g4_t *ptr_layer, float input_scale, gna_scale_factor_t *scale, uint32_t j);
+void IntegrityCheckLstmMacroLayer(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_t *ptr_layer, gna_scale_factor_t *scale, uint32_t j);
+
+*/
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/polymorh_allocator.hpp b/inference-engine/src/gna_plugin/polymorh_allocator.hpp
new file mode 100644
index 00000000000000..d50d8a3a7e5245
--- /dev/null
+++ b/inference-engine/src/gna_plugin/polymorh_allocator.hpp
@@ -0,0 +1,68 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <memory>
+#include <utility>
+
+/**
+ * @brief c++17 concept simulation
+ */
+
+template<class T>
+class IPolymorhAllocator {
+ public:
+    virtual T *allocate(std::size_t n)  = 0;
+    virtual void deallocate(T *p, std::size_t n)  = 0;
+};
+
+template<class T>
+class allocator_polymorph;
+
+template<class T>
+class PolymorphAllocator {
+    std::shared_ptr<IPolymorhAllocator<T>> _impl;
+ public:
+    explicit PolymorphAllocator(const std::shared_ptr<IPolymorhAllocator<T>> &impl) : _impl(impl) {}
+
+    T *allocate(std::size_t n) {
+        return _impl->allocate(n);
+    }
+
+    void deallocate(T *p, std::size_t n) {
+        _impl->deallocate(p, n);
+    }
+};
+
+/**
+ * transform any allocator into polymorph type
+ * @tparam origin
+ */
+
+template<class origin>
+class polymorph_adapter : public IPolymorhAllocator<typename origin::value_type> {
+    origin _impl;
+    using T = typename origin::value_type;
+
+ public:
+    template<class ...Args>
+    explicit polymorph_adapter(Args &&... args)
+        :_impl(std::forward<Args>(args)...) {
+    }
+    T *allocate(std::size_t n) override {
+        return _impl.allocate(n);
+    }
+    void deallocate(T *p, std::size_t n) override {
+        _impl.deallocate(p, n);
+    }
+};
+
+template<class T, class ...Args>
+inline PolymorphAllocator<typename T::value_type> make_polymorph(Args &&... args) {
+    auto sp = std::make_shared<polymorph_adapter<T>>(std::forward<Args>(args)...);
+    auto ipoly = std::static_pointer_cast<IPolymorhAllocator<typename T::value_type>>(sp);
+
+    return PolymorphAllocator<typename T::value_type>(ipoly);
+}
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/pwl.h b/inference-engine/src/gna_plugin/pwl.h
new file mode 100644
index 00000000000000..fd45903fcb4a73
--- /dev/null
+++ b/inference-engine/src/gna_plugin/pwl.h
@@ -0,0 +1,70 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "dnn.h"
+#include <vector>
+
+#define SIGMOID_NUM_SEGMENTS 65
+#define SIGMOID_DOMAIN 10.0f  // portion of input to be approximated (-10,10)
+#define TANH_NUM_SEGMENTS 65
+#define TANH_DOMAIN 5.0f  // portion of input to be approximated (-5,5)
+#define RELU_NUM_SEGMENTS 2
+#define LEAKYRELU_SLOPE 0.01
+#define IDENTITY_NUM_SEGMENTS 3
+#define IDENTITY_DOMAIN 10.0f
+#define PWL_MAX_ERR_PERCENT 1.0f
+#define PWL_MAX_ITERATIONS 2000
+#define PWL_MAX_NUM_SEGMENTS 128
+#define PWL_DESIGN_THRESHOLD 0.1f
+#define PWL_DESIGN_SAMPLES 500
+#define ACTIVATION_SCALE_FACTOR 2048.0f
+#define IDENTITY_SCALE_FACTOR 2049.0f
+#define XBASEMASK 0xFFFFFFFC  // only top 30 bits are used
+#define KALDI_LSTM_CLIP_LOWER (-50.0)
+#define KALDI_LSTM_CLIP_UPPER (50.0)
+
+typedef struct {
+    double t;
+    double alpha;
+    double beta;
+    double m;
+    double b;
+} pwl_t;
+
+typedef struct {
+    double slope;
+    uint64_t slope_scale = 0;
+    uint32_t slope_scale_index;
+} pwl_gna_slope_scale_t;
+
+double first_deriv_tanh(const double x);
+double sigmoid(const double x);
+double first_deriv_sigmoid(const double x);
+double relu(const double x);
+double leaky_relu(const double x);
+
+double clipping(const double x, const double lbound, const double ubound);
+void PwlApply16(intel_dnn_component_t *component, const uint32_t num_subset_size);
+void PwlApply16(intel_dnn_component_t *component,
+                const uint32_t num_row_start,
+                const uint32_t num_row_end,
+                const uint32_t num_col_start,
+                const uint32_t num_col_end);
+void PwlApply32(intel_dnn_component_t *component, const uint32_t num_subset_size);
+void PwlApply32(intel_dnn_component_t *component,
+                const uint32_t num_row_start,
+                const uint32_t num_row_end,
+                const uint32_t num_col_start,
+                const uint32_t num_col_end);
+void PwlDesign16(const DnnActivation activation_type,
+                 intel_pwl_segment_t *ptr_segment,
+                 const uint32_t num_segments,
+                 const float scale_in,
+                 const float scale_out);
+void PwlDesignOpt16(const DnnActivation activation_type,
+                std::vector<intel_pwl_segment_t> &ptr_segment,
+                const float scale_in,
+                const float scale_out);
diff --git a/inference-engine/src/gna_plugin/pwl_design.cpp b/inference-engine/src/gna_plugin/pwl_design.cpp
new file mode 100644
index 00000000000000..1f325bac7fe4b9
--- /dev/null
+++ b/inference-engine/src/gna_plugin/pwl_design.cpp
@@ -0,0 +1,681 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "pwl.h"
+#include "gna_plugin_log.hpp"
+#include <vector>
+#include <algorithm>
+#include <limits>
+
+#define FLOAT_TO_INT16(a) static_cast<int16_t>(((a) < 0)?((a) - 0.5):((a) + 0.5))
+#define FLOAT_TO_INT32(a) static_cast<int32_t>(((a) < 0)?((a)-0.5):((a)+0.5))
+#ifdef _NO_MKL_
+#include <cmath>
+#include <details/ie_exception.hpp>
+#define SCOPY(num, in, inci, out, inco) for (int i_ = 0; i_ < *(num); i_++) *(out + i_ * *(inco)) = *(in + i_ * *(inci));
+#define SSCAL(num, scale, inout, inco)  for (int i_ = 0; i_ < *(num); i_++) *(inout + i_ * *(inco)) = *(scale) * *(inout + i_ * *(inco));
+#define TANH(num, in, out) for (int i_ = 0; i_ < num; i_++) *(out+i_) = tanh(*(in+i_))
+#else
+#include <mkl.h>
+#define SCOPY(num, in, incx, out, incy) scopy(num, in, incx, out, incy)
+#define SSCAL(num, scale, inout, incx) sscal(num, scale, inout, incx)
+#define TANH(num, in, out) vsTanh(num, in, out)
+#endif
+
+double first_deriv_tanh(const double x) { return(1.0 - tanh(x) * tanh(x)); }
+
+double sigmoid(const double x) { return(0.5 * (1.0 + tanh(x / 2))); }
+double first_deriv_sigmoid(const double x) { return(sigmoid(x) * (1.0 - sigmoid(x))); }
+double relu(const double x) { if (x < 0) { return(0.0); } else { return(x); } }
+double leaky_relu(const double x) { if (x < 0.0) { return(LEAKYRELU_SLOPE*x); } else { return(x); } }
+double clipping(const double x, const double lbound, const double ubound) { return((x < lbound)?lbound:((x > ubound)?ubound:x)); }
+
+double pivot_search(std::vector<pwl_t>& result, double(*f)(const double),
+                                    double(*first_deriv_f)(const double),
+                                    const uint32_t N,
+                                    const double alpha_0,
+                                    const double alpha_N,
+                                    const double threshold,
+                                    const bool negative) {
+    std::vector<std::vector<double>> t(N + 1);
+    std::vector<std::vector<double>> alpha(N + 1);
+    std::vector<std::vector<double>> epsilon(N + 1);
+    std::vector<std::vector<double>> d(N + 1);
+    bool same_epsilon = false;
+    double Delta;
+    double epsilon_final = 0.0;
+    double max_epsilon = 0.0;
+    double max_epsilon_prev;
+    double min_epsilon;
+    double sgn = (negative) ? -1.0 : 1.0;
+    int j;
+
+    if ( f == nullptr ||
+        first_deriv_f == nullptr ||
+        threshold < 0) {
+        return epsilon_final;
+    }
+    // Figure 4:  Box #1
+    j = 0;
+    Delta = 1.0;
+
+    for (int i = 0; i < N; i++) {
+        t[i].push_back(alpha_0 + (static_cast<double>((i + 1)) / static_cast<double>((N + 1))) * (alpha_N - alpha_0));
+    }
+
+    while (true) {
+        // Figure 4:  Box #2
+        alpha[0].resize(j + 1);
+        alpha[0][j] = alpha_0;
+        for (int i = 1; i < N; i++) {
+            alpha[i].resize(j + 1);
+            alpha[i][j] = (f(t[i - 1][j]) - f(t[i][j]) + first_deriv_f(t[i][j]) * t[i][j] - first_deriv_f(t[i - 1][j]) * t[i - 1][j])
+                / (first_deriv_f(t[i][j]) - first_deriv_f(t[i - 1][j]));
+        }
+        alpha[N].resize(j + 1);
+        alpha[N][j] = alpha_N;
+
+        // Figure 4:  Box #3
+        for (int i = 0; i < N; i++) {
+            epsilon[i].resize(j + 1);
+            epsilon[i][j] = sgn * (first_deriv_f(t[i][j]) * (alpha[i][j] - t[i][j]) + f(t[i][j]) - f(alpha[i][j]));
+        }
+        epsilon[N].resize(j + 1);
+        epsilon[N][j] = sgn * (first_deriv_f(t[N - 1][j]) * (alpha[N][j] - t[N - 1][j]) + f(t[N - 1][j]) - f(alpha[N][j]));
+
+        // Figure 4:  Test for completion
+        max_epsilon_prev = max_epsilon;
+        max_epsilon = fabs(epsilon[0][j]);
+        min_epsilon = fabs(epsilon[0][j]);
+        for (int i = 1; i < N + 1; i++) {
+            if (fabs(epsilon[i][j]) > max_epsilon) max_epsilon = fabs(epsilon[i][j]);
+            if (fabs(epsilon[i][j]) < min_epsilon) min_epsilon = fabs(epsilon[i][j]);
+        }
+        if ((j == PWL_MAX_ITERATIONS) || (max_epsilon - min_epsilon < threshold * min_epsilon)) {
+            pwl_t value;
+            result.resize(0);
+            epsilon_final = (max_epsilon + min_epsilon) / 4.0;  // Andrzej's modification
+            for (int i = 0; i < N; i++) {
+                double val, val_next;
+                value.t = t[i][j];
+                value.alpha = alpha[i][j];
+                val = sgn * first_deriv_f(value.t) * (value.alpha - value.t) + sgn * f(value.t) - epsilon_final;
+                val_next = sgn * first_deriv_f(value.t) * (alpha[i + 1][j] - value.t) + sgn * f(value.t) - epsilon_final;
+                value.beta = val;
+                value.m = (val_next - val) / (alpha[i + 1][j] - value.alpha);
+                value.b = (val - value.m * value.alpha);
+                result.push_back(value);
+            }
+            value.t = value.m = value.b = 0.0;
+            value.alpha = alpha[N][j];
+            value.beta = sgn * first_deriv_f(t[N - 1][j]) * (alpha[N][j] - t[N - 1][j]) + sgn * f(t[N - 1][j]) - epsilon_final;
+            result.push_back(value);
+            if (j == PWL_MAX_ITERATIONS) {
+                std::cerr << "Error:  failed to converge in pivot_search!" << std::endl;
+            }
+            return(epsilon_final);
+        }
+
+        if (j > 0) {
+            if (max_epsilon > max_epsilon_prev) {
+                j = j - 1;
+                Delta = Delta / 2;
+            } else if (max_epsilon == max_epsilon_prev) {
+                if (!same_epsilon) {
+                    same_epsilon = true;
+                } else {
+                    j = j - 1;
+                    Delta = Delta / 2;
+                    same_epsilon = false;
+                }
+            }
+        }
+
+        // Figure 4:  Box #4
+        for (int i = 0; i < N; i++) {
+            d[i].resize(j + 1);
+            d[i][j] = Delta * (epsilon[i + 1][j] - epsilon[i][j]) /
+                ((epsilon[i + 1][j] / (alpha[i + 1][j] - t[i][j])) + (epsilon[i][j] / (t[i][j] - alpha[i][j])));
+        }
+
+        // Figure 4:  Box #5
+        for (int i = 0; i < N; i++) {
+            t[i].resize(j + 2);
+            t[i][j + 1] = t[i][j] + d[i][j];
+        }
+        t[N].resize(j + 2);
+
+        j = j + 1;
+    }
+}
+
+double calculate_error_pct(const DnnActivationType fun,
+                            const double l_bound,
+                            const double u_bound,
+                            const double offset,
+                            const int samples) {
+    double delta = (u_bound - l_bound) / (samples + 1);
+    double min_val = 0.0;
+    double max_val = 0.0;
+
+    if ( delta < 0 ) {
+        return 0.0;
+    }
+
+    switch (fun) {
+        case kActSigmoid:  min_val = max_val = sigmoid(l_bound); break;
+        case kActTanh:     min_val = max_val = tanh(l_bound); break;
+    }
+
+    for (int i = 0; i < samples; i++) {
+        double arg = l_bound + i * delta;
+        double val = 0.0;
+        switch (fun) {
+            case kActSigmoid:  val = sigmoid(arg); break;
+            case kActTanh:     val = tanh(arg); break;
+        }
+        if (val > max_val) max_val = val;
+        if (val < min_val) min_val = val;
+    }
+
+    return(100.0 * fabs(offset) / (max_val - min_val));
+}
+
+bool split_search(const DnnActivationType fun,
+                    const double l_bound,
+                    const double u_bound) {
+    bool is_split = false;
+    if (l_bound > u_bound) {
+        return is_split;
+    }
+
+    switch (fun) {
+        case kActSigmoid:
+        case kActTanh:
+            if ((l_bound < 0.0) && (u_bound > 0.0)) {
+                is_split = true;
+            }
+            break;
+        default:
+            is_split = false;
+    }
+    return(is_split);
+}
+
+inline std::vector<pwl_t> negative_pwl(const std::vector<pwl_t>& pwl) {
+    std::vector<pwl_t> new_pwl;
+    new_pwl = pwl;
+    for (uint32_t i = 0; i < pwl.size(); i++) {
+        new_pwl[i].m = -pwl[i].m;
+        new_pwl[i].b = -pwl[i].b;
+        new_pwl[i].beta = -pwl[i].beta;
+    }
+
+    return(new_pwl);
+}
+
+std::vector<pwl_t> pwl_search(const DnnActivationType fun,
+                                const double l_bound,
+                                const double u_bound,
+                                const double threshold,
+                                const double allowed_err_pct,
+                                const int samples,
+                                double& err_pct) {
+    std::vector<pwl_t> pwl;
+    double err = 0.0;
+    int n_segments = 1;
+
+    if (l_bound > u_bound ||
+        threshold < 0) {
+        return pwl;
+    }
+
+    if (split_search(fun, l_bound, u_bound)) {
+        std::vector<pwl_t> pwl2;
+        double err_pct1 = 0.0, err_pct2 = 0.0;
+
+        pwl = pwl_search(fun, l_bound, 0.0, threshold, allowed_err_pct, samples, err_pct1);
+        pwl = negative_pwl(pwl);
+        pwl2 = pwl_search(fun, 0.0, u_bound, threshold, allowed_err_pct, samples, err_pct2);
+
+        // merge
+        pwl.pop_back();  // remove final alpha and beta from first half
+        pwl.insert(pwl.end(), pwl2.begin(), pwl2.end());  // concatenate the two halves
+        err_pct = (err_pct1 + err_pct2) / 2;  // this is not quite correct but should give an indication
+
+    } else {
+        if (fun == kActIdentity) {
+            pwl.resize(2);
+            pwl[0].alpha = pwl[0].t = pwl[0].beta = -std::numeric_limits<float>::infinity();
+            pwl[0].m = 1.0;
+            pwl[0].b = 0.0;
+            pwl[1].alpha = std::numeric_limits<float>::infinity();
+            pwl[1].beta = std::numeric_limits<float>::infinity();
+
+        } else if (fun == kActKaldiLstmClipping) {
+            pwl.resize(4);
+            pwl[0].alpha = pwl[0].t = pwl[0].beta = -std::numeric_limits<float>::infinity();
+            pwl[0].m = 0.0;
+            pwl[0].b = pwl[0].beta = KALDI_LSTM_CLIP_LOWER;
+            pwl[1].alpha = pwl[0].t = pwl[1].beta = KALDI_LSTM_CLIP_LOWER;
+            pwl[1].m = 1.0;
+            pwl[1].b = 0.0;
+            pwl[2].alpha = pwl[0].t = pwl[1].beta = KALDI_LSTM_CLIP_UPPER;
+            pwl[2].m = 0.0;
+            pwl[2].b = KALDI_LSTM_CLIP_UPPER;
+            pwl[3].alpha = pwl[3].beta = std::numeric_limits<float>::infinity();
+
+        } else {
+            bool negative = false;
+
+            switch (fun) {
+                case kActSigmoid:
+                    if (u_bound == 0) negative = true;  // make left half convex
+                    err = pivot_search(pwl, sigmoid, first_deriv_sigmoid, n_segments, l_bound, u_bound, threshold, negative);
+                    break;
+                case kActTanh:
+                    if (u_bound == 0) negative = true;  // make left half convex
+                    err = pivot_search(pwl, tanh, first_deriv_tanh, n_segments, l_bound, u_bound, threshold, negative);
+                    break;
+            }
+            err_pct = calculate_error_pct(fun, l_bound, u_bound, err, samples);
+
+            while ((n_segments < PWL_MAX_ITERATIONS) && (allowed_err_pct < err_pct)) {
+                n_segments += 1;
+                switch (fun) {
+                    case kActSigmoid:
+                        err = pivot_search(pwl, sigmoid, first_deriv_sigmoid, n_segments, l_bound, u_bound, threshold, negative);
+                        break;
+                    case kActTanh:
+                        err = pivot_search(pwl, tanh, first_deriv_tanh, n_segments, l_bound, u_bound, threshold, negative);
+                        break;
+                }
+                err_pct = calculate_error_pct(fun, l_bound, u_bound, err, samples);
+            }
+
+            if (n_segments >= PWL_MAX_ITERATIONS) {
+                std::cerr << "Error:  failed to converge in pwl_search!" << std::endl;
+            }
+        }
+    }
+    return(pwl);
+}
+
+pwl_gna_slope_scale_t gna_slope(const double slope,
+                                const double in_scale,
+                                const double out_scale) {
+    pwl_gna_slope_scale_t s;
+    s.slope = slope* out_scale / in_scale;
+
+    for (s.slope_scale_index = 3; s.slope_scale_index > 0; --s.slope_scale_index) {
+        s.slope_scale = static_cast<uint64_t>(1) << (8 * (1 + s.slope_scale_index));
+        if (((s.slope * s.slope_scale) <= std::numeric_limits<int16_t>::max()) &&
+                    ((s.slope * s.slope_scale) >= std::numeric_limits<int16_t>::min()))
+            break;
+    }
+    s.slope_scale = static_cast<uint64_t>(1) << (8 * (1 + s.slope_scale_index));
+
+    return(s);
+}
+
+void make_gna_pwl(const DnnActivation  fun,
+                    const std::vector<pwl_t>& pwl,
+                    const double l_bound,
+                    const double u_bound,
+                    const double in_scale,
+                    const double out_scale,
+                    std::vector<intel_pwl_segment_t> &gna_pwl) {
+    pwl_gna_slope_scale_t s;
+    uint32_t pwl_size = static_cast<int32_t>(pwl.size());
+    switch (fun) {
+        case kActSigmoid:
+        case kActTanh: {
+            auto n_segments = static_cast<int32_t> (pwl_size) + 1;
+            gna_pwl.resize(n_segments);
+            // insert extra segment for x values < l_bound
+            gna_pwl[0].xBase = static_cast<int32_t> (INT32_MIN & XBASEMASK);  // zero out the 2 lsb
+            if (fun == kActSigmoid) {
+                gnalog() <<  "=========================== Sigmoid Segments ===========================\n";
+                gna_pwl[0].yBase = gna_pwl[1].yBase = 0;
+                gna_pwl[1].xBase = (static_cast<int32_t> (in_scale * (-pwl[0].b / pwl[0].m))) & XBASEMASK;
+            } else {
+                gnalog() <<  "=========================== Tanh Segments ===========================\n";
+                gna_pwl[0].yBase = gna_pwl[1].yBase = static_cast<int16_t>(-1.0 * out_scale);
+                gna_pwl[1].xBase = (static_cast<int32_t> (in_scale * (-1.0 - pwl[0].b) / pwl[0].m)) & XBASEMASK;
+            }
+            gna_pwl[0].slope = 0;
+
+            gnalog() << (gna_pwl[0].xBase) / in_scale
+                     << " " << (gna_pwl[0].yBase) / out_scale
+                     << " " << 0.0
+                     << "\n";
+
+            s = gna_slope(pwl[0].m, in_scale, out_scale);
+            gna_pwl[1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
+            gna_pwl[1].xBase = gna_pwl[1].xBase | s.slope_scale_index;
+
+            gnalog() << (gna_pwl[1].xBase/in_scale)
+                     << " " << (gna_pwl[1].yBase) / out_scale
+                     << " " << pwl[0].m
+                     << "\n";
+
+            for (uint32_t i = 1; i < pwl_size - 1; ++i) {
+                s = gna_slope(pwl[i].m, in_scale, out_scale);
+                gna_pwl[i + 1].xBase = (static_cast<int32_t> (in_scale * pwl[i].alpha)) & XBASEMASK;
+                gna_pwl[i + 1].yBase = FLOAT_TO_INT16(pwl[i].beta * out_scale);
+                gna_pwl[i + 1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
+                gna_pwl[i + 1].xBase = gna_pwl[i + 1].xBase | s.slope_scale_index;
+
+                gnalog() << (pwl[i].alpha)
+                         << " " << pwl[i].beta
+                         << " " << pwl[i].m
+                         << "\n";
+            }
+            // insert extra segment for xvalues > u_bound
+            gna_pwl[n_segments - 1].xBase =
+                ((uint32_t) (in_scale * (1.0 - pwl[pwl_size - 2].b) / pwl[pwl_size - 2].m)) & XBASEMASK;
+            gna_pwl[n_segments - 1].yBase = FLOAT_TO_INT16(1.0 * out_scale);
+            gna_pwl[n_segments - 1].slope = 0;
+
+            gnalog() << (gna_pwl[n_segments - 1].xBase / in_scale)
+                     << " " << 1.0
+                     << " " << 0.0
+                     << "\n";
+            break;
+        }
+        case kActRelu:
+        case kActLeakyRelu: {
+            auto n_segments = 2;
+            gna_pwl.resize(n_segments);
+
+            gnalog() << "=========================== ReLU Segments ===========================\n";
+            int32_t x_lower = INT32_MIN;
+            int16_t y_lower = INT16_MIN;
+            if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
+            if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
+            gna_pwl[0].yBase = y_lower * fun.negative_slope;
+            s = gna_slope(fun.negative_slope, in_scale, out_scale);
+            gna_pwl[0].xBase = (x_lower & XBASEMASK) | s.slope_scale_index;  // zero out the 2 lsb
+            gna_pwl[0].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
+
+            gnalog() << gna_pwl[0].xBase / in_scale
+                    << " " << gna_pwl[0].yBase / out_scale
+                    << " " << (gna_pwl[0].slope * in_scale) / (out_scale*s.slope_scale)
+                    << "\n";
+            gna_pwl[1].xBase = 0;
+            gna_pwl[1].yBase = 0;
+            s = gna_slope(1.0, in_scale, out_scale);
+            gna_pwl[1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
+            gna_pwl[1].xBase = gna_pwl[1].xBase | s.slope_scale_index;
+            gnalog() << 0.0
+                    << " " << 0.0
+                    << " " << (gna_pwl[1].slope * in_scale) / (out_scale*s.slope_scale)
+                    << "\n";
+            break;
+        }
+        case kActIdentity:
+        case kActKaldiLstmClipping: {
+            int32_t x_lower = INT32_MIN;
+            int32_t x_upper = INT32_MAX;
+            int16_t y_lower = INT16_MIN;
+            int16_t y_upper = INT16_MAX;
+            auto n_segments = 2;
+            if (fun == kActKaldiLstmClipping) {
+                gnalog()  << "=========================== Clipping Segments ===========================\n";
+                if (x_lower < l_bound * in_scale) {
+                    if (y_lower < l_bound * out_scale) {
+                        x_lower = FLOAT_TO_INT32(l_bound * in_scale);
+                        y_lower = FLOAT_TO_INT16(l_bound * out_scale);
+                    } else {
+                        x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
+                    }
+                }
+                if (x_upper > u_bound * in_scale) {
+                    if (y_upper > u_bound * out_scale) {
+                        x_upper = FLOAT_TO_INT32(u_bound * in_scale);
+                        y_upper = FLOAT_TO_INT16(u_bound * out_scale);
+                    } else {
+                        x_upper = FLOAT_TO_INT32(y_upper  * in_scale / out_scale);
+                    }
+                }
+            } else {
+                gnalog() << "=========================== Identity Segments ===========================\n";
+                if (x_lower < y_lower * in_scale / out_scale) x_lower = FLOAT_TO_INT32(y_lower * in_scale / out_scale);
+                if (x_upper > y_upper * in_scale / out_scale) x_upper = FLOAT_TO_INT32(y_upper * in_scale / out_scale);
+                if (y_lower < x_lower * out_scale / in_scale) y_lower = FLOAT_TO_INT16(x_lower * out_scale / in_scale);
+                if (y_upper > x_upper * out_scale / in_scale) y_upper = FLOAT_TO_INT16(x_upper * out_scale / in_scale);
+            }
+            gna_pwl.resize(n_segments);
+            gna_pwl[0].xBase = INT32_MIN & XBASEMASK;  // zero out the 2 lsb
+            gna_pwl[0].yBase = y_lower;
+            gna_pwl[0].slope = 0;
+            gnalog() << gna_pwl[0].xBase / in_scale
+                    << " " << gna_pwl[0].yBase / out_scale
+                    << " " << 0
+                    << "\n";
+            gna_pwl[1].xBase = x_lower & XBASEMASK;  // zero out the 2 lsb
+            gna_pwl[1].yBase = y_lower;
+            s = gna_slope(1.0, in_scale, out_scale);
+            gna_pwl[1].slope = FLOAT_TO_INT16(s.slope * s.slope_scale);
+            gna_pwl[1].xBase = gna_pwl[1].xBase | s.slope_scale_index;
+            gnalog() << gna_pwl[1].xBase / in_scale
+                    << " " << gna_pwl[1].yBase / out_scale
+                    << " " << 1.0
+                    << "\n";
+            if (INT32_MAX > x_upper) {  // need a right segment
+                gna_pwl.push_back({
+                    static_cast<int32_t>(x_upper & XBASEMASK),  // zero out the 2 lsb
+                    y_upper,
+                    0 });
+
+                gnalog() << gna_pwl[n_segments].xBase / in_scale
+                    << " " << gna_pwl[n_segments].yBase / out_scale
+                    << " " << 0
+                    << "\n";
+                n_segments += 1;
+            }
+            break;
+        }
+        default:
+            gnalog() << "Unexpected function activation!\n";
+            std::cerr << "Unexpected function activation!\n";
+    }
+}
+
+void PwlDesignOpt16(const DnnActivation activation_type,
+                    std::vector<intel_pwl_segment_t> &ptr_segment,
+                    const float scale_in,
+                    const float scale_out) {
+    std::vector<pwl_t> pwl;
+    double err_pct = 0.0;
+    switch (activation_type) {
+        case kActSigmoid:
+            pwl = pwl_search(kActSigmoid, -SIGMOID_DOMAIN, SIGMOID_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct);
+            make_gna_pwl(activation_type, pwl, -SIGMOID_DOMAIN, SIGMOID_DOMAIN, scale_in, scale_out, ptr_segment);
+            break;
+        case kActTanh:
+            pwl = pwl_search(kActTanh, -TANH_DOMAIN, TANH_DOMAIN, PWL_DESIGN_THRESHOLD, PWL_MAX_ERR_PERCENT, PWL_DESIGN_SAMPLES, err_pct);
+            make_gna_pwl(activation_type, pwl, -TANH_DOMAIN, TANH_DOMAIN, scale_in, scale_out, ptr_segment);
+            break;
+        case kActRelu:
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            break;
+        case kActLeakyRelu:
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            break;
+        case kActIdentity:
+            make_gna_pwl(activation_type, pwl, -1.0, 1.0, scale_in, scale_out, ptr_segment);
+            break;
+        case kActKaldiLstmClipping:
+            make_gna_pwl(activation_type, pwl, KALDI_LSTM_CLIP_LOWER, KALDI_LSTM_CLIP_UPPER, scale_in, scale_out, ptr_segment);
+            break;
+        default:
+            break;
+    }
+}
+
+void PwlDesign16(const DnnActivation activation_type,
+                 intel_pwl_segment_t *ptr_segment,
+                 const uint32_t num_segments,
+                 const float scale_in,
+                 const float scale_out) {
+    switch (activation_type) {
+        case kActSigmoid:
+           {
+                gnalog() <<  "=========================== Sigmoid Segments===========================\n";
+                uint32_t num_segment_size = 0;
+                int32_t offset = 0;
+                ptr_segment[0].xBase = static_cast<int32_t>(INT32_MIN & XBASEMASK);  // zero out the 2 lsb
+                num_segment_size = static_cast<int32_t>(SIGMOID_DOMAIN * scale_in / ((num_segments-2) / 2) + 0.5);
+                offset = -static_cast<int32_t>(num_segment_size * (num_segments-2) / 2);
+                for (uint32_t i = 1; i < num_segments; i++) {
+                    ptr_segment[i].xBase = static_cast<int32_t>(offset & XBASEMASK);  // zero out the 2 lsb
+                    offset += num_segment_size;
+                }
+                for (uint32_t i = 0; i < num_segments; i++) {
+                    int32_t xbase = static_cast<int32_t>(ptr_segment[i].xBase & XBASEMASK);
+                    int32_t xbasenext = (i < num_segments-1) ? static_cast<int32_t>(ptr_segment[i+1].xBase & XBASEMASK) : INT32_MAX;
+                    float floatarg = static_cast<float>(xbase / (2 * scale_in));
+                    float floatargnext = static_cast<float>(xbasenext / (2 * scale_in));
+                    float floatval, floatvalnext, slope;
+                    TANH(1, &floatarg, &floatval);
+                    floatval = 0.5f * (1.0f + floatval);
+                    TANH(1, &floatargnext, &floatvalnext);
+                    floatvalnext = 0.5f * (1.0f + floatvalnext);
+                    slope = scale_out*(floatvalnext - floatval) / static_cast<float>(xbasenext - xbase);
+                    {
+                        // find best scale factor
+                        uint64_t slope_scale;
+                        uint32_t slope_scale_index;
+                        for (slope_scale_index = 3; slope_scale_index > 0; slope_scale_index--) {
+                            slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                            if (((slope * slope_scale) <= 32767.0) && ((slope * slope_scale) >= -32768.0))
+                                break;
+                        }
+                        slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                        ptr_segment[i].slope = FLOAT_TO_INT16(slope * slope_scale);
+
+                        ptr_segment[i].xBase = ptr_segment[i].xBase | slope_scale_index;
+                    }
+                    ptr_segment[i].yBase = FLOAT_TO_INT16(floatval * scale_out);
+                    gnalog() << (static_cast<int32_t>((ptr_segment[i].xBase & XBASEMASK))/scale_out)
+                             << " "
+                             << (static_cast<float>((ptr_segment[i].yBase))/scale_out)
+                             << " "
+                             << (slope/scale_out)
+                             << "\n";
+                }
+            }
+            break;
+        case kActTanh:
+            {
+                gnalog() <<  "=========================== Tanh Segments===========================\n";
+                uint32_t num_segment_size = 0;
+                int32_t offset = 0;
+                ptr_segment[0].xBase = static_cast<int32_t>(INT32_MIN & XBASEMASK);  // zero out the 2 lsb
+                num_segment_size = static_cast<int32_t>(TANH_DOMAIN * scale_in / ((num_segments-2) / 2) + 0.5);
+                offset = -static_cast<int32_t>(num_segment_size * (num_segments-2) / 2);
+                for (uint32_t i = 1; i < num_segments; i++) {
+                    ptr_segment[i].xBase = static_cast<int32_t>(offset & XBASEMASK);  // zero out the 2 lsb
+                    offset += num_segment_size;
+                }
+                for (uint32_t i = 0; i < num_segments; i++) {
+                    int32_t xbase = static_cast<int32_t>(ptr_segment[i].xBase & XBASEMASK);
+                    int32_t xbasenext = (i < num_segments-1) ?
+                                                    static_cast<int32_t>(ptr_segment[i+1].xBase & XBASEMASK) :
+                                                    INT32_MAX;
+                    float floatarg = static_cast<float>(xbase / scale_in);
+                    float floatargnext = static_cast<float>(xbasenext / scale_in);
+                    float floatval, floatvalnext, slope;
+                    TANH(1, &floatarg, &floatval);
+                    TANH(1, &floatargnext, &floatvalnext);
+                    slope = scale_out * (floatvalnext - floatval) /
+                                                static_cast<float>(xbasenext - xbase);
+                    {
+                        // find best scale factor
+                        uint64_t slope_scale;
+                        uint32_t slope_scale_index;
+                        for (slope_scale_index = 3; slope_scale_index > 0; slope_scale_index--) {
+                            slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                            if (((slope * slope_scale) <= 32767.0) && ((slope * slope_scale) >= -32768.0))
+                                break;
+                        }
+                        slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                        ptr_segment[i].slope = FLOAT_TO_INT16(slope * slope_scale);
+                        ptr_segment[i].xBase = ptr_segment[i].xBase | slope_scale_index;
+                    }
+                    ptr_segment[i].yBase = FLOAT_TO_INT16(floatval * scale_out);
+                    gnalog() << (static_cast<int32_t>((ptr_segment[i].xBase & XBASEMASK))/scale_out)
+                             << " "
+                             << (static_cast<float>((ptr_segment[i].yBase))/scale_out)
+                             << " "
+                             << (slope/scale_out)
+                             << "\n";
+                }
+            }
+            break;
+        case kActRelu:
+            std::cerr << "Rectilinear activation function design not yet implemented!" << std::endl;
+            throw -1;
+            break;
+        case kActIdentity:
+        case kActKaldiLstmClipping:  // clipping of IDENTITY is more aggressive than Kaldi
+            {
+                float slope = 0.0;
+                int64_t x_lower_limit = static_cast<int64_t>((INT16_MIN / scale_out) * scale_in - 0.5);
+                int64_t x_upper_limit = static_cast<int64_t>((INT16_MAX / scale_out) * scale_in + 0.5);
+                int16_t y_lower_limit = INT16_MIN;
+                int16_t y_upper_limit = INT16_MAX;
+                if (activation_type == kActKaldiLstmClipping)
+                    gnalog() << "=========================== Clipping Segments ===========================\n";
+                else
+                    gnalog() << "=========================== Identity Segments ===========================\n";
+                if (x_lower_limit < INT32_MIN) {
+                    std::cerr << "Warning:  saturation in PwlDesign16! " << x_lower_limit  << " < INT32_MIN"<< std::endl;
+                    x_lower_limit = INT32_MIN;
+                    y_lower_limit = static_cast<int16_t>((scale_out / scale_in)*static_cast<float>(INT32_MIN) - 0.5);
+                }
+                if (x_upper_limit > INT32_MAX) {
+                    std::cerr << "Warning:  saturation in PwlDesign16! " << x_upper_limit  << " > INT32_MAX"<< std::endl;
+                    x_upper_limit = INT32_MAX;
+                    y_upper_limit = static_cast<int16_t>((scale_out / scale_in)*static_cast<float>(INT32_MAX) + 0.5);
+                }
+                slope =
+                    static_cast<float>(static_cast<uint64_t>(y_upper_limit) - static_cast<uint64_t>(y_lower_limit)) /
+                                               static_cast<float>(static_cast<uint64_t>(x_upper_limit) - static_cast<uint64_t>(x_lower_limit));
+                ptr_segment[0].xBase = static_cast<int32_t>(INT32_MIN & XBASEMASK);  // zero out the 2 lsb
+                ptr_segment[0].yBase = y_lower_limit;
+                ptr_segment[0].slope = 0;
+
+                gnalog() << ptr_segment[0].xBase / scale_in
+                    << " " << ptr_segment[0].yBase / scale_out
+                    << " " << 0
+                    << "\n";
+
+                ptr_segment[1].xBase = static_cast<int32_t>(x_lower_limit & XBASEMASK);
+                ptr_segment[1].yBase = y_lower_limit;
+                {
+                    // find best scale factor
+                    uint64_t slope_scale = 0;
+                    uint32_t slope_scale_index = 0;
+                    for (slope_scale_index = 3; slope_scale_index > 0; slope_scale_index--) {
+                        slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                        if (((slope * slope_scale) <= std::numeric_limits<int16_t>::max()) &&
+                                    ((slope * slope_scale) >= std::numeric_limits<int16_t>::min()))
+                            break;
+                    }
+                    slope_scale = static_cast<uint64_t>(1) << (8 * (1 + slope_scale_index));
+                    ptr_segment[1].slope = FLOAT_TO_INT16(slope * slope_scale);
+                    ptr_segment[1].xBase = ptr_segment[1].xBase | slope_scale_index;
+                }
+                ptr_segment[2].xBase = static_cast<int32_t>(x_upper_limit & XBASEMASK);
+                ptr_segment[2].yBase = y_upper_limit;
+                ptr_segment[2].slope = 0;
+            }
+            break;
+        default:
+            fprintf(stderr, "Activation function design for %s not yet implemented!\n", intel_dnn_activation_name[activation_type]);
+            throw -1;
+    }
+}
diff --git a/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp b/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp
new file mode 100644
index 00000000000000..6c42d9255bec58
--- /dev/null
+++ b/inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp
@@ -0,0 +1,488 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <string>
+#include <utility>
+#include <gna-api-types-xnn.h>
+#include "ie_layers.h"
+#include "quantized_layer_params.hpp"
+#include "quantization.h"
+#include "details/caseless.hpp"
+#include "graph_tools.hpp"
+#include "blob_factory.hpp"
+#include "precision_ex.hpp"
+#include "pwl.h"
+#include "gna_layer_info.hpp"
+
+namespace GNAPluginNS {
+namespace details {
+
+/**
+ * @brief description of quantisation precision
+ * @tparam Ip - input precision
+ * @tparam Wp - weights precision
+ * @tparam Bp - biases precision
+ * @tparam Np - network precision - can be auto generated in future
+ */
+template <class Ip, class Op, class Wp, class Bp, class Np>
+struct QuantDescTmpl {
+    using WeightsPrecision = Wp;
+    using BiasesPrecision = Bp;
+
+    InferenceEngine::TPrecision<Ip> _Ip;
+    InferenceEngine::TPrecision<Op> _Op;
+    InferenceEngine::TPrecision<Wp> _Wp;
+    InferenceEngine::TPrecision<Bp> _Bp;
+    InferenceEngine::TPrecision<Np> _Np;
+
+    QuantDescTmpl() = default;
+    QuantDescTmpl(InferenceEngine::TPrecision<Ip> _Ip,
+              InferenceEngine::TPrecision<Op> _Op,
+              InferenceEngine::TPrecision<Wp> _Wp,
+              InferenceEngine::TPrecision<Bp> _Bp,
+              InferenceEngine::TPrecision<Np> _Np) : _Op(_Op), _Ip(_Ip), _Wp(_Wp), _Bp(_Bp), _Np(_Np) {
+    }
+
+    InferenceEngine::Precision getInputPrecision() const {
+        return _Ip;
+    }
+    InferenceEngine::Precision getWeightsPrecision() const {
+        return _Wp;
+    }
+    InferenceEngine::Precision getBiasesPrecision() const {
+        return _Bp;
+    }
+    InferenceEngine::Precision getNetPrecision() const {
+        return _Np;
+    }
+    InferenceEngine::Precision getOutputPrecision() const {
+        return _Op;
+    }
+};
+
+#define P_TYPE(X)\
+typename InferenceEngine::PrecisionTrait<InferenceEngine::Precision::X>::value_type
+
+#define PRECISION_TYPE(A, B, C, D, E)\
+    P_TYPE(A), P_TYPE(B), P_TYPE(C), P_TYPE(D), P_TYPE(E)
+
+
+struct QuantI16 : public QuantDescTmpl<PRECISION_TYPE(I16, I32, I16, I32, MIXED)> {
+    QuantI16() {
+        _Np = InferenceEngine::Precision::MIXED;
+    }
+};
+struct QuantI8  : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), intel_compound_bias_t, P_TYPE(MIXED)> {
+    QuantI8() {
+        _Np = InferenceEngine::Precision::MIXED;
+    }
+};
+
+template <class A, class B>
+struct QuantPair {
+    using MandatoryType = A;
+    using OptionalType = B;
+    static A mandatory () { return A();}
+    static B optional () { return B();}
+};
+
+/**
+ * @brief should allocated blob for specific data type, in case of src blob is nullptr
+ * @tparam T
+ * @return
+ */
+template <class T>
+inline bool shouldAlwaysAllocate() {
+    return false;
+}
+
+template <>
+inline bool shouldAlwaysAllocate<intel_compound_bias_t>() {
+    return true;
+}
+
+
+#undef P_TYPE
+#undef PRECISION_TYPE
+
+/**
+ * @brief  designate actual data quantisation functions trait
+ */
+template <class T>
+class Quant {
+ public:
+    template<class ...Args>
+    void operator()(Args && ... args) const { }
+};
+
+template<>
+class Quant<QuantI16> {
+ public:
+    template<class ...Args>
+    void operator()(Args && ... args) const {
+        QuantizeAffine16(std::forward<Args>(args)...);
+    }
+};
+
+template<>
+class Quant<QuantI8> {
+ public:
+    template<class ...Args>
+    void operator()(Args && ... args) const {
+        QuantizeAffine8(std::forward<Args>(args)...);
+    }
+};
+
+template<class QuantDesc, class QuantFunc>
+inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
+                                  InferenceEngine::WeightableLayer *wl,
+                                  const QuantFunc &fnc,
+                                  bool isDiagonal = false) {  // for diagonal layer number of weights and biases significatly smaller
+    // for quantized weights
+    auto intWeights =
+        make_custom_blob<typename QuantDesc::WeightsPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({wl->_weights->size()}));
+    intWeights->allocate();
+    if (intWeights->buffer() == nullptr) {
+        THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
+                            << "cannot copy weights for layer :"<< wl->name << " of size" << intWeights->byteSize();
+    }
+
+
+    auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) {
+        if (wl->_biases) {
+            return wl->_biases->size();
+        }
+        // calculating biases len using weight dims
+        auto & dims = wl->outData.front()->getDims();
+        return dims[1];
+    };
+
+    using BiasesPrecision = typename QuantDesc::BiasesPrecision;
+    auto biasMaker = [&] () {
+        InferenceEngine::Blob::Ptr zero;
+        if (!wl->_biases && !shouldAlwaysAllocate<BiasesPrecision>()) {
+            return zero;
+        }
+        auto bias = make_custom_blob<BiasesPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({
+            getBiasSizeForLayer(wl)
+        }));
+        bias->allocate();
+        if (bias->buffer() == nullptr) {
+            THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
+                                << "cannot copy bias for layer :"<< wl->name <<"of size" << bias->byteSize();
+        }
+
+        memset(bias->buffer(), 0, bias->byteSize());
+
+        return bias;
+    };
+    auto intBiases = biasMaker();
+
+    float input_scale_factor = 1.f;
+    if (InferenceEngine::CNNNetHasPrevLayer(wl)) {
+        auto quantDataForInputLayer =
+            InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
+        input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
+        if (std::isnan(input_scale_factor) ||
+            std::isinf(input_scale_factor)) {
+            THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
+        }
+    }
+    if (wl->outData[0]->getDims().size() < 2) {
+        THROW_IE_EXCEPTION << "Unsupported output dims size for " << wl->name <<", should be > 1, but " << wl->outData[0]->getDims().size();
+    }
+    if (wl->insData[0].lock().get()->getDims().size() < 2) {
+        THROW_IE_EXCEPTION << "Unsupported input dims size for " << wl->name << ", should be > 1, but " << wl->insData[0].lock().get()->getDims().size();
+    }
+    uint32_t num_rows = isDiagonal ? 1 : wl->outData[0]->getDims()[1];
+    uint32_t num_columns = wl->insData[0].lock().get()->getDims()[1];
+
+    if (isDiagonal) {
+        std::swap(num_rows, num_columns);
+    }
+
+    uint32_t num_rows_padded = num_rows;
+    uint32_t num_columns_padded = num_columns;
+
+    // TODO: replace this into fixed scale quantizer then
+
+    auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
+    {
+        fnc(wl->_weights->buffer().as<float *>(),
+            wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
+            intWeights->buffer(),
+            intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
+            input_scale_factor,
+            &quantData->_weights_quant.scale,
+            &quantData->_dst_quant.scale,
+            num_rows,
+            num_columns,
+            num_rows_padded,
+            num_columns_padded);
+    }
+    wl->_weights = intWeights;
+    wl->_biases = intBiases;
+
+    /**
+     * correcting precision for outdata
+     */
+    wl->precision = quantDesc.getWeightsPrecision();
+    for (auto &&outData : wl->outData) {
+        outData->setPrecision(quantDesc.getOutputPrecision());
+    }
+}
+
+
+template<class QuantDesc, class QuantFunc>
+inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
+                                  InferenceEngine::WeightableLayer *conv,
+                                  const QuantFunc &fnc) {
+    // for quantized weights
+    auto intWeights = make_custom_blob<typename QuantDesc::WeightsPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({conv->_weights->size()}));
+    intWeights->allocate();
+    if (intWeights->buffer() == nullptr) {
+        THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
+                            << "cannot copy weights for layer :"<< conv->name << " of size" << intWeights->byteSize();
+    }
+
+
+    auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) {
+        if (wl->_biases) {
+            return wl->_biases->size();
+        }
+        // calculating biases len using weight dims
+        auto & dims = wl->outData.front()->getDims();
+        return dims[1];
+    };
+
+    using BiasesPrecision = typename QuantDesc::BiasesPrecision;
+    auto biasMaker = [&] () {
+        InferenceEngine::Blob::Ptr zero;
+        if (!conv->_biases && !shouldAlwaysAllocate<BiasesPrecision>()) {
+            return zero;
+        }
+        auto bias = make_custom_blob<BiasesPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({
+                                                                                                          getBiasSizeForLayer(conv)
+                                                                                                      }));
+        bias->allocate();
+        if (bias->buffer() == nullptr) {
+            THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
+                                << "cannot copy bias for layer :"<< conv->name <<"of size" << bias->byteSize();
+        }
+        memset(bias->buffer(), 0, bias->byteSize());
+
+        return bias;
+    };
+    auto intBiases = biasMaker();
+
+    float input_scale_factor = 1.f;
+    if (InferenceEngine::CNNNetHasPrevLayer(conv)) {
+        auto quantDataForInputLayer =
+            InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(conv).get());
+        input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
+        if (std::isnan(input_scale_factor) ||
+            std::isinf(input_scale_factor)) {
+            THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
+        }
+    }
+    if (conv->outData[0]->getDims().size() < 2) {
+        THROW_IE_EXCEPTION << "Unsupported output dims size for " << conv->name <<", should be > 1, but " << conv->outData[0]->getDims().size();
+    }
+    if (conv->insData[0].lock().get()->getDims().size() < 2) {
+        THROW_IE_EXCEPTION << "Unsupported input dims size for " << conv->name << ", should be > 1, but " << conv->insData[0].lock().get()->getDims().size();
+    }
+    auto inputData = conv->insData[0].lock();
+
+    uint32_t num_rows = getBiasSizeForLayer(conv);
+    uint32_t num_columns = conv->_weights->size() / num_rows;
+
+    uint32_t num_rows_padded = num_rows;
+    uint32_t num_columns_padded = num_columns;
+
+    // TODO: replace this into fixed scale quantizer then
+
+    auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv);
+    {
+        fnc(conv->_weights->buffer().as<float *>(),
+            conv->_biases ? conv->_biases->buffer().as<float *>() : nullptr,
+            intWeights->buffer(),
+            intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
+            input_scale_factor,
+            &quantData->_weights_quant.scale,
+            &quantData->_dst_quant.scale,
+            num_rows,
+            num_columns,
+            num_rows_padded,
+            num_columns_padded);
+    }
+    conv->_weights = intWeights;
+    conv->_biases = intBiases;
+
+    /**
+     * correcting precision for outdata
+     */
+    conv->precision = quantDesc.getWeightsPrecision();
+    for (auto &&outData : conv->outData) {
+        outData->setPrecision(quantDesc.getOutputPrecision());
+    }
+}
+
+
+class DataQuantizerBase {
+ public:
+    explicit DataQuantizerBase(float scaleFactor) : scaleFactor(scaleFactor) {
+    }
+ protected:
+    float scaleFactor = 1.0;
+};
+/**
+ * Helper class to use partial specialisation of Layer type
+ * @tparam Desc
+ * @tparam Layer
+ */
+template<class Desc, class Layer>
+class DataQuantizer : public DataQuantizerBase {
+ public:
+    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+    bool operator()(Layer cnnLayer) const {
+        return false;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::CNNLayer *> : public DataQuantizerBase {
+ public:
+    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+
+    bool operator()(InferenceEngine::CNNLayer *cnnLayer) const {
+        for (auto &&outData : cnnLayer->outData) {
+            outData->setPrecision(Desc::mandatory().getOutputPrecision());
+        }
+        // set scale factor for input layers
+        auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
+        if (cnnLayer->insData.empty()) {
+            for (auto &&outData : cnnLayer->outData) {
+                outData->setPrecision(Desc::mandatory().getInputPrecision());
+            }
+        } else {
+                if (LayerInfo(*cnnLayer).isActivation() ||
+                        LayerInfo(*cnnLayer).isCopy()) {
+                // precision of activation layers is always equal input precision
+                for (auto &&outData : cnnLayer->outData) {
+                    outData->setPrecision(Desc::mandatory().getInputPrecision());
+                }
+            }
+        }
+        cnnLayer->precision = Desc::mandatory().getInputPrecision();
+
+        return true;
+    }
+};
+
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::SplitLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
+    using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
+ public:
+    explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
+    bool operator()(InferenceEngine::SplitLayer *splitLayer) const {
+        base::operator()(splitLayer);
+        // split layer doesnt change it's data at all
+        for (auto &&outData : splitLayer->outData) {
+            outData->setPrecision(Desc::mandatory().getInputPrecision());
+        }
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::ConcatLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
+    using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
+ public:
+    explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
+    bool operator()(InferenceEngine::ConcatLayer *concatLayer) const {
+        base::operator()(concatLayer);
+        for (auto &&outData : concatLayer->outData) {
+            outData->setPrecision(Desc::mandatory().getInputPrecision());
+        }
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::CropLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
+    using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
+ public:
+    explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
+    bool operator()(InferenceEngine::CropLayer *cropLayer) const {
+        base::operator()(cropLayer);
+        for (auto &&outData : cropLayer->outData) {
+            outData->setPrecision(Desc::mandatory().getInputPrecision());
+        }
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::ReshapeLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
+    using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
+ public:
+    explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
+    bool operator()(InferenceEngine::ReshapeLayer *reshapeLayer) const {
+        base::operator()(reshapeLayer);
+        // reshape layer doesnt change it's data at all
+        for (auto &&outData : reshapeLayer->outData) {
+            outData->setPrecision(Desc::mandatory().getInputPrecision());
+        }
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::WeightableLayer *> : public DataQuantizerBase {
+ public:
+    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+    bool operator()(InferenceEngine::WeightableLayer *wl) const {
+        quantizeWeightsBiases<typename Desc::MandatoryType>(Desc::mandatory(), wl, Quant<typename Desc::MandatoryType>());
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::ConvolutionLayer *> : public DataQuantizerBase {
+ public:
+    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+    bool operator()(InferenceEngine::WeightableLayer *wl) const {
+        quantizeWeightsBiasesConv<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>());
+        return true;
+    }
+};
+
+template<class Desc>
+class DataQuantizer<Desc, InferenceEngine::ScaleShiftLayer *> : public DataQuantizerBase {
+ public:
+    explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+    bool operator()(InferenceEngine::ScaleShiftLayer *wl) const {
+        quantizeWeightsBiases<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>(), true);
+        return true;
+    }
+};
+
+}  // namespace details
+
+template<class Desc>
+class LayersQuantizer : public details::DataQuantizerBase {
+ public:
+    explicit LayersQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
+    template<class T>
+    bool operator()(T input) const {
+        return details::DataQuantizer<Desc, T>(scaleFactor)(input);
+    }
+};
+
+using QuantI16 = details::QuantPair<details::QuantI16, details::QuantI16>;
+using QuantI8 = details::QuantPair<details::QuantI8, details::QuantI16>;
+
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp b/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp
new file mode 100644
index 00000000000000..797c87c9c71818
--- /dev/null
+++ b/inference-engine/src/gna_plugin/quantization/model_quantizer.hpp
@@ -0,0 +1,78 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#pragma once
+#include <vector>
+#include "gna_plugin_config.hpp"
+#include "layer_transform.hpp"
+#include "graph_tools.hpp"
+#include "details/ie_cnn_network_tools.h"
+#include "layer_quantizer.hpp"
+#include "scale_factor_calc.hpp"
+
+namespace GNAPluginNS {
+/**
+ * Quantize entire cnn - network
+ * @tparam T - type trait for weights and biases
+ */
+template<class T>
+class ModelQuantizer {
+ public:
+    CNNNetworkPtr quantize(InferenceEngine::ICNNNetwork &model, float scaleFactor) const {
+        return quantize(model, [](InferenceEngine::CNNNetPtr &){}, scaleFactor);
+    }
+
+    template <class PreQuantisationCb>
+    CNNNetworkPtr quantize(InferenceEngine::ICNNNetwork &model, const PreQuantisationCb &cb, float scaleFactor) const {
+        auto visitor = [&](InferenceEngine::CNNLayerPtr lp) {
+            return InferenceEngine::injectData<QuantizedLayerParams>(lp);
+        };
+        auto copiedNet = InferenceEngine::CNNNetCopy(model, visitor);
+
+        // TODO: probably not the best way of using dynamic cast in order to transform Precision
+        // one of solution is to create not copyNet overloads, that accepts 2 functors, one for layer copy
+        // and another one for net copy
+        auto rawNet = dynamic_cast<InferenceEngine::details::CNNNetworkImpl *>(copiedNet.get());
+        rawNet->setPrecision(T::mandatory().getNetPrecision());
+
+        // allow client code to access copied topology, to avoid copies if user would like to chain quantisation with
+        // another preprocessing
+        cb(copiedNet);
+
+        LayersQuantizer<T> lc(scaleFactor);
+        auto sortedNewNet = InferenceEngine::details::CNNNetSortTopologically(*copiedNet.get());
+        gnalog() << "Sorted layers: " << std::endl;
+        for (auto &&layer : sortedNewNet) {
+            gnalog() << layer->name << std::endl;
+        }
+
+        // weights scale is a hint, not all weightable layer preserve it in all possible precisions
+        propagateScaleFactor(sortedNewNet, T::mandatory().getWeightsPrecision().size(), scaleFactor);
+
+        // sorted order gives possibility for propagate quantisation along depended layers
+        for (auto &&layer : sortedNewNet) {
+            transformLayer(layer, lc);
+        }
+
+        return copiedNet;
+    }
+
+ private :
+    void propagateScaleFactor(std::vector<InferenceEngine::CNNLayerPtr> & net, int weightsBytesSize, float scaleFactor) const {
+        ScaleFactorCalculator sf(net, weightsBytesSize, scaleFactor);
+
+        while (!sf.allLayersProcessed()) {
+            for (auto &&layer : sf.getStartLayers()) {
+                transformLayer(layer, sf);
+                // transforming until we reached cases where output scale updated due to situation in downstream layer
+                if (sf.needToRestart()) {
+                    break;
+                }
+            }
+        }
+    }
+};
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/quantization/precision_ex.hpp b/inference-engine/src/gna_plugin/quantization/precision_ex.hpp
new file mode 100644
index 00000000000000..798345e9821545
--- /dev/null
+++ b/inference-engine/src/gna_plugin/quantization/precision_ex.hpp
@@ -0,0 +1,95 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "ie_precision.hpp"
+
+namespace InferenceEngine {
+
+/**
+ * @brief reverse trait for getting some precision from it's underlined memory type
+ * this might not work for certain precisions : for Q78, U16
+ * @tparam T
+ */
+template<class T>
+struct precision_from_media {
+    static const Precision::ePrecision type = Precision::CUSTOM;
+};
+
+template<>
+struct precision_from_media<float> {
+    static const Precision::ePrecision type = Precision::FP32;
+};
+
+template<>
+struct precision_from_media<uint16_t> {
+    static const Precision::ePrecision type = Precision::FP16;
+};
+
+template<>
+struct precision_from_media<int16_t> {
+    static const Precision::ePrecision type = Precision::I16;
+};
+
+template<>
+struct precision_from_media<uint8_t> {
+    static const Precision::ePrecision type = Precision::U8;
+};
+
+template<>
+struct precision_from_media<int8_t> {
+    static const Precision::ePrecision type = Precision::I8;
+};
+
+template<>
+struct precision_from_media<int32_t> {
+    static const Precision::ePrecision type = Precision::I32;
+};
+
+/**
+ * @brief container for storing both precision and it's underlined media type
+ * @tparam TMedia
+ */
+template <class TMedia>
+class TPrecision : public Precision {
+ public:
+    typedef TMedia MediaType;
+    TPrecision() : Precision(precision_from_media<TMedia>::type) {}
+    explicit TPrecision(const Precision & that) : Precision(that) {}
+    TPrecision & operator = (const Precision & that) {
+        Precision::operator=(that);
+        return *this;
+    }
+    explicit TPrecision(const Precision::ePrecision  value) : Precision(value) {}
+};
+
+template <class T> TPrecision<T> createTPrecision() {
+    TPrecision<T> cnt(InferenceEngine::Precision::fromType<T>());
+    return cnt;
+}
+
+template <InferenceEngine::Precision::ePrecision T>
+TPrecision<typename InferenceEngine::PrecisionTrait<T>::value_type> createTPrecision() {
+    TPrecision<typename InferenceEngine::PrecisionTrait<T>::value_type> cnt(T);
+    return cnt;
+}
+
+
+// special case for Mixed, or undefined precisions
+template <>
+class TPrecision<void> : public Precision {
+ public:
+    typedef void MediaType;
+    TPrecision() = default;
+    explicit TPrecision(const Precision & that) : Precision(that) {}
+    TPrecision & operator = (const Precision & that) {
+        Precision::operator=(that);
+        return *this;
+    }
+    explicit TPrecision(const Precision::ePrecision  value) : Precision(value) {}
+};
+
+
+}  // namespace InferenceEngine
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/quantization/quantization.cpp b/inference-engine/src/gna_plugin/quantization/quantization.cpp
new file mode 100644
index 00000000000000..457bff9afed336
--- /dev/null
+++ b/inference-engine/src/gna_plugin/quantization/quantization.cpp
@@ -0,0 +1,699 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cstring>
+#include <iostream>
+#include "quantization.h"
+
+void QuantizeAffine16(float *ptr_float_weights,
+                      float *ptr_float_biases,
+                      int16_t *ptr_int_weights,
+                      int32_t *ptr_int_biases,
+                      float input_scale_factor,
+                      float *ptr_weight_scale_factor,
+                      float *ptr_output_scale_factor,
+                      uint32_t num_rows,
+                      uint32_t num_columns,
+                      uint32_t num_rows_padded,
+                      uint32_t num_columns_padded) {
+    uint32_t num_saturate = 0;
+
+    if (*ptr_weight_scale_factor == 1.0) {
+        // scale factor for weights is not calculated yet
+        float mean_weight = 0.0;
+        float mean_weight_squared = 0.0;
+        float max_weight = -1e20f;
+        float var_weight;
+        float mean_plus_2stdev;
+
+        for (uint32_t i = 0; i < num_rows; i++) {
+            for (uint32_t j = 0; j < num_columns; j++) {
+                float weight = ptr_float_weights[i * num_columns + j];
+                mean_weight += weight;
+                mean_weight_squared += weight * weight;
+                if (fabs(weight) > max_weight) {
+                    max_weight = fabs(weight);
+                }
+            }
+        }
+
+        mean_weight /= static_cast<float>(num_rows * num_columns);
+        mean_weight_squared /= static_cast<float>(num_rows * num_columns);
+        var_weight = mean_weight_squared - mean_weight * mean_weight;
+        mean_plus_2stdev = mean_weight + 2.0f * static_cast<float>(sqrtf(var_weight));
+
+        *ptr_weight_scale_factor = static_cast<float>(MAX_VAL_2B_WEIGHT) / max_weight;
+        *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
+    }
+
+    for (uint32_t row = 0; row < num_rows; row++) {
+        for (uint32_t col = 0; col < num_columns; col++) {
+            float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_weights[row * num_columns + col] * *ptr_weight_scale_factor + rounding_value;
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (value > 32767.0) {
+                *ptr_weight_16 = 32767;
+                num_saturate++;
+            } else if (value < -32768.0) {
+                *ptr_weight_16 = -32768;
+                num_saturate++;
+            } else {
+                *ptr_weight_16 = (int16_t) value;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            *ptr_weight_16 = 0;
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            *ptr_weight_16 = 0;
+        }
+    }
+
+    // case for element wise layer
+    if (ptr_float_biases != nullptr && ptr_int_biases != nullptr) {
+        for (uint32_t j = 0; j < num_rows; j++) {
+            float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+            if (value > 2147483647.0) {
+                ptr_int_biases[j] = 2147483647L;
+                num_saturate++;
+            } else if (value < -2147483648.0) {
+                ptr_int_biases[j] = -2147483648LL;
+                num_saturate++;
+            } else {
+                ptr_int_biases[j] = (int32_t) value;
+            }
+        }
+        for (uint32_t j = num_rows; j < num_rows_padded; j++) {
+            ptr_int_biases[j] = 0;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine16()\n",
+                     num_saturate,
+                     num_rows * num_columns + num_rows);
+    }
+}
+
+void FixedQuantizeAffine16(float *ptr_float_weights,
+                           float *ptr_float_biases,
+                           int16_t *ptr_int_weights,
+                           int32_t *ptr_int_biases,
+                           float input_scale_factor,
+                           float weight_scale_factor,
+                           float *ptr_output_scale_factor,
+                           uint32_t num_rows,
+                           uint32_t num_columns,
+                           uint32_t num_rows_padded,
+                           uint32_t num_columns_padded) {
+    uint32_t num_saturate = 0;
+
+    for (uint32_t row = 0; row < num_rows; row++) {
+        for (uint32_t col = 0; col < num_columns; col++) {
+            float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (value > 32767.0) {
+                *ptr_weight_16 = 32767;
+                num_saturate++;
+            } else if (value < -32768.0) {
+                *ptr_weight_16 = -32768;
+                num_saturate++;
+            } else {
+                *ptr_weight_16 = (int16_t) value;
+            }
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            *ptr_weight_16 = 0;
+        }
+    }
+
+    *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
+
+    for (uint32_t j = 0; j < num_rows; j++) {
+        float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+        if (value > 2147483647.0) {
+            ptr_int_biases[j] = 2147483647L;
+            num_saturate++;
+        } else if (value < -2147483648.0) {
+            ptr_int_biases[j] = -2147483648LL;
+            num_saturate++;
+        } else {
+            ptr_int_biases[j] = (int32_t) value;
+        }
+    }
+    for (uint32_t j = num_rows; j < num_rows_padded; j++) {
+        ptr_int_biases[j] = 0;
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in FixedQuantizeAffine16()\n",
+                     num_saturate,
+                     num_rows * num_columns + num_rows);
+    }
+}
+
+float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements) {
+    float *ptr_float_feat = reinterpret_cast<float *>(ptr_float_memory);
+    float max = 0.0;
+    float scale_factor;
+
+    for (size_t i = 0; i < num_elements; i++) {
+        if (fabs(ptr_float_feat[i]) > max) {
+            max = fabs(ptr_float_feat[i]);
+        }
+    }
+
+    if (max == 0) {
+        scale_factor = 1.0;
+    } else {
+        scale_factor = target_max / max;
+    }
+
+    return (scale_factor);
+}
+
+float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors, float target_max) {
+    float max = 0.0;
+    float scale_factor;
+    uint32_t num_vectors = (uint32_t) input_vectors.size();
+
+    for (uint32_t i = 0; i < num_vectors; i++) {
+        float *ptr_float_feat = input_vectors[i].data();
+        uint32_t num_elements = (uint32_t) input_vectors[i].size();
+        for (uint32_t j = 0; i < num_elements; i++) {
+            if (fabs(ptr_float_feat[j]) > max) {
+                max = fabs(ptr_float_feat[j]);
+            }
+        }
+    }
+
+    if (max == 0) {
+        scale_factor = 1.0;
+    } else {
+        scale_factor = target_max / max;
+    }
+
+    return (scale_factor);
+}
+
+float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors,
+                                 int index,
+                                 int num_group_size,
+                                 float target_max) {
+    float max = 0.0;
+    float scale_factor;
+    uint32_t start_index = (uint32_t) index;
+    uint32_t end_index =
+        (uint32_t) ((index + num_group_size > input_vectors.size()) ? input_vectors.size() - 1 : start_index
+            + num_group_size);
+
+    for (uint32_t i = start_index; i < end_index; i++) {
+        float *ptr_float_feat = input_vectors[i].data();
+        uint32_t num_elements = (uint32_t) input_vectors[i].size();
+        for (uint32_t j = 0; j < num_elements; j++) {
+            if (fabs(ptr_float_feat[j]) > max) {
+                max = fabs(ptr_float_feat[j]);
+            }
+        }
+    }
+
+    if (max == 0) {
+        scale_factor = 1.0;
+    } else {
+        scale_factor = target_max / max;
+    }
+
+    return (scale_factor);
+}
+
+void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor) {
+    float *ptr_float_feat = reinterpret_cast<float *>(ptr_float_memory);
+    uint32_t num_saturate = 0;
+
+    int16_t *ptr_int_feat = reinterpret_cast<int16_t *>(ptr_int_memory);
+    for (uint32_t i = 0; i < num_elements; i++) {
+        float rounding_value = (ptr_float_feat[i] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_feat[i] * scale_factor + rounding_value;
+        if (value > 32767.0) {
+            ptr_int_feat[i] = 32767;
+            num_saturate++;
+        } else if (value < -32768.0) {
+            ptr_int_feat[i] = -32768;
+            num_saturate++;
+        } else {
+            ptr_int_feat[i] = (int16_t) value;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations during QuantizeVector16()\n", num_saturate, num_elements);
+    }
+}
+
+void QuantizeVector16(std::vector<std::vector<float>> &input_vectors,
+                      int16_t *ptr_int_memory,
+                      uint32_t index,
+                      uint32_t num_group_size,
+                      float scale_factor) {
+    int16_t *ptr_int_feat = reinterpret_cast<int16_t *> (ptr_int_memory);
+    uint32_t num_saturate = 0;
+    uint32_t num_elements = (uint32_t) input_vectors[0].size();  // assume all vector are same size
+    uint32_t start_index = (uint32_t) index;
+    uint32_t end_index =
+        (uint32_t) ((index + num_group_size > input_vectors.size()) ? input_vectors.size() - 1 : start_index
+            + num_group_size);
+
+    if (end_index - start_index < num_group_size) {
+        memset(ptr_int_feat, 0, num_elements * num_group_size * sizeof(int16_t));  // for zero padding partial group
+    }
+    for (uint32_t j = start_index; j < end_index; j++) {
+        for (uint32_t i = 0; i < num_elements; i++) {
+            float *ptr_float_feat = input_vectors[j].data();
+            float rounding_value = (ptr_float_feat[i] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_feat[i] * scale_factor + rounding_value;
+            if (value > 32767.0) {
+                ptr_int_feat[i * num_group_size + j - start_index] = 32767;
+                num_saturate++;
+            } else if (value < -32768.0) {
+                ptr_int_feat[i * num_group_size + j - start_index] = -32768;
+                num_saturate++;
+            } else {
+                ptr_int_feat[i * num_group_size + j - start_index] = (int16_t) value;
+            }
+        }
+    }
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations during QuantizeVector16()\n",
+                     num_saturate,
+                     num_elements * num_group_size);
+    }
+}
+
+void ReQuantizeVector16(int16_t *ptr_int_memory, uint32_t num_elements, float prev_scale_factor, float scale_factor) {
+    uint32_t num_saturate = 0;
+
+    int16_t *ptr_int_feat = reinterpret_cast<int16_t *> (ptr_int_memory);
+    for (uint32_t i = 0; i < num_elements; i++) {
+        float float_value = ptr_int_feat[i] / prev_scale_factor;
+        float rounding_value = (float_value > 0) ? 0.5f : -0.5f;
+        float value = float_value * scale_factor + rounding_value;
+        if (value > 32767.0) {
+            ptr_int_feat[i] = 32767;
+            num_saturate++;
+        } else if (value < -32768.0) {
+            ptr_int_feat[i] = -32768;
+            num_saturate++;
+        } else {
+            ptr_int_feat[i] = (int16_t) value;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations during ReQuantizeVector16()\n", num_saturate, num_elements);
+    }
+}
+
+void QuantizeBias16(float *ptr_float_biases,
+                    int32_t *ptr_int_biases,
+                    float input_scale_factor,
+                    float weight_scale_factor,
+                    float *ptr_output_scale_factor,
+                    uint32_t num_rows) {
+    uint32_t num_saturate = 0;
+
+    *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
+    for (uint32_t j = 0; j < num_rows; j++) {
+        float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+        if (value > 2147483647.0) {
+            ptr_int_biases[j] = 2147483647L;
+            num_saturate++;
+        } else if (value < -2147483648.0) {
+            ptr_int_biases[j] = -2147483648LL;
+            num_saturate++;
+        } else {
+            ptr_int_biases[j] = (int32_t) value;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeBias16()\n", num_saturate, num_rows);
+    }
+}
+
+void DeQuantizeVector16(int16_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor) {
+    int16_t *int16_vector = reinterpret_cast<int16_t *> (ptr_int_memory);
+    for (uint32_t i = 0; i < float_vector.size(); i++) {
+        float_vector[i] = int16_vector[i] / scale_factor;
+    }
+}
+
+void DeQuantizeVector32(int32_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor) {
+    int32_t *int32_vector = reinterpret_cast<int32_t  *> (ptr_int_memory);
+    for (uint32_t i = 0; i < float_vector.size(); i++) {
+        float_vector[i] = int32_vector[i] / scale_factor;
+    }
+}
+
+void DeQuantizeVector32(int32_t *ptr_int_memory,
+                        std::vector<float> &float_vector,
+                        uint32_t index,
+                        uint32_t num_group_size,
+                        float scale_factor) {
+    int32_t *int32_vector = reinterpret_cast<int32_t  *> (ptr_int_memory);
+    for (uint32_t i = 0; i < float_vector.size(); i++) {
+        float_vector[i] = int32_vector[i * num_group_size + index] / scale_factor;
+    }
+}
+bool IntegrityCheckAffine16(float *ptr_float_weights,
+                            float *ptr_float_biases,
+                            int16_t *ptr_int_weights,
+                            int32_t *ptr_int_biases,
+                            float weight_scale_factor,
+                            float output_scale_factor,
+                            uint32_t num_rows,
+                            uint32_t num_columns,
+                            uint32_t num_rows_padded,
+                            uint32_t num_columns_padded) {
+    bool model_ok = true;
+
+    for (uint32_t row = 0; row < num_rows; row++) {
+        for (uint32_t col = 0; col < num_columns; col++) {
+            float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            int16_t int_value;
+            if (value > 32767.0) {
+                int_value = 32767;
+            } else if (value < -32768.0) {
+                int_value = -32768;
+            } else {
+                int_value = (int16_t) value;
+            }
+            if (int_value != *ptr_weight_16) {
+                model_ok = false;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (*ptr_weight_16 != 0) {
+                model_ok = false;
+            }
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (*ptr_weight_16 != 0) {
+                model_ok = false;
+            }
+        }
+    }
+
+    for (uint32_t j = 0; j < num_rows; j++) {
+        float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_biases[j] * output_scale_factor + rounding_value;
+        int32_t int_value;
+        if (value > 2147483647.0) {
+            int_value = 2147483647L;
+        } else if (value < -2147483648.0) {
+            int_value = -2147483648LL;
+        } else {
+            int_value = (int32_t) value;
+        }
+        if (int_value != ptr_int_biases[j]) {
+            model_ok = false;
+        }
+    }
+    for (uint32_t j = num_rows; j < num_rows_padded; j++) {
+        if (ptr_int_biases[j] != 0) {
+            model_ok = false;
+        }
+    }
+
+    return (model_ok);
+}
+
+bool IntegrityCheckAffineWeights16(float *ptr_float_weights,
+                                   int16_t *ptr_int_weights,
+                                   float weight_scale_factor,
+                                   uint32_t num_rows,
+                                   uint32_t num_columns,
+                                   uint32_t num_rows_padded,
+                                   uint32_t num_columns_padded) {
+    bool model_ok = true;
+
+    for (uint32_t row = 0; row < num_rows; row++) {
+        for (uint32_t col = 0; col < num_columns; col++) {
+            float rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_weights[row * num_columns + col] * weight_scale_factor + rounding_value;
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            int16_t int_value;
+            if (value > 32767.0) {
+                int_value = 32767;
+            } else if (value < -32768.0) {
+                int_value = -32768;
+            } else {
+                int_value = (int16_t) value;
+            }
+            if (int_value != *ptr_weight_16) {
+                model_ok = false;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (*ptr_weight_16 != 0) {
+                model_ok = false;
+            }
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int16_t *ptr_weight_16 = ptr_int_weights + (row * num_columns_padded + col);
+            if (*ptr_weight_16 != 0) {
+                model_ok = false;
+            }
+        }
+    }
+
+    return (model_ok);
+}
+
+
+void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases,
+                     int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
+                     float input_scale_factor, float *ptr_weight_scale_factor,
+                     float *ptr_output_scale_factor, uint32_t num_rows, uint32_t num_columns,
+                     uint32_t num_rows_padded, uint32_t num_columns_padded) {
+    uint32_t num_saturate = 0;
+
+    if (*ptr_weight_scale_factor == 1.0) {
+        // scale factor for weights is not calculated yet
+        float mean_weight = 0.0;
+        float mean_weight_squared = 0.0;
+        float max_weight = -1e20f;
+        float var_weight;
+        float mean_plus_2stdev;
+
+        for (uint32_t i = 0; i < num_rows; i++) {
+            for (uint32_t j = 0; j < num_columns; j++) {
+                float weight = ptr_float_weights[i*num_columns + j];
+                mean_weight += weight;
+                mean_weight_squared += weight * weight;
+                if (fabs(weight) > max_weight) {
+                    max_weight = fabs(weight);
+                }
+            }
+        }
+
+        mean_weight /= static_cast<float>(num_rows * num_columns);
+        mean_weight_squared /= static_cast<float>(num_rows * num_columns);
+        var_weight = mean_weight_squared - mean_weight * mean_weight;
+        mean_plus_2stdev = mean_weight + 2.0f * static_cast<float>(sqrtf(var_weight));
+
+        *ptr_weight_scale_factor = static_cast<float>(MAX_VAL_1B_WEIGHT) / max_weight;
+
+        // For 8 bit weights quantize as follows:
+        // 1. adjust scale factor to increase dynamic range of entire matrix by max multiplier
+        // 2. find maximum scaled weight for each row
+        // 3. find multiplier such that dividing by the multiplier brings row back within 8-bit dynamic range
+        // 4. quantize and store scaled row
+        *ptr_weight_scale_factor = MAX_OUT_MULTIPLIER * *ptr_weight_scale_factor;  //  increase dynamic range by max multiplier
+        *ptr_output_scale_factor = input_scale_factor * *ptr_weight_scale_factor;
+    }
+    float valueAcc = 0.0;
+    for (uint32_t row = 0; row < num_rows; row++) {
+        float scaled_row_max = 0;
+        float rounding_value, value;
+        for (uint32_t col = 0; col < num_columns; col++) {
+            value = ptr_float_weights[row*num_columns + col] * *ptr_weight_scale_factor;
+            valueAcc += value;
+            if (fabs(value) > scaled_row_max) {
+                scaled_row_max = fabs(value);
+            }
+        }
+
+        value = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
+        ptr_int_biases[row].multiplier = (uint8_t) (value + 0.5);
+        for (uint32_t col = 0; col < num_columns; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            rounding_value = (ptr_float_weights[row * num_columns + col] > 0) ? 0.5f : -0.5f;
+
+
+            value = ptr_float_weights[row*num_columns + col] * (*ptr_weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
+            if (value > 127.0) {
+                *ptr_weight_8 = 127;
+                num_saturate++;
+            } else if (value < -128.0) {
+                *ptr_weight_8 = -128;
+                num_saturate++;
+            } else {
+                *ptr_weight_8 = (int8_t)value;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            *ptr_weight_8 = 0;
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            *ptr_weight_8 = 0;
+        }
+        ptr_int_biases[row].multiplier = 0;
+    }
+
+    // bias value of the bas will be only used when input bias provided
+    if (ptr_float_biases != nullptr) {
+        for (uint32_t j = 0; j < num_rows; j++) {
+            float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+            float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+            if (value > 2147483647.0) {
+                ptr_int_biases[j].bias = 2147483647L;
+                num_saturate++;
+            } else if (value < -2147483648.0) {
+                ptr_int_biases[j].bias = -2147483648LL;
+                num_saturate++;
+            } else {
+                ptr_int_biases[j].bias = (int32_t) value;
+            }
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeAffine8()\n", num_saturate, num_rows * num_columns + num_rows);
+    }
+}
+
+
+void QuantizeBias8(float *ptr_float_biases,
+                   intel_compound_bias_t  *ptr_int_biases,
+                   float input_scale_factor,
+                   float weight_scale_factor,
+                   float *ptr_output_scale_factor, uint32_t num_rows) {
+    uint32_t num_saturate = 0;
+
+    *ptr_output_scale_factor = input_scale_factor * weight_scale_factor;
+    for (uint32_t j = 0; j < num_rows; j++) {
+        float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_biases[j] * *ptr_output_scale_factor + rounding_value;
+        if (value > 2147483647.0) {
+            ptr_int_biases[j].bias = 2147483647L;
+            num_saturate++;
+        } else if (value < -2147483648.0) {
+            ptr_int_biases[j].bias = -2147483648LL;
+            num_saturate++;
+        } else {
+            ptr_int_biases[j].bias = (int32_t)value;
+        }
+    }
+
+    if (num_saturate > 0) {
+        QUANTWARNING("Warning:  %d / %d saturations in QuantizeBias8()\n", num_saturate, num_rows);
+    }
+}
+
+bool IntegrityCheckAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
+                           float weight_scale_factor, float output_scale_factor, uint32_t num_rows, uint32_t num_columns,
+                           uint32_t num_rows_padded, uint32_t num_columns_padded) {
+    bool model_ok = true;
+
+    for (uint32_t row = 0; row < num_rows; row++) {
+        float scaled_row_max = 0;
+        float rounding_value, value;
+        for (uint32_t col = 0; col < num_columns; col++) {
+            value = ptr_float_weights[row*num_columns + col] * weight_scale_factor;
+            if (fabs(value) > scaled_row_max) {
+                scaled_row_max = fabs(value);
+            }
+        }
+        value = scaled_row_max / static_cast<float>(MAX_VAL_1B_WEIGHT);
+        if (ptr_int_biases[row].multiplier != (uint8_t)(value + 0.5)) {
+            model_ok = false;
+        }
+        for (uint32_t col = 0; col < num_columns; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            int8_t int_value;
+            rounding_value = (ptr_float_weights[row*num_columns + col] > 0) ? 0.5f : -0.5f;
+            value = ptr_float_weights[row*num_columns + col] * (weight_scale_factor / ptr_int_biases[row].multiplier) + rounding_value;
+            if (value > 127.0) {
+                int_value = 127;
+            } else if (value < -128.0) {
+                int_value = -128;
+            } else {
+                int_value = (int8_t)value;
+            }
+            if (int_value != *ptr_weight_8) {
+                model_ok = false;
+            }
+        }
+        for (uint32_t col = num_columns; col < num_columns_padded; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            if (*ptr_weight_8 != 0) {
+                model_ok = false;
+            }
+        }
+    }
+    for (uint32_t row = num_rows; row < num_rows_padded; row++) {
+        for (uint32_t col = 0; col < num_columns_padded; col++) {
+            int8_t *ptr_weight_8 = ptr_int_weights + (row*num_columns_padded + col);
+            if (*ptr_weight_8 != 0) {
+                model_ok = false;
+            }
+        }
+        if (ptr_int_biases[row].multiplier != 0) {
+            model_ok = false;
+        }
+    }
+
+    for (uint32_t j = 0; j < num_rows; j++) {
+        float rounding_value = (ptr_float_biases[j] > 0) ? 0.5f : -0.5f;
+        float value = ptr_float_biases[j] * output_scale_factor + rounding_value;
+        int32_t int_value;
+        if (value > 2147483647.0) {
+            int_value = 2147483647L;
+        } else if (value < -2147483648.0) {
+            int_value = -2147483648LL;
+        } else {
+            int_value = (int32_t)value;
+        }
+        if (int_value != ptr_int_biases[j].bias) {
+            model_ok = false;
+        }
+    }
+
+    return(model_ok);
+}
+
diff --git a/inference-engine/src/gna_plugin/quantization/quantization.h b/inference-engine/src/gna_plugin/quantization/quantization.h
new file mode 100644
index 00000000000000..bd1ff7b07146fe
--- /dev/null
+++ b/inference-engine/src/gna_plugin/quantization/quantization.h
@@ -0,0 +1,100 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+#include <cstdint>
+
+#define MAX_OUT_MULTIPLIER 230
+#define MAX_VAL_1B_WEIGHT 127
+#define MAX_VAL_2B_WEIGHT 16384
+#define MAX_VAL_2B_FEAT 16384
+#ifdef DEBUG
+#define QUANTWARNING(...) (fprintf(stderr, __VA_ARGS__))
+#else
+#define QUANTWARNING(...)
+#endif
+
+void QuantizeAffine16(float *ptr_float_weights,
+                      float *ptr_float_biases,
+                      int16_t *ptr_int_weights,
+                      int32_t *ptr_int_biases,
+                      float input_scale_factor,
+                      float *ptr_weight_scale_factor,
+                      float *ptr_output_scale_factor,
+                      uint32_t num_rows,
+                      uint32_t num_columns,
+                      uint32_t num_rows_padded,
+                      uint32_t num_columns_padded);
+void FixedQuantizeAffine16(float *ptr_float_weights,
+                           float *ptr_float_biases,
+                           int16_t *ptr_int_weights,
+                           int32_t *ptr_int_biases,
+                           float input_scale_factor,
+                           float weight_scale_factor,
+                           float *ptr_output_scale_factor,
+                           uint32_t num_rows,
+                           uint32_t num_columns,
+                           uint32_t num_rows_padded,
+                           uint32_t num_columns_padded);
+float ScaleFactorForQuantization(void *ptr_float_memory, float target_max, size_t num_elements);
+float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors, float target_max);
+float ScaleFactorForQuantization(std::vector<std::vector<float>> &input_vectors,
+                                 int index,
+                                 int num_group_size,
+                                 float target_max);
+void QuantizeVector16(float *ptr_float_memory, int16_t *ptr_int_memory, uint32_t num_elements, float scale_factor);
+void QuantizeVector16(std::vector<std::vector<float>> &input_vectors,
+                      int16_t *ptr_int_memory,
+                      uint32_t index,
+                      uint32_t num_group_size,
+                      float scale_factor);
+void ReQuantizeVector16(int16_t *ptr_int_memory, uint32_t num_elements, float prev_scale_factor, float scale_factor);
+bool IntegrityCheckAffine16(float *ptr_float_weights,
+                            float *ptr_float_biases,
+                            int16_t *ptr_int_weights,
+                            int32_t *ptr_int_biases,
+                            float weight_scale_factor,
+                            float output_scale_factor,
+                            uint32_t num_rows,
+                            uint32_t num_columns,
+                            uint32_t num_rows_padded,
+                            uint32_t num_columns_padded);
+bool IntegrityCheckAffineWeights16(float *ptr_float_weights,
+                                   int16_t *ptr_int_weights,
+                                   float weight_scale_factor,
+                                   uint32_t num_rows,
+                                   uint32_t num_columns,
+                                   uint32_t num_rows_padded,
+                                   uint32_t num_columns_padded);
+void QuantizeBias16(float *ptr_float_biases,
+                    int32_t *ptr_int_biases,
+                    float input_scale_factor,
+                    float weight_scale_factor,
+                    float *ptr_output_scale_factor,
+                    uint32_t num_rows);
+void DeQuantizeVector16(int16_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor);
+void DeQuantizeVector32(int32_t *ptr_int_memory, std::vector<float> &float_vector, float scale_factor);
+void DeQuantizeVector32(int32_t *ptr_int_memory,
+                        std::vector<float> &float_vector,
+                        uint32_t index,
+                        uint32_t num_group_size,
+                        float scale_factor);
+
+#include "gna-api.h"
+
+void QuantizeAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
+                     float input_scale_factor, float *ptr_weight_scale_factor, float *ptr_output_scale_factor,
+                     uint32_t num_rows, uint32_t num_columns, uint32_t num_rows_padded, uint32_t num_columns_padded);
+void QuantizeBias8(float *ptr_float_biases, intel_compound_bias_t  *ptr_int_biases, float input_scale_factor,
+                   float weight_scale_factor, float *ptr_output_scale_factor, uint32_t num_rows);
+bool IntegrityCheckAffine8(float *ptr_float_weights, float *ptr_float_biases, int8_t *ptr_int_weights, intel_compound_bias_t *ptr_int_biases,
+                           float weight_scale_factor, float output_scale_factor, uint32_t num_rows, uint32_t num_columns,
+                           uint32_t num_rows_padded, uint32_t num_columns_padded);
+
+
diff --git a/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp b/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp
new file mode 100644
index 00000000000000..347102bbb3ac39
--- /dev/null
+++ b/inference-engine/src/gna_plugin/quantization/quantized_layer_params.hpp
@@ -0,0 +1,24 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+namespace GNAPluginNS {
+
+struct Quantization {
+    float scale = 1.0f;
+    float offset = 0.0f;
+    int shift = 0.0f;
+};
+
+struct QuantizedLayerParams {
+    Quantization _src_quant;
+    Quantization _dst_quant;
+    Quantization _weights_quant;
+    Quantization _bias_quant;
+    float _o_shift = 0.0f;
+    float _b_shift = 0.0f;
+};
+
+}  // namespace GNAPluginNS
\ No newline at end of file
diff --git a/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp b/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp
new file mode 100644
index 00000000000000..a3ba22c1b00713
--- /dev/null
+++ b/inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp
@@ -0,0 +1,339 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <vector>
+#include <algorithm>
+#include <utility>
+#include <limits>
+#include <string>
+#include "gna_layer_info.hpp"
+#include "ie_layers.h"
+#include "gna_plugin_log.hpp"
+
+namespace GNAPluginNS {
+namespace details {
+using namespace InferenceEngine;
+struct ScaleFactorUpdateResult {
+    CNNLayer *restartLayer = nullptr;
+    ScaleFactorUpdateResult() = default;
+    explicit ScaleFactorUpdateResult(CNNLayer * restartlayer) : restartLayer(restartlayer) {
+    }
+    operator bool() {
+        return restartLayer == nullptr;
+    }
+};
+
+/**
+ * @brief calculates output scale factor per layer
+ * @tparam T
+ */
+template<class T>
+class ScaleFactorPerLayer {
+ public:
+    /**
+     * @brief calculates weights scale factor for fit dynamic range into target bitsize,
+     * also calculates output scale factor for the given layer
+     * @param cnnLayer
+     * @param weightsSize
+     * @param inputScaleFactor
+     * @param result
+     * @return
+     */
+    bool operator()(T cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
+        return false;
+    }
+};
+
+template<>
+class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
+ private :
+    const float activation_scale_factor = 2048.f;
+    const float identity_scale_factor = 2049.0f;
+    const float k = 5;
+    const float k_identity = 6;
+ public :
+    bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
+        if ( !cnnLayer ) {
+            THROW_IE_EXCEPTION << "Incorrect Convolutional Layer pointer \n";
+        }
+        LayerInfo layerInfo(*cnnLayer);
+        // TODO: current approach set input scale factor for true input layer(s) equals to provided factor,
+        auto quant = getInjectedData<QuantizedLayerParams>(*cnnLayer);
+        if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) {
+            // for memory output layer need to verify it's input scale factor
+            if (CNNNetHasPrevLayer(cnnLayer)) {
+                auto prevLayer = CNNNetPrevLayer(cnnLayer);
+                auto inputQuant = getInjectedData<QuantizedLayerParams>(prevLayer);
+                if (inputQuant->_dst_quant.scale != activation_scale_factor) {
+                    gnawarn() << "[WARNING] quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") "
+                                       << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
+                                       << activation_scale_factor << std::endl;
+                    inputQuant->_dst_quant.scale = activation_scale_factor;
+                    // restarting from that activation;
+                    result = ScaleFactorUpdateResult(prevLayer.get());
+                    return true;
+                }
+            }
+            quant->_src_quant.scale = quant->_dst_quant.scale = activation_scale_factor;
+            return true;
+        }
+
+        if (!CNNNetHasPrevLayer(cnnLayer)) {
+            quant->_dst_quant.scale = inputScaleFactor;
+            return ScaleFactorUpdateResult();
+        }
+
+        // by default layer is pass thru its scale factor
+        auto inputQuant = getInjectedData<QuantizedLayerParams>(CNNNetPrevLayer(cnnLayer));
+        quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
+        quant->_src_quant.scale = inputQuant->_dst_quant.scale;
+
+        if (layerInfo.isActivation()) {
+            // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
+            // set the initial value
+            quant->_dst_quant.scale = layerInfo.isIdentity() ? identity_scale_factor:activation_scale_factor;
+            // if activation is one from relu family, we need to apply heuruistic to avoid activation output overflow
+            if (layerInfo.isRelu() &&
+                    static_cast<uint64_t>(quant->_dst_quant.scale * quant->_src_quant.scale)
+                                                                > std::numeric_limits<int32_t>::max()-1) {
+                quant->_dst_quant.scale = (quant->_dst_quant.scale * 0.5);
+            }
+        }
+        return true;
+    }
+};
+
+template<>
+class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
+ public:
+    bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
+        if ( !eltwiseLayer ) {
+            THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
+        }
+        auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0);
+        auto in1 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 1);
+
+        auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
+        auto quantParams1 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in1);
+        auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*eltwiseLayer);
+
+        switch (eltwiseLayer->_operation) {
+            case InferenceEngine::EltwiseLayer::Prod: {
+                quantData->_weights_quant.scale = quantParams1->_dst_quant.scale;
+                quantData->_dst_quant.scale     = quantParams0->_dst_quant.scale * quantParams1->_dst_quant.scale;
+                break;
+            }
+            case InferenceEngine::EltwiseLayer::Sum: {
+                // detect which input will be used as biases
+                if (LayerInfo(in0).has32BOutput()) {
+                    std::swap(in0, in1);
+                    std::swap(quantParams0, quantParams1);
+                }
+
+                // this path might result in significant data loss
+                quantData->_weights_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale;
+                quantData->_dst_quant.scale = quantParams1->_dst_quant.scale;
+
+                // eltwise will always work in int16
+                auto maxValue = std::numeric_limits<int16_t>::max() - 1;
+                if (quantData->_weights_quant.scale > maxValue + 1) {
+                    // rescaling it's activation input
+                    // iterating thru previous layers of eltwise
+                    for (uint8_t i = 0; i < 2; ++i) {
+                        InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, i);
+                        // trick to get opposite index (for 0 -> 1 for 1 -> 0) by inversing i.
+                        auto quantParams =
+                                InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, !i));
+
+                        for (; InferenceEngine::CNNNetHasPrevLayer(in.get()); in = CNNNetPrevLayer(in)) {
+                            auto info = LayerInfo(in);
+                            // we skipping only split layers so far, also need to work on memory layers
+                            // this case for input from port 0
+                            if (info.isSplit() || info.isSlice()) {
+                                continue;
+                            } else if (info.has16BOutput() && info.isActivation()) {
+                                auto newOutputScale = quantParams->_dst_quant.scale / maxValue;
+                                if (newOutputScale > std::numeric_limits<int16_t>::max() / 2) {
+                                    break;
+                                }
+                                auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
+                                gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
+                                         << ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
+                                         << ", was " << quantDataForActivation->_dst_quant.scale <<"\n" << std::flush;
+                                quantDataForActivation->_dst_quant.scale = newOutputScale;
+                                result = ScaleFactorUpdateResult(in.get());
+                                return true;
+                            } else if (info.has16BOutput()) {
+                                break;
+                            }
+
+                            // if we are here it means that we are in the port 1
+                            if (info.isFullyConnected() || info.isConvolutional()) {
+                                auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
+                                auto newOutputScale = quantParams->_dst_quant.scale * maxValue;
+                                auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.scale;
+                                quantDataForInputLayer->_dst_quant.scale = newOutputScale;
+                                quantDataForInputLayer->_weights_quant.scale = newWeightScale;
+                                result = ScaleFactorUpdateResult(in.get());
+                                return true;
+                            }
+                        }
+                    }
+                    // we unable to rescale the input - results might be bad
+                    gnawarn() << "[INFO] weights saturated for " << eltwiseLayer->name << "\n";
+                }
+                break;
+            }
+            default : THROW_GNA_EXCEPTION << "Unsupported Eltwise layer for quantisation: " << eltwiseLayer->_operation;
+        }
+        return true;
+    }
+};
+
+template<>
+class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
+ private:
+    float const _scale_reduction_50 = 0.50;
+    float const _scale_reduction_45 = 0.45;
+    float const _scale_reduction_40 = 0.40;
+    float const _scale_reduction_35 = 0.35;
+
+    uint16_t const _scale_change_req_threshold = 30;
+    uint16_t const _scale_change_threshold_100 = 100;
+    uint16_t const _scale_change_threshold_150 = 150;
+    uint16_t const _scale_change_threshold_200 = 200;
+
+ public:
+    bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
+        if ( !wl ) {
+            THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer  \n";
+        } else if (!wl->_weights) {
+            THROW_GNA_EXCEPTION << "Incorrect weight value for " << wl->name << ":" << wl->type << "\n";
+        }
+
+        auto prevLayer = CNNNetPrevLayer(wl);
+        auto quantDataForInputLayer =
+            InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
+
+        auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
+        // TODO: pass 8 bits somehow
+        if (quant->_weights_quant.scale == 1.0f) {
+            size_t scaleRange = 0;
+            if (weightsSize == 2) {
+                scaleRange = MAX_VAL_2B_WEIGHT;
+            } else if (weightsSize == 1) {
+                scaleRange = MAX_VAL_1B_WEIGHT;
+            } else {
+                THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize;
+            }
+            quant->_weights_quant.scale =
+                ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size());
+
+            // TODO: findout why ???
+            if (weightsSize == 1) {
+                quant->_weights_quant.scale *= MAX_OUT_MULTIPLIER;
+            }
+        }
+
+        quant->_src_quant.scale = quantDataForInputLayer->_dst_quant.scale;
+
+        double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
+
+        if (weightsSize == 1 &&
+            static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
+                                                    static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) {
+            gnawarn() << "Output scale for " << wl->name
+                                            << " too large and are being reduced. Else saturations likely will happen \n";
+            // reduce weight scale according experimentatl heuruistic
+            if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_100) {
+                quant->_weights_quant.scale *= _scale_reduction_50;
+                tmp_dst_quant_scale *= _scale_reduction_50;
+            } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_150) {
+                quant->_weights_quant.scale *= _scale_reduction_45;
+                tmp_dst_quant_scale *= _scale_reduction_45;
+            } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_200) {
+                quant->_weights_quant.scale *= _scale_reduction_40;
+                tmp_dst_quant_scale *= _scale_reduction_40;
+            } else {
+                quant->_weights_quant.scale *= _scale_reduction_35;
+                tmp_dst_quant_scale *= _scale_reduction_35;
+            }
+        }
+
+        quant->_dst_quant.scale = tmp_dst_quant_scale;
+
+        return true;
+    }
+};
+
+template<>
+class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
+ public:
+    bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
+        return ScaleFactorPerLayer<InferenceEngine::WeightableLayer*>::operator()(wl, 2, inputScaleFactor, result);
+    }
+};
+
+/**
+ * GNA convolutions cannot be quantized in int8, remove when library starts support that
+ */
+template<>
+class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> {
+};
+
+
+}  // namespace details
+
+/**
+ * @brief scale factor calculator will calculate only output scale factors for the layer
+ * if scale factor propagation not possible, it will fall indicate a restart condition
+ */
+class ScaleFactorCalculator {
+    using Cnt = std::vector<InferenceEngine::CNNLayerPtr>;
+    Cnt  net;
+    mutable Cnt::const_iterator idx;
+    float inputScaleFactor;
+    mutable bool needRestart = false;
+    int weightsBytesSize;
+
+ public:
+    ScaleFactorCalculator(Cnt &net, int weightsBytesSize, float inputScaleFactor)
+            : net(net), inputScaleFactor(inputScaleFactor), weightsBytesSize(weightsBytesSize) {
+        idx = std::begin(this->net);
+    }
+    bool needToRestart() const {
+        return needRestart;
+    }
+    bool allLayersProcessed() const {
+        return idx == std::end(net);
+    }
+    std::vector<InferenceEngine::CNNLayerPtr> getStartLayers() const {
+        return std::vector<InferenceEngine::CNNLayerPtr>(idx, std::end(net));
+    }
+    template<class T>
+    bool operator()(T ptr) const {
+        needRestart = false;
+        details::ScaleFactorUpdateResult result;
+        if (!details::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, inputScaleFactor, result)) {
+            return false;
+        }
+        if (result) {
+            idx++;
+            return true;
+        }
+
+        idx = std::find_if(net.begin(), net.end(), [&](InferenceEngine::CNNLayerPtr cnnLayer) {
+            if (!result) {
+                return result.restartLayer == cnnLayer.get();
+            }
+            return ptr == cnnLayer.get();
+        });
+        idx++;
+        needRestart = true;
+        return true;
+    }
+};
+
+}  // namespace GNAPluginNS
diff --git a/inference-engine/src/gna_plugin/util.cpp b/inference-engine/src/gna_plugin/util.cpp
new file mode 100644
index 00000000000000..c10e3175f47456
--- /dev/null
+++ b/inference-engine/src/gna_plugin/util.cpp
@@ -0,0 +1,46 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <cinttypes>
+#ifndef _WIN32
+#include <mm_malloc.h>
+#endif
+#include <cstring>
+#include <details/ie_exception.hpp>
+#include "util.h"
+#include "gna_plugin_log.hpp"
+
+void *AllocateMemory(uint32_t num_memory_bytes, const char *ptr_name) {
+    void *ptr_memory = _mm_malloc(num_memory_bytes, 64);
+    if (ptr_memory == NULL) {
+        THROW_GNA_EXCEPTION << "Memory allocation failed for " << ptr_name;
+    }
+    memset(ptr_memory, 0, num_memory_bytes);
+
+    return (ptr_memory);
+}
+
+void FreeMemory(void *ptr_memory) {
+    if (ptr_memory != NULL) {
+        _mm_free(ptr_memory);
+    }
+    ptr_memory = NULL;
+}
+
+int32_t MemoryOffset(void *ptr_target, void *ptr_base) {
+    uint64_t target = (uint64_t) ptr_target;
+    uint64_t base = (uint64_t) ptr_base;
+    if (target == 0) {  // handle NULL pointers separately
+        return (-1);
+    } else if (target < base) {
+        THROW_GNA_EXCEPTION << "Error:  target address value " <<  target<< " is less than base address " << base << " in MemoryOffset()";
+    } else {
+        uint64_t diff = target - base;
+        if (diff > 0x7fffffff) {
+            THROW_GNA_EXCEPTION << "Error:  target address value " << target << " too far from base address " << base << " in MemoryOffset()!";
+        }
+        return ((int32_t) diff);
+    }
+}
+
diff --git a/inference-engine/src/gna_plugin/util.h b/inference-engine/src/gna_plugin/util.h
new file mode 100644
index 00000000000000..0838bd2a690e0f
--- /dev/null
+++ b/inference-engine/src/gna_plugin/util.h
@@ -0,0 +1,9 @@
+// Copyright (C) 2018 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+void *AllocateMemory(uint32_t num_memory_bytes, const char *ptr_name);
+void FreeMemory(void *ptr_memory);
+int32_t MemoryOffset(void *ptr_target, void *ptr_base);