diff --git a/README.md b/README.md
index dfaf3e1eebcfae..869616f3ac8fe9 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # [OpenVINO™ Toolkit](https://01.org/openvinotoolkit) - Deep Learning Deployment Toolkit repository
-[![Stable release](https://img.shields.io/badge/version-2020.2-green.svg)](https://github.com/openvinotoolkit/openvino/releases/tag/2020.2)
+[![Stable release](https://img.shields.io/badge/version-2020.3-green.svg)](https://github.com/openvinotoolkit/openvino/releases/tag/2020.3.0)
 [![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE)
 
 This toolkit allows developers to deploy pre-trained deep learning models 
diff --git a/build-instruction.md b/build-instruction.md
index 3d5cfe136f2f21..12103ce9875004 100644
--- a/build-instruction.md
+++ b/build-instruction.md
@@ -28,7 +28,6 @@
 - [Add Inference Engine to Your Project](#add-inference-engine-to-your-project)
 - [(Optional) Additional Installation Steps for the Intel® Movidius™ Neural Compute Stick and Neural Compute Stick 2](#optional-additional-installation-steps-for-the-intel-movidius-neural-compute-stick-and-neural-compute-stick-2)
   - [For Linux, Raspbian Stretch* OS](#for-linux-raspbian-stretch-os)
-  - [For Windows](#for-windows-1)
 - [Next Steps](#next-steps)
 - [Additional Resources](#additional-resources)
 
@@ -60,12 +59,12 @@ The software was validated on:
 - [CMake]\* 3.11 or higher
 - GCC\* 4.8 or higher to build the Inference Engine
 - Python 2.7 or higher for Inference Engine Python API wrapper
-- (Optional) [Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 19.41.14441].
+- (Optional) [Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 20.13.16352].
 
 ### Build Steps
 1. Clone submodules:
     ```sh
-    cd dldt
+    cd openvino
     git submodule update --init --recursive
     ```
 2. Install build dependencies using the `install_dependencies.sh` script in the
@@ -78,7 +77,7 @@ The software was validated on:
    ```
 3. By default, the build enables the Inference Engine GPU plugin to infer models
    on your Intel® Processor Graphics. This requires you to
-   [Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 19.41.14441]
+   [Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 20.13.16352]
    before running the build. If you don't want to use the GPU plugin, use the
    `-DENABLE_CLDNN=OFF` CMake build option and skip the installation of the
    Intel® Graphics Compute Runtime for OpenCL™ Driver.
@@ -172,10 +171,10 @@ Native compilation of the Inference Engine is the most straightforward solution.
   sudo apt-get install -y git cmake libusb-1.0-0-dev
   ```
 
-2. Go to the cloned `dldt` repository:
+2. Go to the cloned `openvino` repository:
 
   ```bash
-  cd dldt
+  cd openvino
   ```
 
 3. Initialize submodules:
@@ -262,15 +261,15 @@ with the following content:
 5. Run Docker\* container with mounted source code folder from host:
 
   ```bash
-  docker run -it -v /absolute/path/to/dldt:/dldt ie_cross_armhf /bin/bash
+  docker run -it -v /absolute/path/to/openvino:/openvino ie_cross_armhf /bin/bash
   ```
 
 6. While in the container:
 
-    1. Go to the cloned `dldt` repository:
+    1. Go to the cloned `openvino` repository:
 
       ```bash
-      cd dldt
+      cd openvino
       ```
 
     2. Create a build folder:
@@ -291,8 +290,8 @@ with the following content:
       ```
 
 7. Press **Ctrl+D** to exit from Docker. You can find the resulting binaries
-   in the `dldt/bin/armv7l/` directory and the OpenCV*
-   installation in the `dldt/inference-engine/temp`.
+   in the `openvino/bin/armv7l/` directory and the OpenCV*
+   installation in the `openvino/inference-engine/temp`.
 
 >**NOTE**: Native applications that link to cross-compiled Inference Engine
 library require an extra compilation flag `-march=armv7-a`.
@@ -381,8 +380,8 @@ cmake -G "Visual Studio 15 2017 Win64" -T "Intel C++ Compiler 18.0" ^
 
 6. Before running the samples, add paths to the TBB and OpenCV binaries used for
    the build to the `%PATH%` environment variable. By default, TBB binaries are
-   downloaded by the CMake-based script to the `<dldt_repo>/inference-engine/temp/tbb/bin`
-   folder, OpenCV binaries to the `<dldt_repo>/inference-engine/temp/opencv_4.3.0/opencv/bin`
+   downloaded by the CMake-based script to the `<openvino_repo>/inference-engine/temp/tbb/bin`
+   folder, OpenCV binaries to the `<openvino_repo>/inference-engine/temp/opencv_4.3.0/opencv/bin`
    folder.
 
 ### Additional Build Options
@@ -437,7 +436,7 @@ cmake -G "Visual Studio 15 2017 Win64" -T "Intel C++ Compiler 18.0" ^
 call "C:\Program Files (x86)\IntelSWTools\compilers_and_libraries_2018\windows\bin\ipsxe-comp-vars.bat" intel64 vs2017
 set CXX=icl
 set CC=icl
-:: clean TBBROOT value set by ipsxe-comp-vars.bat, required TBB package will be downloaded by dldt cmake script
+:: clean TBBROOT value set by ipsxe-comp-vars.bat, required TBB package will be downloaded by openvino cmake script
 set TBBROOT=
 cmake -G Ninja -Wno-dev -DCMAKE_BUILD_TYPE=Release ..
 cmake --build . --config Release
@@ -461,7 +460,7 @@ The software was validated on:
 
 1. Clone submodules:
     ```sh
-    cd dldt
+    cd openvino
     git submodule update --init --recursive
     ```
 2. Install build dependencies using the `install_dependencies.sh` script in the
@@ -545,7 +544,7 @@ This section describes how to build Inference Engine for Android x86 (64-bit) op
 
 2. Clone submodules
   ```sh
-  cd dldt
+  cd openvino
   git submodule update --init --recursive
   ```
 
@@ -610,7 +609,7 @@ before running the Inference Engine build:
 For CMake projects, set the `InferenceEngine_DIR` environment variable:
 
 ```sh
-export InferenceEngine_DIR=/path/to/dldt/build/
+export InferenceEngine_DIR=/path/to/openvino/build/
 ```
 
 Then you can find Inference Engine by `find_package`:
@@ -660,20 +659,6 @@ sudo ldconfig
 rm 97-myriad-usbboot.rules
 ```
 
-### For Windows
-
-For Intel® Movidius™ Neural Compute Stick and Intel® Neural Compute Stick 2,
-install the Movidius™ VSC driver:
-
-1. Go to the `<DLDT_ROOT_DIR>/inference-engine/thirdparty/movidius/MovidiusDriver`
-   directory, where the `DLDT_ROOT_DIR` is the directory to which the DLDT
-   repository was cloned.
-2. Right click on the `Movidius_VSC_Device.inf` file and choose **Install** from
-   the pop-up menu.
-
-You have installed the driver for your Intel® Movidius™ Neural Compute Stick
-or Intel® Neural Compute Stick 2.
-
 ## Next Steps
 
 Congratulations, you have built the Inference Engine. To get started with the
@@ -706,7 +691,7 @@ This target collects all dependencies, prepares the nGraph package and copies it
 
 [Intel® Distribution of OpenVINO™]:https://software.intel.com/en-us/openvino-toolkit
 [CMake]:https://cmake.org/download/
-[Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 19.41.14441]:https://github.com/intel/compute-runtime/releases/tag/19.41.14441
+[Install Intel® Graphics Compute Runtime for OpenCL™ Driver package 20.13.16352]:https://github.com/intel/compute-runtime/releases/tag/20.13.16352
 [MKL-DNN repository]:https://github.com/intel/mkl-dnn/releases/download/v0.19/mklml_lnx_2019.0.5.20190502.tgz
 [MKL-DNN repository for Windows]:(https://github.com/intel/mkl-dnn/releases/download/v0.19/mklml_win_2019.0.5.20190502.zip)
 [OpenBLAS]:https://sourceforge.net/projects/openblas/files/v0.2.14/OpenBLAS-v0.2.14-Win64-int64.zip/download
diff --git a/get-started-linux.md b/get-started-linux.md
index bf87003b068b48..661fc4ec8ddded 100644
--- a/get-started-linux.md
+++ b/get-started-linux.md
@@ -1,7 +1,7 @@
-# Get Started with OpenVINO™ Deep Learning Deployment Toolkit (DLDT) on Linux*
+# Get Started with OpenVINO™ Toolkit on Linux*
 
 This guide provides you with the information that will help you to start using 
-the DLDT on Linux\*. With this guide, you will learn how to:
+the OpenVINO™ Toolkit on Linux\*. With this guide, you will learn how to:
 
 1. [Configure the Model Optimizer](#configure-the-model-optimizer)
 2. [Prepare a model for sample inference](#prepare-a-model-for-sample-inference)
@@ -10,13 +10,13 @@ the DLDT on Linux\*. With this guide, you will learn how to:
 3. [Run the Image Classification Sample Application with the model](#run-the-image-classification-sample-application)
 
 ## Prerequisites
-1. This guide assumes that you have already cloned the `dldt` repo and 
+1. This guide assumes that you have already cloned the `openvino` repo and 
    successfully built the Inference Engine and Samples using the 
    [build instructions](inference-engine/README.md). 
 2. The original structure of the repository directories remains unchanged.
 
-> **NOTE**: Below, the directory to which the `dldt` repository is cloned is 
-referred to as `<DLDT_DIR>`.  
+> **NOTE**: Below, the directory to which the `openvino` repository is cloned is 
+referred to as `<OPENVINO_DIR>`.  
 
 ## Configure the Model Optimizer
 
@@ -53,7 +53,7 @@ If you see error messages, check for any missing dependencies.
 
 1.  Go to the Model Optimizer prerequisites directory:
 ```sh
-cd <DLDT_DIR>/model_optimizer/install_prerequisites
+cd <OPENVINO_DIR>/model_optimizer/install_prerequisites
 ```
 2.  Run the script to configure the Model Optimizer for Caffe,
     TensorFlow, MXNet, Kaldi\*, and ONNX:
@@ -68,7 +68,7 @@ Configure individual frameworks separately **ONLY** if you did not select
 
 1.  Go to the Model Optimizer prerequisites directory:
 ```sh
-cd <DLDT_DIR>/model_optimizer/install_prerequisites
+cd <OPENVINO_DIR>/model_optimizer/install_prerequisites
 ```
 2.  Run the script for your model framework. You can run more than one script:
 
@@ -162,20 +162,20 @@ as `<models_dir>` below) with the Model Downloader:
 
    **For CPU (FP32):**
    ```sh  
-   python3 <DLDT_DIR>/model_optimizer/mo.py --input_model <models_dir>/classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel --data_type FP32 --output_dir <ir_dir>
+   python3 <OPENVINO_DIR>/model_optimizer/mo.py --input_model <models_dir>/classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel --data_type FP32 --output_dir <ir_dir>
    ```
 
    **For GPU and MYRIAD (FP16):**
    ```sh  
-   python3 <DLDT_DIR>/model_optimizer/mo.py --input_model <models_dir>/classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel --data_type FP16 --output_dir <ir_dir>
+   python3 <OPENVINO_DIR>/model_optimizer/mo.py --input_model <models_dir>/classification/squeezenet/1.1/caffe/squeezenet1.1.caffemodel --data_type FP16 --output_dir <ir_dir>
    ``` 
    After the Model Optimizer script is completed, the produced IR files (`squeezenet1.1.xml`, `squeezenet1.1.bin`) are in the specified `<ir_dir>` directory.
 
-3. Copy the `squeezenet1.1.labels` file from the `<DLDT_DIR>/inference-engine/samples/sample_data/` 
+3. Copy the `squeezenet1.1.labels` file from the `<OPENVINO_DIR>/scripts/demo/` 
    folder to the model IR directory. This file contains the classes that ImageNet 
    uses so that the inference results show text instead of classification numbers:
    ```sh   
-   cp <DLDT_DIR>/inference-engine/samples/sample_data/squeezenet1.1.labels <ir_dir>
+   cp <OPENVINO_DIR>/scripts/demo/squeezenet1.1.labels <ir_dir>
    ```
 
 Now you are ready to run the Image Classification Sample Application.
@@ -184,28 +184,28 @@ Now you are ready to run the Image Classification Sample Application.
 
 The Inference Engine sample applications are automatically compiled when you 
 built the Inference Engine using the [build instructions](inference-engine/README.md). 
-The binary files are located in the `<DLDT_DIR>/inference-engine/bin/intel64/Release` 
+The binary files are located in the `<OPENVINO_DIR>/inference-engine/bin/intel64/Release` 
 directory.
 
 To run the Image Classification sample application with an input image on the prepared IR: 
 
 1. Go to the samples build directory:
    ```sh
-   cd <DLDT_DIR>/inference-engine/bin/intel64/Release
+   cd <OPENVINO_DIR>/inference-engine/bin/intel64/Release
    
 2. Run the sample executable with specifying the `car.png` file from the 
-   `<DLDT_DIR>/inference-engine/samples/sample_data/` directory as an input 
+   `<OPENVINO_DIR>/scripts/demo/` directory as an input 
    image, the IR of your model and a plugin for a hardware device to perform 
    inference on:
 
    **For CPU:**
    ```sh
-   ./classification_sample -i <DLDT_DIR>/inference-engine/samples/sample_data/car.png -m <ir_dir>/squeezenet1.1.xml -d CPU
+   ./classification_sample -i <OPENVINO_DIR>/scripts/demo/car.png -m <ir_dir>/squeezenet1.1.xml -d CPU
    ```
 
    **For GPU:**
    ```sh
-   ./classification_sample -i <DLDT_DIR>/inference-engine/samples/sample_data/car.png -m <ir_dir>/squeezenet1.1.xml -d GPU
+   ./classification_sample -i <OPENVINO_DIR>/scripts/demo/car.png -m <ir_dir>/squeezenet1.1.xml -d GPU
    ```
    
    **For MYRIAD:** 
@@ -214,14 +214,14 @@ To run the Image Classification sample application with an input image on the pr
    Stick or Intel® Neural Compute Stick 2) with the MYRIAD plugin requires 
    performing [additional hardware configuration steps](inference-engine/README.md#optional-additional-installation-steps-for-the-intel-movidius-neural-compute-stick-and-neural-compute-stick-2).
    ```sh   
-   ./classification_sample -i <DLDT_DIR>/inference-engine/samples/sample_data/car.png -m <ir_dir>/squeezenet1.1.xml -d MYRIAD
+   ./classification_sample -i <OPENVINO_DIR>/scripts/demo/car.png -m <ir_dir>/squeezenet1.1.xml -d MYRIAD
    ```
 
 When the Sample Application completes, you will have the label and confidence for the top-10 categories printed on the screen. Below is a sample output with inference results on CPU:    
 ```sh
 Top 10 results:
 
-Image /home/user/dldt/inference-engine/samples/sample_data/car.png
+Image /home/user/openvino/scripts/demo/car.png
 
 classid probability label
 ------- ----------- -----
diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp
index 3090a371122604..86cdddb81ef433 100644
--- a/inference-engine/src/cldnn_engine/cldnn_program.cpp
+++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp
@@ -2735,6 +2735,8 @@ void Program::CreatePoolingPrimitive(cldnn::topology& topology, InferenceEngine:
             input_offset,
             CldnnTensorFromIEDims(poolLayer->outData[0]->getTensorDesc().getDims()),
             dt);
+        cldnn::tensor pad_end = { 0, 0, -TensorValue(poolLayer->_pads_end[X_AXIS]), -TensorValue(poolLayer->_pads_end[Y_AXIS]), 0 };
+        poolPrim.pad_end = pad_end;
         topology.add(poolPrim);
         primitiveIDs[poolLayerName] = poolLayerName;
     }
diff --git a/inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp b/inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp
index 2c236a3072767f..aa52d29b310dd4 100644
--- a/inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp
+++ b/inference-engine/src/mkldnn_plugin/mkldnn_memory_solver.hpp
@@ -10,6 +10,8 @@
 
 #include "ie_api.h"
 
+#include <stdint.h>
+
 #include <vector>
 #include <map>
 
diff --git a/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp b/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp
index 55ffb832f676b1..18d0fd73c30940 100644
--- a/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp
+++ b/inference-engine/src/mkldnn_plugin/nodes/roifeatureextractor_onnx.cpp
@@ -140,6 +140,7 @@ void ROIAlignForward_cpu_kernel(
     const int pooled_width,
     const int sampling_ratio,
     const T* bottom_rois,
+    const bool aligned,
     T* top_data) {
   int roi_cols = 4;
 
@@ -156,11 +157,12 @@ void ROIAlignForward_cpu_kernel(
       offset_bottom_rois++;
     }
 
+    T offset = aligned ? (T)0.5 : (T)0.0;
     // Do not using rounding; this implementation detail is critical
-    T roi_start_w = offset_bottom_rois[0] * spatial_scale;
-    T roi_start_h = offset_bottom_rois[1] * spatial_scale;
-    T roi_end_w = offset_bottom_rois[2] * spatial_scale;
-    T roi_end_h = offset_bottom_rois[3] * spatial_scale;
+    T roi_start_w = offset_bottom_rois[0] * spatial_scale - offset;
+    T roi_start_h = offset_bottom_rois[1] * spatial_scale - offset;
+    T roi_end_w = offset_bottom_rois[2] * spatial_scale - offset;
+    T roi_end_h = offset_bottom_rois[3] * spatial_scale - offset;
 
     // Force malformed ROIs to be 1x1
     T roi_width = (std::max)(roi_end_w - roi_start_w, (T)1.);
@@ -321,6 +323,7 @@ class ExperimentalDetectronROIFeatureExtractorImpl: public ExtLayerBase {
             output_dim_ = layer->GetParamAsInt("output_size");
             pyramid_scales_ = layer->GetParamAsInts("pyramid_scales");
             sampling_ratio_ = layer->GetParamAsInt("sampling_ratio");
+            aligned_ = layer->GetParamAsBool("aligned");
             pooled_height_ = output_dim_;
             pooled_width_ = output_dim_;
 
@@ -374,6 +377,7 @@ class ExperimentalDetectronROIFeatureExtractorImpl: public ExtLayerBase {
                     pooled_width_,
                     sampling_ratio_,
                     &reordered_rois[4 * level_rois_offset],
+                    aligned_,
                     &output_rois_features_temp[feaxels_per_roi * level_rois_offset]);
             }
         }
@@ -394,6 +398,7 @@ class ExperimentalDetectronROIFeatureExtractorImpl: public ExtLayerBase {
     int pooled_width_ = 0;
     std::vector<int> pyramid_scales_;
     int sampling_ratio_ = 0;
+    bool aligned_ = false;
 };
 
 REG_FACTORY_FOR(ExperimentalDetectronROIFeatureExtractorImpl, ExperimentalDetectronROIFeatureExtractor);
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp
index bd38670c9dd221..69748a01a524ce 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/behavior/core_threading_tests.cpp
@@ -31,11 +31,12 @@ TEST_P(CoreThreadingTestsWithIterations, smoke_LoadNetwork_RemoteContext) {
         networks.emplace_back(ie.ReadNetwork(model.model_xml_str, model.weights_blob));
     }
 
-    networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::make2InputSubtract()));
-    networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeMultiSingleConv()));
-    networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSingleConv()));
-    networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitConvConcat()));
-    networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitMultiConvConcat()));
+    // TODO: uncomment after fixing *-31414
+    // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::make2InputSubtract()));
+    // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeMultiSingleConv()));
+    // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSingleConv()));
+    // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitConvConcat()));
+    // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitMultiConvConcat()));
 
     auto ocl_instance = std::make_shared<OpenCL>();
     ie.SetConfig(config, deviceName);
diff --git a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp
index 8ddd708318001b..5cab9e39aadf7f 100644
--- a/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp
+++ b/inference-engine/tests/functional/plugin/gpu/shared_tests_instances/single_layer_tests/pooling.cpp
@@ -12,7 +12,6 @@ using namespace ngraph::helpers;
 using namespace LayerTestsDefinitions;
 
 namespace {
-
 const std::vector<InferenceEngine::Precision> netPrecisions = {
         InferenceEngine::Precision::FP32,
         InferenceEngine::Precision::FP16
@@ -28,6 +27,7 @@ const std::vector<std::vector<size_t >> padEnds = {{0, 0},
                                                           {0, 2}};
 const std::vector<ngraph::op::RoundingType> roundingTypes = {ngraph::op::RoundingType::CEIL,
                                                              ngraph::op::RoundingType::FLOOR};
+
 ////* ========== Max Polling ========== */
 /* +========== Explicit Pad Floor Rounding ========== */
 const auto maxPool_ExplicitPad_FloorRounding_Params = ::testing::Combine(
@@ -35,8 +35,7 @@ const auto maxPool_ExplicitPad_FloorRounding_Params = ::testing::Combine(
         ::testing::ValuesIn(kernels),
         ::testing::ValuesIn(strides),
         ::testing::ValuesIn(padBegins),
-        // TODO: Accuracy mismatch with non zero Pad Ends (tested with {0.2})
-        ::testing::Values(std::vector<size_t>({0, 0})),
+        ::testing::ValuesIn(padEnds),
         ::testing::Values(ngraph::op::RoundingType::FLOOR),
         ::testing::Values(ngraph::op::PadType::EXPLICIT),
         ::testing::Values(false)  // placeholder value - exclude pad not applicable for max pooling
@@ -57,8 +56,7 @@ const auto maxPool_ExplicitPad_CeilRounding_Params = ::testing::Combine(
         // TODO: Non 1 strides fails in ngraph reference implementation with error "The end corner is out of bounds at axis 3" thrown in the test body.
         ::testing::Values(std::vector<size_t>({1, 1})),
         ::testing::ValuesIn(padBegins),
-        // TODO: Accuracy mismatch with non zero Pad Ends (tested with {0.2})
-        ::testing::Values(std::vector<size_t>({0, 0})),
+        ::testing::ValuesIn(padEnds),
         ::testing::Values(ngraph::op::RoundingType::CEIL),
         ::testing::Values(ngraph::op::PadType::EXPLICIT),
         ::testing::Values(false)  // placeholder value - exclude pad not applicable for max pooling
@@ -80,9 +78,8 @@ const auto avgPoolExplicitPadCeilRoundingParams = ::testing::Combine(
         ::testing::ValuesIn(kernels),
         // TODO: Non 1 strides fails in ngraph reference implementation with error "The end corner is out of bounds at axis 3" thrown in the test body.
         ::testing::Values(std::vector<size_t>({1, 1})),
-        // TODO: Non zero pads excluded because of accuracy mismatch
-        ::testing::Values(std::vector<size_t>({0, 0})),
-        ::testing::Values(std::vector<size_t>({0, 0})),
+        ::testing::ValuesIn(padBegins),
+        ::testing::ValuesIn(padEnds),
         ::testing::Values(ngraph::op::RoundingType::CEIL),
         ::testing::Values(ngraph::op::PadType::EXPLICIT),
         ::testing::Values(true, false)
@@ -101,9 +98,8 @@ const auto avgPoolExplicitPadFloorRoundingParams = ::testing::Combine(
         ::testing::Values(PoolingTypes::AVG),
         ::testing::ValuesIn(kernels),
         ::testing::ValuesIn(strides),
-        // TODO: Non zero pads excluded because of accuracy mismatch
-        ::testing::Values(std::vector<size_t>({0, 0})),
-        ::testing::Values(std::vector<size_t>({0, 0})),
+        ::testing::ValuesIn(padBegins),
+        ::testing::ValuesIn(padEnds),
         ::testing::Values(ngraph::op::RoundingType::FLOOR),
         ::testing::Values(ngraph::op::PadType::EXPLICIT),
         ::testing::Values(true, false)
@@ -125,9 +121,9 @@ const auto allPools_ValidPad_Params = ::testing::Combine(
         ::testing::ValuesIn(kernels),
         ::testing::ValuesIn(strides),
         ::testing::Values(std::vector<size_t>({0, 0})),
-        ::testing::Values(std::vector<size_t>({0, 0})),
-        ::testing::Values(
-                ngraph::op::RoundingType::FLOOR),  // placeholder value - Rounding Type not applicable for Valid pad type
+        ::testing::ValuesIn(padEnds),
+        ::testing::Values(ngraph::op::RoundingType::FLOOR),  // placeholder value - Rounding Type not applicable for Valid pad type
+        // TODO: PadType::VALID seems not to ignore padBegins
         ::testing::Values(ngraph::op::PadType::VALID),
         ::testing::Values(false)  // placeholder value - exclude pad not applicable for max pooling
 );
@@ -139,6 +135,4 @@ INSTANTIATE_TEST_CASE_P(MAX_and_AVGPool_ValidPad, PoolingLayerTest,
                                 ::testing::Values(std::vector<size_t >({1, 3, 50, 50})),
                                 ::testing::Values(CommonTestUtils::DEVICE_GPU)),
                         PoolingLayerTest::getTestCaseName);
-
-
-}  // namespace
\ No newline at end of file
+}  // namespace
diff --git a/inference-engine/tests/functional/plugin/shared/include/behavior/core_threading_tests.hpp b/inference-engine/tests/functional/plugin/shared/include/behavior/core_threading_tests.hpp
index c53f9fc0939c63..0379767355b54c 100644
--- a/inference-engine/tests/functional/plugin/shared/include/behavior/core_threading_tests.hpp
+++ b/inference-engine/tests/functional/plugin/shared/include/behavior/core_threading_tests.hpp
@@ -183,11 +183,12 @@ TEST_P(CoreThreadingTestsWithIterations, smoke_LoadNetwork) {
         networks.emplace_back(ie.ReadNetwork(model.model_xml_str, model.weights_blob));
     }
 
-    networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::make2InputSubtract()));
-    networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeMultiSingleConv()));
-    networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSingleConv()));
-    networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitConvConcat()));
-    networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitMultiConvConcat()));
+    // TODO: uncomment after fixing *-31414
+    // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::make2InputSubtract()));
+    // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeMultiSingleConv()));
+    // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSingleConv()));
+    // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitConvConcat()));
+    // networks.emplace_back(InferenceEngine::CNNNetwork(ngraph::builder::subgraph::makeSplitMultiConvConcat()));
 
     ie.SetConfig(config, deviceName);
     runParallel([&] () {
diff --git a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/output_layers_handling_in_transformations.cpp b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/output_layers_handling_in_transformations.cpp
index 88c6ce69e42ae9..fddf40add41267 100644
--- a/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/output_layers_handling_in_transformations.cpp
+++ b/inference-engine/tests/functional/plugin/shared/src/low_precision_transformations/output_layers_handling_in_transformations.cpp
@@ -75,6 +75,8 @@ InferenceEngine::Blob::Ptr OutputLayersHandlingInTransformations::GenerateInput(
 */
 
 void OutputLayersHandlingInTransformations::SetUp() {
+    threshold = 0.05;
+
     InferenceEngine::SizeVector inputShape1;
     InferenceEngine::Precision netPrecision;
     InferenceEngine::details::LayerTransformation::Params params;
diff --git a/inference-engine/thirdparty/clDNN/api/pooling.hpp b/inference-engine/thirdparty/clDNN/api/pooling.hpp
index 4a92e601a1f41a..4dcccfbaafe349 100644
--- a/inference-engine/thirdparty/clDNN/api/pooling.hpp
+++ b/inference-engine/thirdparty/clDNN/api/pooling.hpp
@@ -188,6 +188,8 @@ struct pooling : public primitive_base<pooling> {
     bool with_output_size;
     /// @brief User-defined output data size of the primitive (w/o padding).
     tensor output_size;
+    /// @brief Defines a shift, relative to the end of padding shape.
+    tensor pad_end;
 
 protected:
     std::vector<std::reference_wrapper<const primitive_id>> get_dependencies() const override {
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
index b0284dbaf56d55..bcb6a1d45b5137 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/convolution/convolution_kernel_b_fs_yx_fsv16_1x1.cpp
@@ -78,6 +78,7 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
     auto autoTune = GetAutoTuneOptions(params, autoTuneIndex);
     kd.cldnnStyle.blockWidth = autoTune.blockWidth;
 
+    const auto& input = params.inputs[0];
     const auto& out = params.output;
     auto x = out.X().v;
     auto y = out.Y().v;
@@ -92,11 +93,16 @@ ConvolutionKernelBase::DispatchData ConvolutionKernel_b_fs_yx_fsv16_1x1::SetDefa
     kd.lws1 = sub_group_size;
     kd.lws2 = 1;
 
+    auto bBlockSizeX = x % autoTune.blockWidth == 0;
+    auto bBlockSizeXY = out.X().pad.Total() + out.Y().pad.Total() == 0;
+    auto bInputPad = input.X().pad.Total() + input.Y().pad.Total() != 0;
+    
     if (b == 1) {
-        if (x <= 8)
+        if ((bBlockSizeX || bBlockSizeXY) && !bInputPad) {
             kd.efficiency = FORCE_PRIORITY_1;
-        else
-            kd.efficiency = FORCE_PRIORITY_2;
+        } else {
+            kd.efficiency = FORCE_PRIORITY_3;
+        }
     } else {
         kd.efficiency = FORCE_PRIORITY_7;
     }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp
index d7e8081ff062ba..f9486d77a2da78 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_base.cpp
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016-2019 Intel Corporation
+﻿// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -33,24 +33,27 @@ bool PoolingKernelBase::Validate(const Params& p, const optional_params& o) cons
 }
 
 Datatype PoolingKernelBase::GetAccumulatorType(const pooling_params& params) const {
-    if (params.quantization != QuantizationType::NONE)
-        return Datatype::INT32;
+    const auto& input_dt = params.inputs[0].GetDType();
+    const auto& pool_type = params.poolType;
 
-    Datatype types[] = { Datatype::F32, Datatype::F16, Datatype::INT64, Datatype::INT32, Datatype::UINT32};
-
-    for (Datatype type : types)
-        for (auto& in : params.inputs)
-            if (in.GetDType() == type)
-                return type;
-
-    return Datatype::F32;
+    if (pool_type == PoolType::MAX) {
+        return input_dt;
+    } else {
+        switch (input_dt) {
+            case Datatype::F32: return Datatype::F32;
+            case Datatype::F16: return Datatype::F32;
+            case Datatype::INT8: return Datatype::INT32;
+            case Datatype::UINT8: return Datatype::INT32;
+            default: return Datatype::F32;
+        }
+    }
 }
 
 Datatype PoolingKernelBase::GetActivationType(const pooling_params& params) const {
-    if (params.quantization != QuantizationType::NONE)
+    if (params.output.GetDType() == Datatype::F16)
+        return Datatype::F16;
+    else
         return Datatype::F32;
-
-    return GetUnitType(params);
 }
 
 
@@ -78,12 +81,17 @@ JitConstants PoolingKernelBase::GetJitConstants(const pooling_params& pp, Poolin
 
 // Checks if we need boundary checking in kernel.
 bool PoolingKernelBase::NeedsBoundaryCheck(const pooling_params& pp) const {
+    const auto& input = pp.inputs[0];
+    const auto& output = pp.output;
+
     if (pp.poolPad.x != 0 || pp.poolPad.y != 0 || pp.poolPad.z != 0) {
         return true;
+    } else if ((((input.X().v - pp.poolSize.x) / pp.poolStride.x) + 1) < output.X().v ||
+               (((input.Y().v - pp.poolSize.y) / pp.poolStride.y) + 1) < output.Y().v ||
+               (((input.Z().v - pp.poolSize.z) / pp.poolStride.z) + 1) < output.Z().v) {
+        return true;
     }
 
-    const auto& input = pp.inputs[0];
-
     if (input.X().v < pp.poolSize.x || input.Y().v < pp.poolSize.y || input.Z().v < pp.poolSize.z) {
         return true;
     }
@@ -99,7 +107,7 @@ bool PoolingKernelBase::NeedsBoundaryCheck(const pooling_params& pp) const {
     return mod_x || mod_y || mod_z;
 }
 
-bool PoolingKernelBase::EnableRound(const kernel_selector::pooling_params &params) const {
+bool PoolingKernelBase::EnableRound(const kernel_selector::pooling_params& params) const {
     bool has_fused_quantize_to_int8 = false;
     for (auto& op : params.fused_ops) {
         if (op.GetType() == FusedOpType::QUANTIZE &&
@@ -108,7 +116,8 @@ bool PoolingKernelBase::EnableRound(const kernel_selector::pooling_params &param
         }
     }
 
-    if (!has_fused_quantize_to_int8 && (params.output.GetDType() == Datatype::INT8 || params.output.GetDType() == Datatype::UINT8) &&
+    if (!has_fused_quantize_to_int8 &&
+        (params.output.GetDType() == Datatype::INT8 || params.output.GetDType() == Datatype::UINT8) &&
         params.poolType == PoolType::AVG) {
         return true;
     }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp
index ef90e978005c4d..5e20ef6349a8cf 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.cpp
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -20,12 +20,16 @@ ParamsKey PoolingKernelGPUAverageOpt::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
     k.EnableInputLayout(DataLayout::bfyx);
     k.EnableOutputLayout(DataLayout::bfyx);
     k.EnablePoolType(PoolType::AVG);
     k.EnablePoolRemainder(PoolRemainder::FLOOR);
     k.EnablePoolRemainder(PoolRemainder::CEIL);
     k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
+    k.EnableDifferentTypes();
     return k;
 }
 
@@ -80,19 +84,19 @@ PoolingKernelBase::DispatchData PoolingKernelGPUAverageOpt::SetDefault(const poo
 
 JitConstants PoolingKernelGPUAverageOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const {
     auto tileDims = GetTileDimentions();
-    auto mem_consts = PoolingKernelBase::GetJitConstants(params, kd);
+    auto jit = PoolingKernelBase::GetJitConstants(params, kd);
 
     if (tileDims.y != 0 && tileDims.x != 0) {
-        mem_consts.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", kd.lws0));
-        mem_consts.AddConstant(MakeJitConstant("TILE_HEIGHT", tileDims.y));
-        mem_consts.AddConstant(MakeJitConstant("TILE_WIDTH", tileDims.x));
-        mem_consts.AddConstant(MakeJitConstant("ONE_OVER_POOL_SIZE", 1.f / (params.poolSize.x * params.poolSize.y)));
+        jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", kd.lws0));
+        jit.AddConstant(MakeJitConstant("TILE_HEIGHT", tileDims.y));
+        jit.AddConstant(MakeJitConstant("TILE_WIDTH", tileDims.x));
+        jit.AddConstant(MakeJitConstant("ONE_OVER_POOL_SIZE", 1.f / (params.poolSize.x * params.poolSize.y)));
     }
 
-    return mem_consts;
+    return jit;
 }
 
 KernelsData PoolingKernelGPUAverageOpt::GetKernelsData(const Params& params, const optional_params& options) const {
     return GetCommonKernelsData(params, options, FORCE_PRIORITY_7);
 }
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h
index 39b8ec97efd0f8..828434705fa1ce 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_average_opt.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,4 +30,4 @@ class PoolingKernelGPUAverageOpt : public PoolingKernelBase {
     JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
     DispatchData SetDefault(const pooling_params& params) const override;
 };
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp
index 31a2ac39c40cdd..aeb43373bd7fca 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.cpp
@@ -22,6 +22,8 @@ ParamsKey PoolingKernel_b_fs_yx_fsv16::GetSupportedKey() const {
     k.EnableOutputDataType(Datatype::F16);
     k.EnableInputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
     k.EnableInputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableOutputLayout(DataLayout::b_fs_yx_fsv16);
     k.EnableTensorOffset();
@@ -38,6 +40,7 @@ ParamsKey PoolingKernel_b_fs_yx_fsv16::GetSupportedKey() const {
     k.EnableDifferentTypes();
     k.EnableSubGroup();
     k.EnableSubGroupShort();
+    k.EnableDifferentTypes();
     return k;
 }
 
@@ -88,9 +91,36 @@ JitConstants PoolingKernel_b_fs_yx_fsv16::GetJitConstants(const pooling_params&
     jit.AddConstant(MakeJitConstant("INPUT_LINE_SIZE", input_line_size));
     jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", alignment));
     jit.AddConstant(MakeJitConstant("X_BLOCKS", CeilDiv(output.X().v, x_block_size)));
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+
     if (params.output.Feature().v % 16 != 0) {
         jit.AddConstant(MakeJitConstant("OUTPUT_LEFTOVERS", 1));
     }
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf_vec = {"_VEC",
+                                         {"b", "(f_block*16)", "y", "x"},
+                                         "pool_result",
+                                         input_dt,
+                                         x_block_size,
+                                         LoadType::LT_ALIGNED_READ,
+                                         BoundaryCheck::ENABLED,
+                                         IndexType::TENSOR_COORD,
+                                         Tensor::DataChannelName::X};
+        FusedOpsConfiguration conf_scalar = {"_SCALAR",
+                                            {"b", "(f_block*16)", "y", "(x+i)"},
+                                            "pool_result[i]",
+                                            input_dt,
+                                            1,
+                                            LoadType::LT_ALIGNED_READ,
+                                            BoundaryCheck::ENABLED,
+                                            IndexType::TENSOR_COORD,
+                                            Tensor::DataChannelName::X};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf_vec, conf_scalar}));
+    }
+
     return jit;
 }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h
index 90b7fb2c2ee412..4877d4f9d102f4 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv16.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2018-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,6 +30,11 @@ class PoolingKernel_b_fs_yx_fsv16 : public PoolingKernelBase {
     bool Validate(const Params&, const optional_params&) const override;
     JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
     DispatchData SetDefault(const pooling_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
 
     size_t GetBlockSize(const pooling_params& params) const;
 };
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp
index 621dd566fc2b23..606023653ebd30 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -66,19 +66,21 @@ JitConstants PoolingKerneGPU_b_fs_yx_fsv4::GetJitConstants(const pooling_params&
     const size_t in_y_pitch = 4 * params.inputs[0].X().LogicalDimPadded();
     jit.AddConstant(MakeJitConstant("IN_X_PITCH", in_x_pitch));
     jit.AddConstant(MakeJitConstant("IN_Y_PITCH", in_y_pitch));
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
 
     if (!params.fused_ops.empty()) {
-        auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params);
-        FusedOpsConfiguration conf = { "",
-                                       {"b", "f", "y", "x"},
-                                       "pool_result",
-                                       input_dt,
-                                       4,
-                                       LoadType::LT_UNALIGNED,
-                                       BoundaryCheck::ENABLED,
-                                       IndexType::TENSOR_COORD,
-                                       Tensor::DataChannelName::FEATURE };
-        jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf = {"",
+                                     {"b", "f", "y", "x"},
+                                     "pool_result",
+                                     input_dt,
+                                     4,
+                                     LoadType::LT_UNALIGNED,
+                                     BoundaryCheck::ENABLED,
+                                     IndexType::TENSOR_COORD,
+                                     Tensor::DataChannelName::FEATURE};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
     }
 
     return jit;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h
index 6caf7c1c30c817..fd12d6526fa84c 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_b_fs_yx_fsv4.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.cpp
index 4acba9efe4aec9..4088e22b30ebae 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.cpp
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@ ParamsKey PoolingKernelGPUBfyxBlockOpt::GetSupportedKey() const {
     k.EnableInputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
     k.EnableInputLayout(DataLayout::bfyx);
     k.EnableOutputLayout(DataLayout::bfyx);
     k.EnableTensorOffset();
@@ -48,12 +50,28 @@ PoolingKernelBase::DispatchData PoolingKernelGPUBfyxBlockOpt::SetDefault(const p
 }
 
 JitConstants PoolingKernelGPUBfyxBlockOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const {
-    auto mem_consts = PoolingKernelBase::GetJitConstants(params, kd);
+    auto jit = PoolingKernelBase::GetJitConstants(params, kd);
 
-    mem_consts.AddConstant(
+    jit.AddConstant(
         MakeJitConstant("BLOCK_SIZE_Y", params.poolSize.y + params.poolSize.y * params.poolStride.y - 1));
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
 
-    return mem_consts;
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf = {"",
+                                     {"b", "f", "y + i", "x"},
+                                     "pool_result",
+                                     input_dt,
+                                     1,
+                                     LoadType::LT_UNALIGNED,
+                                     BoundaryCheck::ENABLED,
+                                     IndexType::TENSOR_COORD,
+                                     Tensor::DataChannelName::Y};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
+
+    return jit;
 }
 
 bool PoolingKernelGPUBfyxBlockOpt::Validate(const Params& p, const optional_params& o) const {
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h
index 77b94e7e2aa3c5..4b77a845df793a 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bfyx_block_opt.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,5 +30,10 @@ class PoolingKernelGPUBfyxBlockOpt : public PoolingKernelBase {
     bool Validate(const Params&, const optional_params&) const override;
     JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
     DispatchData SetDefault(const pooling_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
 };
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.cpp
index a4714ab8d5ef75..93ae17541e286f 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.cpp
@@ -1,4 +1,3 @@
-//
 // Copyright (c) 2019-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
@@ -27,9 +26,11 @@ static const size_t batch_block_size = 16;
 ParamsKey PoolingKernel_bsv16_fsv16::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::F32);
-    k.EnableOutputDataType(Datatype::F32);
     k.EnableInputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
     k.EnableInputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
     k.EnableOutputLayout(DataLayout::bs_fs_yx_bsv16_fsv16);
     k.EnableInputLayout(DataLayout::bs_fs_zyx_bsv16_fsv16);
@@ -44,6 +45,7 @@ ParamsKey PoolingKernel_bsv16_fsv16::GetSupportedKey() const {
     k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
     k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC);
     k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
+    k.EnableDifferentTypes();
     return k;
 }
 
@@ -105,6 +107,30 @@ JitConstants PoolingKernel_bsv16_fsv16::GetJitConstants(const pooling_params& pa
     jit.AddConstant(MakeJitConstant("MB_BLOCK", batch_block_size));
     jit.AddConstant(MakeJitConstant("IC_BLOCK", feature_block_size));
     jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+
+        std::vector<std::string> idx_order;
+        if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) {
+            idx_order = {"(b + BLOCK_NUM * 8)", "oc", "y", "x"};
+        } else if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
+            idx_order = {"(b + BLOCK_NUM * 8)", "oc", "z", "y", "x"};
+        }
+
+        FusedOpsConfiguration conf = {"",
+                                     idx_order,
+                                     "pool_result",
+                                     input_dt,
+                                     8,
+                                     LoadType::LT_ALIGNED_READ,
+                                     BoundaryCheck::ENABLED,
+                                     IndexType::TENSOR_COORD,
+                                     Tensor::DataChannelName::BATCH};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
 
     return jit;
 }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.h
index 57cdbd1004c4a9..fc2ebc258bd5da 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_bsv16_fsv16.h
@@ -1,5 +1,4 @@
-//
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -35,5 +34,10 @@ class PoolingKernel_bsv16_fsv16 : public PoolingKernelBase {
     bool Validate(const Params& p, const optional_params& o) const override;
     JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
     DispatchData SetDefault(const pooling_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
 };
 }  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp
index 6818394affe9b7..70d1d655c3d051 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.cpp
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -62,18 +62,20 @@ JitConstants PoolingKerneGPU_byxf_af32::GetJitConstants(const pooling_params& pa
     JitConstants jit = PoolingKernelBase::GetJitConstants(params, kd);
 
     jit.AddConstant(MakeJitConstant("AS_INPUT_TYPE(val)", "as_" + toCLType(params.inputs[0].GetDType()) + "4(val)"));
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
 
     if (!params.fused_ops.empty()) {
-        auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params);
-        FusedOpsConfiguration conf = { "",
-                                       {"b", "f", "y", "x"},
-                                       "pool_result",
-                                       input_dt,
-                                       4,
-                                       LoadType::LT_UNALIGNED,
-                                       BoundaryCheck::ENABLED,
-                                       IndexType::TENSOR_COORD,
-                                       Tensor::DataChannelName::FEATURE };
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf = {"",
+                                     {"b", "f", "y", "x"},
+                                     "fused_pool_result",
+                                     input_dt,
+                                     4,
+                                     LoadType::LT_UNALIGNED,
+                                     BoundaryCheck::ENABLED,
+                                     IndexType::TENSOR_COORD,
+                                     Tensor::DataChannelName::FEATURE};
         jit.Merge(MakeFusedOpsJitConstants(params, { conf }));
     }
 
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h
index c2bbb9f0feffd6..1ffc94bc545ebf 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_af32.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.cpp
index bca6e0367e757a..b5d9e4759db0b4 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,10 +22,13 @@ ParamsKey PoolingKernelGPUByxfOpt::GetSupportedKey() const {
     k.EnableInputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
     k.EnableInputLayout(DataLayout::byxf);
     k.EnableOutputLayout(DataLayout::byxf);
     k.EnableTensorOffset();
     k.EnableTensorPitches();
+    k.EnableDifferentTypes();
     k.EnableBatching();
     k.EnablePoolType(PoolType::MAX);
     k.EnablePoolType(PoolType::AVG);
@@ -46,9 +49,24 @@ PoolingKernelBase::DispatchData PoolingKernelGPUByxfOpt::SetDefault(const poolin
 }
 
 JitConstants PoolingKernelGPUByxfOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const {
-    auto mem_consts = PoolingKernelBase::GetJitConstants(params, kd);
+    auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
 
-    return mem_consts;
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf = {"",
+                                     {"b", "f + i", "y", "x"},
+                                     "pool_result",
+                                     input_dt,
+                                     1,
+                                     LoadType::LT_UNALIGNED,
+                                     BoundaryCheck::ENABLED,
+                                     IndexType::TENSOR_COORD,
+                                     Tensor::DataChannelName::FEATURE};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
+    return jit;
 }
 
 bool PoolingKernelGPUByxfOpt::Validate(const Params& p, const optional_params& o) const {
@@ -71,4 +89,4 @@ bool PoolingKernelGPUByxfOpt::Validate(const Params& p, const optional_params& o
 KernelsData PoolingKernelGPUByxfOpt::GetKernelsData(const Params& params, const optional_params& options) const {
     return GetCommonKernelsData(params, options, FORCE_PRIORITY_7);
 }
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h
index 0678f7f059037f..5c6547706b89b1 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_opt.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,5 +30,10 @@ class PoolingKernelGPUByxfOpt : public PoolingKernelBase {
     bool Validate(const Params&, const optional_params&) const override;
     JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
     DispatchData SetDefault(const pooling_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
 };
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.cpp
index 02f0f90f91089f..655f1648d0b425 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@ ParamsKey PoolingKernelGPUByxfPaddingOpt::GetSupportedKey() const {
     k.EnableInputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
     k.EnableInputLayout(DataLayout::byxf);
     k.EnableOutputLayout(DataLayout::byxf);
     k.EnableTensorOffset();
@@ -32,6 +34,7 @@ ParamsKey PoolingKernelGPUByxfPaddingOpt::GetSupportedKey() const {
     k.EnablePoolRemainder(PoolRemainder::FLOOR);
     k.EnablePoolRemainder(PoolRemainder::CEIL);
     k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
+    k.EnableDifferentTypes();
     return k;
 }
 
@@ -46,9 +49,16 @@ PoolingKernelBase::DispatchData PoolingKernelGPUByxfPaddingOpt::SetDefault(const
 }
 
 JitConstants PoolingKernelGPUByxfPaddingOpt::GetJitConstants(const pooling_params& params, DispatchData kd) const {
-    auto mem_consts = PoolingKernelBase::GetJitConstants(params, kd);
+    auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
 
-    return mem_consts;
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf = {"", {"b", "f + i", "y", "x"}, "pool_result", input_dt, 1};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
+    return jit;
 }
 
 bool PoolingKernelGPUByxfPaddingOpt::Validate(const Params& p, const optional_params& o) const {
@@ -66,4 +76,4 @@ bool PoolingKernelGPUByxfPaddingOpt::Validate(const Params& p, const optional_pa
 KernelsData PoolingKernelGPUByxfPaddingOpt::GetKernelsData(const Params& params, const optional_params& options) const {
     return GetCommonKernelsData(params, options, FORCE_PRIORITY_8);
 }
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h
index 64d0a3af6acdea..f7566aac68a03d 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_byxf_padding_opt.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,5 +30,10 @@ class PoolingKernelGPUByxfPaddingOpt : public PoolingKernelBase {
     bool Validate(const Params&, const optional_params&) const override;
     JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
     DispatchData SetDefault(const pooling_params& params) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
 };
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp
index 71e96934dc66df..b963162bc18f49 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.cpp
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -19,7 +19,11 @@ namespace kernel_selector {
 ParamsKey PoolingKerneGPU_fs_b_yx_fsv32::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::F16);
+    k.EnableInputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
     k.EnableInputLayout(DataLayout::fs_b_yx_fsv32);
     k.EnableOutputLayout(DataLayout::fs_b_yx_fsv32);
     k.EnableTensorOffset();
@@ -34,6 +38,7 @@ ParamsKey PoolingKerneGPU_fs_b_yx_fsv32::GetSupportedKey() const {
     k.EnablePoolKernelDividerMode(KernelDividerMode::DYNAMIC_WITH_PADDING);
     k.EnableSubGroup();
     k.EnableSubGroupShort();
+    k.EnableDifferentTypes();
     return k;
 }
 
@@ -75,6 +80,22 @@ JitConstants PoolingKerneGPU_fs_b_yx_fsv32::GetJitConstants(const pooling_params
     if (pp.poolSize.x >= 7 && pp.poolSize.y >= 7 && pp.poolType == PoolType::AVG) {
         jit.AddConstant(MakeJitConstant("USE_FLOAT_ACC", true));
     }
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf = {"",
+                                     {"b", "fs", "out_y", "out_x"},
+                                     "pool_result",
+                                     input_dt,
+                                     2,
+                                     LoadType::LT_ALIGNED_READ,
+                                     BoundaryCheck::ENABLED,
+                                     IndexType::TENSOR_COORD,
+                                     Tensor::DataChannelName::FEATURE};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
 
     return jit;
 }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.h
index 5db49e66e20587..5bb61fa3309994 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_b_yx_fsv32.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,5 +30,10 @@ class PoolingKerneGPU_fs_b_yx_fsv32 : public PoolingKernelBase {
 protected:
     bool Validate(const Params& p, const optional_params& o) const override;
     JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
 };
 }  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp
index 0fb5fbdc79c774..9f5a25204402d6 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.cpp
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2018 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -20,6 +20,9 @@ ParamsKey PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::INT8);
     k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
     k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
     k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
     k.EnableTensorOffset();
@@ -68,12 +71,41 @@ JitConstants PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetJitConstants(const pooling_
     jit.AddConstant(MakeJitConstant("IN_B_BLOCK_PITCH", in_b_block_pitch));
     jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
     jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf = {"",
+                                     {"b + bi", "f", "y", "x"},
+                                     "char_result",
+                                     input_dt,
+                                     4,
+                                     LoadType::LT_UNALIGNED,
+                                     BoundaryCheck::ENABLED,
+                                     IndexType::TENSOR_COORD,
+                                     Tensor::DataChannelName::FEATURE};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
 
     return jit;
 }
 
+bool PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::Validate(const Params& params, const optional_params& options) const {
+    if (!PoolingKernelBase::Validate(params, options)) {
+        return false;
+    }
+
+    auto p = dynamic_cast<const pooling_params&>(params);
+
+    if (p.quantization != QuantizationType::NONE && p.poolType == PoolType::AVG) {
+        return false;
+    }
+
+    return true;
+}
 KernelsData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32::GetKernelsData(const Params& params,
                                                                 const optional_params& options) const {
     return GetCommonKernelsData(params, options, FORCE_PRIORITY_2);
 }
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h
index 6ac996cc60e9f8..307b426a5635a1 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2018 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -29,5 +29,11 @@ class PoolingKerneGPU_fs_bs_yx_bsv4_fsv32 : public PoolingKernelBase {
 
 protected:
     JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+    bool Validate(const Params&, const optional_params&) const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+         return { FusedOpType::QUANTIZE,
+                  FusedOpType::SCALE,
+                  FusedOpType::ACTIVATION };
+    }
 };
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp
index 16c566544bc915..34f97ab9cc963f 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.cpp
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2018 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -20,13 +20,15 @@ ParamsKey PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetSupportedKey() const {
     ParamsKey k;
     k.EnableInputDataType(Datatype::INT8);
     k.EnableOutputDataType(Datatype::INT8);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::F16);
+    k.EnableOutputDataType(Datatype::F32);
     k.EnableInputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
     k.EnableOutputLayout(DataLayout::fs_bs_yx_bsv4_fsv32);
     k.EnableTensorOffset();
     k.EnableTensorPitches();
     k.EnableBatching();
     k.EnablePoolType(PoolType::MAX);
-    //        k.EnablePoolType(PoolType::AVG);
     k.EnablePoolRemainder(PoolRemainder::FLOOR);
     k.EnablePoolRemainder(PoolRemainder::CEIL);
     k.EnablePoolKernelDividerMode(KernelDividerMode::FIXED);
@@ -77,6 +79,22 @@ JitConstants PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetJitConstants(const p
     jit.AddConstant(MakeJitConstant("IN_F_BLOCK_PITCH", in_f_block_pitch));
     jit.AddConstant(MakeJitConstant("IN_OFFSET", in_offset));
     jit.AddConstant(MakeJitConstant("BATCH_SG_COUNT", get_batch_sub_groups_count(params)));
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        FusedOpsConfiguration conf = {"",
+                                     {"b", "f", "y", "x"},
+                                     "pool_result",
+                                     input_dt,
+                                     4,
+                                     LoadType::LT_UNALIGNED,
+                                     BoundaryCheck::ENABLED,
+                                     IndexType::TENSOR_COORD,
+                                     Tensor::DataChannelName::FEATURE};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
 
     return jit;
 }
@@ -85,4 +103,4 @@ KernelsData PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32::GetKernelsData(const Par
                                                                        const optional_params& options) const {
     return GetCommonKernelsData(params, options, FORCE_PRIORITY_1);
 }
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h
index bb3fbf0dde4e13..3e2de8f1a3b57a 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_fs_bs_yx_bsv4_fsv32_simd32.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2018 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -29,5 +29,10 @@ class PoolingKerneGPU_fs_bs_yx_bsv4_fsv32_simd32 : public PoolingKernelBase {
 
 protected:
     JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
+        std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+        }
 };
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp
index 71524b83512453..beedfe9e995b26 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.cpp
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -63,9 +63,12 @@ KernelsData PoolingKernelGPUInt8Ref::GetKernelsData(const Params& params, const
 
 JitConstants PoolingKernelGPUInt8Ref::GetJitConstants(const pooling_params& params, DispatchData kd) const {
     JitConstants jit = PoolingKernelBase::GetJitConstants(params, kd);
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
 
     if (!params.fused_ops.empty()) {
-        auto input_dt = EnableRound(params) ? Datatype::INT32 : GetActivationType(params);
+        auto input_dt = GetActivationType(params);
+
         std::vector<std::string> idx_order;
         if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) {
             idx_order = {"b", "f", "y", "x"};
@@ -73,7 +76,7 @@ JitConstants PoolingKernelGPUInt8Ref::GetJitConstants(const pooling_params& para
             idx_order = {"b", "f", "z", "y", "x"};
         }
 
-        FusedOpsConfiguration conf = {"", idx_order, "pool_res", input_dt, 1 };
+        FusedOpsConfiguration conf = {"", idx_order, "pool_result", input_dt, 1 };
         jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
     }
 
@@ -88,7 +91,8 @@ bool PoolingKernelGPUInt8Ref::Validate(const Params& params, const optional_para
 
     if (p.inputs[0].GetDType() == Datatype::INT8 || p.inputs[0].GetDType() == Datatype::UINT8) {
         // Max pooling doesn't change quantization ranges, so output data type should be the same as input
-        if ((p.poolType == PoolType::MAX || p.poolType == PoolType::MAX_WITH_ARGMAX) && p.output.GetDType() != p.inputs[0].GetDType())
+        if ((p.poolType == PoolType::MAX || p.poolType == PoolType::MAX_WITH_ARGMAX)
+            && (p.output.GetDType() != p.inputs[0].GetDType()) && p.quantization == QuantizationType::NONE)
             return false;
 //         Average pooling should produce FP by default. (u)int8 is possible when quantize op is fused.
 //        if (p.poolType == PoolType::AVG &&
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h
index efef3e15433b68..6def2a4b290a4c 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_int8_ref.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -29,8 +29,7 @@ class PoolingKernelGPUInt8Ref : public PoolingKernelBase {
     bool Validate(const Params&, const optional_params&) const override;
     JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
     std::vector<FusedOpType> GetSupportedFusedOps() const override {
-        return { FusedOpType::ELTWISE,
-                 FusedOpType::QUANTIZE,
+        return { FusedOpType::QUANTIZE,
                  FusedOpType::SCALE,
                  FusedOpType::ACTIVATION };
     }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp
index 71e64f242c7d15..1f4bb273ec5fb6 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.cpp
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016-2019 Intel Corporation
+﻿// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@ ParamsKey PoolingKernelGPURef::GetSupportedKey() const {
     k.EnableInputDataType(Datatype::F32);
     k.EnableOutputDataType(Datatype::F16);
     k.EnableOutputDataType(Datatype::F32);
+    k.EnableOutputDataType(Datatype::UINT8);
+    k.EnableOutputDataType(Datatype::INT8);
     k.EnableInputLayout(DataLayout::bfyx);
     k.EnableInputLayout(DataLayout::yxfb);
     k.EnableInputLayout(DataLayout::byxf);
@@ -53,6 +55,26 @@ ParamsKey PoolingKernelGPURef::GetSupportedKey() const {
     return k;
 }
 
+JitConstants PoolingKernelGPURef::GetJitConstants(const pooling_params& params, DispatchData kd) const {
+    auto jit = PoolingKernelBase::GetJitConstants(params, kd);
+    jit.Merge(MakeTypeJitConstants(GetActivationType(params), "ACTIVATION"));
+    jit.Merge(MakeTypeJitConstants(GetAccumulatorType(params), "ACCUMULATOR"));
+
+    if (!params.fused_ops.empty()) {
+        auto input_dt = GetActivationType(params);
+        std::vector<std::string> idx_order;
+        if (DataTensor::ChannelsCount(params.output.GetLayout()) == 4) {
+            idx_order = {"b", "f", "y", "x"};
+        } else if (DataTensor::ChannelsCount(params.output.GetLayout()) == 5) {
+            idx_order = {"b", "f", "z", "y", "x"};
+        }
+        FusedOpsConfiguration conf = {"", idx_order, "pool_result", input_dt, 1};
+        jit.Merge(MakeFusedOpsJitConstants(params, {conf}));
+    }
+
+    return jit;
+}
+
 KernelsData PoolingKernelGPURef::GetKernelsData(const Params& params, const optional_params& options) const {
     return GetCommonKernelsData(params, options, FORCE_PRIORITY_9);
 }
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h
index ff693f6be5cbfd..e42bcc8c77a1e9 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_gpu_ref.h
@@ -1,4 +1,4 @@
-﻿// Copyright (c) 2016 Intel Corporation
+﻿// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -25,5 +25,13 @@ class PoolingKernelGPURef : public PoolingKernelBase {
 
     KernelsData GetKernelsData(const Params& params, const optional_params& options) const override;
     ParamsKey GetSupportedKey() const override;
+    std::vector<FusedOpType> GetSupportedFusedOps() const override {
+        return { FusedOpType::QUANTIZE,
+                 FusedOpType::SCALE,
+                 FusedOpType::ACTIVATION };
+    }
+
+protected:
+    JitConstants GetJitConstants(const pooling_params& params, DispatchData kd) const override;
 };
-}  // namespace kernel_selector
\ No newline at end of file
+}  // namespace kernel_selector
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp
index 6b9fd4b27d667c..3177325901d5a0 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/pooling/pooling_kernel_selector.cpp
@@ -32,7 +32,7 @@ namespace kernel_selector {
 
 pooling_kernel_selector::pooling_kernel_selector() {
     Attach<PoolingKernelGPURef>();
-    // Attach<PoolingKernelGPUAverageOpt>(); TODO: fix the kernel as it reads out of bounds now
+    //Attach<PoolingKernelGPUAverageOpt>(); TODO: fix the kernel as it reads out of bounds now
     Attach<PoolingKernelGPUByxfOpt>();
     Attach<PoolingKernelGPUBfyxBlockOpt>();
     Attach<PoolingKernelGPUByxfPaddingOpt>();
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl
index 0adfb299c4f779..6af3b271d8b38f 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -161,9 +161,14 @@ KERNEL(convolution_bfyx_f16)(
     vec_t dst = INPUT0_VAL_ZERO;
 #endif  // BIAS_TERM
 
-#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD
-    for (uint g = group; g < group + groups_per_sub_group; g++) {
+#if MULTIPLE_GROUPS_INPUT_PRELOAD
+    const uint in_split_offset = f_block * input_fs_pitch;
+    const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
+    const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
+    const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
+#else
 #if GROUPED
+    for (uint g = group; g < group + groups_per_sub_group; g++) {
         const uint in_split_offset = g * input_fs_pitch * (FILTER_IFM_NUM / FEATURE_SLICE_SIZE);
         const uint filter_split_offset = g * FILTER_GROUPS_PITCH;
         const uint filter_offset = (f_block % (FILTER_OFM_NUM / FEATURE_SLICE_SIZE)) * filter_os_pitch;
@@ -173,11 +178,6 @@ KERNEL(convolution_bfyx_f16)(
         const uint filter_offset = f_block * filter_os_pitch;
 #endif  // GROUPED
         const uint grouped_filter_offset = filter_offset + filter_split_offset;
-#else
-        const uint in_split_offset = f_block * input_fs_pitch;
-        const uint g = lid / (FEATURE_SLICE_SIZE / groups_per_sub_group);
-        const uint ofm_in_group = lid % (FEATURE_SLICE_SIZE / groups_per_sub_group);
-        const uint grouped_filter_offset = (group + g) * FILTER_GROUPS_PITCH;
 #endif  // MULTIPLE_GROUPS_INPUT_PRELOAD
 
         const uint grouped_input_offset = input_offset + in_split_offset;
@@ -248,7 +248,11 @@ KERNEL(convolution_bfyx_f16)(
                     vec_t src;
                     __attribute__((opencl_unroll_hint(OUTPUT_X_BLOCK_SIZE)))
                     for (int i = 0; i < OUTPUT_X_BLOCK_SIZE; i++) {
+#if FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
+                        src[i] = line_cache[i];
+#else
                         src[i] = line_cache[kw*DILATION_SIZE_X + STRIDE_SIZE_X*i];
+#endif  // FILTER_SIZE_X == 1 && DILATION_SIZE_X == 1 && STRIDE_SIZE_X == 1
                     }
 #if MULTIPLE_GROUPS_INPUT_PRELOAD
                     typedef MAKE_VECTOR_TYPE(FILTER_TYPE, FILTER_IFM_NUM) ifm_vec_t;
@@ -345,9 +349,9 @@ KERNEL(convolution_bfyx_f16)(
                 }
             }
         }
-#ifndef MULTIPLE_GROUPS_INPUT_PRELOAD
+#if GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
     }
-#endif  // MULTIPLE_GROUPS_INPUT_PRELOAD
+#endif  // GROUPED && !MULTIPLE_GROUPS_INPUT_PRELOAD
     dst = ACTIVATION(dst, ACTIVATION_PARAMS);
 
     typedef MAKE_VECTOR_TYPE(OUTPUT_TYPE, OUTPUT_X_BLOCK_SIZE) out_vec_t;
@@ -370,7 +374,7 @@ KERNEL(convolution_bfyx_f16)(
     else
 #endif  // OUTPUT_LEFTOVERS
     {
-        if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X) {
+        if (x + OUTPUT_X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE == 0) {
 #if HAS_FUSED_OPS
             FUSED_OPS_VEC;
             res = FUSED_OPS_RESULT_VEC;
@@ -390,8 +394,7 @@ KERNEL(convolution_bfyx_f16)(
 #   error convolution_gpu_bfyx_f16.cl: Unsupported output x block size.
 #endif
         } else {
-            const int x_tail = OUTPUT_SIZE_X - x;
-            for (int i = 0; i < x_tail; i++) {
+            for (int i = 0; i < OUTPUT_SIZE_X % OUTPUT_X_BLOCK_SIZE; i++) {
 #if HAS_FUSED_OPS
                 FUSED_OPS_SCALAR;
                 res[i] = FUSED_OPS_RESULT_SCALAR;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
index 25a2b36197a912..155ed590e73113 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/convolution_gpu_bfyx_f16_1x1.cl
@@ -208,21 +208,10 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
 #endif
     {
 #if !PADDED_OUTPUT
-        if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y) {
-#if HAS_FUSED_OPS
-            FUSED_OPS_VEC;
-            dst = FUSED_OPS_RESULT_VEC;
-#endif
-#if X_BLOCK_SIZE == 8
-            UNIT_BLOCK_WRITE8(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
-#elif X_BLOCK_SIZE == 4
-            UNIT_BLOCK_WRITE4(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
-#elif X_BLOCK_SIZE == 2
-            UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
-#endif
-        } else {
+        if (xy * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X * OUTPUT_SIZE_Y || (OUTPUT_SIZE_X * OUTPUT_SIZE_Y) % X_BLOCK_SIZE == 0) {
 #else
-        if (x * X_BLOCK_SIZE + X_BLOCK_SIZE <= OUTPUT_SIZE_X) {
+        if (x + X_BLOCK_SIZE <= OUTPUT_SIZE_X || OUTPUT_SIZE_X % X_BLOCK_SIZE == 0) {
+#endif
 #if HAS_FUSED_OPS
             FUSED_OPS_VEC;
             dst = FUSED_OPS_RESULT_VEC;
@@ -235,7 +224,6 @@ KERNEL(convolution_b_fs_yx_fsv16_1x1)(
             UNIT_BLOCK_WRITE2(output, output_offset + y * output_y_pitch + x * output_x_pitch, dst);
 #endif
         } else {
-#endif
             for (int i = 0; i < X_BLOCK_SIZE; i++) {
                 if (xy * X_BLOCK_SIZE + i >= OUTPUT_SIZE_X * OUTPUT_SIZE_Y)
                     return;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_average_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_average_opt.cl
index 57205585849bb0..a10c90a68c587b 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_average_opt.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_average_opt.cl
@@ -17,7 +17,10 @@
 
 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
 __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
-KERNEL(pooling_gpu_average_opt)(const __global float* input, __global float* output)
+KERNEL(pooling_gpu_average_opt)(
+    const __global INPUT0_TYPE* input,
+     __global OUTPUT_TYPE* output
+)
 {
     int local_id = get_local_id(0);
     int tile_x = get_global_id(0);
@@ -39,7 +42,7 @@ KERNEL(pooling_gpu_average_opt)(const __global float* input, __global float* out
     // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
     // In the diagram above X represents the current work item.
 
-    const __global float* base_addr = input + offset + (start_y * INPUT0_SIZE_X + start_x) - 1;
+    const __global INPUT0_TYPE* base_addr = input + offset + (start_y * INPUT0_SIZE_X + start_x) - 1;
 
     float input_buffer[3];
     input_buffer[0] = as_float(intel_sub_group_block_read((const __global uint*)(base_addr - INPUT0_SIZE_X)));
@@ -92,10 +95,12 @@ KERNEL(pooling_gpu_average_opt)(const __global float* input, __global float* out
             res = (sum + sum_1 + sum_2) * ONE_OVER_POOL_SIZE;
         }
 #endif
+        OUTPUT_TYPE final_result;
 
         if ((local_id < TILE_WIDTH) && (offset_x < INPUT0_SIZE_X))
         {
-            output[offset + y * INPUT0_SIZE_X + offset_x] = ACTIVATION(res, ACTIVATION_PARAMS);
+            final_result = TO_OUTPUT_TYPE(ACTIVATION(res, ACTIVATION_PARAMS));
+            output[offset + y * INPUT0_SIZE_X + offset_x] = final_result;
         }
 
         first = (first + 1) % 3;
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl
index 23bca7c504ae74..08c4bf32ae0bf5 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_b_fs_yx_fsv4.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -19,24 +19,26 @@
 #define ALIGN_TO(val, multiple) (((val) + (multiple) - 1) / (multiple) * (multiple))
 
 #define AS_TYPE(type, val) CAT(as_, type)(val)
-#define IN_VEC4 MAKE_VECTOR_TYPE(INPUT0_TYPE, 4)
-#define OUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
-#define CONVERT_OUT CAT(convert_, OUTPUT_TYPE)
-#define CONVERT_OUT_VEC4 CAT(convert_, OUT_VEC4)
+#define INPUT_VEC4 MAKE_VECTOR_TYPE(INPUT0_TYPE, 4)
 
-#if MAX_POOLING
-    #define INIT_VAL CHAR_MIN
-#elif AVG_POOLING
-    #define INIT_VAL 0
+#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4)
+#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4)
+
+#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
+#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4)
+
+#if   defined MAX_POOLING
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
+#elif defined AVG_POOLING
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #else
-#error
+    #error
 #endif
 
-
-inline int FUNC(apply_pooling)(int tmp, int in)
+inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
 {
 #if MAX_POOLING
-    return max(tmp, in);
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #elif AVG_POOLING
     return tmp + in;
 #endif
@@ -59,7 +61,7 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)(
     const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
     const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
 
-    int result[4] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };
+    ACCUMULATOR_TYPE result[4] = { INIT_VAL, INIT_VAL, INIT_VAL, INIT_VAL };
 
 #ifdef CHECK_BOUNDRY
     if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
@@ -88,11 +90,11 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)(
                     const uint input_idx = batch_and_feature_offset + input_offset_y*IN_Y_PITCH + input_offset_x*IN_X_PITCH;
 
                     int int_data   = *((const __global int*)(input + input_idx));
-                    IN_VEC4 ch4_data = AS_TYPE(IN_VEC4, int_data);
-                    result[0] = FUNC_CALL(apply_pooling)(result[0], (int)ch4_data[0]);
-                    result[1] = FUNC_CALL(apply_pooling)(result[1], (int)ch4_data[1]);
-                    result[2] = FUNC_CALL(apply_pooling)(result[2], (int)ch4_data[2]);
-                    result[3] = FUNC_CALL(apply_pooling)(result[3], (int)ch4_data[3]);
+                    INPUT_VEC4 ch4_data = AS_TYPE(INPUT_VEC4, int_data);
+                    result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(ch4_data[0]));
+                    result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(ch4_data[1]));
+                    result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(ch4_data[2]));
+                    result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(ch4_data[3]));
 
 #ifdef DYNAMIC_KERNEL_DIVIDER
                     num_elements++;
@@ -114,11 +116,11 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)(
         for(uint i = 0; i < POOL_SIZE_X; i++)
         {
             int int_data   = *((const __global int*)(input + input_idx));
-            IN_VEC4 ch4_data = AS_TYPE(IN_VEC4, int_data);
-            result[0] = FUNC_CALL(apply_pooling)(result[0], (int)ch4_data[0]);
-            result[1] = FUNC_CALL(apply_pooling)(result[1], (int)ch4_data[1]);
-            result[2] = FUNC_CALL(apply_pooling)(result[2], (int)ch4_data[2]);
-            result[3] = FUNC_CALL(apply_pooling)(result[3], (int)ch4_data[3]);
+            INPUT_VEC4 ch4_data = AS_TYPE(INPUT_VEC4, int_data);
+            result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(ch4_data[0]));
+            result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(ch4_data[1]));
+            result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(ch4_data[2]));
+            result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(ch4_data[3]));;
 
             input_idx += IN_X_PITCH;
         }
@@ -132,47 +134,48 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)(
 
 #if defined AVG_POOLING
 #if ENABLE_ROUND
-    int4 pool_result;
+    int4 not_fused_result;
     for(uint i = 0; i < 4; i++) {
     #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-        result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
+        not_fused_result[i] = convert_int(round(((float)result[i] / max(num_elements, (uint)1))));
     #else
-        result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
+        not_fused_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
     #endif
     }
 #else
-    float4 pool_result;
+    float4 not_fused_result;
     for(uint i = 0; i < 4; i++) {
     #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-        pool_result[i] = (float)result[i] / max(num_elements, (uint)1);
+        not_fused_result[i] = (float)result[i] / max(num_elements, (uint)1);
     #else
-        pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
+        not_fused_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
     #endif
     }
 #endif  // ENABLE_ROUND
 #else  // AVG_POOLING
-    int4 pool_result;
+    int4 not_fused_result;
     for (uint i = 0; i < 4; ++i) {
-        pool_result[i] = result[i];
+        not_fused_result[i] = result[i];
     }
 #endif  // AVG_POOLING
 
+    ACTIVATION_VEC4 pool_result = TO_ACTIVATION_VEC4(not_fused_result);
+
 #if HAS_FUSED_OPS
     FUSED_OPS;
-    OUT_VEC4 final_result = FUSED_OPS_RESULT;
+    OUTPUT_VEC4 final_result = FUSED_OPS_RESULT;
 #else
-    OUT_VEC4 final_result = CONVERT_OUT_VEC4(pool_result);
-#endif
-
+    OUTPUT_VEC4 final_result = TO_OUTPUT_VEC4(pool_result);
     for(uint op = 0; op < 4; op++)
     {
         final_result[op] = ACTIVATION(final_result[op], ACTIVATION_PARAMS);
     }
+#endif
 
 #if OUTPUT_LAYOUT_B_FS_YX_FSV4 || OUTPUT_LAYOUT_BYXF_AF32
     const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
 #if OUTPUT_FEATURE_NUM % 4 == 0
-    *((__global OUT_VEC4*)(output + output_pos)) = final_result;
+    *((__global OUTPUT_VEC4*)(output + output_pos)) = final_result;
 #else
     for (uint i = 0; i < 4; ++i) {
         if (f + i < OUTPUT_FEATURE_NUM) {
@@ -191,8 +194,12 @@ KERNEL(pooling_gpu_b_fs_yx_fsv4)(
 
 #undef ALIGN_TO
 #undef AS_TYPE
-#undef IN_VEC4
-#undef OUT_VEC4
-#undef CONVERT_OUT
-#undef CONVERT_OUT_VEC4
+
 #undef INIT_VAL
+#undef INPUT_VEC4
+
+#undef ACTIVATION_VEC4
+#undef TO_ACTIVATION_VEC4
+
+#undef OUTPUT_VEC4
+#undef TO_OUTPUT_VEC4
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bfyx_block_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bfyx_block_opt.cl
index a72c5bbacd89cc..960d4933ddf03e 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bfyx_block_opt.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bfyx_block_opt.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2017 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,57 +16,58 @@
 #include "include/include_all.cl"
 
 #if MAX_POOLING || MAX_WITH_ARGMAX_POOLING
-    #define UNIT_INIT_VAL UNIT_VAL_MIN
-#elif AVG_POOLING
-    #define UNIT_INIT_VAL UNIT_VAL_ZERO
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
+#elif defined AVG_POOLING
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #else
-#error
+    #error
 #endif
 
-
-inline UNIT_TYPE FUNC(apply_pooling)(UNIT_TYPE tmp, UNIT_TYPE in)
+inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
 {
 #if MAX_POOLING || MAX_WITH_ARGMAX_POOLING
-    return max(tmp, in);
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #elif AVG_POOLING
     return tmp + in;
 #endif
 }
 
-KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
+KERNEL(pooling_gpu)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
 #if MAX_WITH_ARGMAX_POOLING
-, __global float* arg_max
+    , __global float* arg_max
+#endif
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
 #endif
 )
 {
-
     const uint x    = (uint)get_global_id(0);
     const uint y    = (uint)get_global_id(1) * POOL_SIZE_Y;
     const uint bf   = (uint)get_global_id(2);
     const uint f    = bf % INPUT0_FEATURE_NUM;
     const uint b    = bf / INPUT0_FEATURE_NUM;
-    
+
     if ((x >= OUTPUT_SIZE_X) || (y >= OUTPUT_SIZE_Y))
         return;
 
     const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
     const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
-    
-    UNIT_TYPE result = UNIT_INIT_VAL;
-    
+
     uint input_idx = GET_DATA_INDEX(INPUT0, b, f, offset_y, offset_x);
-    UNIT_TYPE max_x[BLOCK_SIZE_Y];
-    UNIT_TYPE out[POOL_SIZE_Y];
+    ACCUMULATOR_TYPE max_x[BLOCK_SIZE_Y];
+    ACCUMULATOR_TYPE result[POOL_SIZE_Y];
 
 #if MAX_WITH_ARGMAX_POOLING
     uint arg_max_x[BLOCK_SIZE_Y] = { 0 };
-    uint arg_max_out[POOL_SIZE_Y] = { 0 };
+    uint arg_max_result[POOL_SIZE_Y] = { 0 };
     uint input_idx_bfyx_no_padding = offset_x + INPUT0_SIZE_X * (offset_y + INPUT0_SIZE_Y * (f + INPUT0_FEATURE_NUM * b));
 #endif
 
     for(uint i = 0; i < BLOCK_SIZE_Y; i++)
     {
-        max_x[i] = UNIT_INIT_VAL;
+        max_x[i] = INIT_VAL;
     }
 
     // we do max in "x" dimension
@@ -79,7 +80,7 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
             if(input[input_idx] > max_x[j])
                 arg_max_x[j] = input_idx_bfyx_no_padding;
 #endif
-            max_x[j] = FUNC_CALL(apply_pooling)(max_x[j], input[input_idx]);
+            max_x[j] = FUNC_CALL(apply_pooling)(max_x[j], TO_ACCUMULATOR_TYPE(input[input_idx]));
             input_idx += INPUT0_X_PITCH;
 
 #if MAX_WITH_ARGMAX_POOLING
@@ -96,10 +97,10 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
 
     for(uint i = 0; i < POOL_SIZE_Y; i++)
     {
-        out[i] = max_x[i * STRIDE_SIZE_Y];
+        result[i] = max_x[i * STRIDE_SIZE_Y];
 
 #if MAX_WITH_ARGMAX_POOLING
-        arg_max_out[i] = arg_max_x[i * STRIDE_SIZE_Y];
+        arg_max_result[i] = arg_max_x[i * STRIDE_SIZE_Y];
 #endif
     }
 
@@ -110,11 +111,11 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
         {
 
 #if MAX_WITH_ARGMAX_POOLING
-            if(max_x[j + i * STRIDE_SIZE_Y] > out[i])
-                arg_max_out[i] = arg_max_x[j + i * STRIDE_SIZE_Y];
+            if(max_x[j + i * STRIDE_SIZE_Y] > result[i])
+                arg_max_result[i] = arg_max_x[j + i * STRIDE_SIZE_Y];
 #endif
 
-            out[i] = FUNC_CALL(apply_pooling)(out[i], max_x[j + i * STRIDE_SIZE_Y]);
+            result[i] = FUNC_CALL(apply_pooling)(result[i], max_x[j + i * STRIDE_SIZE_Y]);
         }
     }
 
@@ -124,22 +125,31 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
     uint arg_max_pos = GET_DATA_INDEX(INPUT1, b, f, y, x);
 #endif
 
+    OUTPUT_TYPE final_result;
+    ACTIVATION_TYPE pool_result;
+
     for(uint i = 0; i < POOL_SIZE_Y; i++)
     {
         if((y + i) < OUTPUT_SIZE_Y)
         {
 #if defined AVG_POOLING
-            out[i] /= (UNIT_TYPE)(POOL_SIZE_Y * POOL_SIZE_X);
+            result[i] /= TO_ACCUMULATOR_TYPE(POOL_SIZE_Y * POOL_SIZE_X);
 #endif
-            output[output_pos] = ACTIVATION(out[i], ACTIVATION_PARAMS);
+            pool_result = TO_ACTIVATION_TYPE(result[i]);
+        #if HAS_FUSED_OPS
+            FUSED_OPS;
+            final_result = FUSED_OPS_RESULT;
+        #else
+            final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS));
+        #endif
+            output[output_pos] = final_result;
             output_pos += OUTPUT_Y_PITCH;
-
 #if MAX_WITH_ARGMAX_POOLING
-            arg_max[arg_max_pos] = arg_max_out[i];
+            arg_max[arg_max_pos] = arg_max_result[i];
             arg_max_pos += INPUT1_Y_PITCH;
 #endif
         }
     }
 }
 
-#undef UNIT_INIT_VAL
\ No newline at end of file
+#undef INIT_VAL
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl
index a9918ac970ef10..c20dbc1775a20b 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_blocked.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,25 +14,39 @@
 
 
 #include "include/include_all.cl"
-#include "include/unit_type.cl"
+#include "include/data_types.cl"
 
 #define FEATURE_SLICE_SIZE 16
 #if X_BLOCK_SIZE > 1
-#define vec_t MAKE_VECTOR_TYPE(UNIT_TYPE, X_BLOCK_SIZE)
+    #define INPUT_VAR_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, X_BLOCK_SIZE)
+    #define OUTPUT_VAR_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, X_BLOCK_SIZE)
+    #define ACCUMULATOR_VAR_TYPE MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, X_BLOCK_SIZE)
+    #define ACTIVATION_VAR_TYPE MAKE_VECTOR_TYPE(ACTIVATION_TYPE, X_BLOCK_SIZE)
 #else
-#define vec_t UNIT_TYPE
+    #define INPUT_VAR_TYPE INPUT0_TYPE
+    #define OUTPUT_VAR_TYPE OUTPUT_TYPE
+    #define ACCUMULATOR_VAR_TYPE ACCUMULATOR_TYPE
+    #define ACTIVATION_VAR_TYPE ACTIVATION_TYPE
 #endif
 
+#define TO_OUTPUT_VAR_TYPE(x) CAT(convert_, OUTPUT_VAR_TYPE)(x)
+#define TO_ACCUMULATOR_VAR_TYPE CAT(convert_, ACCUMULATOR_VAR_TYPE)
+#define TO_ACTIVATION_VAR_TYPE CAT(convert_, ACTIVATION_VAR_TYPE)
+
 #if   defined MAX_POOLING
-    #define UNIT_INIT_VAL UNIT_VAL_MIN
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
 #elif defined AVG_POOLING
-    #define UNIT_INIT_VAL UNIT_VAL_ZERO
-#else
-#error
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #endif
 
 __attribute__((intel_reqd_sub_group_size(16)))
-KERNEL(pooling_gpu_blocked)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)
+KERNEL(pooling_gpu_blocked)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
 {
     const int lid = get_sub_group_local_id();
     const int f_block = get_group_id(1);
@@ -74,10 +88,10 @@ KERNEL(pooling_gpu_blocked)(const __global UNIT_TYPE* input, __global UNIT_TYPE*
                                (x + OUTPUT_PAD_BEFORE_SIZE_X) * output_x_pitch;
 
 
-    vec_t dst = (vec_t)UNIT_INIT_VAL;
+    ACCUMULATOR_VAR_TYPE dst = (ACCUMULATOR_VAR_TYPE)INIT_VAL;
 
 #if AVG_POOLING && (defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER))
-    UNIT_TYPE count;
+    ACCUMULATOR_TYPE count;
     if (lid < X_BLOCK_SIZE)
     {
 #if defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
@@ -91,10 +105,10 @@ KERNEL(pooling_gpu_blocked)(const __global UNIT_TYPE* input, __global UNIT_TYPE*
         int x_max = min(input_x + lid*STRIDE_SIZE_X + POOL_SIZE_X, INPUT0_SIZE_X);
         int y_max = min(input_y + POOL_SIZE_Y, INPUT0_SIZE_Y);
 #endif
-        count = (UNIT_TYPE)(1.f / (float)((y_max - y_min) * (x_max - x_min)));
+        count = TO_ACCUMULATOR_TYPE(1.f / (float)((y_max - y_min) * (x_max - x_min)));
     }
 
-    vec_t scale;
+    ACCUMULATOR_VAR_TYPE scale;
 #if X_BLOCK_SIZE > 1
     for (int i = 0; i < X_BLOCK_SIZE; i++)
         scale[i] = intel_sub_group_shuffle(count, i);
@@ -108,80 +122,138 @@ KERNEL(pooling_gpu_blocked)(const __global UNIT_TYPE* input, __global UNIT_TYPE*
         if (input_y + kh < 0 || input_y + kh >= INPUT0_SIZE_Y)
             continue;
 
-        UNIT_TYPE line_cache[INPUT_LINE_SIZE];
+        INPUT0_TYPE line_cache[INPUT_LINE_SIZE];
         for (int i = 0; i < INPUT_LINE_SIZE; i++) {
             if ((input_x + i) >= 0 && (input_x + i) < INPUT0_SIZE_X)
-                line_cache[i] = UNIT_BLOCK_READ(input, input_offset + kh*input_y_pitch + i*input_x_pitch);
+                line_cache[i] = DT_INPUT_BLOCK_READ(input, input_offset + kh*input_y_pitch + i*input_x_pitch);
             else
-                line_cache[i] = UNIT_INIT_VAL;
+                #if   defined MAX_POOLING
+                    line_cache[i] = INPUT0_VAL_MIN;
+                #elif defined AVG_POOLING
+                    line_cache[i] = INPUT0_VAL_ZERO;
+                #endif
         }
 
         __attribute__((opencl_unroll_hint(POOL_SIZE_X)))
         for (int kw = 0; kw < POOL_SIZE_X; kw++)
         {
-            vec_t src;
+            ACCUMULATOR_VAR_TYPE src;
 #if X_BLOCK_SIZE > 1
             for (int i = 0; i < X_BLOCK_SIZE; i++) {
-                src[i] = line_cache[kw + STRIDE_SIZE_X*i];
+                src[i] = TO_ACCUMULATOR_TYPE(line_cache[kw + STRIDE_SIZE_X*i]);
             }
 #else
-            src = line_cache[kw];
+            src = TO_ACCUMULATOR_VAR_TYPE(line_cache[kw]);
 #endif
 
 #if defined MAX_POOLING
-            dst = max(dst, src);
+            dst = ACCUMULATOR_MAX_FUNC(dst, src);
 #elif defined AVG_POOLING
             dst += src;
 #endif
         }
     }
 
+    ACTIVATION_VAR_TYPE pool_result;
+
 #if defined MAX_POOLING
-    dst = ACTIVATION(dst, ACTIVATION_PARAMS);
+        pool_result = TO_ACTIVATION_VAR_TYPE(dst);
+    #if !HAS_FUSED_OP
+        pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
+    #endif
 #elif defined AVG_POOLING && (defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER))
-    dst = ACTIVATION((dst*scale), ACTIVATION_PARAMS);
+        pool_result = TO_ACTIVATION_VAR_TYPE(dst*scale);
+    #if !HAS_FUSED_OP
+        pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
+    #endif
 #elif defined AVG_POOLING
-    dst = ACTIVATION((dst/(POOL_SIZE_X*POOL_SIZE_Y)), ACTIVATION_PARAMS);
+        pool_result = TO_ACTIVATION_VAR_TYPE(dst/(POOL_SIZE_X*POOL_SIZE_Y));
+    #if !HAS_FUSED_OP
+        pool_result = ACTIVATION(pool_result, ACTIVATION_PARAMS);
+    #endif
 #endif
 
+    OUTPUT_VAR_TYPE final_result;
+
 #if OUTPUT_LEFTOVERS
     if ((f_block+1)*FEATURE_SLICE_SIZE >= OUTPUT_FEATURE_NUM) {
         for (int i = 0; i < X_BLOCK_SIZE; i++) {
-            if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X)
+            if ((f_block*FEATURE_SLICE_SIZE + lid < OUTPUT_FEATURE_NUM) && (x + i) < OUTPUT_SIZE_X) {
 #if X_BLOCK_SIZE > 1
-                output[output_offset + i * output_x_pitch + lid] = dst[i];
+            #if HAS_FUSED_OP
+                FUSED_OPS_SCALAR;
+                final_result[i] = FUSED_OPS_RESULT_SCALAR;
+            #else
+                final_result[i] = TO_OUTPUT_TYPE(pool_result[i]);
+            #endif
+                output[output_offset + i * output_x_pitch + lid] = final_result[i];
 #else
-                output[output_offset + i * output_x_pitch + lid] = dst;
+            #if HAS_FUSED_OPS
+                FUSED_OPS_VEC;
+                final_result = FUSED_OPS_RESULT_VEC;
+            #else
+                final_result = TO_OUTPUT_VAR_TYPE(pool_result);
+            #endif
+                output[output_offset + i * output_x_pitch + lid] = final_result;
+
 #endif
+            }
         }
     }
     else
 #endif  // OUTPUT_LEFTOVERS
     if (x + X_BLOCK_SIZE <= OUTPUT_SIZE_X)
     {
-#if X_BLOCK_SIZE == 8
-        UNIT_BLOCK_WRITE8(output, output_offset, dst);
-#elif X_BLOCK_SIZE == 4
-        UNIT_BLOCK_WRITE4(output, output_offset, dst);
-#elif X_BLOCK_SIZE == 2
-        UNIT_BLOCK_WRITE2(output, output_offset, dst);
-#elif X_BLOCK_SIZE == 1
-        UNIT_BLOCK_WRITE(output, output_offset, dst);
-#endif
+        #if HAS_FUSED_OPS
+                FUSED_OPS_VEC;
+                final_result = FUSED_OPS_RESULT_VEC;
+        #else
+                final_result = TO_OUTPUT_VAR_TYPE(pool_result);
+        #endif
+
+        #if X_BLOCK_SIZE == 8
+                DT_OUTPUT_BLOCK_WRITE8(output, output_offset, final_result);
+        #elif X_BLOCK_SIZE == 4
+                DT_OUTPUT_BLOCK_WRITE4(output, output_offset, final_result);
+        #elif X_BLOCK_SIZE == 2
+                DT_OUTPUT_BLOCK_WRITE2(output, output_offset, final_result);
+        #elif X_BLOCK_SIZE == 1
+                DT_OUTPUT_BLOCK_WRITE(output, output_offset, final_result);
+        #endif
     }
     else
     {
         const int x_tail = OUTPUT_SIZE_X - x;
-        for (int i = 0; i < x_tail; i++)
+        for (int i = 0; i < x_tail; i++){
 #if X_BLOCK_SIZE > 1
-            UNIT_BLOCK_WRITE(output, output_offset + i*output_x_pitch, dst[i]);
+        #if HAS_FUSED_OPS
+            FUSED_OPS_SCALAR;
+            final_result[i] = FUSED_OPS_RESULT_SCALAR;
+        #else
+            final_result[i] = TO_OUTPUT_TYPE(pool_result[i]);
+        #endif
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset + i*output_x_pitch, final_result[i]);
 #else
-            UNIT_BLOCK_WRITE(output, output_offset + i*output_x_pitch, dst);
+        #if HAS_FUSED_OPS
+            FUSED_OPS_VEC;
+            final_result = FUSED_OPS_RESULT_VEC;
+        #else
+            final_result = TO_OUTPUT_VAR_TYPE(pool_result);
+        #endif
+            DT_OUTPUT_BLOCK_WRITE(output, output_offset + i*output_x_pitch, final_result);
 #endif
+        }
     }
-
-
 }
 
-#undef UNIT_INIT_VAL
+#undef INIT_VAL
 #undef FEATURE_SLICE_SIZE
+
+#undef INPUT_VAR_TYPE
+#undef OUTPUT_VAR_TYPE
+#undef TO_OUTPUT_VAR_TYPE
+
+#undef ACCUMULATOR_VAR_TYPE
+
+#undef ACTIVATION_VAR_TYPE
+#undef TO_ACTIVATION_VAR_TYPE
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bsv16_fsv16.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bsv16_fsv16.cl
index e5bac34d0bec2d..c0fb62e8e050e0 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bsv16_fsv16.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_bsv16_fsv16.cl
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019 Intel Corporation
+* Copyright (c) 2020 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -13,7 +13,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
-#include "include/unit_type.cl"
 #include "include/include_all.cl"
 
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
@@ -28,18 +27,29 @@
 #define HAS_PAD_Y (PADDING_SIZE_Y != 0)
 #define HAS_PAD_X (PADDING_SIZE_X != 0)
 
+#define INPUT_VEC8 MAKE_VECTOR_TYPE(INPUT0_TYPE, 8)
+
+#define ACCUMULATOR_VEC8 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 8)
+#define TO_ACCUMULATOR_VEC8 CAT(convert_, ACCUMULATOR_VEC8)
+
+#define ACTIVATION_VEC8 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 8)
+#define TO_ACTIVATION_VEC8 CAT(convert_, ACTIVATION_VEC8)
+
+#define OUTPUT_VEC8 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8)
+#define TO_OUTPUT_VEC8 CAT(convert_, OUTPUT_VEC8)
+
+#define unroll_for __attribute__((opencl_unroll_hint)) for
+
 #if MAX_POOLING
-#define INIT_VAL INPUT0_VAL_MIN
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
 #elif AVG_POOLING
-#define INIT_VAL 0
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #endif
 
-#define unroll_for __attribute__((opencl_unroll_hint)) for
-
-inline UNIT_TYPE8 FUNC(apply_pooling)(UNIT_TYPE8 tmp, UNIT_TYPE8 in)
+inline ACCUMULATOR_VEC8 FUNC(apply_pooling)(ACCUMULATOR_VEC8 tmp, ACCUMULATOR_VEC8 in)
 {
 #if MAX_POOLING
-    return INPUT0_MAX_FUNC(tmp, in);
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #elif AVG_POOLING
     return tmp + in;
 #endif
@@ -49,7 +59,13 @@ __attribute__((reqd_work_group_size(SUB_GROUP_SIZE, 1, 1)))
 #if SUB_GROUP_SIZE != 1
 __attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE)))
 #endif
-KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)
+KERNEL(pooling_gpu_bsv16_fsv16)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
 {
     const int oc = get_group_id(0) * OC_BLOCK;
     const int sp = get_group_id(1);
@@ -71,7 +87,7 @@ KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_T
     int in_x = x * STRIDE_SIZE_X - PADDING_SIZE_X;
     int pool_elementes = 0;
 
-    __global UNIT_TYPE *dst_write0 = output
+    __global OUTPUT_TYPE *dst_write0 = output
             + b * OUTPUT_FEATURE_NUM * (OUTPUT_SIZE_Z * OUTPUT_SIZE_Y * OUTPUT_SIZE_X)
             + oc * (OUTPUT_SIZE_Z * OUTPUT_SIZE_Y * OUTPUT_SIZE_X) * OC_BLOCK
             + z * OUTPUT_SIZE_Y * OUTPUT_SIZE_X * OC_BLOCK * MB_BLOCK
@@ -84,8 +100,8 @@ KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_T
             + in_y * INPUT0_SIZE_X_WITH_PADDING * IC_BLOCK * MB_BLOCK
             + in_z * INPUT0_SIZE_Y_WITH_PADDING * INPUT0_SIZE_X_WITH_PADDING * IC_BLOCK * MB_BLOCK;
 
-    UNIT_TYPE8 blockC00 = (UNIT_TYPE8)(INIT_VAL);
-    UNIT_TYPE8 blockC01 = (UNIT_TYPE8)(INIT_VAL);
+    ACCUMULATOR_VEC8 blockC00 = (ACCUMULATOR_VEC8)(INIT_VAL);
+    ACCUMULATOR_VEC8 blockC01 = (ACCUMULATOR_VEC8)(INIT_VAL);
 
 #if ((HAS_PAD_Z && POOL_SIZE_Z == 1) || (HAS_PAD_Y && POOL_SIZE_Y == 1) || (HAS_PAD_X && POOL_SIZE_X == 1))
     if (!(in_z < 0 || in_z >= INPUT0_SIZE_Z_WITH_PADDING || in_y < 0 || in_y >= INPUT0_SIZE_Y_WITH_PADDING || in_x < 0 || in_x >= INPUT0_SIZE_X_WITH_PADDING)) {
@@ -105,26 +121,25 @@ KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_T
 #endif
                     continue;
                 }
-
                 const uint idx = p_z * INPUT0_SIZE_Y_WITH_PADDING * INPUT0_SIZE_X_WITH_PADDING * IC_BLOCK * MB_BLOCK
                                  + p_y * INPUT0_SIZE_X_WITH_PADDING * IC_BLOCK * MB_BLOCK
                                  + p_x * IC_BLOCK * MB_BLOCK;
-                const __global UNIT_TYPE *src1 = input + idx;
+                const __global INPUT0_TYPE *src1 = input + idx;
 #else
-                const __global UNIT_TYPE *src1 = input;
+                const __global INPUT0_TYPE *src1 = input;
 #endif
+                INPUT_VEC8 blockA;
 
-                UNIT_TYPE8 blockA;
+                blockA = DT_INPUT_BLOCK_READ8(src1, 0);
 
-                blockA = UNIT_BLOCK_READ8(src1, 0);
+                blockC00 = FUNC_CALL(apply_pooling)(blockC00, TO_ACCUMULATOR_VEC8(blockA));
 
-                blockC00 = FUNC_CALL(apply_pooling)(blockC00, blockA);
+                blockA = DT_INPUT_BLOCK_READ8(src1, 8 * IC_BLOCK);
 
-                blockA = UNIT_BLOCK_READ8(src1, 8 * IC_BLOCK);
-
-                blockC01 = FUNC_CALL(apply_pooling)(blockC01, blockA);
+                blockC01 = FUNC_CALL(apply_pooling)(blockC01, TO_ACCUMULATOR_VEC8(blockA));
 
                 pool_elementes++;
+
 #if POOL_SIZE_Y != 1 || POOL_SIZE_X != 1 || POOL_SIZE_Z != 1
             }
 #endif
@@ -135,20 +150,43 @@ KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_T
 #if defined AVG_POOLING
 
 #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-    blockC00 /= max(pool_elementes, (int)1);
-    blockC01 /= max(pool_elementes, (int)1);
+    blockC00 /= (ACCUMULATOR_TYPE)max(pool_elementes, (int)1);
+    blockC01 /= (ACCUMULATOR_TYPE)max(pool_elementes, (int)1);
 #else
-    blockC00 /= (POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X);
-    blockC01 /= (POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X);
+    blockC00 /= (ACCUMULATOR_TYPE)POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X;
+    blockC01 /= (ACCUMULATOR_TYPE)POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X;
 #endif
 
 #endif
-
-    blockC00 = ACTIVATION(blockC00, ACTIVATION_PARAMS);
-    blockC01 = ACTIVATION(blockC01, ACTIVATION_PARAMS);
-
-    UNIT_BLOCK_WRITE8(dst_write0, 0, blockC00);
-    UNIT_BLOCK_WRITE8(dst_write0, 8 * OC_BLOCK, blockC01);
+    ACTIVATION_VEC8 pool_result;
+    OUTPUT_VEC8 final_result;
+
+    #if HAS_FUSED_OPS
+    {
+        #define BLOCK_NUM 0
+        pool_result = TO_ACTIVATION_VEC8(blockC00);
+        FUSED_OPS;
+        final_result = FUSED_OPS_RESULT;
+        DT_OUTPUT_BLOCK_WRITE8(dst_write0, 0, final_result);
+        #undef BLOCK_NUM
+    }
+    {
+        #define BLOCK_NUM 1
+        pool_result = TO_ACTIVATION_VEC8(blockC01);
+        FUSED_OPS;
+        final_result = FUSED_OPS_RESULT;
+        DT_OUTPUT_BLOCK_WRITE8(dst_write0, 8 * OC_BLOCK, final_result);
+        #undef BLOCK_NUM
+    }
+    #else
+        pool_result = TO_ACTIVATION_VEC8(blockC00);
+        final_result = TO_OUTPUT_VEC8(ACTIVATION(pool_result, ACTIVATION_PARAMS));
+        DT_OUTPUT_BLOCK_WRITE8(dst_write0, 0, final_result);
+
+        pool_result = TO_ACTIVATION_VEC8(blockC01);
+        final_result = TO_OUTPUT_VEC8(ACTIVATION(pool_result, ACTIVATION_PARAMS));
+        DT_OUTPUT_BLOCK_WRITE8(dst_write0, 8 * OC_BLOCK, final_result);
+    #endif
 }
 
 #undef INPUT0_SIZE_X_WITH_PADDING
@@ -164,3 +202,13 @@ KERNEL(pooling_gpu_bsv16_fsv16)(const __global UNIT_TYPE* input, __global UNIT_T
 #undef HAS_PAD_X
 
 #undef unroll_for
+#undef INPUT_VEC8
+
+#undef ACCUMULATOR_VEC8
+#undef TO_ACCUMULATOR_VEC8
+
+#undef ACTIVATION_VEC8
+#undef TO_ACTIVATION_VEC8
+
+#undef OUTPUT_VEC8
+#undef TO_OUTPUT_VEC8
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl
index 9317c5113971c6..b3829ec6a96f7b 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_af32.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2017 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,22 +15,26 @@
 
 #include "include/include_all.cl"
 
-#define OUTPUT_TYPE4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
-#define TO_OUTPUT_TYPE4(x) CAT(convert_, OUTPUT_TYPE4)(x)
+#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4)
+#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4)
+
+#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4)
+
+#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
+#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4)
 
 #if MAX_POOLING
-    #define INIT_VAL INPUT0_VAL_MIN
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
 #elif AVG_POOLING
-    #define INIT_VAL 0
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #else
-#error
+    #error
 #endif
 
-
-inline int FUNC(apply_pooling)(int tmp, int in)
+inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
 {
 #if MAX_POOLING
-    return max(tmp, in);
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #elif AVG_POOLING
     return tmp + in;
 #endif
@@ -61,7 +65,7 @@ KERNEL(pooling_gpu_byxf_af32)(
     const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
     const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
 
-    int4 result = INIT_VAL;
+    ACCUMULATOR_VEC4 result = INIT_VAL;
 
 #ifdef CHECK_BOUNDRY
     if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
@@ -90,10 +94,10 @@ KERNEL(pooling_gpu_byxf_af32)(
                     const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH;
 
                     input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
-                    result[0] = FUNC_CALL(apply_pooling)(result[0], (int)input_data[0]);
-                    result[1] = FUNC_CALL(apply_pooling)(result[1], (int)input_data[1]);
-                    result[2] = FUNC_CALL(apply_pooling)(result[2], (int)input_data[2]);
-                    result[3] = FUNC_CALL(apply_pooling)(result[3], (int)input_data[3]);
+                    result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
+                    result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
+                    result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
+                    result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
 
 #ifdef DYNAMIC_KERNEL_DIVIDER
                     num_elementes++;
@@ -115,10 +119,10 @@ KERNEL(pooling_gpu_byxf_af32)(
         for(uint i = 0; i < POOL_SIZE_X; i++)
         {
             input_t input_data = AS_INPUT_TYPE(intel_sub_group_block_read((const __global uint*)(input + input_idx)));
-            result[0] = FUNC_CALL(apply_pooling)(result[0], (int)input_data[0]);
-            result[1] = FUNC_CALL(apply_pooling)(result[1], (int)input_data[1]);
-            result[2] = FUNC_CALL(apply_pooling)(result[2], (int)input_data[2]);
-            result[3] = FUNC_CALL(apply_pooling)(result[3], (int)input_data[3]);
+            result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
+            result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
+            result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
+            result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
 
             input_idx += INPUT0_X_PITCH;
         }
@@ -132,44 +136,54 @@ KERNEL(pooling_gpu_byxf_af32)(
 
 #if defined AVG_POOLING
 #if ENABLE_ROUND
-    int4 pool_result;
+    int4 not_fused_result;
     for (uint i = 0; i < 4; ++i) {
     #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-        pool_result[i] = convert_int(round(((float)result[i] / max(num_elementes, (uint)1)));
+        not_fused_result[i] = convert_int(round(((float)result[i] / max(num_elementes, (uint)1)));
     #else
-        pool_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
+        not_fused_result[i] = convert_int(round((float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
     #endif
     }
 #else  // ENABLE_ROUND
-    float4 pool_result;
+    float4 not_fused_result;
     for (uint i = 0; i < 4; ++i) {
     #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-        pool_result[i] = (float)result[i] / max(num_elementes, (uint)1);
+        not_fused_result[i] = (float)result[i] / max(num_elementes, (uint)1);
     #else
-        pool_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
+        not_fused_result[i] = (float)result[i] / (int)(POOL_SIZE_Y * POOL_SIZE_X);
     #endif
     }
 #endif  // ENABLE_ROUND
 #else  // AVG_POOLING
-    int4 pool_result = result;
+    float4 not_fused_result = convert_float4(result);
 #endif  // AVG_POOLING
 
-    OUTPUT_TYPE4 final_result;
+    OUTPUT_VEC4 final_result;
 #if HAS_FUSED_OPS
+    ACTIVATION_VEC4 fused_pool_result = TO_ACTIVATION_VEC4(not_fused_result);
     FUSED_OPS;
     final_result = FUSED_OPS_RESULT;
+    for(uint op = 0; op < 4; op++)
+    {
+        const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
+        output[output_pos] = final_result[op];
+    }
 #else
-    final_result = TO_OUTPUT_TYPE4(pool_result);
+    final_result = TO_OUTPUT_VEC4(not_fused_result);
+    for(uint op = 0; op < 4; op++)
+    {
+        const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
+        final_result[op] = TO_OUTPUT_TYPE(ACTIVATION(not_fused_result[op], ACTIVATION_PARAMS));
+        output[output_pos] = final_result[op];
+    }
 #endif
-
-for(uint op = 0; op < 4; op++)
-{
-    const uint output_pos = GET_DATA_INDEX(OUTPUT, b, f+op, y, x);
-    output[output_pos] = ACTIVATION(TO_OUTPUT_TYPE(final_result[op]), ACTIVATION_PARAMS);
-}
-
 }
 
 #undef INIT_VAL
-#undef OUTPUT_TYPE4
-#undef TO_OUTPUT_TYPE4
+#undef ACCUMULATOR_VEC4
+
+#undef ACTIVATION_VEC4
+#undef TO_ACTIVATION_VEC4
+
+#undef OUTPUT_VEC4
+#undef TO_OUTPUT_VEC4
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_opt.cl
index 293f56f6960b53..54bcab6d1b5b96 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_opt.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_opt.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,38 +15,48 @@
 
 #include "include/include_all.cl"
 
-#define VECTOR_TYPE MAKE_VECTOR_TYPE(UNIT_TYPE,8)
+#define INPUT_VEC8 MAKE_VECTOR_TYPE(INPUT0_TYPE, 8)
+
+#define ACCUMULATOR_VEC8 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 8)
+#define TO_ACCUMULATOR_VEC8 CAT(convert_, ACCUMULATOR_VEC8)
+
 #define FEATURE_PER_ITEM 8
 #define FEATURE_BLOCK_NUM (OUTPUT_FEATURE_NUM / 8)
 
-#if   defined MAX_POOLING
-    #define UNIT_INIT_VAL UNIT_VAL_MIN
-#elif defined AVG_POOLING
-    #define UNIT_INIT_VAL UNIT_VAL_ZERO
+#if MAX_POOLING
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
+#elif AVG_POOLING
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #else
-#error
+    #error
 #endif
 
-inline VECTOR_TYPE FUNC(apply_pooling)(VECTOR_TYPE tmp, VECTOR_TYPE in)
+inline ACCUMULATOR_VEC8 FUNC(apply_pooling)(ACCUMULATOR_VEC8 tmp, ACCUMULATOR_VEC8 in)
 {
 #if defined MAX_POOLING
-    return max(tmp, in);
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #elif defined AVG_POOLING
     return tmp + in;
 #endif
 }
 
-KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)
+KERNEL(pooling_gpu_byxf_opt)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
 {
-    VECTOR_TYPE out;
     const uint x    = (uint)get_global_id(0);
     const uint y    = (uint)get_global_id(1);
     const uint bf   = (uint)get_global_id(2);
     const uint f    = bf / INPUT0_BATCH_NUM * FEATURE_PER_ITEM;
     const uint b    = bf % INPUT0_BATCH_NUM;
-    
-    VECTOR_TYPE feature_block;
-    
+
+    INPUT_VEC8 feature_block;
+    ACCUMULATOR_VEC8 result;
+
     if ((x >= OUTPUT_SIZE_X) || (y >= OUTPUT_SIZE_Y))
         return;
 
@@ -54,8 +64,8 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE
     const int offset_y = (int)y*STRIDE_SIZE_Y;
 
     int input_idx = b*FEATURE_BLOCK_NUM*INPUT0_SIZE_X*INPUT0_SIZE_Y + FEATURE_BLOCK_NUM*INPUT0_SIZE_X*offset_y + FEATURE_BLOCK_NUM*offset_x + bf / INPUT0_BATCH_NUM;
-    
-    out = UNIT_INIT_VAL;
+
+    result = INIT_VAL;
 
     __attribute__((opencl_unroll_hint))
     for(uint j = 0; j < POOL_SIZE_Y; j++)
@@ -64,20 +74,36 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE
         for(uint i = 0; i < POOL_SIZE_X; i++)
         {
             feature_block = vload8(input_idx+FEATURE_BLOCK_NUM*i, input);
-            out = FUNC_CALL(apply_pooling)(out, feature_block);
+            result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_VEC8(feature_block));
         }
         input_idx += FEATURE_BLOCK_NUM*INPUT0_SIZE_X;
     }
 
+    OUTPUT_TYPE final_result;
+
     uint output_pos = GET_DATA_INDEX(OUTPUT, b, f, y, x);
     __attribute__((opencl_unroll_hint))
     for(uint i = 0; i < FEATURE_PER_ITEM; i++)
     {
         if(f+i < INPUT0_FEATURE_NUM){
 #if defined MAX_POOLING
-            output[output_pos+i] = ACTIVATION(out[i], ACTIVATION_PARAMS);
+        ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result[i]);
+    #if HAS_FUSED_OPS
+        FUSED_OPS;
+        final_result = FUSED_OPS_RESULT;
+    #else
+        final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS));
+    #endif
+        output[output_pos+i] = final_result;
 #elif defined AVG_POOLING
-            output[output_pos+i] = ACTIVATION(out[i]/(UNIT_TYPE)(POOL_SIZE_X*POOL_SIZE_Y), ACTIVATION_PARAMS);
+        ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result[i]/(OUTPUT_TYPE)(POOL_SIZE_X*POOL_SIZE_Y));
+    #if HAS_FUSED_OPS
+        FUSED_OPS;
+        final_result = FUSED_OPS_RESULT;
+    #else
+        final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS));
+    #endif
+        output[output_pos+i] = final_result;
 #endif
         }
     }
@@ -85,5 +111,9 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE
 
 #undef FEATURE_BLOCK_NUM
 #undef FEATURE_PER_ITEM
-#undef UNIT_INIT_VAL
-#undef VECTOR_TYPE
\ No newline at end of file
+
+#undef INIT_VAL
+#undef INPUT_VEC8
+
+#undef ACCUMULATOR_VEC8
+#undef TO_ACCUMULATOR_VEC8
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_padding_opt.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_padding_opt.cl
index ca0969047d842d..72a5a1c4c770af 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_padding_opt.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_byxf_padding_opt.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,38 +15,48 @@
 
 #include "include/include_all.cl"
 
-#define VECTOR_TYPE MAKE_VECTOR_TYPE(UNIT_TYPE,8)
+#define INPUT0_VEC8 MAKE_VECTOR_TYPE(INPUT0_TYPE,8)
+
+#define ACCUMULATOR_VEC8 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 8)
+#define TO_ACCUMULATOR_VEC8 CAT(convert_, ACCUMULATOR_VEC8)
+
 #define FEATURE_PER_ITEM 8
-#define FEATURE_BLOCK_NUM (OUTPUT_FEATURE_NUM / 8)
+#define FEATURE_BLOCK_NUM (INPUT0_FEATURE_NUM / 8)
 
-#if   defined MAX_POOLING
-    #define UNIT_INIT_VAL UNIT_VAL_MIN
-#elif defined AVG_POOLING
-    #define UNIT_INIT_VAL UNIT_VAL_ZERO
+#if MAX_POOLING
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
+#elif AVG_POOLING
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #else
-#error
+    #error
 #endif
 
-inline VECTOR_TYPE FUNC(apply_pooling)(VECTOR_TYPE tmp, VECTOR_TYPE in)
+inline ACCUMULATOR_VEC8 FUNC(apply_pooling)(ACCUMULATOR_VEC8 tmp, ACCUMULATOR_VEC8 in)
 {
 #if   defined MAX_POOLING
-    return max(tmp, in);
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #elif defined AVG_POOLING
     return tmp + in;
 #endif
 }
 
-KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output)
+KERNEL(pooling_gpu_byxf_opt)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
 {
-    VECTOR_TYPE out;
     const uint x    = (uint)get_global_id(0);
     const uint y    = (uint)get_global_id(1);
     const uint bf   = (uint)get_global_id(2);
     const uint f    = bf / INPUT0_BATCH_NUM * FEATURE_PER_ITEM;
     const uint b    = bf % INPUT0_BATCH_NUM;
-    
-    VECTOR_TYPE feature_block;
-    
+
+    INPUT0_VEC8 feature_block;
+    ACCUMULATOR_VEC8 result;
+
     if ((x >= OUTPUT_SIZE_X) || (y >= OUTPUT_SIZE_Y))
         return;
 
@@ -62,7 +72,7 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE
 #endif
     int input_idx = b*FEATURE_BLOCK_NUM*INPUT0_SIZE_X*INPUT0_SIZE_Y + FEATURE_BLOCK_NUM*INPUT0_SIZE_X*offset_y + FEATURE_BLOCK_NUM*offset_x + bf / INPUT0_BATCH_NUM;
 
-    out = UNIT_INIT_VAL;
+    result = INIT_VAL;
 
     __attribute__((opencl_unroll_hint))
     for(uint j = 0; j < POOL_SIZE_Y; j++)
@@ -79,13 +89,15 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE
                 if (!zero)
                 {
                     feature_block = vload8(input_idx+FEATURE_BLOCK_NUM*i, input);
-                    out = FUNC_CALL(apply_pooling)(out, feature_block);
+                    result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_VEC8(feature_block));
                 }
             }
         }
         input_idx += FEATURE_BLOCK_NUM*INPUT0_SIZE_X;
     }
 
+   OUTPUT_TYPE final_result;
+
     uint output_pos = GET_DATA_INDEX(OUTPUT, b, f, y, x);
     __attribute__((opencl_unroll_hint))
     for(uint i = 0; i < FEATURE_PER_ITEM; i++)
@@ -93,9 +105,23 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE
         if(f+i < INPUT0_FEATURE_NUM)
         {
 #if defined MAX_POOLING
-            output[output_pos+i] = ACTIVATION(out[i], ACTIVATION_PARAMS);
+            ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result[i]);
+        #if HAS_FUSED_OPS
+            FUSED_OPS;
+            final_result = FUSED_OPS_RESULT;
+        #else
+            final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS));
+        #endif
+            output[output_pos+i] = final_result;
 #elif defined AVG_POOLING
-            output[output_pos+i] = ACTIVATION(out[i]/(UNIT_TYPE)(POOL_SIZE_X*POOL_SIZE_Y), ACTIVATION_PARAMS);
+            ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result[i]/(OUTPUT_TYPE)(POOL_SIZE_X*POOL_SIZE_Y));
+        #if HAS_FUSED_OPS
+            FUSED_OPS;
+            final_result = FUSED_OPS_RESULT;
+        #else
+            final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS));
+        #endif
+           output[output_pos+i] = final_result;
 #endif
         }
     }
@@ -103,5 +129,9 @@ KERNEL(pooling_gpu_byxf_opt)(const __global UNIT_TYPE* input, __global UNIT_TYPE
 
 #undef FEATURE_BLOCK_NUM
 #undef FEATURE_PER_ITEM
-#undef UNIT_INIT_VAL
-#undef VECTOR_TYPE
\ No newline at end of file
+
+#undef INIT_VAL
+#undef INPUT0_VEC8
+
+#undef ACCUMULATOR_VEC8
+#undef TO_ACCUMULATOR_VEC8
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl
index aa3cc42040be79..7c98ee7953085c 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_b_yx_fsv32.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,25 +14,26 @@
 
 
 #include "include/include_all.cl"
-#include "include/unit_type.cl"
+#include "include/data_types.cl"
 
 #if MAX_POOLING
-    #define INIT_VAL UNIT_VAL_MIN
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
 #elif AVG_POOLING
-    #define INIT_VAL 0
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #else
-#error No correct pooling mode defined
+    #error No correct pooling mode defined
 #endif
 
-#if defined(USE_FLOAT_ACC)
-    #define ACC_TYPE2 float2
-    #define READ_BLOCK2_INPUT(input, input_total_offset) convert_float2(UNIT_BLOCK_READ2(input,total_input_offset))
-    #define TO_UNIT_BLOCK2(values) convert_half2(values)
-#else
-    #define ACC_TYPE2 UNIT_TYPE2
-    #define READ_BLOCK2_INPUT(input, input_total_offset) UNIT_BLOCK_READ2(input,total_input_offset)
-    #define TO_UNIT_BLOCK2(values) values
-#endif
+#define INPUT_VEC2 MAKE_VECTOR_TYPE(INPUT0_TYPE, 2)
+
+#define ACCUMULATOR_VEC2 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 2)
+#define TO_ACCUMULATOR_VEC2 CAT(convert_, ACCUMULATOR_VEC2)
+
+#define ACTIVATION_VEC2 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 2)
+#define TO_ACTIVATION_VEC2 CAT(convert_, ACTIVATION_VEC2)
+
+#define OUTPUT_VEC2 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 2)
+#define TO_OUTPUT_VEC2 CAT(convert_, OUTPUT_VEC2)
 
 #define INPUT0_SIZE_X_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_X + INPUT0_SIZE_X + INPUT0_PAD_AFTER_SIZE_X)
 #define INPUT0_SIZE_Y_WITH_PADDING (INPUT0_PAD_BEFORE_SIZE_Y + INPUT0_SIZE_Y + INPUT0_PAD_AFTER_SIZE_Y)
@@ -46,10 +47,10 @@
 
 #define unroll_for __attribute__((opencl_unroll_hint)) for
 
-inline ACC_TYPE2 FUNC(apply_pooling)(ACC_TYPE2 tmp, ACC_TYPE2 in)
+inline ACCUMULATOR_VEC2 FUNC(apply_pooling)(ACCUMULATOR_VEC2 tmp, ACCUMULATOR_VEC2 in)
 {
 #if MAX_POOLING
-    return max(tmp, in);
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #elif AVG_POOLING
     return tmp + in;
 #endif
@@ -57,8 +58,12 @@ inline ACC_TYPE2 FUNC(apply_pooling)(ACC_TYPE2 tmp, ACC_TYPE2 in)
 
 __attribute__((intel_reqd_sub_group_size(REQD_SUB_GROUP_SIZE)))
 KERNEL(pooling_gpu_fs_b_yx_fsv32)(
-    const __global UNIT_TYPE* input,
-    __global UNIT_TYPE* output)
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
 {
     const uint out_x    = (uint)get_global_id(0);
     const uint out_y    = (uint)get_global_id(1);
@@ -69,12 +74,12 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
     const uint b  = bfs % INPUT0_BATCH_NUM;
     const uint fs = bfs / INPUT0_BATCH_NUM;
 
-    ACC_TYPE2 results = (ACC_TYPE2)(INIT_VAL,INIT_VAL);
+    ACCUMULATOR_VEC2 results  = (ACCUMULATOR_VEC2)(INIT_VAL,INIT_VAL);
 
     const uint x_pitch = REQD_FEATURE_SLICE_SIZE;                        // difference in location between (x+1) and (x)
     const uint y_pitch = x_pitch * INPUT0_SIZE_X_WITH_PADDING;           // difference in location between (y+1) and (y)
     const uint b_pitch = y_pitch * INPUT0_SIZE_Y_WITH_PADDING;           // difference in location between (b+1) and (b)
-    const uint fs_pitch = b_pitch * INPUT0_BATCH_NUM;                     // difference in location between (fs+1) and (fs)
+    const uint fs_pitch = b_pitch * INPUT0_BATCH_NUM;                    // difference in location between (fs+1) and (fs)
 
     const int offset_x = (int)out_x*STRIDE_SIZE_X - PADDING_SIZE_X;
     const int offset_y = (int)out_y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
@@ -103,10 +108,8 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
                 {
                     const size_t input_offset_x = (offset_x + in_dx) * x_pitch;
                     const size_t total_input_offset = padding_offset + fs_offset + b_offset + input_offset_y + input_offset_x;
-
-                    ACC_TYPE2 tmp_input = READ_BLOCK2_INPUT(input, input_total_offset);
-                    
-                    results = FUNC_CALL(apply_pooling)(results, tmp_input);
+                    INPUT_VEC2 tmp_input = DT_INPUT_BLOCK_READ2(input, total_input_offset);
+                    results  = FUNC_CALL(apply_pooling)(results , TO_ACCUMULATOR_VEC2(tmp_input));
 
                     #ifdef DYNAMIC_KERNEL_DIVIDER
                         num_elements++;
@@ -115,6 +118,7 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
             }
         }
     }
+
 #ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
     const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
     const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
@@ -128,10 +132,8 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
         {
             const size_t input_offset_x = (offset_x + in_dx) * x_pitch;
             const size_t total_input_offset = padding_offset + fs_offset + b_offset + input_offset_y + input_offset_x;
-
-            ACC_TYPE2 tmp_input = READ_BLOCK2_INPUT(input, input_total_offset);
-
-            results = FUNC_CALL(apply_pooling)(results, tmp_input);
+            INPUT_VEC2 tmp_input = DT_INPUT_BLOCK_READ2(input, total_input_offset);
+            results = FUNC_CALL(apply_pooling)(results , TO_ACCUMULATOR_VEC2(tmp_input));
         }
     }
     #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
@@ -147,8 +149,6 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
     #endif
 #endif
 
-    results = ACTIVATION(results, ACTIVATION_PARAMS);
-
     const size_t out_x_pitch = REQD_FEATURE_SLICE_SIZE;
     const size_t out_y_pitch = out_x_pitch * OUTPUT_SIZE_X_WITH_PADDING;
     const size_t out_b_pitch = out_y_pitch * OUTPUT_SIZE_Y_WITH_PADDING;
@@ -166,9 +166,19 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
     const bool full_f = OUTPUT_FEATURE_NUM % REQD_FEATURE_SLICE_SIZE == 0 ||
                         fs * REQD_FEATURE_SLICE_SIZE + REQD_FEATURE_SLICE_SIZE <= OUTPUT_FEATURE_NUM;
 
+    OUTPUT_VEC2 final_result;
+    ACTIVATION_VEC2 pool_result = TO_ACTIVATION_VEC2(results);
+
+    #if HAS_FUSED_OPS
+        FUSED_OPS;
+        final_result = FUSED_OPS_RESULT;
+    #else
+        final_result = TO_OUTPUT_VEC2(ACTIVATION(pool_result , ACTIVATION_PARAMS));
+    #endif
+
     if (full_f)
     {
-        UNIT_BLOCK_WRITE2(output, output_offset, TO_UNIT_BLOCK2(results));
+        DT_OUTPUT_BLOCK_WRITE2(output, output_offset, final_result);
     }
     else
     {
@@ -176,14 +186,21 @@ KERNEL(pooling_gpu_fs_b_yx_fsv32)(
         {
             if (fs * REQD_FEATURE_SLICE_SIZE + ofi * REQD_SUB_GROUP_SIZE + sglid < OUTPUT_FEATURE_NUM)
             {
-                output[output_offset + ofi * REQD_SUB_GROUP_SIZE + sglid] = (UNIT_TYPE)results[ofi];
+                output[output_offset + ofi * REQD_SUB_GROUP_SIZE + sglid] = (OUTPUT_TYPE)final_result[ofi];
             }
         }
     }
 }
 
-#undef TO_UNIT_BLOCK2
-#undef READ_BLOCK2_INPUT
-#undef ACC_TYPE2
 #undef FEATURE_SLICE_SIZE
 #undef INIT_VAL
+#undef INPUT_VEC2
+
+#undef ACCUMULATOR_VEC2
+#undef TO_ACCUMULATOR_VEC2
+
+#undef ACTIVATION_VEC2
+#undef TO_ACTIVATION_VEC2
+
+#undef OUTPUT_VEC2
+#undef TO_OUTPUT_VEC2
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl
index d81490d7fa3130..4439732718cce3 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,19 +15,26 @@
 
 #include "include/include_all.cl"
 
+#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4)
+#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4)
+
+#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4)
+
+#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE,4)
+#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4)
+
 #if MAX_POOLING
-    #define INIT_VAL CHAR_MIN
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
 #elif AVG_POOLING
-    #define INIT_VAL 0
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #else
-#error
+    #error
 #endif
 
-
-inline int FUNC(apply_pooling)(int tmp, int in)
+inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
 {
 #if MAX_POOLING
-    return max(tmp, in);
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #elif AVG_POOLING
     return tmp + in;
 #endif
@@ -35,8 +42,12 @@ inline int FUNC(apply_pooling)(int tmp, int in)
 
 __attribute__((intel_reqd_sub_group_size(8)))
 KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)(
-    const __global UNIT_TYPE* input,
-    __global UNIT_TYPE* output)
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
 {
     const uint x    = (uint)get_global_id(0);
     const uint y    = (uint)get_global_id(1);
@@ -44,8 +55,7 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)(
 	// we process 4 features per workitem that's why we need to divide it
     const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32;
     const uint f    = ((uint)get_global_id(2) * 4) % aligned32_features;
-    const uint b = 4 * (((uint)get_global_id(2) * 4) / aligned32_features);
-    
+    const uint b    = 4 * (((uint)get_global_id(2) * 4) / aligned32_features);
     if (x >= OUTPUT_SIZE_X)
     {
         return;
@@ -53,8 +63,7 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)(
 
     const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
     const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
-    
-    int4 result[4] = { INIT_VAL };
+    ACCUMULATOR_VEC4 result[4] = { INIT_VAL };
 
 #ifdef CHECK_BOUNDRY
     if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
@@ -86,13 +95,12 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)(
                     for(uint b = 0; b < 4; b++)
                     {
                         char4 input_data = as_char4(int_data[b]);
-                        result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], (int)input_data[0]);
-                        result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], (int)input_data[1]);
-                        result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], (int)input_data[2]);
-                        result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], (int)input_data[3]);
-
+                        result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], TO_ACCUMULATOR_TYPE(input_data[0]));
+                        result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], TO_ACCUMULATOR_TYPE(input_data[1]));
+                        result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], TO_ACCUMULATOR_TYPE(input_data[2]));
+                        result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], TO_ACCUMULATOR_TYPE(input_data[3]));
                     }
-                    
+
 #ifdef DYNAMIC_KERNEL_DIVIDER
                     num_elementes++;
 #endif
@@ -116,54 +124,104 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32)(
             for(uint b = 0; b < 4; b++)
             {
                 char4 input_data = as_char4(int_data[b]);
-                result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], (int)input_data[0]);
-                result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], (int)input_data[1]);
-                result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], (int)input_data[2]);
-                result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], (int)input_data[3]);
+                result[b][0] = FUNC_CALL(apply_pooling)(result[b][0], TO_ACCUMULATOR_TYPE(input_data[0]));
+                result[b][1] = FUNC_CALL(apply_pooling)(result[b][1], TO_ACCUMULATOR_TYPE(input_data[1]));
+                result[b][2] = FUNC_CALL(apply_pooling)(result[b][2], TO_ACCUMULATOR_TYPE(input_data[2]));
+                result[b][3] = FUNC_CALL(apply_pooling)(result[b][3], TO_ACCUMULATOR_TYPE(input_data[3]));
             }
 
             input_idx += IN_X_PITCH;
         }
         input_idx += (IN_Y_PITCH - POOL_SIZE_X*IN_X_PITCH);
     }
-    
+
 #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
     const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y;
 #endif
 #endif
 
 #if defined AVG_POOLING
-    #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-        for(uint b = 0; b < 4; b++)
-        {
-            for(uint i = 0; i < 4; i++)
+    #if ENABLE_ROUND
+        #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+            for(uint b = 0; b < 4; b++)
             {
-                result[b][i] = convert_int(round(((float)result[b][i] / max(num_elementes, (uint)1)));
+                for(uint i = 0; i < 4; i++)
+                {
+                    result[b][i] = TO_ACCUMULATOR_TYPE(round(((float)result[b][i] / max(num_elementes, (uint)1))));
+                }
             }
-        }
+        #else
+            for(uint b = 0; b < 4; b++)
+            {
+                for(uint i = 0; i < 4; i++)
+                {
+                    result[b][i] = TO_ACCUMULATOR_TYPE(round((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
+                }
+            }
+        #endif
     #else
-        for(uint b = 0; b < 4; b++)
-        {
-            for(uint i = 0; i < 4; i++)
+        #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
+            for(uint b = 0; b < 4; b++)
+            {
+                for(uint i = 0; i < 4; i++)
+                {
+                    result[b][i] = TO_ACCUMULATOR_TYPE(((float)result[b][i] / max(num_elementes, (uint)1)));
+                }
+            }
+        #else
+            for(uint b = 0; b < 4; b++)
             {
-                result[b][i] = convert_int(round((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X)));
+                for(uint i = 0; i < 4; i++)
+                {
+                    result[b][i] = TO_ACCUMULATOR_TYPE((float)result[b][i] / (int)(POOL_SIZE_Y * POOL_SIZE_X));
+                }
             }
-        }
-    #endif
-#endif
+        #endif
+    #endif  // ENABLE_ROUND
+#endif  // AVG_POOLING
 
-    int4 char_result;
-    for(uint b = 0; b < 4; b++)
+#if OUTPUT_TYPE_SIZE == 1
+    int4 final_result;
+
+    for(uint bi = 0; bi < 4; bi++)
     {
-        char4 char_res = as_char4(char_result[b]);
-        for(uint op = 0; op < 4; op++)
-        {
-            char_res[op] = ACTIVATION(convert_char(result[b][op]), ACTIVATION_PARAMS);
-        }
-        char_result[b] = as_int(char_res);
+        #if HAS_FUSED_OPS
+            ACTIVATION_VEC4 char_result = TO_ACTIVATION_VEC4(convert_char4(result[bi]));
+            FUSED_OPS;
+            final_result[bi] = as_int(FUSED_OPS_RESULT);
+        #else
+            char4 char_result = ACTIVATION(convert_char4(result[bi]), ACTIVATION_PARAMS);
+            final_result[bi] = as_int(char_result);
+        #endif
     }
     const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
-    intel_sub_group_block_write4((__global uint*)(output + output_pos), as_uint4(char_result));																						
+    intel_sub_group_block_write4((__global uint*)(output + output_pos), as_uint4(final_result));
+
+#elif OUTPUT_TYPE_SIZE == 2 || OUTPUT_TYPE_SIZE == 4
+    OUTPUT_VEC4 final_result;
+
+    for(uint bi = 0; bi < 4; bi++)
+    {
+    #if HAS_FUSED_OPS
+        ACTIVATION_VEC4 char_result = TO_ACTIVATION_VEC4(TO_OUTPUT_VEC4(result[bi]));
+        FUSED_OPS;
+        final_result = FUSED_OPS_RESULT;
+    #else
+        char4 char_result = ACTIVATION(TO_OUTPUT_VEC4(result[bi]), ACTIVATION_PARAMS);
+        final_result = TO_OUTPUT_VEC4(char_result);
+    #endif
+        const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b + bi, f, y, x);
+        vstore4(final_result, 0, output + output_pos);
+    }
+#endif
 }
 
 #undef INIT_VAL
+#undef ACCUMULATOR_VEC4
+#undef ACCUMULATOR_VEC4
+
+#undef ACTIVATION_VEC4
+#undef TO_ACTIVATION_VEC4
+
+#undef OUTPUT_VEC4
+#undef TO_OUTPUT_VEC4
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl
index 19d7e50122d3f6..f439e9e6e300ea 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,28 +15,35 @@
 
 #include "include/include_all.cl"
 
+#define ACTIVATION_VEC4 MAKE_VECTOR_TYPE(ACTIVATION_TYPE, 4)
+#define TO_ACTIVATION_VEC4 CAT(convert_, ACTIVATION_VEC4)
+
+#define ACCUMULATOR_VEC4 MAKE_VECTOR_TYPE(ACCUMULATOR_TYPE, 4)
+
+#define OUTPUT_VEC4 MAKE_VECTOR_TYPE(OUTPUT_TYPE, 4)
+#define TO_OUTPUT_VEC4 CAT(convert_, OUTPUT_VEC4)
+
 #if MAX_POOLING
-    #define INIT_VAL CHAR_MIN
-#elif AVG_POOLING
-    #define INIT_VAL 0
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
 #else
-#error
+    #error
 #endif
 
-
-inline int FUNC(apply_pooling)(int tmp, int in)
+inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
 {
 #if MAX_POOLING
-    return max(tmp, in);
-#elif AVG_POOLING
-    return tmp + in;
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #endif
 }
 
 __attribute__((intel_reqd_sub_group_size(32)))
 KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32)(
-    const __global UNIT_TYPE* input,
-    __global UNIT_TYPE* output)
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
+)
 {
     const uint x    = (uint)get_group_id(0);
     const uint y    = (uint)get_group_id(1);
@@ -45,7 +52,6 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32)(
     const uint aligned32_features = ((INPUT0_FEATURE_NUM + 31) / 32) * 32;
     const uint f = ((bf * 32) % aligned32_features) + (get_sub_group_local_id() % 8) * 4;
     const uint b = 4 * ((bf * 32) / aligned32_features) + (get_sub_group_local_id() / 8);
-    
     if (x >= OUTPUT_SIZE_X)
     {
         return;
@@ -53,8 +59,8 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32)(
 
     const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
     const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
-    
-    int4 result = INIT_VAL;
+
+    ACCUMULATOR_VEC4 result = INIT_VAL;
 
     if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
         offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y)
@@ -81,23 +87,38 @@ KERNEL(pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32)(
             int int_data = as_int(input_uint[0]);
 
             char4 input_data = zero ? (char4)(INIT_VAL,INIT_VAL,INIT_VAL,INIT_VAL) : as_char4(int_data);
-            result[0] = FUNC_CALL(apply_pooling)((int)result[0], (int)input_data[0]);
-            result[1] = FUNC_CALL(apply_pooling)((int)result[1], (int)input_data[1]);
-            result[2] = FUNC_CALL(apply_pooling)((int)result[2], (int)input_data[2]);
-            result[3] = FUNC_CALL(apply_pooling)((int)result[3], (int)input_data[3]);
+            result[0] = FUNC_CALL(apply_pooling)(result[0], TO_ACCUMULATOR_TYPE(input_data[0]));
+            result[1] = FUNC_CALL(apply_pooling)(result[1], TO_ACCUMULATOR_TYPE(input_data[1]));
+            result[2] = FUNC_CALL(apply_pooling)(result[2], TO_ACCUMULATOR_TYPE(input_data[2]));
+            result[3] = FUNC_CALL(apply_pooling)(result[3], TO_ACCUMULATOR_TYPE(input_data[3]));
         }
     }
 
-    char4 char_res;
-    for(uint op = 0; op < 4; op++)
-    {
-        char_res[op] = ACTIVATION(convert_char(result[op]), ACTIVATION_PARAMS);
-    }
+    OUTPUT_VEC4 final_result;
 
-    const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
+    #if HAS_FUSED_OPS
+        ACTIVATION_VEC4 pool_result;
+        pool_result = TO_ACTIVATION_VEC4(TO_OUTPUT_VEC4(result));
+        FUSED_OPS;
+        final_result = FUSED_OPS_RESULT;
+    #else
+        char4 pool_result;
+        for(uint op = 0; op < 4; op++)
+        {
+            pool_result[op] = ACTIVATION(TO_OUTPUT_TYPE(result[op]), ACTIVATION_PARAMS);
+        }
+        final_result = TO_OUTPUT_VEC4(pool_result);
+    #endif
 
-    __global uint* output_uint = (__global uint*)(output + output_pos);
-    output_uint[0] = as_uint(char_res);
+    const uint output_pos = GET_DATA_FS_BS_YX_BSV4_FSV32_INDEX(OUTPUT, b, f, y, x);
+    *((__global OUTPUT_VEC4*)(output + output_pos)) = final_result;
 }
 
 #undef INIT_VAL
+#undef ACCUMULATOR_VEC4
+
+#undef ACTIVATION_VEC4
+#undef TO_ACTIVATION_VEC4
+
+#undef OUTPUT_VEC4
+#undef TO_OUTPUT_VEC4
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl
index cdb4cd127ba835..244d32fa19cee1 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_int8_ref.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2017 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,17 +16,17 @@
 #include "include/include_all.cl"
 
 #if MAX_POOLING
-    #define INIT_VAL CHAR_MIN
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
 #elif AVG_POOLING
-    #define INIT_VAL 0
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #else
-#error
+    #error
 #endif
 
-inline int FUNC(apply_pooling)(int tmp, int in)
+inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
 {
 #if MAX_POOLING
-    return max(tmp, in);
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #elif AVG_POOLING
     return tmp + in;
 #endif
@@ -84,6 +84,7 @@ KERNEL(pooling_gpu_int8_ref)(
     const uint bf   = (uint)get_global_id(0);
     const uint f    = bf / INPUT0_BATCH_NUM;
     const uint b    = bf % INPUT0_BATCH_NUM;
+    const uint z    = 0;
 #elif OUTPUT_LAYOUT_B_FS_YX_FSV16
     const uint x = get_global_id(1);
     const uint y = get_global_id(2);
@@ -92,14 +93,14 @@ KERNEL(pooling_gpu_int8_ref)(
     const uint b = bf % INPUT0_BATCH_NUM;
     const uint z = 0;
 #else
-#error "pooling_int8_ref: unsupported layout"
+    #error "pooling_int8_ref: unsupported layout"
 #endif
 
     const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
     const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
     const int offset_z = (int)z*STRIDE_SIZE_Z - PADDING_SIZE_Z;
 
-    int result = INIT_VAL;
+    ACCUMULATOR_TYPE result = INIT_VAL;
 
 #ifdef CHECK_BOUNDRY
     if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
@@ -138,8 +139,7 @@ KERNEL(pooling_gpu_int8_ref)(
 #else
                             const uint input_idx = INPUT0_GET_INDEX(b, f, input_offset_y, input_offset_x);
 #endif
-
-                            result = FUNC_CALL(apply_pooling)(result, (int)input[input_idx]);
+                            result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
 
 #ifdef DYNAMIC_KERNEL_DIVIDER
                             num_elementes++;
@@ -180,7 +180,7 @@ KERNEL(pooling_gpu_int8_ref)(
 #else
                 uint input_idx = INPUT0_GET_INDEX(b, f, offset_y + j, offset_x + i);
 #endif
-                result = FUNC_CALL(apply_pooling)(result, (int)input[input_idx]);
+                result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
             }
         }
     }
@@ -194,26 +194,29 @@ KERNEL(pooling_gpu_int8_ref)(
 #if defined AVG_POOLING
 #if ENABLE_ROUND
     #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-    int pool_res = convert_int(round((float)result / max(num_elementes, (uint)1)));
+    int not_fused_result = convert_int(round((float)result / max(num_elementes, (uint)1)));
     #else
-    int pool_res = convert_int(round((float)result / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X)));
+    int not_fused_result = convert_int(round((float)result / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X)));
     #endif
 #else  // ENABLE_ROUND
     #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
-    float pool_res = (float)result / max(num_elementes, (uint)1);
+    float not_fused_result = (float)result / max(num_elementes, (uint)1);
     #else
-    float pool_res = (float)result / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X);
+    float not_fused_result = (float)result / (int)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X);
     #endif
 #endif  // ENABLE_ROUND
 #else  // defined AVG_POOLING
-    int pool_res = result;
+    int not_fused_result = result;
 #endif  // defined AVG_POOLING
 
+    OUTPUT_TYPE final_result;
+    ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(not_fused_result);
+
 #if HAS_FUSED_OPS
       FUSED_OPS;
-      OUTPUT_TYPE dst = FUSED_OPS_RESULT;
+      final_result = FUSED_OPS_RESULT;
 #else  // HAS_FUSED_OPS
-      OUTPUT_TYPE dst = TO_OUTPUT_TYPE(pool_res);
+      final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS));
 #endif  // HAS_FUSED_OPS
 
 #if OUTPUT_DIMS == 5
@@ -221,7 +224,7 @@ KERNEL(pooling_gpu_int8_ref)(
 #else
     const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
 #endif
-    output[output_pos] = ACTIVATION(dst, ACTIVATION_PARAMS);
+    output[output_pos] = final_result;
 }
 
 #undef INIT_VAL
diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl
index 9d260c2eedb352..999ea6eb3220b9 100644
--- a/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl
+++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/cl_kernels/pooling_gpu_ref.cl
@@ -1,4 +1,4 @@
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -16,38 +16,42 @@
 #include "include/include_all.cl"
 
 #if MAX_POOLING || MAX_WITH_ARGMAX_POOLING
-    #define UNIT_INIT_VAL UNIT_VAL_MIN
+    #define INIT_VAL ACCUMULATOR_VAL_MIN
 #elif AVG_POOLING
-    #define UNIT_INIT_VAL UNIT_VAL_ZERO
+    #define INIT_VAL ACCUMULATOR_VAL_ZERO
 #else
-#error
+    #error
 #endif
 
-
 inline ACCUMULATOR_TYPE FUNC(apply_pooling)(ACCUMULATOR_TYPE tmp, ACCUMULATOR_TYPE in)
 {
 #if MAX_POOLING || MAX_WITH_ARGMAX_POOLING
-    return max(tmp, in);
+    return ACCUMULATOR_MAX_FUNC(tmp, in);
 #elif AVG_POOLING
     return tmp + in;
 #endif
 }
 
-KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
+KERNEL(pooling_gpu)(
+    const __global INPUT0_TYPE* input,
+    __global OUTPUT_TYPE* output
 #if MAX_WITH_ARGMAX_POOLING
 , __global float* arg_max
 #endif
+#if HAS_FUSED_OPS_DECLS
+    , FUSED_OPS_DECLS
+#endif
 )
 {
-#if OUTPUT_LAYOUT_BFYX  || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BFZYX || OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16 || \
-    OUTPUT_LAYOUT_B_FS_YX_FSV32 || OUTPUT_LAYOUT_B_FS_ZYX_FSV32
+#if OUTPUT_LAYOUT_BFYX  || OUTPUT_LAYOUT_BYXF || OUTPUT_LAYOUT_BFZYX ||\
+    OUTPUT_LAYOUT_B_FS_ZYX_FSV16 || OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16
     const uint x    = (uint)get_global_id(0);
-#if  OUTPUT_DIMS < 5
-    const uint y    = (uint)get_global_id(1);
-    const uint z = 0;
+#if OUTPUT_DIMS == 5
+    const uint y   = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
+    const uint z   = (uint)get_global_id(1) / OUTPUT_SIZE_Y;
 #else
-    const uint y = (uint)get_global_id(1) % OUTPUT_SIZE_Y;
-    const uint z = (uint)get_global_id(1) / OUTPUT_SIZE_Y;
+    const uint y   = (uint)get_global_id(1);
+    const uint z   = 0;
 #endif
     const uint bf   = (uint)get_global_id(2);
     const uint f    = bf % INPUT0_FEATURE_NUM;
@@ -57,32 +61,51 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
     {
         return;
     }
+#elif OUTPUT_LAYOUT_B_FS_YX_FSV32 || OUTPUT_LAYOUT_B_FS_ZYX_FSV32
+    const uint fsv = get_global_id(0);
+    const uint zyx = get_global_id(1);
+    const uint fsb = get_global_id(2);
+
+    const uint x = zyx % OUTPUT_SIZE_X;
+#if OUTPUT_DIMS == 5
+    const uint y = zyx / OUTPUT_SIZE_X % OUTPUT_SIZE_Y;
+    const uint z = zyx / OUTPUT_SIZE_X / OUTPUT_SIZE_Y;
+#else
+    const uint y = zyx / OUTPUT_SIZE_X;
+    const uint z = 0;
+#endif
+    const uint fs = fsb % ((OUTPUT_FEATURE_NUM + 32 - 1) / 32);
+    const uint b = fsb / ((OUTPUT_FEATURE_NUM + 32 - 1) / 32);
+    const uint f = fs * 32 + fsv;
+
+    if (f >= OUTPUT_FEATURE_NUM) {
+        return;
+    }
 #elif OUTPUT_LAYOUT_YXFB
     const uint x    = (uint)get_global_id(1);
     const uint y    = (uint)get_global_id(2);
-    const uint z    = 0;
     const uint bf   = (uint)get_global_id(0);
     const uint f    = bf / INPUT0_BATCH_NUM;
     const uint b    = bf % INPUT0_BATCH_NUM;
+    const uint z    = 0;
+#else
+    #error "pooling_gpu_ref: unsupported layout"
 #endif
 
     const int offset_x = (int)x*STRIDE_SIZE_X - PADDING_SIZE_X;
     const int offset_y = (int)y*STRIDE_SIZE_Y - PADDING_SIZE_Y;
     const int offset_z = (int)z*STRIDE_SIZE_Z - PADDING_SIZE_Z;
 
-    ACCUMULATOR_TYPE result = UNIT_INIT_VAL;
+    ACCUMULATOR_TYPE result = INIT_VAL;
 
 #if MAX_WITH_ARGMAX_POOLING
     uint arg_max_idx = 0;
 #endif
 
 #ifdef CHECK_BOUNDRY
-    bool out_of_boundry = offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
-        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y;
-    #if  INPUT0_SIZE_Z != 1
-        out_of_boundry = out_of_boundry || offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z;
-    #endif
-    if (out_of_boundry)
+    if (offset_x + POOL_SIZE_X < 0 || offset_x >= INPUT0_SIZE_X ||
+        offset_y + POOL_SIZE_Y < 0 || offset_y >= INPUT0_SIZE_Y ||
+        offset_z + POOL_SIZE_Z < 0 || offset_z >= INPUT0_SIZE_Z)
     {
         return;
     }
@@ -91,122 +114,140 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
     uint num_elementes = 0;
 #endif
 
-    const uint batch_and_feature_offset = GET_DATA_INDEX(INPUT0, b, f, 0, 0);
-#if  OUTPUT_DIMS == 5  // 3D
-    for(uint k = 0; k < POOL_SIZE_Z; k++)
+#if OUTPUT_DIMS == 5
+    const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0, 0);
+#else
+    const uint batch_and_feature_offset = INPUT0_GET_INDEX(b, f, 0, 0);
+#endif
+
+#if OUTPUT_DIMS == 5
+    for(uint l = 0; l < POOL_SIZE_Z; l++)
     {
-        int input_offset_z = offset_z + k;
+        int input_offset_z = offset_z + l;
         bool zero_z = input_offset_z >= INPUT0_SIZE_Z || input_offset_z < 0;
-        if(!zero_z)
+        if (!zero_z)
         {
 #endif
-    for(uint j = 0; j < POOL_SIZE_Y; j++)
-    {
-        int input_offset_y = offset_y + j;
-        bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
-        if(!zero_y)
-        {
-            for(uint i = 0; i < POOL_SIZE_X; i++)
+            for(uint j = 0; j < POOL_SIZE_Y; j++)
             {
-                int input_offset_x = offset_x + i;
-                bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
-                if(!zero)
+                int input_offset_y = offset_y + j;
+                bool zero_y = input_offset_y >= INPUT0_SIZE_Y || input_offset_y < 0;
+                if(!zero_y)
                 {
-#if  OUTPUT_DIMS < 5
-                    const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH;
+                    for(uint i = 0; i < POOL_SIZE_X; i++)
+                    {
+                        int input_offset_x = offset_x + i;
+                        bool zero = input_offset_x >= INPUT0_SIZE_X || input_offset_x < 0;
+                        if(!zero)
+                        {
+#if OUTPUT_DIMS == 5
+    #if !INPUT0_SIMPLE
+                            const uint input_idx = INPUT0_GET_INDEX(b, f, input_offset_z, input_offset_y, input_offset_x);
+    #else
+                            const uint input_idx = batch_and_feature_offset + input_offset_z*INPUT0_Z_PITCH + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH;
+    #endif
 #else
-  #if OUTPUT_LAYOUT_B_FS_ZYX_FSV16
-                    const uint input_idx = GET_DATA_B_FS_ZYX_FSV16_INDEX(INPUT0, b, f, input_offset_z, input_offset_y, input_offset_x);
-  #elif OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16
-                    const uint input_idx = GET_DATA_BS_FS_ZYX_BSV16_FSV16_INDEX(INPUT0, b, f, input_offset_z, input_offset_y, input_offset_x);
-  #else
-                    const uint input_idx = batch_and_feature_offset + input_offset_z*INPUT0_Z_PITCH + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH;
-  #endif
+    #if !INPUT0_SIMPLE
+                            const uint input_idx = INPUT0_GET_INDEX(b, f, input_offset_y, input_offset_x);
+    #else
+                            const uint input_idx = batch_and_feature_offset + input_offset_y*INPUT0_Y_PITCH + input_offset_x*INPUT0_X_PITCH;
+    #endif
 #endif
 
 #if MAX_WITH_ARGMAX_POOLING
-                    if(input[input_idx] > result)
-                    {
+                            if(input[input_idx] > result)
+                            {
 #if  OUTPUT_DIMS < 5
-                        const uint input_idx_bfyx_no_padding = input_offset_x + INPUT0_SIZE_X * (input_offset_y + INPUT0_SIZE_Y * (f + INPUT0_FEATURE_NUM * b));
+                                const uint input_idx_bfyx_no_padding = input_offset_x + INPUT0_SIZE_X * (input_offset_y + INPUT0_SIZE_Y * (f + INPUT0_FEATURE_NUM * b));
 #else
-                        const uint input_idx_bfyx_no_padding = input_offset_x + INPUT0_SIZE_X * (input_offset_y + INPUT0_SIZE_Y *
+                                const uint input_idx_bfyx_no_padding = input_offset_x + INPUT0_SIZE_X * (input_offset_y + INPUT0_SIZE_Y *
                                                                (input_offset_z + INPUT0_SIZE_Z * (f + INPUT0_FEATURE_NUM * b)));
 #endif
-                        arg_max_idx = input_idx_bfyx_no_padding;
-                    }
+                                arg_max_idx = input_idx_bfyx_no_padding;
+                            }
 #endif
-                    result = FUNC_CALL(apply_pooling)(result, input[input_idx]);
+                            result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
 
 #ifdef DYNAMIC_KERNEL_DIVIDER
-                    num_elementes++;
+                            num_elementes++;
 #endif
+                        }
+                    }
                 }
             }
-        }
-    }
-#if  OUTPUT_DIMS == 5 // 3D
+#if OUTPUT_DIMS == 5
         }
     }
 #endif
+
 #ifdef DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
-#if  INPUT0_SIZE_Z != 1
-    const int dend = min(offset_z + POOL_SIZE_Z, INPUT0_SIZE_Z + PADDING_SIZE_Z);
-#endif
     const int hend = min(offset_y + POOL_SIZE_Y, INPUT0_SIZE_Y + PADDING_SIZE_Y);
     const int wend = min(offset_x + POOL_SIZE_X, INPUT0_SIZE_X + PADDING_SIZE_X);
-#if  INPUT0_SIZE_Z == 1
-    const uint num_elementes = (hend - offset_y) * (wend - offset_x);
+#if OUTPUT_DIMS == 5
+    const int zend = min(offset_z + POOL_SIZE_Z, INPUT0_SIZE_Z + PADDING_SIZE_Z);
+    const uint num_elementes = (hend - offset_y) * (wend - offset_x) * (zend - offset_z);
 #else
-    const uint num_elementes = (dend - offset_z) * (hend - offset_y) * (wend - offset_x);
-#endif
+    const uint num_elementes = (hend - offset_y) * (wend - offset_x);
 #endif
-#else
+
+#endif  // DYNAMIC_WITH_PADDING_KERNEL_DIVIDER
+
+#else  // CHECK_BOUNDRY
+
 #if  OUTPUT_DIMS == 5  // 3D
-    uint input_idx = GET_DATA_INDEX_5D(INPUT0, b, f, offset_z, offset_y, offset_x);
+    uint input_idx = INPUT0_GET_INDEX(b, f, offset_z, offset_y, offset_x);
 #else
-    uint input_idx = GET_DATA_INDEX(INPUT0, b, f, offset_y, offset_x);
+    uint input_idx = INPUT0_GET_INDEX(b, f, offset_y, offset_x);
 #endif
 
 #if MAX_WITH_ARGMAX_POOLING
-#if  OUTPUT_DIMS < 5
-    uint input_idx_bfyx_no_padding = offset_x + INPUT0_SIZE_X * (offset_y + INPUT0_SIZE_Y * (f + INPUT0_FEATURE_NUM * b));
-#else
-    uint input_idx_bfyx_no_padding = offset_x + INPUT0_SIZE_X * (offset_y + INPUT0_SIZE_Y * (offset_z + INPUT0_SIZE_Z *(f + INPUT0_FEATURE_NUM * b)));
-#endif
+    #if  OUTPUT_DIMS < 5
+        uint input_idx_bfyx_no_padding = offset_x + INPUT0_SIZE_X * (offset_y + INPUT0_SIZE_Y * (f + INPUT0_FEATURE_NUM * b));
+    #else
+        uint input_idx_bfyx_no_padding = offset_x + INPUT0_SIZE_X * (offset_y + INPUT0_SIZE_Y * (offset_z + INPUT0_SIZE_Z *(f + INPUT0_FEATURE_NUM * b)));
+    #endif
 #endif
 
-#if  OUTPUT_DIMS == 5  // 3D
-    for(uint k = 0; k < POOL_SIZE_Z; k++)
+#if OUTPUT_DIMS == 5
+    for(uint l = 0; l < POOL_SIZE_Z; l++)
     {
 #endif
-    for(uint j = 0; j < POOL_SIZE_Y; j++)
-    {
-        for(uint i = 0; i < POOL_SIZE_X; i++)
+        for(uint j = 0; j < POOL_SIZE_Y; j++)
         {
-
+            for(uint i = 0; i < POOL_SIZE_X; i++)
+            {
 #if MAX_WITH_ARGMAX_POOLING
             if(input[input_idx] > result)
                 arg_max_idx = input_idx_bfyx_no_padding;
 #endif
 
-#if INPUT0_LAYOUT_B_FS_ZYX_FSV16
-            uint input1_idx = INPUT0_GET_INDEX(b, f, offset_z+k, offset_y+j, offset_x+i);
-            result = FUNC_CALL(apply_pooling)(result, input[input1_idx]);
+#if OUTPUT_DIMS == 5
+    #if !INPUT0_SIMPLE
+                uint input_idx = INPUT0_GET_INDEX(b, f, offset_z + l, offset_y + j, offset_x + i);
+                result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
+    #else
+                result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
+                input_idx += INPUT0_X_PITCH;
+    #endif
 #else
-            result = FUNC_CALL(apply_pooling)(result, input[input_idx]);
+    #if !INPUT0_SIMPLE
+                uint input_idx = INPUT0_GET_INDEX(b, f, offset_y + j, offset_x + i);
+                result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
+    #else
+                result = FUNC_CALL(apply_pooling)(result, TO_ACCUMULATOR_TYPE(input[input_idx]));
+                input_idx += INPUT0_X_PITCH;
+    #endif
 #endif
 
-            input_idx += INPUT0_X_PITCH;
 #if MAX_WITH_ARGMAX_POOLING
-            input_idx_bfyx_no_padding++;
+                input_idx_bfyx_no_padding++;
 #endif
-        }
-        input_idx += (INPUT0_Y_PITCH - POOL_SIZE_X*INPUT0_X_PITCH);
+            }
+            input_idx += (INPUT0_Y_PITCH - POOL_SIZE_X*INPUT0_X_PITCH);
 #if MAX_WITH_ARGMAX_POOLING
-        input_idx_bfyx_no_padding += (INPUT0_SIZE_X - POOL_SIZE_X);
+            input_idx_bfyx_no_padding += (INPUT0_SIZE_X - POOL_SIZE_X);
 #endif
-    }
+        }
 #if  OUTPUT_DIMS == 5  // 3D
         input_idx += (INPUT0_Z_PITCH - POOL_SIZE_Y*INPUT0_Y_PITCH);
 #if MAX_WITH_ARGMAX_POOLING
@@ -218,7 +259,8 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
 #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
     const uint num_elementes = POOL_SIZE_X*POOL_SIZE_Y*POOL_SIZE_Z;
 #endif
-#endif
+
+#endif // CHECK_BOUNDRY
 
 #if defined AVG_POOLING
     #if defined(DYNAMIC_KERNEL_DIVIDER) || defined(DYNAMIC_WITH_PADDING_KERNEL_DIVIDER)
@@ -226,23 +268,30 @@ KERNEL(pooling_gpu)(const __global UNIT_TYPE* input, __global UNIT_TYPE* output
     #else
         result /= (ACCUMULATOR_TYPE)(POOL_SIZE_Z * POOL_SIZE_Y * POOL_SIZE_X);
     #endif
-#endif
+#endif  // defined AVG_POOLING
 
-#if OUTPUT_LAYOUT_B_FS_ZYX_FSV16
-    const uint output_pos = GET_DATA_B_FS_ZYX_FSV16_INDEX(OUTPUT, b, f, z, y, x);
-#elif OUTPUT_LAYOUT_BS_FS_ZYX_BSV16_FSV16
-    const uint output_pos = GET_DATA_BS_FS_ZYX_BSV16_FSV16_INDEX(OUTPUT, b, f, z, y, x);
+    OUTPUT_TYPE final_result;
+    ACTIVATION_TYPE pool_result = TO_ACTIVATION_TYPE(result);
+    
+#if HAS_FUSED_OPS
+      FUSED_OPS;
+      final_result = FUSED_OPS_RESULT;
+#else  // HAS_FUSED_OPS
+      final_result = TO_OUTPUT_TYPE(ACTIVATION(pool_result, ACTIVATION_PARAMS));
+#endif  // HAS_FUSED_OPS
+
+#if OUTPUT_DIMS == 5
+    const uint output_pos = OUTPUT_GET_INDEX(b, f, z, y, x);
 #else
-    const uint output_pos = GET_DATA_INDEX_5D(OUTPUT, b, f, z, y, x);
+    const uint output_pos = OUTPUT_GET_INDEX(b, f, y, x);
 #endif
-    output[output_pos] = ACTIVATION(TO_UNIT_TYPE(result), ACTIVATION_PARAMS);
+    output[output_pos] = final_result;
 
 #if MAX_WITH_ARGMAX_POOLING
     //INPUT1 macro stands for Argmax
     const uint arg_max_pos = GET_DATA_INDEX_5D(INPUT1, b, f, z, y, x);
     arg_max[arg_max_pos] = convert_float(arg_max_idx);
 #endif
-
 }
 
-#undef UNIT_INIT_VAL
+#undef INIT_VAL
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp
index d2e08023ac2135..a0e6533b2dcab2 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/pooling_gpu.cpp
@@ -115,13 +115,13 @@ struct pooling_gpu : typed_primitive_gpu_impl<pooling> {
         }
 
         // check if last pooling window goes outside of input size + padding. If so the avg pooling size will be
-        // adjusted to that.
+        // adjusted to that, to work properly this calculation must take pad_end into account.
         auto dynamic_mode = (((output_sizes.spatial[0] - 1) * stride.spatial[0]) + primitive->size.spatial[0]) >
-                                -2 * input_offset.spatial[0] + input_sizes.spatial[0] ||
+                                 (-input_offset.spatial[0] - primitive->pad_end.spatial[0]) + input_sizes.spatial[0] ||
                             (((output_sizes.spatial[1] - 1) * stride.spatial[1]) + primitive->size.spatial[1]) >
-                                -2 * input_offset.spatial[1] + input_sizes.spatial[1] ||
+                                 (-input_offset.spatial[1] - primitive->pad_end.spatial[1]) + input_sizes.spatial[1] ||
                             (((output_sizes.spatial[2] - 1) * stride.spatial[2]) + primitive->size.spatial[2]) >
-                                -2 * input_offset.spatial[2] + input_sizes.spatial[2];
+                                 (-input_offset.spatial[2] - primitive->pad_end.spatial[2]) + input_sizes.spatial[2];
 
         if (primitive->mode == pooling_mode::average && dynamic_mode)
             pp.divMode = kernel_selector::kernel_divider_mode::DYNAMIC_WITH_PADDING;
@@ -196,6 +196,7 @@ attach_pooling_gpu::attach_pooling_gpu() {
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
+    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create);
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), pooling_gpu::create);
     // MMAD
@@ -214,6 +215,9 @@ attach_pooling_gpu::attach_pooling_gpu() {
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_zyx_fsv32), pooling_gpu::create);
     //
     implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), pooling_gpu::create);
+    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::fs_b_yx_fsv32), pooling_gpu::create);
+    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::fs_b_yx_fsv32), pooling_gpu::create);
+    implementation_map<pooling>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_b_yx_fsv32), pooling_gpu::create);
 }
 
 }  // namespace detail
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp b/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp
index 34db0348382a31..2823f73fcd5fda 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/quantize_gpu.cpp
@@ -104,6 +104,10 @@ attach_quantize_gpu::attach_quantize_gpu() {
     auto val_fw = quantize_gpu::create;
 
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::fs_b_yx_fsv32), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::fs_b_yx_fsv32), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_b_yx_fsv32), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::fs_b_yx_fsv32), val_fw);
+
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::b_fs_yx_fsv16), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::b_fs_yx_fsv16), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv16), val_fw);
@@ -134,12 +138,28 @@ attach_quantize_gpu::attach_quantize_gpu() {
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_zyx_fsv32), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_zyx_fsv32), val_fw);
 
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_yx_bsv16_fsv16), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_yx_bsv16_fsv16), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_yx_bsv16_fsv16), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_yx_bsv16_fsv16), val_fw);
+
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bs_fs_zyx_bsv16_fsv16), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bs_fs_zyx_bsv16_fsv16), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bs_fs_zyx_bsv16_fsv16), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bs_fs_zyx_bsv16_fsv16), val_fw);
+
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::bfyx), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::bfyx), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), val_fw);
 
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i32, format::byxf), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::byxf), val_fw);
+    implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), val_fw);
+
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), val_fw);
     implementation_map<quantize>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), val_fw);
 
diff --git a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
index 66cba243eff45d..591efacc8a737e 100644
--- a/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
+++ b/inference-engine/thirdparty/clDNN/src/graph_optimizer/prepare_primitive_fusing.cpp
@@ -36,6 +36,7 @@
 #include "lrn_inst.h"
 #include "mutable_data_inst.h"
 #include "mvn_inst.h"
+#include "pooling_inst.h"
 #include "normalize_inst.h"
 #include "permute_inst.h"
 #include "reshape_inst.h"
@@ -328,6 +329,15 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
             return false;
         };
 
+        auto pooling_supports_fusings = [](pooling_node& node) -> bool {
+            auto pooling_mode = node.as<pooling>().get_primitive()->mode;
+
+            if (pooling_mode != cldnn::pooling_mode::max_with_argmax)
+                return true;
+
+            return false;
+        };
+
         auto fuse_activation_f = [&](activation_node& activation_node) {
             auto& input_data = activation_node.get_dependency(0);
             if (input_data.get_users().size() != 1 || activation_node.get_dependencies().size() >= 3)
@@ -341,13 +351,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<gemm>() && gemm_supports_fusings(input_data.as<gemm>());
 
-            should_fuse |= input_data.is_type<lrn>();
-
-            should_fuse |= input_data.is_type<pooling>() &&
-                (input_data.get_dependency(0).get_output_layout().data_type == data_types::i8 ||
-                 input_data.get_dependency(0).get_output_layout().data_type == data_types::u8) &&
-                (input_data.as<pooling>().get_primitive()->mode == pooling_mode::average ||
-                 input_data.as<pooling>().get_primitive()->mode == pooling_mode::average_no_padding);
+            should_fuse |= input_data.is_type<pooling>() && pooling_supports_fusings(input_data.as<pooling>());
 
             should_fuse |= input_data.is_type<resample>();
 
@@ -357,6 +361,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<activation>();
 
+            should_fuse |= input_data.is_type<lrn>();
+
             if (!should_fuse)
                 return;
 
@@ -380,13 +386,7 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<gemm>() && gemm_supports_fusings(input_data.as<gemm>());
 
-            should_fuse |= input_data.is_type<lrn>();
-
-            should_fuse |= input_data.is_type<pooling>() &&
-                (input_data.get_dependency(0).get_output_layout().data_type == data_types::i8 ||
-                 input_data.get_dependency(0).get_output_layout().data_type == data_types::u8) &&
-                (input_data.as<pooling>().get_primitive()->mode == pooling_mode::average ||
-                 input_data.as<pooling>().get_primitive()->mode == pooling_mode::average_no_padding);
+            should_fuse |= input_data.is_type<pooling>() && pooling_supports_fusings(input_data.as<pooling>());
 
             should_fuse |= input_data.is_type<resample>();
 
@@ -396,6 +396,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
 
             should_fuse |= input_data.is_type<activation>();
 
+            should_fuse |= input_data.is_type<lrn>();
+
             if (!should_fuse)
                 return;
 
@@ -434,13 +436,8 @@ void prepare_primitive_fusing::fuse_simple_primitives(program_impl &p) {
                            input_data.get_dependency(0).get_output_layout().data_type == data_types::i8) &&
                            (out_layout.data_type == data_types::u8 || out_layout.data_type == data_types::i8)));
 
-            should_fuse |= input_data.is_type<pooling>() &&
-                           quantize_node.get_scale_shift_opt() &&
-                          // TODO: unify pooling ref and ref_int8 kernels and remove this restriction on precision
-                          (input_data.get_dependency(0).get_output_layout().data_type == data_types::u8 ||
-                           input_data.get_dependency(0).get_output_layout().data_type == data_types::i8) &&
-                          (input_data.as<pooling>().get_primitive()->mode == pooling_mode::average ||
-                           input_data.as<pooling>().get_primitive()->mode == pooling_mode::average_no_padding);
+            should_fuse |= input_data.is_type<pooling>() && quantize_node.get_scale_shift_opt() &&
+                           pooling_supports_fusings(input_data.as<pooling>());
 
             should_fuse |= input_data.is_type<fully_connected>() && fc_supports_fusings(input_data.as<fully_connected>()) &&
                            quantize_node.get_scale_shift_opt() &&
diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp
index 14dedbe7c06c16..d5a2f3b531ce06 100644
--- a/inference-engine/thirdparty/clDNN/src/program.cpp
+++ b/inference-engine/thirdparty/clDNN/src/program.cpp
@@ -375,6 +375,11 @@ void program_impl::build_program(bool is_internal) {
 void program_impl::init_graph() {
     apply_opt_pass<graph_initializations>();
 
+    for (auto& node : processing_order) {
+        if (!node->is_type<internal_primitive>() && !node->is_type<data>())
+            node->get_output_layout();
+    }
+
     apply_opt_pass<calculate_prior_boxes>();
 
     apply_opt_pass<mark_nodes>();
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
index a3bbae5806b8c3..1d81204c3f7df6 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/fusings_gpu_test.cpp
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2019 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -2554,117 +2554,6 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, mvn_scale_activation_quantize_u8_eltwise_fp
         mvn_test_params{ CASE_MVN_3D_U8_2, 2, 7 },
 }), );
 
-/* ----------------------------------------------------------------------------------------------------- */
-/* --------------------------------------- Pooling cases ----------------------------------------------- */
-/* ----------------------------------------------------------------------------------------------------- */
-struct pooling_test_params {
-    tensor input_size;
-    data_types input_type;
-    format input_format;
-    pooling_mode mode;
-    tensor kernel_size;
-    tensor stride;
-    tensor offset;
-    data_types default_type;
-    format default_format;
-    size_t expected_fused_primitives;
-    size_t expected_not_fused_primitives;
-};
-
-#define CASE_POOLING_F32_1 {1, 16, 8, 8}, data_types::f32, format::bfyx, pooling_mode::max, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx
-#define CASE_POOLING_F32_2 {2, 16, 8, 8}, data_types::f32, format::bfyx, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx
-
-#define CASE_POOLING_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, pooling_mode::max, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx
-#define CASE_POOLING_F16_2 {2, 16, 8, 8}, data_types::f16, format::bfyx, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx
-
-#define CASE_POOLING_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, pooling_mode::max, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx
-#define CASE_POOLING_U8_2 {2, 16, 8, 8}, data_types::u8, format::bfyx, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx
-#define CASE_POOLING_U8_3 {2, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx
-
-#define CASE_POOLING_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, pooling_mode::max, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx
-#define CASE_POOLING_I8_2 {2, 16, 8, 8}, data_types::i8, format::bfyx, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx
-#define CASE_POOLING_I8_3 {2, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, pooling_mode::average, tensor{1,1,3,3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}, data_types::f32, format::bfyx
-
-class PoolingFusingTest : public ::BaseFusingTest<pooling_test_params> {
-public:
-    void execute(pooling_test_params& p) {
-        auto input_prim = get_mem(get_input_layout(p));
-
-        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
-        network network_fused(this->engine, this->topology_fused, bo_fused);
-
-        network_fused.set_input_data("input", input_prim);
-        network_not_fused.set_input_data("input", input_prim);
-
-        compare(network_not_fused, network_fused, p);
-    }
-
-    layout get_input_layout(pooling_test_params& p) {
-        return layout{ p.input_type, p.input_format, p.input_size };
-    }
-
-    layout get_per_channel_layout(pooling_test_params& p) {
-        return layout{ p.default_type, p.default_format, tensor{1, p.input_size.feature[0], 1, 1} };
-    }
-};
-
-class pooling_activation : public PoolingFusingTest {};
-TEST_P(pooling_activation, basic) {
-    auto p = GetParam();
-    create_topologies(
-        input_layout("input", get_input_layout(p)),
-        pooling("pooling", "input", p.mode, p.kernel_size, p.stride, p.offset),
-        activation("act", "pooling", activation_func::relu),
-        reorder("reorder_bfyx", "act", format::bfyx, data_types::f32)
-    );
-
-    tolerance = 1e-5f;
-    execute(p);
-}
-
-INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_activation,
-    ::testing::ValuesIn(std::vector<pooling_test_params>{
-                        pooling_test_params{ CASE_POOLING_F32_1, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_F32_2, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_F16_1, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_F16_2, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_I8_1, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_U8_2, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_U8_3, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_I8_1, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_I8_2, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_I8_3, 2, 3 },
-}), );
-
-class pooling_scale : public PoolingFusingTest {};
-TEST_P(pooling_scale, basic) {
-    auto p = GetParam();
-    create_topologies(
-        input_layout("input", get_input_layout(p)),
-        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f/p.kernel_size.count())),
-        pooling("pooling", "input", p.mode, p.kernel_size, p.stride, p.offset),
-        scale("scale", "pooling", "scale_data"),
-        reorder("reorder_bfyx", "scale", format::bfyx, data_types::f32)
-    );
-
-    tolerance = 1e-5f;
-    execute(p);
-}
-
-INSTANTIATE_TEST_CASE_P(fusings_gpu, pooling_scale,
-    ::testing::ValuesIn(std::vector<pooling_test_params>{
-                        pooling_test_params{ CASE_POOLING_F32_1, 3, 3 },
-                        pooling_test_params{ CASE_POOLING_F32_2, 3, 3 },
-                        pooling_test_params{ CASE_POOLING_F16_1, 3, 3 },
-                        pooling_test_params{ CASE_POOLING_F16_2, 3, 3 },
-                        pooling_test_params{ CASE_POOLING_U8_1, 3, 3 },
-                        pooling_test_params{ CASE_POOLING_U8_2, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_U8_3, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_I8_1, 3, 3 },
-                        pooling_test_params{ CASE_POOLING_I8_2, 2, 3 },
-                        pooling_test_params{ CASE_POOLING_I8_3, 2, 3 },
-}), );
-
 /* ----------------------------------------------------------------------------------------------------- */
 /* ---------------------------------------- LRN cases -------------------------------------------------- */
 /* ----------------------------------------------------------------------------------------------------- */
@@ -3707,3 +3596,451 @@ INSTANTIATE_TEST_CASE_P(fusings_gpu, deconv_scale_actv_quant_u8_eltw_scale_actv_
         deconv_test_params{ CASE_DECONV_S8S8_3D_7, 2, 9 },
         deconv_test_params{ CASE_DECONV_S8S8_3D_8, 2, 9 },
 }), );
+
+/* ----------------------------------------------------------------------------------------------------- */
+/* --------------------------------------- Pooling cases ----------------------------------------------- */
+/* ----------------------------------------------------------------------------------------------------- */
+struct pooling_test_params {
+    tensor in_shape;
+    data_types data_type;
+    format input_format;
+    data_types default_type;
+    format default_format;
+    size_t expected_fused_primitives;
+    size_t expected_not_fused_primitives;
+    pooling_mode pool_mode;
+    std::string kernel_name;
+};
+
+#define CASE_POOLING_F32_1 {1, 16, 8, 8}, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_POOLING_F32_2 {2, 16, 8, 8}, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_POOLING_F32_3 {1, 32, 10, 10}, data_types::f32, format::bfyx, data_types::f32, format::bfyx
+#define CASE_POOLING_F32_4 {1, 32, 10, 10}, data_types::f32, format::fs_b_yx_fsv32, data_types::f32, format::bfyx
+#define CASE_POOLING_F32_5 {1, 32, 10, 10}, data_types::f32, format::byxf, data_types::f32, format::bfyx
+#define CASE_POOLING_F32_6 {1, 32, 40, 40}, data_types::f32, format::byxf, data_types::f32, format::bfyx
+#define CASE_POOLING_F32_7 {16, 32, 10, 10}, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfyx
+#define CASE_POOLING_F32_8 {16, 32, 10, 10}, data_types::f32, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_POOLING_F32_9 {16, 32, 10, 10}, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfyx
+#define CASE_POOLING_F32_10 {16, 32, 10, 10, 10}, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::bfyx
+
+#define CASE_POOLING_F32_F16_1 {1, 16, 8, 8}, data_types::f32, format::bfyx, data_types::f16, format::bfyx
+#define CASE_POOLING_F32_F16_2 {2, 16, 8, 8}, data_types::f32, format::bfyx, data_types::f16, format::bfyx
+#define CASE_POOLING_F32_F16_3 {1, 32, 10, 10}, data_types::f32, format::bfyx, data_types::f16, format::bfyx
+#define CASE_POOLING_F32_F16_4 {1, 32, 10, 10}, data_types::f32, format::fs_b_yx_fsv32, data_types::f16, format::bfyx
+#define CASE_POOLING_F32_F16_5 {1, 32, 10, 10}, data_types::f32, format::byxf, data_types::f16, format::bfyx
+#define CASE_POOLING_F32_F16_6 {1, 32, 40, 40}, data_types::f32, format::byxf, data_types::f16, format::bfyx
+#define CASE_POOLING_F32_F16_7 {16, 32, 10, 10}, data_types::f32, format::bs_fs_yx_bsv16_fsv16, data_types::f16, format::bfyx
+#define CASE_POOLING_F32_F16_8 {16, 32, 10, 10}, data_types::f32, format::b_fs_yx_fsv16, data_types::f16, format::bfyx
+#define CASE_POOLING_F32_F16_9 {16, 32, 10, 10}, data_types::f32, format::b_fs_zyx_fsv16, data_types::f16, format::bfyx
+#define CASE_POOLING_F32_F16_10 {16, 32, 10, 10, 10}, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::bfyx
+
+#define CASE_POOLING_F16_1 {1, 16, 8, 8}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
+#define CASE_POOLING_F16_3 {1, 32, 10, 10}, data_types::f16, format::bfyx, data_types::f32, format::bfyx
+#define CASE_POOLING_F16_4 {1, 32, 10, 10}, data_types::f16, format::fs_b_yx_fsv32, data_types::f32, format::bfyx
+#define CASE_POOLING_F16_5 {1, 32, 10, 10}, data_types::f16, format::byxf, data_types::f32, format::bfyx
+#define CASE_POOLING_F16_6 {1, 32, 40, 40}, data_types::f16, format::byxf, data_types::f32, format::bfyx
+#define CASE_POOLING_F16_7 {16, 32, 10, 10}, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f32, format::bfyx
+#define CASE_POOLING_F16_8 {16, 32, 10, 10}, data_types::f16, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_POOLING_F16_9 {16, 32, 10, 10, 10}, data_types::f32, format::b_fs_zyx_fsv16, data_types::f32, format::bfyx
+#define CASE_POOLING_F16_10 {16, 32, 10, 10, 10}, data_types::f32, format::bs_fs_zyx_bsv16_fsv16, data_types::f32, format::bfyx
+
+#define CASE_POOLING_F16_FP16_1 {1, 32, 10, 10}, data_types::f16, format::bfyx, data_types::f16, format::bfyx
+#define CASE_POOLING_F16_FP16_2 {1, 32, 10, 10}, data_types::f16, format::fs_b_yx_fsv32, data_types::f16, format::bfyx
+#define CASE_POOLING_F16_FP16_3 {1, 32, 10, 10}, data_types::f16, format::byxf, data_types::f16, format::bfyx
+#define CASE_POOLING_F16_FP16_4 {1, 32, 40, 40}, data_types::f16, format::byxf, data_types::f16, format::bfyx
+#define CASE_POOLING_F16_FP16_5 {16, 32, 10, 10}, data_types::f16, format::bs_fs_yx_bsv16_fsv16, data_types::f16, format::bfyx
+#define CASE_POOLING_F16_FP16_6 {16, 32, 10, 10}, data_types::f16, format::b_fs_yx_fsv16, data_types::f16, format::bfyx
+#define CASE_POOLING_F16_FP16_7 {16, 32, 10, 10, 10}, data_types::f16, format::b_fs_zyx_fsv16, data_types::f16, format::bfyx
+#define CASE_POOLING_F16_FP16_8 {16, 32, 10, 10, 10}, data_types::f16, format::bs_fs_zyx_bsv16_fsv16, data_types::f16, format::bfyx
+
+#define CASE_POOLING_U8_1 {1, 16, 8, 8}, data_types::u8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_POOLING_U8_2 {2, 16, 8, 8}, data_types::u8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_POOLING_U8_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4
+#define CASE_POOLING_U8_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f32, format::bfyx
+#define CASE_POOLING_U8_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
+#define CASE_POOLING_U8_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
+
+#define CASE_POOLING_U8_FP16_3 {1, 32, 10, 10}, data_types::u8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4
+#define CASE_POOLING_U8_FP16_4 {1, 32, 10, 10}, data_types::u8, format::byxf_af32, data_types::f16, format::bfyx
+#define CASE_POOLING_U8_FP16_5 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
+#define CASE_POOLING_U8_FP16_6 {16, 32, 10, 10, 10}, data_types::u8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
+
+#define CASE_POOLING_I8_1 {1, 16, 8, 8}, data_types::i8, format::bfyx, data_types::f32, format::bfyx
+#define CASE_POOLING_I8_2 {2, 16, 8, 8}, data_types::i8, format::b_fs_yx_fsv16, data_types::f32, format::bfyx
+#define CASE_POOLING_I8_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f32, format::bfyx
+#define CASE_POOLING_I8_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f32, format::b_fs_yx_fsv4
+#define CASE_POOLING_I8_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f32, format::bfyx
+
+#define CASE_POOLING_I8_FP16_4 {1, 32, 10, 10}, data_types::i8, format::byxf_af32, data_types::f16, format::bfyx
+#define CASE_POOLING_I8_FP16_5 {1, 32, 10, 10}, data_types::i8, format::b_fs_yx_fsv4, data_types::f16, format::b_fs_yx_fsv4
+#define CASE_POOLING_I8_FP16_6 {16, 32, 10, 10, 10}, data_types::i8, format::b_fs_zyx_fsv32, data_types::f16, format::bfyx
+
+// Disabled
+#define CASE_POOLING_I8_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f32, format::bfyx
+#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx
+#define CASE_POOLING_I8_FP16_3 {4, 32, 10, 10}, data_types::i8, format::fs_bs_yx_bsv4_fsv32, data_types::f16, format::bfyx
+
+class PoolingFusingTest : public ::BaseFusingTest<pooling_test_params> {
+public:
+    void execute(pooling_test_params& p) {
+        auto input_prim = get_mem(get_input_layout(p));
+        build_options options;
+        options.set_option(build_option::optimize_data(true));
+        if (!p.kernel_name.empty()) {
+            implementation_desc impl = {p.input_format, p.kernel_name};
+            options.set_option(build_option::force_implementations({{"pooling", impl}}));
+        }
+        network network_not_fused(this->engine, this->topology_non_fused, bo_not_fused);
+        network network_fused(this->engine, this->topology_fused, options);
+
+        network_fused.set_input_data("input", input_prim);
+        network_not_fused.set_input_data("input", input_prim);
+
+        ASSERT_FALSE(network_fused.get_primitives_info().empty());
+        ASSERT_FALSE(network_not_fused.get_primitives_info().empty());
+
+        auto find_and_check = [&](primitive_info& p) -> bool {
+            if (p.original_id == "pooling" || p.original_id == "output_reorder")
+                return true;
+            return false;
+        };
+
+        auto pi_fused = network_fused.get_primitives_info();
+        auto pi_not_fused = network_not_fused.get_primitives_info();
+        auto info_fused = std::find_if(pi_fused.begin(), pi_fused.end(), find_and_check);
+        auto info_not_fused = std::find_if(pi_not_fused.begin(), pi_not_fused.end(), find_and_check);
+
+        ASSERT_TRUE(info_fused != pi_fused.end());
+        ASSERT_TRUE(info_not_fused != pi_not_fused.end());
+
+        compare(network_not_fused, network_fused, p);
+    }
+
+    layout get_input_layout(pooling_test_params& p) { return layout{p.data_type, p.input_format, p.in_shape}; }
+    layout get_per_channel_layout(pooling_test_params& p) {
+        return layout{p.default_type, p.default_format, tensor{1, p.in_shape.feature[0], 1, 1}};
+    }
+};
+
+class pooling_f32_activation : public PoolingFusingTest {};
+TEST_P(pooling_f32_activation, basic) {
+    auto p = GetParam();
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        pooling("pooling", "input", p.pool_mode, tensor{1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}),
+        activation("act", "pooling", activation_func::relu),
+        reorder("output_reorder", "act", format::bfyx, data_types::f32));
+
+    tolerance = 1e-05f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu,
+                        pooling_f32_activation,
+                        ::testing::ValuesIn(std::vector<pooling_test_params>{
+                            pooling_test_params{CASE_POOLING_F32_1, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_F32_1, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_F16_1, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_F16_1, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_U8_1, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_U8_1, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_U8_2, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_U8_2, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_I8_2, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_I8_2, 2, 3, pooling_mode::average, ""},
+                        }), );
+
+class pooling_f32_scale : public PoolingFusingTest {};
+TEST_P(pooling_f32_scale, basic) {
+    auto p = GetParam();
+    create_topologies(
+        input_layout("input", get_input_layout(p)),
+        data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 3, 3}.count())),
+        pooling("pooling", "input", p.pool_mode, tensor{1, 1, 3, 3}, tensor{1}, tensor{0, 0, -1, -1, 0, 0}),
+        scale("scale", "pooling", "scale_data"),
+        reorder("output_reorder", "scale", format::bfyx, data_types::f32));
+
+    tolerance = 1e-05f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu,
+                        pooling_f32_scale,
+                        ::testing::ValuesIn(std::vector<pooling_test_params>{
+                            pooling_test_params{CASE_POOLING_F32_1, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_F32_1, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_F16_1, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_F16_1, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_U8_1, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_U8_1, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_U8_2, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_U8_2, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_I8_1, 2, 3, pooling_mode::average, ""},
+                            pooling_test_params{CASE_POOLING_I8_2, 2, 3, pooling_mode::max, ""},
+                            pooling_test_params{CASE_POOLING_I8_2, 2, 3, pooling_mode::average, ""},
+                        }), );
+
+class pooling_scale_activation_quantize : public PoolingFusingTest {};
+TEST_P(pooling_scale_activation_quantize, basic) {
+    auto p = GetParam();
+
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      data("in_lo", get_mem(get_single_element_layout(p), min_random, 0)),
+                      data("in_hi", get_mem(get_single_element_layout(p), 1, max_random)),
+                      data("out_lo", get_mem(get_single_element_layout(p), 0)),
+                      data("out_hi", get_mem(get_single_element_layout(p), 255)),
+                      data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 4, 4}.count())),
+                      pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)),
+                      scale("scale", "pooling", "scale_data"),
+                      activation("activation", "scale", activation_func::relu),
+                      quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::u8),
+                      reorder("output_reorder", "quantize", p.default_format, data_types::f32));
+
+    tolerance = 1.0f;
+    execute(p);
+}
+
+TEST_P(pooling_scale_activation_quantize, i8_output_data_type) {
+    auto p = GetParam();
+
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
+                      data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
+                      data("out_lo", get_mem(get_single_element_layout(p), -127, 127)),
+                      data("out_hi", get_mem(get_single_element_layout(p), -127, 127)),
+                      data("scale_data",  get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 4, 4}.count())),
+                      pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)),
+                      scale("scale", "pooling", "scale_data"),
+                      activation("activation", "scale", activation_func::relu),
+                      quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::i8),
+                      reorder("output_reorder", "quantize", p.default_format, data_types::f32));
+
+    tolerance = 1.0f;
+    execute(p);
+}
+
+TEST_P(pooling_scale_activation_quantize, per_channel) {
+    auto p = GetParam();
+
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      data("in_lo", get_mem(get_per_channel_layout(p), min_random, 0)),
+                      data("in_hi", get_mem(get_per_channel_layout(p), 1, max_random)),
+                      data("out_lo", get_mem(get_single_element_layout(p), 0)),
+                      data("out_hi", get_mem(get_single_element_layout(p), 255)),
+                      data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 4, 4}.count())),
+                      pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)),
+                      scale("scale", "pooling", "scale_data"),
+                      activation("activation", "scale", activation_func::atan),
+                      quantize("quantize", "activation", "in_lo", "in_hi", "out_lo", "out_hi", 255, data_types::u8),
+                      reorder("output_reorder", "quantize", p.default_format, data_types::f32));
+
+    tolerance = 1.0f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu,
+                         pooling_scale_activation_quantize,
+                         ::testing::ValuesIn(std::vector<pooling_test_params>{
+                            // Input type: FP32
+                            pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_bfyx_block_opt"},
+                            pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::max, "pooling_gpu_bfyx_block_opt"},
+                            pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::max, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_4, 2, 5, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32"},
+                            pooling_test_params{CASE_POOLING_F32_4, 2, 5, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32"},
+                            pooling_test_params{CASE_POOLING_F32_5, 2, 5, pooling_mode::average, "pooling_gpu_byxf_padding_opt"},
+                            pooling_test_params{CASE_POOLING_F32_5, 2, 5, pooling_mode::max, "pooling_gpu_byxf_padding_opt"},
+                            pooling_test_params{CASE_POOLING_F32_6, 2, 5, pooling_mode::average, "pooling_gpu_byxf_opt"},
+                            pooling_test_params{CASE_POOLING_F32_6, 2, 5, pooling_mode::max, "pooling_gpu_byxf_opt"},
+                            pooling_test_params{CASE_POOLING_F32_7, 2, 5, pooling_mode::average, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F32_7, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F32_8, 2, 5, pooling_mode::average, "pooling_gpu_blocked"},
+                            pooling_test_params{CASE_POOLING_F32_8, 2, 5, pooling_mode::max, "pooling_gpu_blocked"},
+                            pooling_test_params{CASE_POOLING_F32_9, 2, 5, pooling_mode::average, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_9, 2, 5, pooling_mode::max, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_10, 2, 5, pooling_mode::average, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F32_10, 2, 5, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
+
+                            // Input type: INT8
+                            pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_I8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_I8_5, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_I8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_I8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
+
+                            // Input type: UINT8
+                            pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_U8_3, 2, 5, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_5, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::average, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_U8_4, 2, 5, pooling_mode::max, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_6, 2, 5, pooling_mode::max, "pooling_gpu_int8_ref"},
+                        }), );
+
+INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
+                         pooling_scale_activation_quantize,
+                         ::testing::ValuesIn(std::vector<pooling_test_params>{
+                            pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
+                            pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
+                            pooling_test_params{CASE_POOLING_I8_3, 2, 5, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
+                            pooling_test_params{CASE_POOLING_F32_3, 2, 5, pooling_mode::average, "pooling_gpu_average_opt"},  //currently not enabled, fusing not upported
+                        }), );
+
+class pooling_scale_activation : public PoolingFusingTest {};
+TEST_P(pooling_scale_activation, basic) {
+    auto p = GetParam();
+
+    create_topologies(input_layout("input", get_input_layout(p)),
+                      data("scale_data", get_mem(get_per_channel_layout(p), 1.0f / tensor{1, 1, 4, 4}.count())),
+                      pooling("pooling", "input", "", p.pool_mode, tensor(1, 1, 4, 4), tensor(1, 1, 2, 2)),
+                      scale("scale", "pooling", "scale_data"),
+                      activation("activation", "scale", activation_func::relu),
+                      reorder("output_reorder", "activation", p.default_format, data_types::f32));
+
+    tolerance = 1e-05f;
+    execute(p);
+}
+
+INSTANTIATE_TEST_CASE_P(fusings_gpu,
+                        pooling_scale_activation,
+                        ::testing::ValuesIn(std::vector<pooling_test_params>{
+                            // Input type: F32
+                            pooling_test_params{CASE_POOLING_F32_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt"},
+                            pooling_test_params{CASE_POOLING_F32_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt"},
+                            pooling_test_params{CASE_POOLING_F32_3, 2, 4, pooling_mode::average, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_3, 2, 4, pooling_mode::max, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32"},
+                            pooling_test_params{CASE_POOLING_F32_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32"},
+                            pooling_test_params{CASE_POOLING_F32_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt"},
+                            pooling_test_params{CASE_POOLING_F32_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt"},
+                            pooling_test_params{CASE_POOLING_F32_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt"},
+                            pooling_test_params{CASE_POOLING_F32_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt"},
+                            pooling_test_params{CASE_POOLING_F32_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F32_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F32_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked"},
+                            pooling_test_params{CASE_POOLING_F32_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked"},
+                            pooling_test_params{CASE_POOLING_F32_9, 2, 4, pooling_mode::average, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_9, 2, 4, pooling_mode::max, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F32_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
+
+                            // Input type: INT8
+                            pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_I8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_I8_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_I8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_I8_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
+
+                            // Input type: UINT8
+                            pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_U8_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_U8_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
+
+                            // Input type: FP16  Output type: F32
+                            pooling_test_params{CASE_POOLING_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt"},
+                            pooling_test_params{CASE_POOLING_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt"},
+                            pooling_test_params{CASE_POOLING_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F16_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32"},
+                            pooling_test_params{CASE_POOLING_F16_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32"},
+                            pooling_test_params{CASE_POOLING_F16_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt"},
+                            pooling_test_params{CASE_POOLING_F16_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt"},
+                            pooling_test_params{CASE_POOLING_F16_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt"},
+                            pooling_test_params{CASE_POOLING_F16_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt"},
+                            pooling_test_params{CASE_POOLING_F16_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F16_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F16_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked"},
+                            pooling_test_params{CASE_POOLING_F16_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked"},
+                            pooling_test_params{CASE_POOLING_F16_9, 2, 4, pooling_mode::average, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F16_9, 2, 4, pooling_mode::max, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F16_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
+
+                            // Input type: FP16
+                            pooling_test_params{CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::average, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_1, 2, 4, pooling_mode::max, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_2, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_2, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_blocked"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_blocked"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_7, 2, 4, pooling_mode::average, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_7, 2, 4, pooling_mode::max, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_8, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F16_FP16_8, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
+
+                            // Input type: FP32
+                            pooling_test_params{CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_bfyx_block_opt"},
+                            pooling_test_params{CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_bfyx_block_opt"},
+                            pooling_test_params{CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::average, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_F16_3, 2, 4, pooling_mode::max, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_F16_4, 2, 4, pooling_mode::average, "pooling_gpu_fs_b_yx_fsv32"},
+                            pooling_test_params{CASE_POOLING_F32_F16_4, 2, 4, pooling_mode::max, "pooling_gpu_fs_b_yx_fsv32"},
+                            pooling_test_params{CASE_POOLING_F32_F16_5, 2, 4, pooling_mode::average, "pooling_gpu_byxf_padding_opt"},
+                            pooling_test_params{CASE_POOLING_F32_F16_5, 2, 4, pooling_mode::max, "pooling_gpu_byxf_padding_opt"},
+                            pooling_test_params{CASE_POOLING_F32_F16_6, 2, 4, pooling_mode::average, "pooling_gpu_byxf_opt"},
+                            pooling_test_params{CASE_POOLING_F32_F16_6, 2, 4, pooling_mode::max, "pooling_gpu_byxf_opt"},
+                            pooling_test_params{CASE_POOLING_F32_F16_7, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F32_F16_7, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F32_F16_8, 2, 4, pooling_mode::average, "pooling_gpu_blocked"},
+                            pooling_test_params{CASE_POOLING_F32_F16_8, 2, 4, pooling_mode::max, "pooling_gpu_blocked"},
+                            pooling_test_params{CASE_POOLING_F32_F16_9, 2, 4, pooling_mode::average, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_F16_9, 2, 4, pooling_mode::max, "pooling_gpu_ref"},
+                            pooling_test_params{CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::average, "pooling_gpu_bsv16_fsv16"},
+                            pooling_test_params{CASE_POOLING_F32_F16_10, 2, 4, pooling_mode::max, "pooling_gpu_bsv16_fsv16"},
+
+                            // Input type: INT8
+                            pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_I8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_I8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_I8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
+
+                            // Input type: UINT8
+                            pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::average, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_U8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_b_fs_yx_fsv4"},
+                            pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::average, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_U8_FP16_4, 2, 4, pooling_mode::max, "pooling_gpu_byxf_af32"},
+                            pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_FP16_5, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::average, "pooling_gpu_int8_ref"},
+                            pooling_test_params{CASE_POOLING_U8_FP16_6, 2, 4, pooling_mode::max, "pooling_gpu_int8_ref"},
+                     }), );
+
+INSTANTIATE_TEST_CASE_P(DISABLED_fusings_gpu,
+                        pooling_scale_activation,
+                        ::testing::ValuesIn(std::vector<pooling_test_params>{
+                            pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
+                            pooling_test_params{CASE_POOLING_I8_FP16_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
+                            pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32_simd32"},
+                            pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::max, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
+                            pooling_test_params{CASE_POOLING_I8_3, 2, 4, pooling_mode::average, "pooling_gpu_fs_bs_yx_bsv4_fsv32"},
+                     }), );
diff --git a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp
index aae7df75ca356f..78b1fa84c0778e 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp
+++ b/inference-engine/thirdparty/clDNN/tests/test_cases/pooling_gpu_test.cpp
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2016-2019 Intel Corporation
+// Copyright (c) 2016-2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -31,6 +31,14 @@
 using namespace cldnn;
 using namespace tests;
 
+namespace cldnn {
+template <>
+struct type_to_data_type<FLOAT16> {
+    static const data_types value = data_types::f16;
+};
+}  // namespace cldnn
+
+
 template <typename InputT, pooling_mode Mode>
 struct pooling_mode_output {
     using type = InputT;
@@ -71,7 +79,7 @@ template <typename InputT>
 struct pooling_accumulator<InputT, pooling_mode::max> {
     using output_t = typename pooling_mode_output<InputT, pooling_mode::max>::type;
 
-    pooling_accumulator() : _acc(std::numeric_limits<InputT>::min()) {}
+    pooling_accumulator() : _acc(std::numeric_limits<InputT>::lowest()) {}
 
     void accumulate(const InputT& val) {
         using std::max;
@@ -82,7 +90,7 @@ struct pooling_accumulator<InputT, pooling_mode::max> {
         return static_cast<output_t>(_acc);
     }
 
-    void reset() { _acc = std::numeric_limits<InputT>::min(); }
+    void reset() { _acc = std::numeric_limits<InputT>::lowest(); }
 
     InputT _acc;
 };
@@ -121,7 +129,7 @@ struct pooling_accumulator<InputT, pooling_mode::average> {
     }
 
     output_t get(size_t pool_x, size_t pool_y) {
-        return static_cast<output_t>(_acc / (pool_x * pool_y));
+        return static_cast<output_t>(_acc / static_cast<InputT>(pool_x * pool_y));
     }
 
     void reset() {
@@ -2351,6 +2359,7 @@ class pooling_test_base {
         auto input_lay = layout(input_type(),
                                 input_format(),
                                 input_size);
+
         auto topo = topology(
             input_layout("input", input_lay),
             pooling("pool",
@@ -2397,12 +2406,23 @@ class pooling_test_base {
         auto out_lay = out_mem.get_layout();
         auto out_ptr = out_mem.cldnn::memory::template pointer<output_t>();
 
+        std::string kernel;
+        for (auto i : net.get_primitives_info()) {
+            if (i.original_id == "pool") {
+                kernel = i.kernel_id;
+            }
+        }
+        std::cout << kernel << std::endl;
+        SCOPED_TRACE("\nkernel: " + kernel);
+
         ASSERT_EQ(out_lay.data_type, output_type());
         ASSERT_EQ(out_lay.size.batch[0], expected.size());
         ASSERT_EQ(out_lay.size.feature[0], expected[0].size());
         ASSERT_EQ(out_lay.size.spatial[1], expected[0][0].size());
         ASSERT_EQ(out_lay.size.spatial[0], expected[0][0][0].size());
 
+        bool compare_with_tolerance = input_type() == data_types::f16;
+
         for (size_t bi = 0; bi < batch_num(); ++bi)
             for (size_t fi = 0; fi < expected[0].size(); ++fi)
                 for (size_t yi = 0; yi < expected[0][0].size(); ++yi)
@@ -2411,9 +2431,14 @@ class pooling_test_base {
                         size_t offset = out_lay.get_linear_offset(coords);
                         auto ref_val = static_cast<float>(expected[bi][fi][yi][xi]);
                         auto actual_val = static_cast<float>(out_ptr[offset]);
-
-                        EXPECT_TRUE(are_equal(ref_val, actual_val))
-                            << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
+                        if (compare_with_tolerance) {
+                            auto tolerance = 1;
+                            ASSERT_NEAR(ref_val, actual_val, tolerance)
+                                << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
+                        } else {
+                            EXPECT_TRUE(are_equal(ref_val, actual_val))
+                                << "at b= " << bi << ", f= " << fi << ", y= " << yi << ", x= " << xi;
+                        }
                     }
 
     }
@@ -2560,16 +2585,19 @@ TEST_P(pooling_random_test, avg_u8) {
 INSTANTIATE_TEST_CASE_P(
     smoke_low_precision,
     pooling_random_test,
-    testing::Combine(
-        testing::Values(1, 2),
-        testing::Values(3, 32),
-        testing::Values(std::tuple<size_t, size_t>(3, 3), std::tuple<size_t, size_t>(8, 8)),
-        testing::Values(std::tuple<size_t, size_t>(1, 1), std::tuple<size_t, size_t>(3, 3)),
-        testing::Values(std::tuple<int, int>(1, 1)),
-        testing::Values(std::tuple<int, int>(0, 0)),
-        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::byxf_af32, format::b_fs_yx_fsv32)
-    ),
-    testing::internal::DefaultParamName<pooling_random_test_params>);
+    testing::Combine(testing::Values(1, 2),
+                     testing::Values(3, 8),
+                     testing::Values(std::tuple<size_t, size_t>(12, 12), std::tuple<size_t, size_t>(24, 24)),
+                     testing::Values(std::tuple<size_t, size_t>(4, 4), std::tuple<size_t, size_t>(2, 2)),
+                     testing::Values(std::tuple<int, int>(2, 2)),
+                     testing::Values(std::tuple<int, int>(0, 0)),
+                     testing::Values(format::yxfb,
+                                     format::bfyx,
+                                     format::byxf_af32,
+                                     format::b_fs_yx_fsv4,
+                                     format::b_fs_yx_fsv16,
+                                     format::b_fs_yx_fsv32)),
+                    testing::internal::DefaultParamName<pooling_random_test_params>);
 
 template <typename InputT, pooling_mode Mode>
 class pooling_scale_random_test_base : public pooling_random_test_base<InputT, Mode> {
@@ -2619,30 +2647,44 @@ class pooling_scale_random_test_base : public pooling_random_test_base<InputT, M
     VF<output_t> _shift;
 };
 
-using pooling_scale_random_test = pooling_random_test;
+using pooling_random_test_fp16_fp32 = pooling_random_test;
+
+TEST_P(pooling_random_test_fp16_fp32, avg_fp16) {
+    auto test_case = pooling_random_test_base<FLOAT16, pooling_mode::average>();
+    ASSERT_NO_FATAL_FAILURE(test_case.run_random(GetParam()));
+}
+
+TEST_P(pooling_random_test_fp16_fp32, max_fp16) {
+    auto test_case = pooling_random_test_base<FLOAT16, pooling_mode::max>();
+    ASSERT_NO_FATAL_FAILURE(test_case.run_random(GetParam()));
+}
 
-TEST_P(pooling_scale_random_test, avg_i8) {
-    auto test_case = pooling_scale_random_test_base<int8_t, pooling_mode::average>();
+TEST_P(pooling_random_test_fp16_fp32, avg_fp32) {
+    auto test_case = pooling_random_test_base<float, pooling_mode::average>();
     ASSERT_NO_FATAL_FAILURE(test_case.run_random(GetParam()));
 }
 
-TEST_P(pooling_scale_random_test, avg_u8) {
-    auto test_case = pooling_scale_random_test_base<uint8_t, pooling_mode::average>();
+TEST_P(pooling_random_test_fp16_fp32, max_fp32) {
+    auto test_case = pooling_random_test_base<float, pooling_mode::max>();
     ASSERT_NO_FATAL_FAILURE(test_case.run_random(GetParam()));
 }
 
 INSTANTIATE_TEST_CASE_P(
     smoke_low_precision,
-    pooling_scale_random_test,
-    testing::Combine(
-        testing::Values(1, 2),
-        testing::Values(3, 32),
-        testing::Values(std::tuple<size_t, size_t>(3, 3), std::tuple<size_t, size_t>(8, 8)),
-        testing::Values(std::tuple<size_t, size_t>(1, 1), std::tuple<size_t, size_t>(3, 3)),
-        testing::Values(std::tuple<int, int>(1, 1)),
-        testing::Values(std::tuple<int, int>(0, 0)),
-        testing::Values(format::bfyx, format::b_fs_yx_fsv4, format::byxf_af32, format::b_fs_yx_fsv32)
-    ),
+    pooling_random_test_fp16_fp32,
+    testing::Combine(testing::Values(1, 2),
+                     testing::Values(3, 8),
+                     testing::Values(std::tuple<size_t, size_t>(12, 12), std::tuple<size_t, size_t>(24, 24)),
+                     testing::Values(std::tuple<size_t, size_t>(4, 4), std::tuple<size_t, size_t>(2, 2)),
+                     testing::Values(std::tuple<int, int>(2, 2)),
+                     testing::Values(std::tuple<int, int>(0, 0)),
+                     testing::Values(format::yxfb,
+                                     format::bfyx,
+                                     format::byxf,
+                                     format::b_fs_yx_fsv16,
+                                     format::fs_b_yx_fsv32,
+                                     format::b_fs_yx_fsv32,
+                                     format::fs_bs_yx_bsv4_fsv32)),
     testing::internal::DefaultParamName<pooling_random_test_params>);
 
 TEST(pooling_forward_gpu, bsv16_fsv16_max_16x16x8x8_input_2x2_pool_2x2_stride)
diff --git a/inference-engine/thirdparty/clDNN/tests/test_utils/float16.h b/inference-engine/thirdparty/clDNN/tests/test_utils/float16.h
index 5438037c814582..607bda0d62ac87 100644
--- a/inference-engine/thirdparty/clDNN/tests/test_utils/float16.h
+++ b/inference-engine/thirdparty/clDNN/tests/test_utils/float16.h
@@ -1,5 +1,5 @@
 /*
-// Copyright (c) 2017 Intel Corporation
+// Copyright (c) 2020 Intel Corporation
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -17,120 +17,96 @@
 #pragma once
 #include "include/math_utils.h"
 
-struct FLOAT16
-{
-    struct representation
-    {
-        uint16_t significand : 10;
-        uint16_t exponent : 5;
+struct FLOAT16 {
+    struct representation {
         uint16_t sign : 1;
+        uint16_t exponent : 5;
+        uint16_t significand : 10;
     };
 
-    union
-    {
-        uint16_t v = 0;
-        representation format; // added this struct for the .natvis file (for debug)
+    union {
+        uint16_t v;
+        representation format;  // added this struct for the .natvis file (for debug)
     };
 
-    static FLOAT16 min_val()
-    {
-        FLOAT16 f16;
-        f16.v = 0xFC00;
-        return f16;
-    }
+    static constexpr FLOAT16 min_val() { return FLOAT16((uint16_t)(0x0400)); }
+
+    static constexpr FLOAT16 lowest_val() { return FLOAT16((uint16_t)(0xfbff)); }
 
-    operator double() const { double d = (double)float16_to_float32(v); return d; }
-    operator float() const { float f = float16_to_float32(v); return f; }
-    operator int16_t() const { return *(int16_t*)(&v); }
+    operator double() const {
+        double d = (double)float16_to_float32(v);
+        return d;
+    }
+    operator float() const {
+        float f = float16_to_float32(v);
+        return f;
+    }
+    operator int16_t() const { return *(int16_t *)(&v); }
     operator long long int() const { return v; }
     operator uint32_t() const { return v; }
     FLOAT16(float f) { v = float32_to_float16(f); }
+    FLOAT16(size_t s) { v = float32_to_float16(float(s)); }
     FLOAT16(int i) { v = float32_to_float16(float(i)); }
-    explicit FLOAT16(int16_t d) : v(d) {}
-    friend FLOAT16 operator +(const FLOAT16 &v1, const FLOAT16 &v2);
-    friend FLOAT16 operator -(const FLOAT16 &v1, const FLOAT16 &v2);
-    friend FLOAT16 operator *(const FLOAT16 &v1, const FLOAT16 &v2);
-    friend FLOAT16 operator /(const FLOAT16 &v1, const FLOAT16 &v2);
-    friend bool operator >(const FLOAT16 &v1, const FLOAT16 &v2);
-    friend bool operator >=(const FLOAT16 &v1, const FLOAT16 &v2);
-    friend bool operator <(const FLOAT16 &v1, const FLOAT16 &v2);
-    friend bool operator >(const FLOAT16 &v1, const float &v2);
-    friend bool operator <(const FLOAT16 &v1, const float &v2);
-    friend bool operator ==(const FLOAT16 &v1, const FLOAT16 &v2);
-    friend bool operator !=(const FLOAT16 &v1, const FLOAT16 &v2);
-
-    FLOAT16() {}
-
-    FLOAT16& operator +=(const FLOAT16 &v1)
-    {
-            *this = (float)*this + (float)v1;
-            return *this;
+    // TODO Below should have constructor tag to avoid ambigious behaviour, ex FLOAT16(16.f) != FLOAT16((uint16_t)16)
+    explicit constexpr FLOAT16(int16_t d) : v(d) {}
+    explicit constexpr FLOAT16(uint16_t d) : v(d) {}
+    friend FLOAT16 operator+(const FLOAT16 &v1, const FLOAT16 &v2);
+    friend FLOAT16 operator-(const FLOAT16 &v1, const FLOAT16 &v2);
+    friend FLOAT16 operator*(const FLOAT16 &v1, const FLOAT16 &v2);
+    friend FLOAT16 operator/(const FLOAT16 &v1, const FLOAT16 &v2);
+    friend bool operator>(const FLOAT16 &v1, const FLOAT16 &v2);
+    friend bool operator>=(const FLOAT16 &v1, const FLOAT16 &v2);
+    friend bool operator<(const FLOAT16 &v1, const FLOAT16 &v2);
+    friend bool operator>(const FLOAT16 &v1, const float &v2);
+    friend bool operator<(const FLOAT16 &v1, const float &v2);
+    friend bool operator==(const FLOAT16 &v1, const FLOAT16 &v2);
+    friend bool operator!=(const FLOAT16 &v1, const FLOAT16 &v2);
+
+    FLOAT16() { v = 0; }
+
+    FLOAT16 &operator+=(const FLOAT16 &v1) {
+        *this = (float)*this + (float)v1;
+        return *this;
     }
 
-    FLOAT16& operator /=(const FLOAT16 &v1)
-    {
-            *this = (float)*this / (float)v1;
-            return *this;
+    FLOAT16 &operator/=(const FLOAT16 &v1) {
+        *this = (float)*this / (float)v1;
+        return *this;
     }
 
-    FLOAT16& operator *=(const FLOAT16 &v1)
-    {
+    FLOAT16 &operator*=(const FLOAT16 &v1) {
         *this = (float)*this * (float)v1;
         return *this;
     }
 };
 
-inline FLOAT16 operator +(const FLOAT16 &v1, const FLOAT16 &v2)
-{
-    return (float)v1 + (float)v2;
-}
-
-inline FLOAT16 operator -(const FLOAT16 &v1, const FLOAT16 &v2)
-{
-    return (float)v1 - (float)v2;
-}
-
-inline FLOAT16 operator *(const FLOAT16 &v1, const FLOAT16 &v2)
-{
-    return (float)v1 * (float)v2;
-}
-
-inline FLOAT16 operator /(const FLOAT16 &v1, const FLOAT16 &v2)
-{
-    return (float)v1 / (float)v2;
-}
-
-inline bool operator >(const FLOAT16 &v1, const FLOAT16 &v2)
-{
-    return (float)v1 > (float)v2;
-}
-
-inline bool operator >=(const FLOAT16 &v1, const FLOAT16 &v2)
-{
-    return (float)v1 >= (float)v2;
-}
-
-inline bool operator <(const FLOAT16 &v1, const FLOAT16 &v2)
-{
-    return (float)v1 < (float)v2;
-}
-
-inline bool operator >(const FLOAT16 &v1, const float &v2)
-{
-    return (float)v1 > v2;
-}
-
-inline bool operator <(const FLOAT16 &v1, const float &v2)
-{
-    return (float)v1 < v2;
-}
-
-inline bool operator ==(const FLOAT16 &v1, const FLOAT16 &v2)
-{
-    return v1.v == v2.v;
-}
-
-inline bool operator !=(const FLOAT16 &v1, const FLOAT16 &v2)
-{
-    return v1.v != v2.v;
-}
+inline FLOAT16 operator+(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 + (float)v2; }
+
+inline FLOAT16 operator-(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 - (float)v2; }
+
+inline FLOAT16 operator*(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 * (float)v2; }
+
+inline FLOAT16 operator/(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 / (float)v2; }
+
+inline bool operator>(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 > (float)v2; }
+
+inline bool operator>=(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 >= (float)v2; }
+
+inline bool operator<(const FLOAT16 &v1, const FLOAT16 &v2) { return (float)v1 < (float)v2; }
+
+inline bool operator>(const FLOAT16 &v1, const float &v2) { return (float)v1 > v2; }
+
+inline bool operator<(const FLOAT16 &v1, const float &v2) { return (float)v1 < v2; }
+
+inline bool operator==(const FLOAT16 &v1, const FLOAT16 &v2) { return v1.v == v2.v; }
+
+inline bool operator!=(const FLOAT16 &v1, const FLOAT16 &v2) { return v1.v != v2.v; }
+
+namespace std {
+
+template <>
+struct numeric_limits<FLOAT16> {
+    static constexpr FLOAT16 lowest() { return FLOAT16::lowest_val(); }
+};
+
+}  // namespace std
diff --git a/install_dependencies.sh b/install_dependencies.sh
index 6fae78066ebd27..ca31972c3ba1bf 100755
--- a/install_dependencies.sh
+++ b/install_dependencies.sh
@@ -22,6 +22,13 @@ yes_or_no() {
 # install dependencies
 if [ -f /etc/lsb-release ]; then
     # Ubuntu
+    host_cpu=$(uname -m)
+    if [ $host_cpu = x86_64 ]; then
+        x86_64_specific_packages="gcc-multilib g++-multilib"
+    else
+        x86_64_specific_packages=""
+    fi
+
     sudo -E apt update
     sudo -E apt-get install -y \
             build-essential \
@@ -32,8 +39,7 @@ if [ -f /etc/lsb-release ]; then
             ca-certificates \
             git \
             libboost-regex-dev \
-            gcc-multilib \
-            g++-multilib \
+            $x86_64_specific_packages \
             libgtk2.0-dev \
             pkg-config \
             unzip \
diff --git a/model-optimizer/extensions/front/caffe/lrn_ext.py b/model-optimizer/extensions/front/caffe/lrn_ext.py
index a8ee3f59c6924e..1c0a72ca06eec3 100644
--- a/model-optimizer/extensions/front/caffe/lrn_ext.py
+++ b/model-optimizer/extensions/front/caffe/lrn_ext.py
@@ -29,7 +29,7 @@ def extract(cls, node):
         AttributedLRN.update_node_stat(node, {
             'alpha': param.alpha,
             'beta': param.beta,
-            'bias': 1,
+            'bias': param.k,
             'local_size': param.local_size,
             'region': region,
         })
diff --git a/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py b/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py
index 0ae58ca77e6642..9c070aa8a58a90 100644
--- a/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py
+++ b/model-optimizer/extensions/front/onnx/roifeatureextractor_ext.py
@@ -31,6 +31,7 @@ def extract(cls, node):
                      sampling_ratio=onnx_attr(node, 'sampling_ratio', 'i', 2),
                      distribute_rois_between_levels=onnx_attr(node, 'distribute_rois_between_levels', 'i', 1),
                      preserve_rois_order=onnx_attr(node, 'preserve_rois_order', 'i', 1),
+                     aligned=onnx_attr(node, 'aligned', 'i', 0),
                      num_classes=onnx_attr(node, 'num_classes', 'i', 81),
                      post_nms_count=onnx_attr(node, 'post_nms_count', 'i', 2000),
                      score_threshold=onnx_attr(node, 'score_threshold', 'f', 0.05),
diff --git a/model-optimizer/extensions/ops/roifeatureextractor_onnx.py b/model-optimizer/extensions/ops/roifeatureextractor_onnx.py
index 87d6a06d7dfe49..26c2e63b4978c3 100644
--- a/model-optimizer/extensions/ops/roifeatureextractor_onnx.py
+++ b/model-optimizer/extensions/ops/roifeatureextractor_onnx.py
@@ -41,7 +41,8 @@ def backend_attrs(self):
             'image_id',
             'output_size',
             'sampling_ratio',
-            'preserve_rois_order']
+            'preserve_rois_order',
+            'aligned']
 
     @staticmethod
     def infer(node):
diff --git a/ngraph/python/src/ngraph/__init__.py b/ngraph/python/src/ngraph/__init__.py
index 6055ba92e59c59..d644fda3758b86 100644
--- a/ngraph/python/src/ngraph/__init__.py
+++ b/ngraph/python/src/ngraph/__init__.py
@@ -29,6 +29,7 @@
 from ngraph.ops import acos
 from ngraph.ops import add
 from ngraph.ops import asin
+from ngraph.ops import assign
 from ngraph.ops import atan
 from ngraph.ops import avg_pool
 from ngraph.ops import batch_norm_inference
@@ -59,6 +60,7 @@
 from ngraph.ops import embedding_bag_offsets_sum
 from ngraph.ops import embedding_bag_packed_sum
 from ngraph.ops import embedding_segments_sum
+from ngraph.ops import extract_image_patches
 from ngraph.ops import equal
 from ngraph.ops import erf
 from ngraph.ops import exp
@@ -108,6 +110,7 @@
 from ngraph.ops import prior_box_clustered
 from ngraph.ops import psroi_pooling
 from ngraph.ops import proposal
+from ngraph.ops import read_value
 from ngraph.ops import reduce_logical_and
 from ngraph.ops import reduce_logical_or
 from ngraph.ops import reduce_max
diff --git a/ngraph/python/src/ngraph/ops.py b/ngraph/python/src/ngraph/ops.py
index 51a299ef8f97c8..8d5fe41dfb4eae 100644
--- a/ngraph/python/src/ngraph/ops.py
+++ b/ngraph/python/src/ngraph/ops.py
@@ -3438,3 +3438,53 @@ def proposal(
     return _get_node_factory().create(
         "Proposal", [class_probs, box_logits, as_node(image_shape)], attrs
     )
+
+
+@nameable_op
+def assign(new_value: NodeInput, variable_id: str, name: Optional[str] = None) -> Node:
+    """Return a node which produces the Assign operation.
+
+    :param new_value:    Node producing a value to be assigned to a variable.
+    :param variable_id:  Id of a variable to be updated.
+    :param name:         Optional name for output node.
+    :return: Assign node
+    """
+    return _get_node_factory().create("Assign", [as_node(new_value)], {"variable_id": variable_id})
+
+
+@nameable_op
+def read_value(init_value: NodeInput, variable_id: str, name: Optional[str] = None) -> Node:
+    """Return a node which produces the Assign operation.
+
+    :param init_value:   Node producing a value to be returned instead of an unassigned variable.
+    :param variable_id:  Id of a variable to be read.
+    :param name:         Optional name for output node.
+    :return: ReadValue node
+    """
+    return _get_node_factory().create("ReadValue", [as_node(init_value)], {"variable_id": variable_id})
+
+
+@nameable_op
+def extract_image_patches(
+    image: NodeInput,
+    sizes: TensorShape,
+    strides: List[int],
+    rates: TensorShape,
+    auto_pad: str,
+    name: Optional[str] = None,
+) -> Node:
+    """Return a node which produces the ExtractImagePatches operation.
+
+    :param image:     4-D Input data to extract image patches.
+    :param sizes:     Patch size in the format of [size_rows, size_cols].
+    :param strides:   Patch movement stride in the format of [stride_rows, stride_cols]
+    :param rates:     Element seleciton rate for creating a patch.
+    :param auto_pad:  Padding type.
+    :param name:      Optional name for output node.
+    :return: ExtractImagePatches node
+    """
+    return _get_node_factory().create(
+        "ExtractImagePatches",
+        [as_node(image)],
+        {"sizes": sizes, "strides": strides, "rates": rates, "auto_pad": auto_pad},
+    )
diff --git a/ngraph/python/test/ngraph/test_create_op.py b/ngraph/python/test/ngraph/test_create_op.py
index abb50adce9e26d..9b041d8fc8d61c 100644
--- a/ngraph/python/test/ngraph/test_create_op.py
+++ b/ngraph/python/test/ngraph/test_create_op.py
@@ -845,3 +845,39 @@ def test_proposal(int_dtype, fp_dtype):
     assert node.get_type_name() == "Proposal"
     assert node.get_output_size() == 1
     assert list(node.get_output_shape(0)) == [batch_size * attributes["attrs.post_nms_topn"], 5]
+
+
+def test_read_value():
+    init_value = ng.parameter([2, 2], name="init_value", dtype=np.int32)
+
+    node = ng.read_value(init_value, "var_id_667")
+
+    assert node.get_type_name() == "ReadValue"
+    assert node.get_output_size() == 1
+    assert list(node.get_output_shape(0)) == [2, 2]
+    assert node.get_output_element_type(0) == Type.i32
+
+
+def test_assign():
+    input_data = ng.parameter([5, 7], name="input_data", dtype=np.int32)
+    rv = ng.read_value(input_data, "var_id_667")
+    node = ng.assign(rv, "var_id_667")
+
+    assert node.get_type_name() == "Assign"
+    assert node.get_output_size() == 1
+    assert list(node.get_output_shape(0)) == [5, 7]
+    assert node.get_output_element_type(0) == Type.i32
+
+
+def test_extract_image_patches():
+    image = ng.parameter([64, 3, 10, 10], name="image", dtype=np.int32)
+    sizes = [3, 3];
+    strides = [5, 5];
+    rates = [1, 1];
+    padding = "VALID";
+    node = ng.extract_image_patches(image, sizes, strides, rates, padding)
+
+    assert node.get_type_name() == "ExtractImagePatches"
+    assert node.get_output_size() == 1
+    assert list(node.get_output_shape(0)) == [64, 27, 2, 2]
+    assert node.get_output_element_type(0) == Type.i32
diff --git a/ngraph/src/ngraph/op/acosh.cpp b/ngraph/src/ngraph/op/acosh.cpp
index a6fea9542ef6c2..514bc2c2f0c06d 100644
--- a/ngraph/src/ngraph/op/acosh.cpp
+++ b/ngraph/src/ngraph/op/acosh.cpp
@@ -71,6 +71,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, out);
             break;
+            TYPE_CASE(bf16)(arg0, out);
+            break;
+            TYPE_CASE(f16)(arg0, out);
+            break;
             TYPE_CASE(f32)(arg0, out);
             break;
             TYPE_CASE(f64)(arg0, out);
diff --git a/ngraph/src/ngraph/op/add.cpp b/ngraph/src/ngraph/op/add.cpp
index a39d4213352163..9e65778db867c1 100644
--- a/ngraph/src/ngraph/op/add.cpp
+++ b/ngraph/src/ngraph/op/add.cpp
@@ -108,6 +108,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/and.cpp b/ngraph/src/ngraph/op/and.cpp
index e309cb80a43a3a..4e00373450a680 100644
--- a/ngraph/src/ngraph/op/and.cpp
+++ b/ngraph/src/ngraph/op/and.cpp
@@ -87,6 +87,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/asinh.cpp b/ngraph/src/ngraph/op/asinh.cpp
index b9ae4c16f659ab..2efc0a341b2b0d 100644
--- a/ngraph/src/ngraph/op/asinh.cpp
+++ b/ngraph/src/ngraph/op/asinh.cpp
@@ -71,6 +71,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, out);
             break;
+            TYPE_CASE(bf16)(arg0, out);
+            break;
+            TYPE_CASE(f16)(arg0, out);
+            break;
             TYPE_CASE(f32)(arg0, out);
             break;
             TYPE_CASE(f64)(arg0, out);
diff --git a/ngraph/src/ngraph/op/atanh.cpp b/ngraph/src/ngraph/op/atanh.cpp
index ed33af21f93a38..ee0ba9539a805b 100644
--- a/ngraph/src/ngraph/op/atanh.cpp
+++ b/ngraph/src/ngraph/op/atanh.cpp
@@ -71,6 +71,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, out);
             break;
+            TYPE_CASE(bf16)(arg0, out);
+            break;
+            TYPE_CASE(f16)(arg0, out);
+            break;
             TYPE_CASE(f32)(arg0, out);
             break;
             TYPE_CASE(f64)(arg0, out);
diff --git a/ngraph/src/ngraph/op/convert.cpp b/ngraph/src/ngraph/op/convert.cpp
index 12cfaa7110ebdb..9695c26b307546 100644
--- a/ngraph/src/ngraph/op/convert.cpp
+++ b/ngraph/src/ngraph/op/convert.cpp
@@ -99,6 +99,8 @@ namespace
             break;
             TYPE_OUT_CASE(bf16)(arg, out);
             break;
+            TYPE_OUT_CASE(f16)(arg, out);
+            break;
             TYPE_OUT_CASE(f32)(arg, out);
             break;
             TYPE_OUT_CASE(f64)(arg, out);
@@ -132,6 +134,8 @@ namespace
             break;
             TYPE_CASE(bf16)(arg, out);
             break;
+            TYPE_CASE(f16)(arg, out);
+            break;
             TYPE_CASE(f32)(arg, out);
             break;
             TYPE_CASE(f64)(arg, out);
diff --git a/ngraph/src/ngraph/op/divide.cpp b/ngraph/src/ngraph/op/divide.cpp
index 1b4e4bfda37eaa..de125abe76bfdb 100644
--- a/ngraph/src/ngraph/op/divide.cpp
+++ b/ngraph/src/ngraph/op/divide.cpp
@@ -125,6 +125,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec, pythondiv);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec, pythondiv);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec, pythondiv);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec, pythondiv);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec, pythondiv);
diff --git a/ngraph/src/ngraph/op/equal.cpp b/ngraph/src/ngraph/op/equal.cpp
index 972ac43a13f435..a4084210f7b94b 100644
--- a/ngraph/src/ngraph/op/equal.cpp
+++ b/ngraph/src/ngraph/op/equal.cpp
@@ -83,6 +83,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/fused/matmul.cpp b/ngraph/src/ngraph/op/fused/matmul.cpp
index cfe6426ff7bc70..b9680e80c10672 100644
--- a/ngraph/src/ngraph/op/fused/matmul.cpp
+++ b/ngraph/src/ngraph/op/fused/matmul.cpp
@@ -226,6 +226,8 @@ namespace
             break;
             TYPE_CASE(bf16)(arg0, arg1, output, transpose_a, transpose_b);
             break;
+            TYPE_CASE(f16)(arg0, arg1, output, transpose_a, transpose_b);
+            break;
             TYPE_CASE(f32)(arg0, arg1, output, transpose_a, transpose_b);
             break;
             TYPE_CASE(f64)(arg0, arg1, output, transpose_a, transpose_b);
diff --git a/ngraph/src/ngraph/op/fused/squeeze.cpp b/ngraph/src/ngraph/op/fused/squeeze.cpp
index 6e7977c018dcc7..f1038ebefbbed1 100644
--- a/ngraph/src/ngraph/op/fused/squeeze.cpp
+++ b/ngraph/src/ngraph/op/fused/squeeze.cpp
@@ -201,6 +201,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, out);
             break;
+            TYPE_CASE(bf16)(arg0, out);
+            break;
+            TYPE_CASE(f16)(arg0, out);
+            break;
             TYPE_CASE(f32)(arg0, out);
             break;
             TYPE_CASE(f64)(arg0, out);
diff --git a/ngraph/src/ngraph/op/fused/unsqueeze.cpp b/ngraph/src/ngraph/op/fused/unsqueeze.cpp
index 8f8cc569a4f134..ced8d8952e6ae8 100644
--- a/ngraph/src/ngraph/op/fused/unsqueeze.cpp
+++ b/ngraph/src/ngraph/op/fused/unsqueeze.cpp
@@ -161,6 +161,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, out);
             break;
+            TYPE_CASE(bf16)(arg0, out);
+            break;
+            TYPE_CASE(f16)(arg0, out);
+            break;
             TYPE_CASE(f32)(arg0, out);
             break;
             TYPE_CASE(f64)(arg0, out);
diff --git a/ngraph/src/ngraph/op/gather.cpp b/ngraph/src/ngraph/op/gather.cpp
index 6ad7b1641e98a3..a40ff71533db2a 100644
--- a/ngraph/src/ngraph/op/gather.cpp
+++ b/ngraph/src/ngraph/op/gather.cpp
@@ -292,6 +292,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, axis);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, axis);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, axis);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, axis);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, axis);
diff --git a/ngraph/src/ngraph/op/greater.cpp b/ngraph/src/ngraph/op/greater.cpp
index a8d483fb56da5c..cc5b70ceae7995 100644
--- a/ngraph/src/ngraph/op/greater.cpp
+++ b/ngraph/src/ngraph/op/greater.cpp
@@ -83,6 +83,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/greater_eq.cpp b/ngraph/src/ngraph/op/greater_eq.cpp
index f9144afb3918e0..b211e72b331bb9 100644
--- a/ngraph/src/ngraph/op/greater_eq.cpp
+++ b/ngraph/src/ngraph/op/greater_eq.cpp
@@ -83,6 +83,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/less.cpp b/ngraph/src/ngraph/op/less.cpp
index 217780a4d84fd4..23e15be240414a 100644
--- a/ngraph/src/ngraph/op/less.cpp
+++ b/ngraph/src/ngraph/op/less.cpp
@@ -83,6 +83,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/less_eq.cpp b/ngraph/src/ngraph/op/less_eq.cpp
index 0107c33a380c01..47a66bc891a3e1 100644
--- a/ngraph/src/ngraph/op/less_eq.cpp
+++ b/ngraph/src/ngraph/op/less_eq.cpp
@@ -83,6 +83,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/max.cpp b/ngraph/src/ngraph/op/max.cpp
index 747452dfe52745..73f7a138ba5e43 100644
--- a/ngraph/src/ngraph/op/max.cpp
+++ b/ngraph/src/ngraph/op/max.cpp
@@ -117,6 +117,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg, out, axes);
             break;
+            TYPE_CASE(bf16)(arg, out, axes);
+            break;
+            TYPE_CASE(f16)(arg, out, axes);
+            break;
             TYPE_CASE(f32)(arg, out, axes);
             break;
             TYPE_CASE(f64)(arg, out, axes);
diff --git a/ngraph/src/ngraph/op/max_pool.cpp b/ngraph/src/ngraph/op/max_pool.cpp
index 3cca67b3863aa4..1d35f620b65d0d 100644
--- a/ngraph/src/ngraph/op/max_pool.cpp
+++ b/ngraph/src/ngraph/op/max_pool.cpp
@@ -562,6 +562,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg, out, out_shape, kernel, strides, pad_begin, pad_end);
             break;
+            TYPE_CASE(bf16)(arg, out, out_shape, kernel, strides, pad_begin, pad_end);
+            break;
+            TYPE_CASE(f16)(arg, out, out_shape, kernel, strides, pad_begin, pad_end);
+            break;
             TYPE_CASE(f32)(arg, out, out_shape, kernel, strides, pad_begin, pad_end);
             break;
             TYPE_CASE(f64)(arg, out, out_shape, kernel, strides, pad_begin, pad_end);
diff --git a/ngraph/src/ngraph/op/maximum.cpp b/ngraph/src/ngraph/op/maximum.cpp
index be9607a0e858fe..83443a0496213e 100644
--- a/ngraph/src/ngraph/op/maximum.cpp
+++ b/ngraph/src/ngraph/op/maximum.cpp
@@ -106,6 +106,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/min.cpp b/ngraph/src/ngraph/op/min.cpp
index 8ba8a35502fb0e..c3f41930be1496 100644
--- a/ngraph/src/ngraph/op/min.cpp
+++ b/ngraph/src/ngraph/op/min.cpp
@@ -117,6 +117,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg, out, axes);
             break;
+            TYPE_CASE(bf16)(arg, out, axes);
+            break;
+            TYPE_CASE(f16)(arg, out, axes);
+            break;
             TYPE_CASE(f32)(arg, out, axes);
             break;
             TYPE_CASE(f64)(arg, out, axes);
diff --git a/ngraph/src/ngraph/op/minimum.cpp b/ngraph/src/ngraph/op/minimum.cpp
index 0e3be4474572e8..a3365e5c26a0c1 100644
--- a/ngraph/src/ngraph/op/minimum.cpp
+++ b/ngraph/src/ngraph/op/minimum.cpp
@@ -105,6 +105,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/multiply.cpp b/ngraph/src/ngraph/op/multiply.cpp
index 92843b6ac48b9f..956e0c75f1e9fe 100644
--- a/ngraph/src/ngraph/op/multiply.cpp
+++ b/ngraph/src/ngraph/op/multiply.cpp
@@ -97,6 +97,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/non_zero.cpp b/ngraph/src/ngraph/op/non_zero.cpp
index e3ae0ea3f64797..ab4c719cf823c7 100644
--- a/ngraph/src/ngraph/op/non_zero.cpp
+++ b/ngraph/src/ngraph/op/non_zero.cpp
@@ -156,6 +156,8 @@ namespace
             break;
             TYPE_CASE(bf16)(input, output);
             break;
+            TYPE_CASE(f16)(input, output);
+            break;
             TYPE_CASE(f32)(input, output);
             break;
             TYPE_CASE(f64)(input, output);
diff --git a/ngraph/src/ngraph/op/not_equal.cpp b/ngraph/src/ngraph/op/not_equal.cpp
index dd6f470eca4fb6..034690c1ca7ebd 100644
--- a/ngraph/src/ngraph/op/not_equal.cpp
+++ b/ngraph/src/ngraph/op/not_equal.cpp
@@ -83,6 +83,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/or.cpp b/ngraph/src/ngraph/op/or.cpp
index 29f4a4beffb267..587e44fd25cc7e 100644
--- a/ngraph/src/ngraph/op/or.cpp
+++ b/ngraph/src/ngraph/op/or.cpp
@@ -81,6 +81,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/power.cpp b/ngraph/src/ngraph/op/power.cpp
index b05ad1f6878e13..18d88d7e7d5ad5 100644
--- a/ngraph/src/ngraph/op/power.cpp
+++ b/ngraph/src/ngraph/op/power.cpp
@@ -102,6 +102,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/product.cpp b/ngraph/src/ngraph/op/product.cpp
index 05397656ce1e46..9aa94898145913 100644
--- a/ngraph/src/ngraph/op/product.cpp
+++ b/ngraph/src/ngraph/op/product.cpp
@@ -80,6 +80,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg, out, axes);
             break;
+            TYPE_CASE(bf16)(arg, out, axes);
+            break;
+            TYPE_CASE(f16)(arg, out, axes);
+            break;
             TYPE_CASE(f32)(arg, out, axes);
             break;
             TYPE_CASE(f64)(arg, out, axes);
diff --git a/ngraph/src/ngraph/op/read_value.cpp b/ngraph/src/ngraph/op/read_value.cpp
index 9f7abb6ed7dc75..f6581a6b6ed479 100644
--- a/ngraph/src/ngraph/op/read_value.cpp
+++ b/ngraph/src/ngraph/op/read_value.cpp
@@ -21,8 +21,8 @@ using namespace ngraph;
 
 constexpr NodeTypeInfo op::ReadValue::type_info;
 
-op::ReadValue::ReadValue(const Output<Node>& new_value, const std::string& variable_id)
-    : Op({new_value})
+op::ReadValue::ReadValue(const Output<Node>& init_value, const std::string& variable_id)
+    : Op({init_value})
     , m_variable_id(variable_id)
 {
     constructor_validate_and_infer_types();
diff --git a/ngraph/src/ngraph/op/read_value.hpp b/ngraph/src/ngraph/op/read_value.hpp
index ea451f3ed45064..ca3f5325f0dcb0 100644
--- a/ngraph/src/ngraph/op/read_value.hpp
+++ b/ngraph/src/ngraph/op/read_value.hpp
@@ -36,9 +36,9 @@ namespace ngraph
 
                 /// \brief Constructs a ReadValue operation.
                 ///
-                /// \param new_value   Node that produces the input tensor.
-                /// \param variable_id identificator of the variable to create.
-                ReadValue(const Output<Node>& new_value, const std::string& variable_id);
+                /// \param init_value   Node that produces the input tensor.
+                /// \param variable_id  identificator of the variable to create.
+                ReadValue(const Output<Node>& init_value, const std::string& variable_id);
 
                 void validate_and_infer_types() override;
 
diff --git a/ngraph/src/ngraph/op/reduce_mean.cpp b/ngraph/src/ngraph/op/reduce_mean.cpp
index 4ed0ba07c325a3..d527d25487817f 100644
--- a/ngraph/src/ngraph/op/reduce_mean.cpp
+++ b/ngraph/src/ngraph/op/reduce_mean.cpp
@@ -72,6 +72,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg, out, axes);
             break;
+            TYPE_CASE(bf16)(arg, out, axes);
+            break;
+            TYPE_CASE(f16)(arg, out, axes);
+            break;
             TYPE_CASE(f32)(arg, out, axes);
             break;
             TYPE_CASE(f64)(arg, out, axes);
diff --git a/ngraph/src/ngraph/op/reduce_prod.cpp b/ngraph/src/ngraph/op/reduce_prod.cpp
index dfe249fde628a8..cd709378d442e7 100644
--- a/ngraph/src/ngraph/op/reduce_prod.cpp
+++ b/ngraph/src/ngraph/op/reduce_prod.cpp
@@ -76,6 +76,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg, out, axes);
             break;
+            TYPE_CASE(bf16)(arg, out, axes);
+            break;
+            TYPE_CASE(f16)(arg, out, axes);
+            break;
             TYPE_CASE(f32)(arg, out, axes);
             break;
             TYPE_CASE(f64)(arg, out, axes);
diff --git a/ngraph/src/ngraph/op/reduce_sum.cpp b/ngraph/src/ngraph/op/reduce_sum.cpp
index 068a84edad1878..8032a3350d0744 100644
--- a/ngraph/src/ngraph/op/reduce_sum.cpp
+++ b/ngraph/src/ngraph/op/reduce_sum.cpp
@@ -87,6 +87,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg, out, axes);
             break;
+            TYPE_CASE(bf16)(arg, out, axes);
+            break;
+            TYPE_CASE(f16)(arg, out, axes);
+            break;
             TYPE_CASE(f32)(arg, out, axes);
             break;
             TYPE_CASE(f64)(arg, out, axes);
diff --git a/ngraph/src/ngraph/op/scatter_elements_update.cpp b/ngraph/src/ngraph/op/scatter_elements_update.cpp
index c7a2542d1657e8..93dc899381c2af 100644
--- a/ngraph/src/ngraph/op/scatter_elements_update.cpp
+++ b/ngraph/src/ngraph/op/scatter_elements_update.cpp
@@ -267,6 +267,8 @@ namespace
             break;
             TYPE_CASE(bf16)(arg0, arg1, arg2, arg3, out, normalized_axis);
             break;
+            TYPE_CASE(f16)(arg0, arg1, arg2, arg3, out, normalized_axis);
+            break;
             TYPE_CASE(f32)(arg0, arg1, arg2, arg3, out, normalized_axis);
             break;
             TYPE_CASE(f64)(arg0, arg1, arg2, arg3, out, normalized_axis);
diff --git a/ngraph/src/ngraph/op/softmax.cpp b/ngraph/src/ngraph/op/softmax.cpp
index a22a3a43e02803..6f4e2e5d4d0bf9 100644
--- a/ngraph/src/ngraph/op/softmax.cpp
+++ b/ngraph/src/ngraph/op/softmax.cpp
@@ -173,7 +173,8 @@ namespace
     bool evaluate_softmax(const HostTensorPtr& arg, const HostTensorPtr& out, const AxisSet& axes)
     {
         auto shape = out->get_shape();
-        return try_evaluate_softmax<element::Type_t::f32>(arg, out, shape, axes) ||
+        return try_evaluate_softmax<element::Type_t::f16>(arg, out, shape, axes) ||
+               try_evaluate_softmax<element::Type_t::f32>(arg, out, shape, axes) ||
                try_evaluate_softmax<element::Type_t::f64>(arg, out, shape, axes);
     }
 }
diff --git a/ngraph/src/ngraph/op/strided_slice.cpp b/ngraph/src/ngraph/op/strided_slice.cpp
index 28412a9a1e66d2..9a8acec0671a2d 100644
--- a/ngraph/src/ngraph/op/strided_slice.cpp
+++ b/ngraph/src/ngraph/op/strided_slice.cpp
@@ -291,6 +291,8 @@ namespace
             break;
             TYPE_CASE(bf16)(in, slice_plan, out);
             break;
+            TYPE_CASE(f16)(in, slice_plan, out);
+            break;
             TYPE_CASE(f32)(in, slice_plan, out);
             break;
             TYPE_CASE(f64)(in, slice_plan, out);
diff --git a/ngraph/src/ngraph/op/subtract.cpp b/ngraph/src/ngraph/op/subtract.cpp
index 50fb2624ba875f..7d638acc9fa6ab 100644
--- a/ngraph/src/ngraph/op/subtract.cpp
+++ b/ngraph/src/ngraph/op/subtract.cpp
@@ -103,6 +103,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/op/sum.cpp b/ngraph/src/ngraph/op/sum.cpp
index 4b8a56dd9bb752..24643889f97846 100644
--- a/ngraph/src/ngraph/op/sum.cpp
+++ b/ngraph/src/ngraph/op/sum.cpp
@@ -91,6 +91,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg, out, axes);
             break;
+            TYPE_CASE(bf16)(arg, out, axes);
+            break;
+            TYPE_CASE(f16)(arg, out, axes);
+            break;
             TYPE_CASE(f32)(arg, out, axes);
             break;
             TYPE_CASE(f64)(arg, out, axes);
diff --git a/ngraph/src/ngraph/op/xor.cpp b/ngraph/src/ngraph/op/xor.cpp
index 40a0019d8e0612..3c144500951017 100644
--- a/ngraph/src/ngraph/op/xor.cpp
+++ b/ngraph/src/ngraph/op/xor.cpp
@@ -87,6 +87,10 @@ namespace
             break;
             TYPE_CASE(u64)(arg0, arg1, out, broadcast_spec);
             break;
+            TYPE_CASE(bf16)(arg0, arg1, out, broadcast_spec);
+            break;
+            TYPE_CASE(f16)(arg0, arg1, out, broadcast_spec);
+            break;
             TYPE_CASE(f32)(arg0, arg1, out, broadcast_spec);
             break;
             TYPE_CASE(f64)(arg0, arg1, out, broadcast_spec);
diff --git a/ngraph/src/ngraph/type/float16.cpp b/ngraph/src/ngraph/type/float16.cpp
index acca521b36da6e..8d6f748c0b2c25 100644
--- a/ngraph/src/ngraph/type/float16.cpp
+++ b/ngraph/src/ngraph/type/float16.cpp
@@ -138,38 +138,6 @@ size_t float16::size() const
     return sizeof(m_value);
 }
 
-bool float16::operator==(const float16& other) const
-{
-#if defined(__GNUC__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wfloat-equal"
-#endif
-    return (static_cast<float>(*this) == static_cast<float>(other));
-#if defined(__GNUC__)
-#pragma GCC diagnostic pop
-#endif
-}
-
-bool float16::operator<(const float16& other) const
-{
-    return (static_cast<float>(*this) < static_cast<float>(other));
-}
-
-bool float16::operator<=(const float16& other) const
-{
-    return (static_cast<float>(*this) <= static_cast<float>(other));
-}
-
-bool float16::operator>(const float16& other) const
-{
-    return (static_cast<float>(*this) > static_cast<float>(other));
-}
-
-bool float16::operator>=(const float16& other) const
-{
-    return (static_cast<float>(*this) >= static_cast<float>(other));
-}
-
 float16::operator float() const
 {
     union {
diff --git a/ngraph/src/ngraph/type/float16.hpp b/ngraph/src/ngraph/type/float16.hpp
index 9028e30a4558f0..12d69574d73516 100644
--- a/ngraph/src/ngraph/type/float16.hpp
+++ b/ngraph/src/ngraph/type/float16.hpp
@@ -50,12 +50,20 @@ namespace ngraph
 
         std::string to_string() const;
         size_t size() const;
-        bool operator==(const float16& other) const;
-        bool operator!=(const float16& other) const { return !(*this == other); }
-        bool operator<(const float16& other) const;
-        bool operator<=(const float16& other) const;
-        bool operator>(const float16& other) const;
-        bool operator>=(const float16& other) const;
+        template<typename T> bool operator==(const T& other) const;
+        template<typename T> bool operator!=(const T& other) const { return !(*this == other); }
+        template<typename T> bool operator<(const T& other) const;
+        template<typename T> bool operator<=(const T& other) const;
+        template<typename T> bool operator>(const T& other) const;
+        template<typename T> bool operator>=(const T& other) const;
+        template<typename T> float16 operator+(const T& other) const;
+        template<typename T> float16 operator+=(const T& other);
+        template<typename T> float16 operator-(const T& other) const;
+        template<typename T> float16 operator-=(const T& other);
+        template<typename T> float16 operator*(const T& other) const;
+        template<typename T> float16 operator*=(const T& other);
+        template<typename T> float16 operator/(const T& other) const;
+        template<typename T> float16 operator/=(const T& other);
         operator float() const;
 
         static constexpr float16 from_bits(uint16_t bits) { return float16(bits, true); }
@@ -86,6 +94,91 @@ namespace ngraph
 
         uint16_t m_value;
     };
+
+    template<typename T>
+    bool float16::operator==(const T& other) const
+    {
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wfloat-equal"
+#endif
+        return (static_cast<float>(*this) == static_cast<float>(other));
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
+    }
+
+    template<typename T>
+    bool float16::operator<(const T& other) const
+    {
+        return (static_cast<float>(*this) < static_cast<float>(other));
+    }
+
+    template<typename T>
+    bool float16::operator<=(const T& other) const
+    {
+        return (static_cast<float>(*this) <= static_cast<float>(other));
+    }
+
+    template<typename T>
+    bool float16::operator>(const T& other) const
+    {
+        return (static_cast<float>(*this) > static_cast<float>(other));
+    }
+
+    template<typename T>
+    bool float16::operator>=(const T& other) const
+    {
+        return (static_cast<float>(*this) >= static_cast<float>(other));
+    }
+
+    template<typename T>
+    float16 float16::operator+(const T& other) const
+    {
+        return {static_cast<float>(*this) + static_cast<float>(other)};
+    }
+
+    template<typename T>
+    float16 float16::operator+=(const T& other)
+    {
+        return *this = *this + other;
+    }
+
+    template<typename T>
+    float16 float16::operator-(const T& other) const
+    {
+        return {static_cast<float>(*this) - static_cast<float>(other)};
+    }
+
+    template<typename T>
+    float16 float16::operator-=(const T& other)
+    {
+        return *this = *this - other;
+    }
+
+    template<typename T>
+    float16 float16::operator*(const T& other) const
+    {
+        return {static_cast<float>(*this) * static_cast<float>(other)};
+    }
+
+    template<typename T>
+    float16 float16::operator*=(const T& other)
+    {
+        return *this = *this * other;
+    }
+
+    template<typename T>
+    float16 float16::operator/(const T& other) const
+    {
+        return {static_cast<float>(*this) / static_cast<float>(other)};
+    }
+
+    template<typename T>
+    float16 float16::operator/=(const T& other)
+    {
+        return *this = *this / other;
+    }
 }
 
 namespace std
diff --git a/tests/stress_tests/scripts/memcheck_upload.py b/tests/stress_tests/scripts/memcheck_upload.py
index d23bc4d5277c79..0f6474b737036d 100644
--- a/tests/stress_tests/scripts/memcheck_upload.py
+++ b/tests/stress_tests/scripts/memcheck_upload.py
@@ -165,8 +165,6 @@ def query_timeline(records, db_url, db_collection, max_items=20, similarity=TIME
 def create_memcheck_report(records, db_url, db_collection, output_path):
     """ Create memcheck timeline HTML report for records.
     """
-    if db_collection == 'pre_commit':
-        db_collection = 'commit'  # pre-commit jobs building report from past commits
     records.sort(
         key=lambda item: f"{item['status']}{item['device']}{item['model']}{item['test_name']}")
     timelines = query_timeline(records, db_url, db_collection)
@@ -203,7 +201,8 @@ def main():
     parser.add_argument('--db_url', required=not is_dryrun,
                         help='MongoDB URL in a for "mongodb://server:port".')
     parser.add_argument('--db_collection', required=not is_dryrun,
-                        help=f'Collection name in {DATABASE} database to upload')
+                        help=f'Collection name in {DATABASE} database to upload.',
+                        choices=["commit", "nightly", "weekly"])
     parser.add_argument('--artifact_root', required=True,
                         help=f'A root directory to strip from log path before upload.')
     parser.add_argument('--append', help='JSON to append to each item.')