diff --git a/CMakeLists.txt b/CMakeLists.txt
index 92ab0d44..d225580e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.8.0 FATAL_ERROR)
 # change version also in configure.ac
-project(gpujpeg VERSION 0.21.0 LANGUAGES C)
+project(gpujpeg VERSION 0.21.0 LANGUAGES C CXX)
 include(CheckLanguage)
 
 check_language(CUDA)
@@ -157,7 +157,7 @@ message(STATUS "Configured options: ${COMPILED_OPTIONS}")
 
 # GPUJPEG library
 file(GLOB H_FILES libgpujpeg/*.h ${CMAKE_CURRENT_BINARY_DIR}/libgpujpeg/gpujpeg_version.h)
-file(GLOB_RECURSE C_FILES src/*.c src/*.cu)
+file(GLOB_RECURSE C_FILES src/*.c src/*.cu src/*.cpp)
 list(REMOVE_ITEM C_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/main.c")
 add_library(gpujpeg ${H_FILES} ${C_FILES})
 target_include_directories(${PROJECT_NAME}
diff --git a/src/gpujpeg_colorspace.h b/src/gpujpeg_colorspace.h
index 40c9464c..c948dbd1 100644
--- a/src/gpujpeg_colorspace.h
+++ b/src/gpujpeg_colorspace.h
@@ -34,6 +34,33 @@
 #include <assert.h>
 #include "../libgpujpeg/gpujpeg_type.h"
 
+// TODO: NEED IMPLEMENTATION
+#ifndef GPUJPEG_USE_CUDA   
+#include <cmath>
+#include <algorithm>
+
+#define __device__
+#define round std::round
+
+/// Create uchar4 structure
+struct uchar4 {
+    uint8_t x;
+    uint8_t y;
+    uint8_t z;
+    uint8_t w;
+};
+typedef struct uchar4 uchar4;
+
+/// Create int4 structure
+struct int4 {
+    uint32_t x;
+    uint32_t y;
+    uint32_t z;
+    uint32_t w;
+};
+typedef struct int4 int4;
+#endif
+
 /**
  * Color transform debug info
  */
diff --git a/src/gpujpeg_common.c b/src/gpujpeg_common.c
index 0a883adb..f476051e 100644
--- a/src/gpujpeg_common.c
+++ b/src/gpujpeg_common.c
@@ -466,7 +466,17 @@ gpujpeg_component_print8(struct gpujpeg_component* component, uint8_t* d_data)
     cudaFreeHost(data);
 #else
     // TODO: NEED IMPLEMENTATION
-    printf("[WARNING] gpujpeg_component_print8(): NOT YET IMPLEMENTED\n");
+    data = malloc(data_size * sizeof(uint8_t));
+    memcpy(data, d_data, data_size * sizeof(uint8_t));
+
+    printf("Print Data\n");
+    for ( int y = 0; y < component->data_height; y++ ) {
+        for ( int x = 0; x < component->data_width; x++ ) {
+            printf("%3u ", data[y * component->data_width + x]);
+        }
+        printf("\n");
+    }
+    free(data);
 #endif
 }
 
@@ -490,7 +500,17 @@ gpujpeg_component_print16(struct gpujpeg_component* component, int16_t* d_data)
     cudaFreeHost(data);
 #else
     // TODO: NEED IMPLEMENTATION
-    printf("[WARNING] gpujpeg_component_print16(): NOT YET IMPLEMENTED\n");
+    data = malloc(data_size * sizeof(int16_t));
+    memcpy(data, d_data, data_size * sizeof(int16_t));
+
+    printf("Print Data\n");
+    for ( int y = 0; y < component->data_height; y++ ) {
+        for ( int x = 0; x < component->data_width; x++ ) {
+            printf("%3d ", data[y * component->data_width + x]);
+        }
+        printf("\n");
+    }
+    free(data);
 #endif
 }
 
@@ -1193,7 +1213,34 @@ gpujpeg_coder_deinit(struct gpujpeg_coder* coder)
         cudaFree(coder->d_block_list);
 #else
     // TODO: NEED IMPLEMENTATION
-    printf("[WARNING] gpujpeg_coder_deinit(): NOT YET IMPLEMENTED\n");
+    if (coder->component != NULL)
+        free(coder->component);
+    if (coder->d_component != NULL)
+        free(coder->d_component);
+    if ( coder->data_raw != NULL )
+        free(coder->data_raw);
+    if ( coder->d_data_raw_allocated != NULL )
+        free(coder->d_data_raw_allocated);
+    if ( coder->d_data != NULL )
+        free(coder->d_data);
+    if ( coder->data_quantized != NULL )
+        free(coder->data_quantized);
+    if ( coder->d_data_quantized != NULL )
+        free(coder->d_data_quantized);
+    if ( coder->data_compressed != NULL )
+        free(coder->data_compressed);
+    if ( coder->d_data_compressed != NULL )
+        free(coder->d_data_compressed);
+    if ( coder->segment != NULL )
+        free(coder->segment);
+    if ( coder->d_segment != NULL )
+        free(coder->d_segment);
+    if ( coder->d_temp_huffman != NULL )
+        free(coder->d_temp_huffman);
+    if ( coder->block_list != NULL )
+        free(coder->block_list);
+    if ( coder->d_block_list != NULL )
+        free(coder->d_block_list);
 #endif
     GPUJPEG_CUSTOM_TIMER_DESTROY(coder->duration_memory_to, return -1);
     GPUJPEG_CUSTOM_TIMER_DESTROY(coder->duration_memory_from, return -1);
@@ -1367,7 +1414,7 @@ gpujpeg_image_destroy(uint8_t* image)
     cudaFreeHost(image);
 #else
     // TODO: NEED IMPLEMENTATION
-    printf("[WARNING] gpujpeg_image_destroy(): NOT YET IMPLEMENTED\n");
+    free(image);
 #endif
     return 0;
 }
diff --git a/src/gpujpeg_decoder.c b/src/gpujpeg_decoder.c
index f8aea27a..540a45c6 100644
--- a/src/gpujpeg_decoder.c
+++ b/src/gpujpeg_decoder.c
@@ -33,11 +33,12 @@
 #include "gpujpeg_decoder_internal.h"
 #include "gpujpeg_huffman_cpu_decoder.h"
 #include "gpujpeg_util.h"
+#include "gpujpeg_postprocessor.h"
 
 #ifdef GPUJPEG_USE_CUDA
     #include "gpujpeg_dct_gpu.h"
     #include "gpujpeg_huffman_gpu_decoder.h"
-    #include "gpujpeg_postprocessor.h"
+
 #endif
 
 /* Documented at declaration */
@@ -214,14 +215,10 @@ gpujpeg_decoder_init(struct gpujpeg_decoder* decoder, const struct gpujpeg_param
     }
 
     // Init postprocessor
-#ifdef GPUJPEG_USE_CUDA
     if ( gpujpeg_preprocessor_decoder_init(&decoder->coder) != 0 ) {
         fprintf(stderr, "[GPUJPEG] [Error] Failed to init postprocessor!\n");
         return -1;
     }
-#else
-    // TODO: NOT YET IMPLEMENTED\n
-#endif
 
     return 0;
 }
@@ -373,7 +370,6 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i
     }
 
     // Preprocessing
-#ifdef GPUJPEG_USE_CUDA    
     GPUJPEG_CUSTOM_TIMER_START(coder->duration_preprocessor, coder->param.perf_stats, decoder->stream, return -1);
     rc = gpujpeg_preprocessor_decode(&decoder->coder, decoder->stream);
     if (rc != GPUJPEG_NOERR) {
@@ -381,11 +377,27 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i
     }
     GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_preprocessor, coder->param.perf_stats, decoder->stream, return -1);
 
+#ifdef GPUJPEG_USE_CUDA    
     // Wait for async operations before copying from the device
     cudaStreamSynchronize(decoder->stream);
 #else
-    // TODO: NEED IMPLEMENTATION
-    printf("[WARNING] gpujpeg_decoder_decode(): NOT YET IMPLEMENTED\n");
+    // // TODO: NEED IMPLEMENTATION
+    // printf("[WARNING] gpujpeg_decoder_decode(): NOT YET IMPLEMENTED\n");
+    // const int height = coder->param_image.height;
+    // const int width = coder->param_image.width;
+    // for (int y = 0; y < height; y++) {
+    //     for (int x = 0; x < width; x++) {
+    //         int Y = coder->component[0].d_data[y * width + x];
+    //         int U = coder->component[1].d_data[y * width + x] - 128;
+    //         int V = coder->component[2].d_data[y * width + x] - 128;
+    //         int R = Y + 1.402 * V;
+    //         int G = Y - 0.344136 * U - 0.714136 * V;
+    //         int B = Y + 1.772 * U;
+    //         coder->d_data_raw[3 * (y * width + x)] = R;
+    //         coder->d_data_raw[3 * (y * width + x) + 1] = G;
+    //         coder->d_data_raw[3 * (y * width + x) + 2] = B;
+    //     }
+    // }
 #endif
 
     GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_in_gpu, coder->param.perf_stats, decoder->stream, return -1);
@@ -518,7 +530,27 @@ gpujpeg_decoder_destroy(struct gpujpeg_decoder* decoder)
     }
 #else
     // TODO: NEED IMPLEMENTATION
-    printf("[WARNING] gpujpeg_decoder_destroy(): NOT YET IMPLEMENTED\n");
+    for (int comp_type = 0; comp_type < GPUJPEG_MAX_COMPONENT_COUNT; comp_type++) {
+        if (decoder->table_quantization[comp_type].d_table != NULL) {
+            free(decoder->table_quantization[comp_type].d_table);
+        }
+    }
+
+    for ( int comp_type = 0; comp_type < GPUJPEG_MAX_COMPONENT_COUNT; comp_type++ ) {
+        for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) {
+            free(decoder->d_table_huffman[comp_type][huff_type]);
+        }
+    }
+
+    if (decoder->reader != NULL) {
+        gpujpeg_reader_destroy(decoder->reader);
+    }
+
+    // ??
+    // if (decoder->huffman_gpu_decoder != NULL) {
+    //     gpujpeg_huffman_gpu_decoder_destroy(decoder->huffman_gpu_decoder);
+    // }
+
 #endif
 
     free(decoder);
diff --git a/src/gpujpeg_encoder.c b/src/gpujpeg_encoder.c
index 2822ff6a..87a19246 100644
--- a/src/gpujpeg_encoder.c
+++ b/src/gpujpeg_encoder.c
@@ -35,11 +35,11 @@
 #include "gpujpeg_huffman_cpu_encoder.h"
 #include "gpujpeg_marker.h"
 #include "gpujpeg_util.h"
+#include "gpujpeg_preprocessor.h"
 
 #ifdef GPUJPEG_USE_CUDA
     #include "gpujpeg_dct_gpu.h"
     #include "gpujpeg_huffman_gpu_encoder.h"
-    #include "gpujpeg_preprocessor.h"
 #endif
 
 /* Documented at declaration */
@@ -109,7 +109,10 @@ gpujpeg_encoder_create(cudaStream_t stream)
     gpujpeg_cuda_check_error("Encoder table allocation", return NULL);
 #else
     // TODO: NEED IMPLEMENTATION
-    printf("[WARNING] gpujpeg_encoder_create(): NOT YET IMPLEMENTED\n");
+    for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) {
+        encoder->table_quantization[comp_type].d_table = malloc(64 * sizeof(uint16_t));
+        encoder->table_quantization[comp_type].d_table_forward = malloc(64 * sizeof(float));
+    }
 #endif
 
     // Init huffman tables for encoder
diff --git a/src/gpujpeg_postprocessor.cpp b/src/gpujpeg_postprocessor.cpp
new file mode 100644
index 00000000..812c03c0
--- /dev/null
+++ b/src/gpujpeg_postprocessor.cpp
@@ -0,0 +1,528 @@
+/*
+ * Copyright (c) 2011-2021, CESNET z.s.p.o
+ * Copyright (c) 2011, Silicon Genome, LLC.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @file
+ * @brief
+ * This file contains postprocessors a common format for computational kernels
+ * to raw image. It also does color space transformations.
+ */
+
+#include "gpujpeg_colorspace.h"
+#include "gpujpeg_preprocessor_common.h"
+#include "gpujpeg_postprocessor.h"
+#include "gpujpeg_util.h"
+
+/**
+ * Store value to component data buffer in specified position by buffer size and subsampling
+ *
+ * @param value
+ * @param position_x
+ * @param position_y
+ * @param comp
+ */
+template<
+    uint8_t s_samp_factor_h = GPUJPEG_DYNAMIC,
+    uint8_t s_samp_factor_v = GPUJPEG_DYNAMIC
+>
+struct gpujpeg_preprocessor_comp_to_raw_load_comp
+{
+    static __device__ void
+    perform(uint8_t & value, int position_x, int position_y, struct gpujpeg_preprocessor_data_component & comp)
+    {
+        uint8_t samp_factor_h = s_samp_factor_h;
+        if ( samp_factor_h == GPUJPEG_DYNAMIC ) {
+            samp_factor_h = comp.sampling_factor.horizontal;
+        }
+        uint8_t samp_factor_v = s_samp_factor_v;
+        if ( samp_factor_v == GPUJPEG_DYNAMIC ) {
+            samp_factor_v = comp.sampling_factor.vertical;
+        }
+
+        position_x = position_x / samp_factor_h;
+        position_y = position_y / samp_factor_v;
+
+        int data_position = position_y * comp.data_width + position_x;
+        value = comp.d_data[data_position];
+    }
+};
+template<>
+struct gpujpeg_preprocessor_comp_to_raw_load_comp<1, 1>
+{
+    static __device__ void
+    perform(uint8_t & value, int position_x, int position_y, struct gpujpeg_preprocessor_data_component & comp)
+    {
+        int data_position = position_y * comp.data_width + position_x;
+        value = comp.d_data[data_position];
+    }
+};
+
+template<int comp_count,
+    uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v,
+    uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v,
+    uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v,
+    uint8_t s_comp4_samp_factor_h, uint8_t s_comp4_samp_factor_v
+>
+struct gpujpeg_preprocessor_comp_to_raw_load {
+};
+
+template<
+    uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v,
+    uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v,
+    uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v,
+    uint8_t s_comp4_samp_factor_h, uint8_t s_comp4_samp_factor_v
+>
+struct gpujpeg_preprocessor_comp_to_raw_load <3, s_comp1_samp_factor_h, s_comp1_samp_factor_v,
+    s_comp2_samp_factor_h, s_comp2_samp_factor_v,
+    s_comp3_samp_factor_h, s_comp3_samp_factor_v,
+    s_comp4_samp_factor_h, s_comp4_samp_factor_v> {
+    static __device__ void perform(uchar4 & value, int position_x, int position_y, struct gpujpeg_preprocessor_data & data) {
+        gpujpeg_preprocessor_comp_to_raw_load_comp<s_comp1_samp_factor_h, s_comp1_samp_factor_v>::perform(value.x, position_x, position_y, data.comp[0]);
+        gpujpeg_preprocessor_comp_to_raw_load_comp<s_comp2_samp_factor_h, s_comp2_samp_factor_v>::perform(value.y, position_x, position_y, data.comp[1]);
+        gpujpeg_preprocessor_comp_to_raw_load_comp<s_comp3_samp_factor_h, s_comp3_samp_factor_v>::perform(value.z, position_x, position_y, data.comp[2]);
+    }
+};
+
+template<
+    uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v,
+    uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v,
+    uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v,
+    uint8_t s_comp4_samp_factor_h, uint8_t s_comp4_samp_factor_v
+>
+struct gpujpeg_preprocessor_comp_to_raw_load <4, s_comp1_samp_factor_h, s_comp1_samp_factor_v,
+    s_comp2_samp_factor_h, s_comp2_samp_factor_v,
+    s_comp3_samp_factor_h, s_comp3_samp_factor_v,
+    s_comp4_samp_factor_h, s_comp4_samp_factor_v> {
+    static __device__ void perform(uchar4 & value, int position_x, int position_y, struct gpujpeg_preprocessor_data & data) {
+        gpujpeg_preprocessor_comp_to_raw_load_comp<s_comp1_samp_factor_h, s_comp1_samp_factor_v>::perform(value.x, position_x, position_y, data.comp[0]);
+        gpujpeg_preprocessor_comp_to_raw_load_comp<s_comp2_samp_factor_h, s_comp2_samp_factor_v>::perform(value.y, position_x, position_y, data.comp[1]);
+        gpujpeg_preprocessor_comp_to_raw_load_comp<s_comp3_samp_factor_h, s_comp3_samp_factor_v>::perform(value.z, position_x, position_y, data.comp[2]);
+        gpujpeg_preprocessor_comp_to_raw_load_comp<s_comp4_samp_factor_h, s_comp4_samp_factor_v>::perform(value.w, position_x, position_y, data.comp[3]);
+    }
+};
+
+template<enum gpujpeg_pixel_format pixel_format>
+inline __device__ void gpujpeg_comp_to_raw_store(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r);
+
+template<>
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_U8>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    d_data_raw[image_position] = r.x;
+}
+
+template<>
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_444_U8_P012>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    image_position = image_position * 3;
+    d_data_raw[image_position + 0] = r.x;
+    d_data_raw[image_position + 1] = r.y;
+    d_data_raw[image_position + 2] = r.z;
+}
+
+template<>
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_444_U8_P012A>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    image_position = image_position * 4;
+    d_data_raw[image_position + 0] = r.x;
+    d_data_raw[image_position + 1] = r.y;
+    d_data_raw[image_position + 2] = r.z;
+    d_data_raw[image_position + 3] = r.w;
+}
+
+template<>
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_444_U8_P012Z>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    image_position = image_position * 4;
+    d_data_raw[image_position + 0] = r.x;
+    d_data_raw[image_position + 1] = r.y;
+    d_data_raw[image_position + 2] = r.z;
+    d_data_raw[image_position + 3] = 0x0;
+}
+
+template<>
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_444_U8_P0P1P2>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    d_data_raw[image_position] = r.x;
+    d_data_raw[image_width * image_height + image_position] = r.y;
+    d_data_raw[2 * image_width * image_height + image_position] = r.z;
+}
+
+template<>
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_422_U8_P0P1P2>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    d_data_raw[image_position] = r.x;
+    if ( (x % 2) == 0 ) {
+        d_data_raw[image_width * image_height + image_position / 2] = r.y;
+        d_data_raw[image_width * image_height + image_height * ((image_width + 1) / 2) + image_position / 2] = r.z;
+    }
+}
+
+template<>
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_422_U8_P1020>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    image_position = image_position * 2;
+    d_data_raw[image_position + 1] = r.x;
+    if ( (x % 2) == 0 )
+        d_data_raw[image_position + 0] = r.y;
+    else
+        d_data_raw[image_position + 0] = r.z;
+}
+
+template<>
+inline __device__ void gpujpeg_comp_to_raw_store<GPUJPEG_420_U8_P0P1P2>(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    d_data_raw[image_position] = r.x;
+    if ( (image_position % 2) == 0 && (y % 2) == 0 ) {
+        d_data_raw[image_width * image_height + y / 2 * ((image_width + 1) / 2) + x / 2] = r.y;
+        d_data_raw[image_width * image_height + ((image_height + 1) / 2 + y / 2) * ((image_width + 1) / 2) + x / 2] = r.z;
+    }
+}
+
+/**
+ * Kernel - Copy three separated component buffers into target image data
+ *
+ * @param d_c1  First component buffer
+ * @param d_c2  Second component buffer
+ * @param d_c3  Third component buffer
+ * @param d_target  Image target data
+ * @param pixel_count  Number of pixels to copy
+ * @return void
+ */
+typedef void (*gpujpeg_preprocessor_decode_kernel)(struct gpujpeg_preprocessor_data data, uint8_t* d_data_raw, int image_width, int image_height);
+
+template<
+    enum gpujpeg_color_space color_space_internal,
+    enum gpujpeg_color_space color_space,
+    enum gpujpeg_pixel_format pixel_format,
+    int comp_count,
+    uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v,
+    uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v,
+    uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v,
+    uint8_t s_comp4_samp_factor_h, uint8_t s_comp4_samp_factor_v
+>
+void
+gpujpeg_preprocessor_comp_to_raw_kernel(struct gpujpeg_preprocessor_data data, uint8_t* d_data_raw, int image_width, int image_height)
+{
+    for (int gX = 0; gX < image_width * image_height; ++gX) {
+        int image_position = gX;
+        if ( image_position >= (image_width * image_height) )
+            return;
+        int image_position_x = image_position % image_width;
+        int image_position_y = image_position / image_width;
+
+        // Load
+        uchar4 r;
+        gpujpeg_preprocessor_comp_to_raw_load<comp_count, s_comp1_samp_factor_h, s_comp1_samp_factor_v, s_comp2_samp_factor_h, s_comp2_samp_factor_v, s_comp3_samp_factor_h, s_comp3_samp_factor_v, s_comp4_samp_factor_h, s_comp4_samp_factor_v>::perform(r, image_position_x, image_position_y, data);
+
+        // Color transform
+        gpujpeg_color_transform<color_space_internal, color_space>::perform(r);
+
+        // Save
+        if (pixel_format == GPUJPEG_444_U8_P012A && comp_count == 3) {
+            r.w = 0xFF;
+        }
+        gpujpeg_comp_to_raw_store<pixel_format>(d_data_raw, image_width, image_height, image_position, image_position_x, image_position_y, r);
+    }
+}
+
+/**
+ * Select preprocessor decode kernel
+ *
+ * @param decoder
+ * @return kernel
+ */
+template<enum gpujpeg_color_space color_space_internal>
+gpujpeg_preprocessor_decode_kernel
+gpujpeg_preprocessor_select_decode_kernel(struct gpujpeg_coder* coder)
+{
+    gpujpeg_preprocessor_sampling_factor_t sampling_factor = gpujpeg_preprocessor_make_sampling_factor(
+        coder->sampling_factor.horizontal / coder->component[0].sampling_factor.horizontal,
+        coder->sampling_factor.vertical / coder->component[0].sampling_factor.vertical,
+        coder->sampling_factor.horizontal / coder->component[1].sampling_factor.horizontal,
+        coder->sampling_factor.vertical / coder->component[1].sampling_factor.vertical,
+        coder->sampling_factor.horizontal / coder->component[2].sampling_factor.horizontal,
+        coder->sampling_factor.vertical / coder->component[2].sampling_factor.vertical,
+        coder->param_image.comp_count == 4 ? coder->sampling_factor.horizontal / coder->component[3].sampling_factor.horizontal : 1,
+        coder->param_image.comp_count == 4 ? coder->sampling_factor.vertical / coder->component[3].sampling_factor.vertical : 1
+    );
+
+#define RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6, P7, P8) \
+        switch ( PIXEL_FORMAT ) { \
+            case GPUJPEG_U8: return &gpujpeg_preprocessor_comp_to_raw_kernel<color_space_internal, COLOR, GPUJPEG_U8, 3, P1, P2, P3, P4, P5, P6, P7, P8>; \
+            case GPUJPEG_444_U8_P012: return &gpujpeg_preprocessor_comp_to_raw_kernel<color_space_internal, COLOR, GPUJPEG_444_U8_P012, 3, P1, P2, P3, P4, P5, P6, P7, P8>; \
+            case GPUJPEG_444_U8_P012A: return coder->param_image.comp_count == 4 ? &gpujpeg_preprocessor_comp_to_raw_kernel<color_space_internal, COLOR, GPUJPEG_444_U8_P012A, 4, P1, P2, P3, P4, P5, P6, P7, P8> : &gpujpeg_preprocessor_comp_to_raw_kernel<color_space_internal, COLOR, GPUJPEG_444_U8_P012A, 3, P1, P2, P3, P4, P5, P6, P7, P8>; \
+            case GPUJPEG_444_U8_P012Z: return &gpujpeg_preprocessor_comp_to_raw_kernel<color_space_internal, COLOR, GPUJPEG_444_U8_P012Z, 3, P1, P2, P3, P4, P5, P6, P7, P8>; \
+            case GPUJPEG_422_U8_P1020: return &gpujpeg_preprocessor_comp_to_raw_kernel<color_space_internal, COLOR, GPUJPEG_422_U8_P1020, 3, P1, P2, P3, P4, P5, P6, P7, P8>; \
+            case GPUJPEG_444_U8_P0P1P2: return &gpujpeg_preprocessor_comp_to_raw_kernel<color_space_internal, COLOR, GPUJPEG_444_U8_P0P1P2, 3, P1, P2, P3, P4, P5, P6, P7, P8>; \
+            case GPUJPEG_422_U8_P0P1P2: return &gpujpeg_preprocessor_comp_to_raw_kernel<color_space_internal, COLOR, GPUJPEG_422_U8_P0P1P2, 3, P1, P2, P3, P4, P5, P6, P7, P8>; \
+            case GPUJPEG_420_U8_P0P1P2: return &gpujpeg_preprocessor_comp_to_raw_kernel<color_space_internal, COLOR, GPUJPEG_420_U8_P0P1P2, 3, P1, P2, P3, P4, P5, P6, P7, P8>; \
+            case GPUJPEG_PIXFMT_NONE: GPUJPEG_ASSERT(0 && "Postprocess to GPUJPEG_PIXFMT_NONE not allowed"); \
+        } \
+
+#define RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6, P7, P8) \
+    if ( sampling_factor == gpujpeg_preprocessor_make_sampling_factor(P1, P2, P3, P4, P5, P6, P7, P8) ) { \
+        int max_h = std::max(P1, std::max(P3, P5)); \
+        int max_v = std::max(P2, std::max(P4, P6)); \
+        if ( coder->param.verbose >= 1 ) { \
+            printf("Using faster kernel for postprocessor (precompiled %dx%d, %dx%d, %dx%d).\n", max_h / P1, max_v / P2, max_h / P3, max_v / P4, max_h / P5, max_v / P6); \
+        } \
+        RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6, P7, P8) \
+    }
+
+#define RETURN_KERNEL(PIXEL_FORMAT, COLOR) \
+    RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 1, 1, 1, 1, 1, 1) /* 4:4:4 */ \
+    else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 2, 2, 2, 2, 1, 1) /* 4:2:0 */ \
+    else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 1, 2, 1, 2, 1, 1) /* 4:4:0 */ \
+    else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 2, 1, 2, 1, 1, 1) /* 4:2:2 */ \
+    else { \
+        if ( coder->param.verbose >= 0 ) { \
+            printf("Using slower kernel for postprocessor (dynamic %dx%d, %dx%d, %dx%d).\n", coder->component[0].sampling_factor.horizontal, coder->component[0].sampling_factor.vertical, coder->component[1].sampling_factor.horizontal, coder->component[1].sampling_factor.vertical, coder->component[2].sampling_factor.horizontal, coder->component[2].sampling_factor.vertical); \
+        } \
+        RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC) \
+    } \
+
+    // None color space
+    if ( coder->param_image.color_space == GPUJPEG_NONE ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_NONE)
+    }
+    // RGB color space
+    else if ( coder->param_image.color_space == GPUJPEG_RGB ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_RGB)
+    }
+    // YCbCr color space
+    else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT601 ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT601)
+    }
+    // YCbCr color space
+    else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT601_256LVLS ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT601_256LVLS)
+    }
+    // YCbCr color space
+    else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT709 ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT709)
+    }
+#ifndef ENABLE_YUV
+    // YUV color space
+    else if ( coder->param_image.color_space == GPUJPEG_YUV ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YUV)
+    }
+#endif
+    // Unknown color space
+    else {
+        assert(false);
+    }
+
+#undef RETURN_KERNEL_SWITCH
+#undef RETURN_KERNEL_IF
+#undef RETURN_KERNEL
+
+    return NULL;
+}
+
+static int gpujpeg_preprocessor_decode_no_transform(struct gpujpeg_coder * coder)
+{
+    if (coder->param_image.comp_count == 3 && coder->param_image.color_space != coder->param.color_space_internal) {
+            /*fprintf(stderr, "Decoding JPEG to a planar pixel format is supported only when no color transformation is required. "
+                            "JPEG internal color space is set to \"%s\", image is \"%s\".\n",
+                            gpujpeg_color_space_get_name(coder->param.color_space_internal),
+                            gpujpeg_color_space_get_name(coder->param_image.color_space));*/
+            return 0;
+    }
+
+    const int *sampling_factors = gpujpeg_pixel_format_get_sampling_factor(coder->param_image.pixel_format);
+    for (int i = 0; i < coder->param_image.comp_count; ++i) {
+        if (coder->component[i].sampling_factor.horizontal != sampling_factors[i * 2]
+                || coder->component[i].sampling_factor.vertical != sampling_factors[i * 2 + 1]) {
+            //const char *name = gpujpeg_pixel_format_get_name(coder->param_image.pixel_format);
+            /*fprintf(stderr, "Decoding JPEG to a planar pixel format cannot change subsampling (%s to %s).\n",
+                    gpujpeg_subsampling_get_name(coder->param_image.comp_count, coder->component),
+                    gpujpeg_pixel_format_get_name(coder->param_image.pixel_format));*/
+            return 0;
+        }
+    }
+    return 1;
+}
+
+static int gpujpeg_preprocessor_decode_aligned(struct gpujpeg_coder * coder)
+{
+    for (int i = 0; i < coder->param_image.comp_count; ++i) {
+        if (coder->component[i].data_width !=  coder->component[i].width) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+/* Documented at declaration */
+int
+gpujpeg_preprocessor_decoder_init(struct gpujpeg_coder* coder)
+{
+    coder->preprocessor = NULL;
+
+    if (!gpujpeg_pixel_format_is_interleaved(coder->param_image.pixel_format) &&
+            gpujpeg_preprocessor_decode_no_transform(coder) &&
+            gpujpeg_preprocessor_decode_aligned(coder)) {
+        if ( coder->param.verbose >= 2 ) {
+            printf("Matching format detected - not using postprocessor, using memcpy instead.");
+        }
+        return 0;
+    }
+
+    if (coder->param_image.comp_count == 1 && gpujpeg_pixel_format_get_comp_count(coder->param_image.pixel_format) > 1) {
+        coder->param.verbose >= 0 && fprintf(stderr, "[GPUJPEG] [Error] Decoding single component JPEG allowed only to single component output format!\n");
+        return -1;
+    }
+
+    assert(coder->param_image.comp_count == 3 || coder->param_image.comp_count == 4);
+
+    if (coder->param.color_space_internal == GPUJPEG_NONE) {
+        coder->preprocessor = (void*)gpujpeg_preprocessor_select_decode_kernel<GPUJPEG_NONE>(coder);
+    }
+    else if (coder->param.color_space_internal == GPUJPEG_RGB) {
+        coder->preprocessor = (void*)gpujpeg_preprocessor_select_decode_kernel<GPUJPEG_RGB>(coder);
+    }
+    else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT601) {
+        coder->preprocessor = (void*)gpujpeg_preprocessor_select_decode_kernel<GPUJPEG_YCBCR_BT601>(coder);
+    }
+    else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT601_256LVLS) {
+        coder->preprocessor = (void*)gpujpeg_preprocessor_select_decode_kernel<GPUJPEG_YCBCR_BT601_256LVLS>(coder);
+    }
+    else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT709) {
+        coder->preprocessor = (void*)gpujpeg_preprocessor_select_decode_kernel<GPUJPEG_YCBCR_BT709>(coder);
+    }
+    else {
+        assert(false);
+    }
+    if (coder->preprocessor == NULL) {
+        return -1;
+    }
+    return 0;
+}
+
+/**
+ * Copies raw data GPU memory without running any postprocessor kernel.
+ *
+ * This assumes that the JPEG has same color space as input raw image and
+ * currently also that the component subsampling correspond between raw and
+ * JPEG (although at least different horizontal subsampling can be quite
+ * easily done).
+ *
+ * @invariant gpujpeg_preprocessor_decode_no_transform(coder) != 0
+ */
+static int
+gpujpeg_preprocessor_decoder_copy_planar_data(struct gpujpeg_coder * coder, cudaStream_t stream)
+{
+    assert(coder->param_image.comp_count == 1 ||
+            coder->param_image.comp_count == 3);
+    size_t data_raw_offset = 0;
+    bool needs_stride = false; // true if width is not divisible by MCU width
+    for (int i = 0; i < coder->param_image.comp_count; ++i) {
+        needs_stride = needs_stride || coder->component[i].width != coder->component[i].data_width;
+    }
+    if (!needs_stride) {
+            for (int i = 0; i < coder->param_image.comp_count; ++i) {
+                    size_t component_size = coder->component[i].width * coder->component[i].height;
+                    memcpy(coder->d_data_raw + data_raw_offset, coder->component[i].d_data, component_size);
+                    data_raw_offset += component_size;
+            }
+    } else {
+            for (int i = 0; i < coder->param_image.comp_count; ++i) {
+                    int spitch = coder->component[i].data_width;
+                    int dpitch = coder->component[i].width;
+                    size_t component_size = spitch * coder->component[i].height;
+
+                    for (size_t i = 0; i < coder->component[i].height; ++i) {
+                        size_t dest_offset = data_raw_offset + i * dpitch;
+                        size_t src_offset = i * spitch;
+                        memcpy(coder->d_data_raw + data_raw_offset + dest_offset,  coder->component[i].d_data + src_offset, coder->component[i].width);
+                    }
+                    
+                    data_raw_offset += component_size;
+            }
+    }
+    return 0;
+}
+
+/* Documented at declaration */
+int
+gpujpeg_preprocessor_decode(struct gpujpeg_coder* coder, cudaStream_t stream)
+{
+    if (!coder->preprocessor) {
+        return gpujpeg_preprocessor_decoder_copy_planar_data(coder, stream);
+    }
+
+    // Select kernel
+    gpujpeg_preprocessor_decode_kernel kernel = (gpujpeg_preprocessor_decode_kernel)coder->preprocessor;
+    assert(kernel != NULL);
+
+    int image_width = coder->param_image.width;
+    int image_height = coder->param_image.height;
+
+    // When saving 4:2:2 data of odd width, the data should have even width, so round it
+    if (coder->param_image.pixel_format == GPUJPEG_422_U8_P1020) {
+        image_width = gpujpeg_div_and_round_up(coder->param_image.width, 2) * 2;
+    }
+
+    // Prepare unit size
+    /// @todo this stuff doesn't look correct - we multiply by unitSize and then divide by it
+    int unitSize = gpujpeg_pixel_format_get_unit_size(coder->param_image.pixel_format);
+    if (unitSize == 0) {
+        unitSize = 1;
+    }
+
+    // Prepare kernel
+    // int alignedSize = gpujpeg_div_and_round_up(image_width * image_height, RGB_8BIT_THREADS) * RGB_8BIT_THREADS * unitSize;
+    // dim3 threads (RGB_8BIT_THREADS);
+    // dim3 grid (alignedSize / (RGB_8BIT_THREADS * unitSize));
+    // assert(alignedSize % (RGB_8BIT_THREADS * unitSize) == 0);
+    // if ( grid.x > GPUJPEG_CUDA_MAXIMUM_GRID_SIZE ) {
+    //     grid.y = gpujpeg_div_and_round_up(grid.x, GPUJPEG_CUDA_MAXIMUM_GRID_SIZE);
+    //     grid.x = GPUJPEG_CUDA_MAXIMUM_GRID_SIZE;
+    // }
+
+    // Run kernel
+    struct gpujpeg_preprocessor_data data;
+    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
+        assert(coder->sampling_factor.horizontal % coder->component[comp].sampling_factor.horizontal == 0);
+        assert(coder->sampling_factor.vertical % coder->component[comp].sampling_factor.vertical == 0);
+        data.comp[comp].d_data = coder->component[comp].d_data;
+        data.comp[comp].sampling_factor.horizontal = coder->sampling_factor.horizontal / coder->component[comp].sampling_factor.horizontal;
+        data.comp[comp].sampling_factor.vertical = coder->sampling_factor.vertical / coder->component[comp].sampling_factor.vertical;
+        data.comp[comp].data_width = coder->component[comp].data_width;
+    }
+    kernel(
+        data,
+        coder->d_data_raw,
+        image_width,
+        image_height
+    );
+
+    return 0;
+}
+
+/* vi: set expandtab sw=4: */
diff --git a/src/gpujpeg_preprocessor.cpp b/src/gpujpeg_preprocessor.cpp
new file mode 100644
index 00000000..34f9e5b8
--- /dev/null
+++ b/src/gpujpeg_preprocessor.cpp
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 2011-2020, CESNET z.s.p.o
+ * Copyright (c) 2011, Silicon Genome, LLC.
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+/**
+ * @file
+ * @brief
+ * This file contains preprocessors from raw image to a common format for
+ * computational kernels. It also does color space transformations.
+ */
+
+#include "gpujpeg_colorspace.h"
+#include "gpujpeg_preprocessor_common.h"
+#include "gpujpeg_preprocessor.h"
+#include "gpujpeg_util.h"
+
+/**
+ * Store value to component data buffer in specified position by buffer size and subsampling
+ */
+template<
+    unsigned int s_samp_factor_h,
+    unsigned int s_samp_factor_v
+>
+static void
+gpujpeg_preprocessor_raw_to_comp_store_comp(uint8_t value, unsigned int position_x, unsigned int position_y, struct gpujpeg_preprocessor_data_component & comp)
+{
+    const unsigned int samp_factor_h = ( s_samp_factor_h == GPUJPEG_DYNAMIC ) ? comp.sampling_factor.horizontal : s_samp_factor_h;
+    const unsigned int samp_factor_v = ( s_samp_factor_v == GPUJPEG_DYNAMIC ) ? comp.sampling_factor.vertical : s_samp_factor_v;
+
+    if ( (position_x % samp_factor_h) || (position_y % samp_factor_v) )
+        return;
+
+    position_x = position_x / samp_factor_h;
+    position_y = position_y / samp_factor_v;
+
+    const unsigned int data_position = position_y * comp.data_width + position_x;
+    comp.d_data[data_position] = value;
+}
+
+template<
+    enum gpujpeg_pixel_format pixel_format,
+    uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v,
+    uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v,
+    uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v
+>
+struct gpujpeg_preprocessor_raw_to_comp_store {
+    static void perform(uchar4 value, unsigned int position_x, unsigned int position_y, struct gpujpeg_preprocessor_data & data) {
+        gpujpeg_preprocessor_raw_to_comp_store_comp<s_comp1_samp_factor_h, s_comp1_samp_factor_v>(value.x, position_x, position_y, data.comp[0]);
+        gpujpeg_preprocessor_raw_to_comp_store_comp<s_comp2_samp_factor_h, s_comp2_samp_factor_v>(value.y, position_x, position_y, data.comp[1]);
+        gpujpeg_preprocessor_raw_to_comp_store_comp<s_comp3_samp_factor_h, s_comp3_samp_factor_v>(value.z, position_x, position_y, data.comp[2]);
+    }
+};
+
+template<
+    uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v,
+    uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v,
+    uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v
+>
+struct gpujpeg_preprocessor_raw_to_comp_store<GPUJPEG_444_U8_P012A, s_comp1_samp_factor_h, s_comp1_samp_factor_v, s_comp2_samp_factor_h, s_comp2_samp_factor_v, s_comp3_samp_factor_h, s_comp3_samp_factor_v> {
+    static void perform (uchar4 value, unsigned int position_x, unsigned int position_y, struct gpujpeg_preprocessor_data & data) {
+        gpujpeg_preprocessor_raw_to_comp_store_comp<s_comp1_samp_factor_h, s_comp1_samp_factor_v>(value.x, position_x, position_y, data.comp[0]);
+        gpujpeg_preprocessor_raw_to_comp_store_comp<s_comp2_samp_factor_h, s_comp2_samp_factor_v>(value.y, position_x, position_y, data.comp[1]);
+        gpujpeg_preprocessor_raw_to_comp_store_comp<s_comp3_samp_factor_h, s_comp3_samp_factor_v>(value.z, position_x, position_y, data.comp[2]);
+        gpujpeg_preprocessor_raw_to_comp_store_comp<s_comp1_samp_factor_h, s_comp1_samp_factor_v>(value.w, position_x, position_y, data.comp[3]);
+    }
+};
+
+template<enum gpujpeg_pixel_format>
+inline void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r);
+
+template<>
+inline void raw_to_comp_load<GPUJPEG_U8>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    r.x = d_data_raw[image_position];
+    r.y = 128;
+    r.z = 128;
+}
+
+template<>
+inline void raw_to_comp_load<GPUJPEG_444_U8_P0P1P2>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    r.x = d_data_raw[image_position];
+    r.y = d_data_raw[image_width * image_height + image_position];
+    r.z = d_data_raw[2 * image_width * image_height + image_position];
+}
+
+template<>
+inline void raw_to_comp_load<GPUJPEG_422_U8_P0P1P2>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    r.x = d_data_raw[image_position];
+    r.y = d_data_raw[image_width * image_height + image_position / 2];
+    r.z = d_data_raw[image_width * image_height + image_height * ((image_width + 1) / 2) + image_position / 2];
+}
+
+template<>
+inline void raw_to_comp_load<GPUJPEG_420_U8_P0P1P2>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    r.x = d_data_raw[image_position];
+    r.y = d_data_raw[image_width * image_height + y / 2 * ((image_width + 1) / 2) + x / 2];
+    r.z = d_data_raw[image_width * image_height + ((image_height + 1) / 2 + y / 2) * ((image_width + 1) / 2) + x / 2];
+}
+
+template<>
+inline void raw_to_comp_load<GPUJPEG_444_U8_P012>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    const unsigned int offset = image_position * 3;
+    r.x = d_data_raw[offset];
+    r.y = d_data_raw[offset + 1];
+    r.z = d_data_raw[offset + 2];
+}
+
+template<>
+inline void raw_to_comp_load<GPUJPEG_444_U8_P012A>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    const unsigned int offset = image_position * 4;
+    r.x = d_data_raw[offset];
+    r.y = d_data_raw[offset + 1];
+    r.z = d_data_raw[offset + 2];
+    r.w = d_data_raw[offset + 3];
+}
+
+template<>
+inline void raw_to_comp_load<GPUJPEG_444_U8_P012Z>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    const unsigned int offset = image_position * 4;
+    r.x = d_data_raw[offset];
+    r.y = d_data_raw[offset + 1];
+    r.z = d_data_raw[offset + 2];
+}
+
+template<>
+inline void raw_to_comp_load<GPUJPEG_422_U8_P1020>(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r)
+{
+    const unsigned int offset = image_position * 2;
+    r.x = d_data_raw[offset + 1];
+    if ( image_position % 2 == 0 ) {
+        r.y = d_data_raw[offset];
+        r.z = d_data_raw[offset + 2];
+    } else {
+        r.y = d_data_raw[offset - 2];
+        r.z = d_data_raw[offset];
+    }
+}
+
+/**
+ * Kernel - Copy raw image source data into three separated component buffers
+ */
+typedef void (*gpujpeg_preprocessor_encode_kernel)(struct gpujpeg_preprocessor_data data, const uint8_t* d_data_raw, const uint8_t* d_data_raw_end, int image_width, int image_height, uint32_t width_div_mul, uint32_t width_div_shift);
+
+/**
+ * @note
+ * In previous versions, there was an optimalization with aligned preloading to shared memory.
+ * This was, however, removed because it didn't exhibit any performance improvement anymore
+ * (actually removing that yields slight performance gain).
+ */
+template<
+    enum gpujpeg_color_space color_space_internal,
+    enum gpujpeg_color_space color_space,
+    enum gpujpeg_pixel_format pixel_format,
+    uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v,
+    uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v,
+    uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v
+>
+void
+gpujpeg_preprocessor_raw_to_comp_kernel(struct gpujpeg_preprocessor_data data, const uint8_t* d_data_raw, const uint8_t* d_data_raw_end, int image_width, int image_height, uint32_t width_div_mul, uint32_t width_div_shift)
+{
+    for (int gX = 0; gX < image_width * image_height; ++gX) {
+        int image_position = gX;
+        int image_position_y = gpujpeg_const_div_divide(image_position, width_div_mul, width_div_shift);
+        int image_position_x = image_position - (image_position_y * image_width);
+
+        if ( image_position >= (image_width * image_height) ) {
+            return;
+        }
+
+        // Load
+        uchar4 r;
+        raw_to_comp_load<pixel_format>(d_data_raw, image_width, image_height, image_position, image_position_x, image_position_y, r);
+
+        // Color transform
+        gpujpeg_color_transform<color_space, color_space_internal>::perform(r);
+
+        // Store
+        gpujpeg_preprocessor_raw_to_comp_store<pixel_format, s_comp1_samp_factor_h, s_comp1_samp_factor_v, s_comp2_samp_factor_h, s_comp2_samp_factor_v, s_comp3_samp_factor_h, s_comp3_samp_factor_v>::perform(r, image_position_x, image_position_y, data);
+    }
+}
+
+/**
+ * Select preprocessor encode kernel
+ *
+ * @param encoder
+ * @return kernel
+ */
+template<enum gpujpeg_color_space color_space_internal>
+gpujpeg_preprocessor_encode_kernel
+gpujpeg_preprocessor_select_encode_kernel(struct gpujpeg_coder* coder)
+{
+    gpujpeg_preprocessor_sampling_factor_t sampling_factor = gpujpeg_preprocessor_make_sampling_factor(
+        coder->sampling_factor.horizontal / coder->component[0].sampling_factor.horizontal,
+        coder->sampling_factor.vertical / coder->component[0].sampling_factor.vertical,
+        coder->sampling_factor.horizontal / coder->component[1].sampling_factor.horizontal,
+        coder->sampling_factor.vertical / coder->component[1].sampling_factor.vertical,
+        coder->sampling_factor.horizontal / coder->component[2].sampling_factor.horizontal,
+        coder->sampling_factor.vertical / coder->component[2].sampling_factor.vertical,
+        0, 0 // it's for comparison only, not kernel selection - use dummy value to accept any val for those components if comp_count < 4
+    );
+
+    /// @todo allow also different susbsampling for 4rd channel than for first
+    assert(coder->param_image.comp_count != 4 ||
+            (coder->component[0].sampling_factor.horizontal == coder->component[3].sampling_factor.horizontal &&
+             coder->component[0].sampling_factor.vertical == coder->component[3].sampling_factor.vertical));
+
+#define RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6) \
+        switch ( PIXEL_FORMAT ) { \
+            case GPUJPEG_444_U8_P012: return &gpujpeg_preprocessor_raw_to_comp_kernel<color_space_internal, COLOR, GPUJPEG_444_U8_P012, P1, P2, P3, P4, P5, P6>; \
+            case GPUJPEG_444_U8_P012A: return coder->param_image.comp_count == 4 ? &gpujpeg_preprocessor_raw_to_comp_kernel<color_space_internal, COLOR, GPUJPEG_444_U8_P012A, P1, P2, P3, P4, P5, P6> : &gpujpeg_preprocessor_raw_to_comp_kernel<color_space_internal, COLOR, GPUJPEG_444_U8_P012Z, P1, P2, P3, P4, P5, P6>; \
+            case GPUJPEG_444_U8_P012Z: return  &gpujpeg_preprocessor_raw_to_comp_kernel<color_space_internal, COLOR, GPUJPEG_444_U8_P012Z, P1, P2, P3, P4, P5, P6>; \
+            case GPUJPEG_422_U8_P1020: return &gpujpeg_preprocessor_raw_to_comp_kernel<color_space_internal, COLOR, GPUJPEG_422_U8_P1020, P1, P2, P3, P4, P5, P6>; \
+            case GPUJPEG_444_U8_P0P1P2: return &gpujpeg_preprocessor_raw_to_comp_kernel<color_space_internal, COLOR, GPUJPEG_444_U8_P0P1P2, P1, P2, P3, P4, P5, P6>; \
+            case GPUJPEG_422_U8_P0P1P2: return &gpujpeg_preprocessor_raw_to_comp_kernel<color_space_internal, COLOR, GPUJPEG_422_U8_P0P1P2, P1, P2, P3, P4, P5, P6>; \
+            case GPUJPEG_420_U8_P0P1P2: return &gpujpeg_preprocessor_raw_to_comp_kernel<color_space_internal, COLOR, GPUJPEG_420_U8_P0P1P2, P1, P2, P3, P4, P5, P6>; \
+            case GPUJPEG_U8: return &gpujpeg_preprocessor_raw_to_comp_kernel<color_space_internal, COLOR, GPUJPEG_U8, P1, P2, P3, P4, P5, P6>; \
+            case GPUJPEG_PIXFMT_NONE: GPUJPEG_ASSERT(0 && "Preprocess from GPUJPEG_PIXFMT_NONE not allowed" ); \
+        }
+
+#define RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6) \
+    if ( sampling_factor == gpujpeg_preprocessor_make_sampling_factor(P1, P2, P3, P4, P5, P6, 0, 0) ) { \
+        int max_h = std::max(P1, std::max(P3, P5)); \
+        int max_v = std::max(P2, std::max(P4, P6)); \
+        if ( coder->param.verbose >= 1 ) { \
+            printf("Using faster kernel for preprocessor (precompiled %dx%d, %dx%d, %dx%d).\n", max_h / P1, max_v / P2, max_h / P3, max_v / P4, max_h / P5, max_v / P6); \
+        } \
+        RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6) \
+    }
+
+#define RETURN_KERNEL(PIXEL_FORMAT, COLOR) \
+    RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 1, 1, 1, 1) /* 4:4:4 */ \
+    else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 2, 2, 2, 2) /* 4:2:0 */ \
+    else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 1, 2, 1, 2) /* 4:4:0 */ \
+    else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 2, 1, 2, 1) /* 4:2:2 */ \
+    else { \
+        if ( coder->param.verbose >= 0 ) { \
+            printf("Using slower kernel for preprocessor (dynamic %dx%d, %dx%d, %dx%d).\n", coder->component[0].sampling_factor.horizontal, coder->component[0].sampling_factor.vertical, coder->component[1].sampling_factor.horizontal, coder->component[1].sampling_factor.vertical, coder->component[2].sampling_factor.horizontal, coder->component[2].sampling_factor.vertical); \
+        } \
+        RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC) \
+    } \
+
+    // None color space
+    if ( coder->param_image.color_space == GPUJPEG_NONE ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_NONE);
+    }
+    // RGB color space
+    else if ( coder->param_image.color_space == GPUJPEG_RGB ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_RGB);
+    }
+    // YCbCr color space
+    else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT601 ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT601);
+    }
+    // YCbCr color space
+    else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT601_256LVLS ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT601_256LVLS);
+    }
+    // YCbCr color space
+    else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT709 ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT709);
+    }
+#ifdef ENABLE_YUV
+    // YUV color space
+    else if ( coder->param_image.color_space == GPUJPEG_YUV ) {
+        RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YUV);
+    }
+#endif
+    // Unknown color space
+    else {
+        assert(false);
+    }
+
+#undef RETURN_KERNEL_IF
+#undef RETURN_KERNEL
+
+    return NULL;
+}
+
+static int gpujpeg_preprocessor_encode_no_transform(struct gpujpeg_coder * coder)
+{
+    if (gpujpeg_pixel_format_is_interleaved(coder->param_image.pixel_format)) {
+        return 0;
+    }
+
+    if (coder->param_image.comp_count == 3 && coder->param_image.color_space != coder->param.color_space_internal) {
+        return 0;
+    }
+
+    const int *sampling_factors = gpujpeg_pixel_format_get_sampling_factor(coder->param_image.pixel_format);
+    for (int i = 0; i < coder->param_image.comp_count; ++i) {
+        if (coder->component[i].sampling_factor.horizontal != sampling_factors[i * 2]
+                || coder->component[i].sampling_factor.vertical != sampling_factors[i * 2 + 1]) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+/* Documented at declaration */
+int
+gpujpeg_preprocessor_encoder_init(struct gpujpeg_coder* coder)
+{
+    coder->preprocessor = NULL;
+
+    if ( coder->param_image.comp_count == 1 ) {
+        return 0;
+    }
+
+    if ( gpujpeg_preprocessor_encode_no_transform(coder) ) {
+        if ( coder->param.verbose >= 2 ) {
+            printf("Matching format detected - not using preprocessor, using memcpy instead.");
+        }
+        return 0;
+    }
+
+    assert(coder->param_image.comp_count == 3 || coder->param_image.comp_count == 4);
+
+    if (coder->param.color_space_internal == GPUJPEG_NONE) {
+        coder->preprocessor = (void*)gpujpeg_preprocessor_select_encode_kernel<GPUJPEG_NONE>(coder);
+    }
+    else if (coder->param.color_space_internal == GPUJPEG_RGB) {
+        coder->preprocessor = (void*)gpujpeg_preprocessor_select_encode_kernel<GPUJPEG_RGB>(coder);
+    }
+    else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT601) {
+        coder->preprocessor = (void*)gpujpeg_preprocessor_select_encode_kernel<GPUJPEG_YCBCR_BT601>(coder);
+    }
+    else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT601_256LVLS) {
+        coder->preprocessor = (void*)gpujpeg_preprocessor_select_encode_kernel<GPUJPEG_YCBCR_BT601_256LVLS>(coder);
+    }
+    else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT709) {
+        coder->preprocessor = (void*)gpujpeg_preprocessor_select_encode_kernel<GPUJPEG_YCBCR_BT709>(coder);
+    }
+
+    if ( coder->preprocessor == NULL ) {
+        return -1;
+    }
+
+    return 0;
+}
+
+int
+gpujpeg_preprocessor_encode_interlaced(struct gpujpeg_encoder * encoder)
+{
+    struct gpujpeg_coder* coder = &encoder->coder;
+
+    // Select kernel
+    gpujpeg_preprocessor_encode_kernel kernel = (gpujpeg_preprocessor_encode_kernel) coder->preprocessor;
+    assert(kernel != NULL);
+
+    int image_width = coder->param_image.width;
+    int image_height = coder->param_image.height;
+
+    // When loading 4:2:2 data of odd width, the data in fact has even width, so round it
+    // (at least imagemagick convert tool generates data stream in this way)
+    if (coder->param_image.pixel_format == GPUJPEG_422_U8_P1020) {
+        image_width = (coder->param_image.width + 1) & ~1;
+    }
+
+    // Prepare unit size
+    /// @todo this stuff doesn't look correct - we multiply by unitSize and then divide by it
+    int unitSize = gpujpeg_pixel_format_get_unit_size(coder->param_image.pixel_format);
+    if (unitSize == 0) {
+        unitSize = 1;
+    }
+
+    // Prepare kernel
+    // int alignedSize = gpujpeg_div_and_round_up(image_width * image_height, RGB_8BIT_THREADS) * RGB_8BIT_THREADS * unitSize;
+    // dim3 threads (RGB_8BIT_THREADS);
+    // dim3 grid (alignedSize / (RGB_8BIT_THREADS * unitSize));
+    // assert(alignedSize % (RGB_8BIT_THREADS * unitSize) == 0);
+    // while ( grid.x > GPUJPEG_CUDA_MAXIMUM_GRID_SIZE ) {
+    //     grid.y *= 2;
+    //     grid.x = gpujpeg_div_and_round_up(grid.x, 2);
+    // }
+
+    // Decompose input image width for faster division using multiply-high and right shift
+    uint32_t width_div_mul, width_div_shift;
+    gpujpeg_const_div_prepare(image_width, width_div_mul, width_div_shift);
+
+    // Run kernel
+    struct gpujpeg_preprocessor_data data;
+    for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) {
+        assert(coder->sampling_factor.horizontal % coder->component[comp].sampling_factor.horizontal == 0);
+        assert(coder->sampling_factor.vertical % coder->component[comp].sampling_factor.vertical == 0);
+        data.comp[comp].d_data = coder->component[comp].d_data;
+        data.comp[comp].sampling_factor.horizontal = coder->sampling_factor.horizontal / coder->component[comp].sampling_factor.horizontal;
+        data.comp[comp].sampling_factor.vertical = coder->sampling_factor.vertical / coder->component[comp].sampling_factor.vertical;
+        data.comp[comp].data_width = coder->component[comp].data_width;
+    }
+    //kernel<<<grid, threads, 0, encoder->stream>>>(
+    kernel(        
+        data,
+        coder->d_data_raw,
+        coder->d_data_raw + coder->data_raw_size,
+        image_width,
+        image_height,
+        width_div_mul,
+        width_div_shift
+    );
+
+    return 0;
+}
+
+/**
+ * Copies raw data from source image to GPU memory without running
+ * any preprocessor kernel.
+ *
+ * This assumes that the JPEG has same color space as input raw image and
+ * currently also that the component subsampling correspond between raw and
+ * JPEG (although at least different horizontal subsampling can be quite
+ * easily done).
+ *
+ * @invariant gpujpeg_preprocessor_encode_no_transform(coder) != 0
+ */
+static int
+gpujpeg_preprocessor_encoder_copy_planar_data(struct gpujpeg_encoder * encoder)
+{
+    struct gpujpeg_coder * coder = &encoder->coder;
+    assert(coder->param_image.comp_count == 1 ||
+            coder->param_image.comp_count == 3);
+
+    size_t data_raw_offset = 0;
+    bool needs_stride = false; // true if width is not divisible by MCU width
+    for (int i = 0; i < coder->param_image.comp_count; ++i) {
+        needs_stride = needs_stride || coder->component[i].width != coder->component[i].data_width;
+    }
+    if (!needs_stride) {
+            for (int i = 0; i < coder->param_image.comp_count; ++i) {
+                    size_t component_size = coder->component[i].width * coder->component[i].height;
+                    memcpy(coder->component[i].d_data, coder->d_data_raw + data_raw_offset, component_size);
+                    data_raw_offset += component_size;
+            }
+    } else {
+            for (int i = 0; i < coder->param_image.comp_count; ++i) {
+                    int spitch = coder->component[i].width;
+                    int dpitch = coder->component[i].data_width;
+                    size_t component_size = spitch * coder->component[i].height;
+                    for (size_t i = 0; i < coder->component[i].height; ++i) {
+                        memcpy((char*)coder->component[i].d_data + i * dpitch, (const char*)coder->d_data_raw + data_raw_offset + i * spitch, spitch);
+                    }
+
+                    data_raw_offset += component_size;
+            }
+    }
+
+    return 0;
+}
+
+/* Documented at declaration */
+int
+gpujpeg_preprocessor_encode(struct gpujpeg_encoder * encoder)
+{
+    struct gpujpeg_coder * coder = &encoder->coder;
+    if (coder->preprocessor) {
+            return gpujpeg_preprocessor_encode_interlaced(encoder);
+    } else {
+        return gpujpeg_preprocessor_encoder_copy_planar_data(encoder);
+    }
+}
+
+/* vi: set expandtab sw=4: */
diff --git a/src/gpujpeg_preprocessor_common.h b/src/gpujpeg_preprocessor_common.h
index 048f46b7..93e6cbab 100644
--- a/src/gpujpeg_preprocessor_common.h
+++ b/src/gpujpeg_preprocessor_common.h
@@ -102,10 +102,17 @@ gpujpeg_const_div_prepare(const uint32_t d, uint32_t & pre_div_mul, uint32_t & p
 /**
  * Divides unsigned numerator (up to 2^31) by precomputed constant denominator.
  */
+#ifdef GPUJPEG_USE_CUDA  
 __device__ static uint32_t
 gpujpeg_const_div_divide(const uint32_t numerator, const uint32_t pre_div_mul, const uint32_t pre_div_shift) {
     return pre_div_mul ? __umulhi(numerator, pre_div_mul) >> pre_div_shift : numerator;
 }
+#else
+static uint32_t 
+gpujpeg_const_div_divide(const uint32_t numerator, const uint32_t pre_div_mul, const uint32_t pre_div_shift) {
+    return pre_div_mul ? (numerator * pre_div_mul) >> pre_div_shift : numerator;
+}
+#endif
 
 /**
  * Compose sampling factor for all components to single type