diff --git a/CMakeLists.txt b/CMakeLists.txt index 92ab0d44..d225580e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_minimum_required(VERSION 3.8.0 FATAL_ERROR) # change version also in configure.ac -project(gpujpeg VERSION 0.21.0 LANGUAGES C) +project(gpujpeg VERSION 0.21.0 LANGUAGES C CXX) include(CheckLanguage) check_language(CUDA) @@ -157,7 +157,7 @@ message(STATUS "Configured options: ${COMPILED_OPTIONS}") # GPUJPEG library file(GLOB H_FILES libgpujpeg/*.h ${CMAKE_CURRENT_BINARY_DIR}/libgpujpeg/gpujpeg_version.h) -file(GLOB_RECURSE C_FILES src/*.c src/*.cu) +file(GLOB_RECURSE C_FILES src/*.c src/*.cu src/*.cpp) list(REMOVE_ITEM C_FILES "${CMAKE_CURRENT_SOURCE_DIR}/src/main.c") add_library(gpujpeg ${H_FILES} ${C_FILES}) target_include_directories(${PROJECT_NAME} diff --git a/src/gpujpeg_colorspace.h b/src/gpujpeg_colorspace.h index 40c9464c..c948dbd1 100644 --- a/src/gpujpeg_colorspace.h +++ b/src/gpujpeg_colorspace.h @@ -34,6 +34,33 @@ #include #include "../libgpujpeg/gpujpeg_type.h" +// TODO: NEED IMPLEMENTATION +#ifndef GPUJPEG_USE_CUDA +#include +#include + +#define __device__ +#define round std::round + +/// Create uchar4 structure +struct uchar4 { + uint8_t x; + uint8_t y; + uint8_t z; + uint8_t w; +}; +typedef struct uchar4 uchar4; + +/// Create int4 structure +struct int4 { + uint32_t x; + uint32_t y; + uint32_t z; + uint32_t w; +}; +typedef struct int4 int4; +#endif + /** * Color transform debug info */ diff --git a/src/gpujpeg_common.c b/src/gpujpeg_common.c index 0a883adb..f476051e 100644 --- a/src/gpujpeg_common.c +++ b/src/gpujpeg_common.c @@ -466,7 +466,17 @@ gpujpeg_component_print8(struct gpujpeg_component* component, uint8_t* d_data) cudaFreeHost(data); #else // TODO: NEED IMPLEMENTATION - printf("[WARNING] gpujpeg_component_print8(): NOT YET IMPLEMENTED\n"); + data = malloc(data_size * sizeof(uint8_t)); + memcpy(data, d_data, data_size * sizeof(uint8_t)); + + printf("Print Data\n"); + for ( int y = 0; y < component->data_height; y++ ) { + for ( int x = 0; x < component->data_width; x++ ) { + printf("%3u ", data[y * component->data_width + x]); + } + printf("\n"); + } + free(data); #endif } @@ -490,7 +500,17 @@ gpujpeg_component_print16(struct gpujpeg_component* component, int16_t* d_data) cudaFreeHost(data); #else // TODO: NEED IMPLEMENTATION - printf("[WARNING] gpujpeg_component_print16(): NOT YET IMPLEMENTED\n"); + data = malloc(data_size * sizeof(int16_t)); + memcpy(data, d_data, data_size * sizeof(int16_t)); + + printf("Print Data\n"); + for ( int y = 0; y < component->data_height; y++ ) { + for ( int x = 0; x < component->data_width; x++ ) { + printf("%3d ", data[y * component->data_width + x]); + } + printf("\n"); + } + free(data); #endif } @@ -1193,7 +1213,34 @@ gpujpeg_coder_deinit(struct gpujpeg_coder* coder) cudaFree(coder->d_block_list); #else // TODO: NEED IMPLEMENTATION - printf("[WARNING] gpujpeg_coder_deinit(): NOT YET IMPLEMENTED\n"); + if (coder->component != NULL) + free(coder->component); + if (coder->d_component != NULL) + free(coder->d_component); + if ( coder->data_raw != NULL ) + free(coder->data_raw); + if ( coder->d_data_raw_allocated != NULL ) + free(coder->d_data_raw_allocated); + if ( coder->d_data != NULL ) + free(coder->d_data); + if ( coder->data_quantized != NULL ) + free(coder->data_quantized); + if ( coder->d_data_quantized != NULL ) + free(coder->d_data_quantized); + if ( coder->data_compressed != NULL ) + free(coder->data_compressed); + if ( coder->d_data_compressed != NULL ) + free(coder->d_data_compressed); + if ( coder->segment != NULL ) + free(coder->segment); + if ( coder->d_segment != NULL ) + free(coder->d_segment); + if ( coder->d_temp_huffman != NULL ) + free(coder->d_temp_huffman); + if ( coder->block_list != NULL ) + free(coder->block_list); + if ( coder->d_block_list != NULL ) + free(coder->d_block_list); #endif GPUJPEG_CUSTOM_TIMER_DESTROY(coder->duration_memory_to, return -1); GPUJPEG_CUSTOM_TIMER_DESTROY(coder->duration_memory_from, return -1); @@ -1367,7 +1414,7 @@ gpujpeg_image_destroy(uint8_t* image) cudaFreeHost(image); #else // TODO: NEED IMPLEMENTATION - printf("[WARNING] gpujpeg_image_destroy(): NOT YET IMPLEMENTED\n"); + free(image); #endif return 0; } diff --git a/src/gpujpeg_decoder.c b/src/gpujpeg_decoder.c index f8aea27a..540a45c6 100644 --- a/src/gpujpeg_decoder.c +++ b/src/gpujpeg_decoder.c @@ -33,11 +33,12 @@ #include "gpujpeg_decoder_internal.h" #include "gpujpeg_huffman_cpu_decoder.h" #include "gpujpeg_util.h" +#include "gpujpeg_postprocessor.h" #ifdef GPUJPEG_USE_CUDA #include "gpujpeg_dct_gpu.h" #include "gpujpeg_huffman_gpu_decoder.h" - #include "gpujpeg_postprocessor.h" + #endif /* Documented at declaration */ @@ -214,14 +215,10 @@ gpujpeg_decoder_init(struct gpujpeg_decoder* decoder, const struct gpujpeg_param } // Init postprocessor -#ifdef GPUJPEG_USE_CUDA if ( gpujpeg_preprocessor_decoder_init(&decoder->coder) != 0 ) { fprintf(stderr, "[GPUJPEG] [Error] Failed to init postprocessor!\n"); return -1; } -#else - // TODO: NOT YET IMPLEMENTED\n -#endif return 0; } @@ -373,7 +370,6 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i } // Preprocessing -#ifdef GPUJPEG_USE_CUDA GPUJPEG_CUSTOM_TIMER_START(coder->duration_preprocessor, coder->param.perf_stats, decoder->stream, return -1); rc = gpujpeg_preprocessor_decode(&decoder->coder, decoder->stream); if (rc != GPUJPEG_NOERR) { @@ -381,11 +377,27 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i } GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_preprocessor, coder->param.perf_stats, decoder->stream, return -1); +#ifdef GPUJPEG_USE_CUDA // Wait for async operations before copying from the device cudaStreamSynchronize(decoder->stream); #else - // TODO: NEED IMPLEMENTATION - printf("[WARNING] gpujpeg_decoder_decode(): NOT YET IMPLEMENTED\n"); + // // TODO: NEED IMPLEMENTATION + // printf("[WARNING] gpujpeg_decoder_decode(): NOT YET IMPLEMENTED\n"); + // const int height = coder->param_image.height; + // const int width = coder->param_image.width; + // for (int y = 0; y < height; y++) { + // for (int x = 0; x < width; x++) { + // int Y = coder->component[0].d_data[y * width + x]; + // int U = coder->component[1].d_data[y * width + x] - 128; + // int V = coder->component[2].d_data[y * width + x] - 128; + // int R = Y + 1.402 * V; + // int G = Y - 0.344136 * U - 0.714136 * V; + // int B = Y + 1.772 * U; + // coder->d_data_raw[3 * (y * width + x)] = R; + // coder->d_data_raw[3 * (y * width + x) + 1] = G; + // coder->d_data_raw[3 * (y * width + x) + 2] = B; + // } + // } #endif GPUJPEG_CUSTOM_TIMER_STOP(coder->duration_in_gpu, coder->param.perf_stats, decoder->stream, return -1); @@ -518,7 +530,27 @@ gpujpeg_decoder_destroy(struct gpujpeg_decoder* decoder) } #else // TODO: NEED IMPLEMENTATION - printf("[WARNING] gpujpeg_decoder_destroy(): NOT YET IMPLEMENTED\n"); + for (int comp_type = 0; comp_type < GPUJPEG_MAX_COMPONENT_COUNT; comp_type++) { + if (decoder->table_quantization[comp_type].d_table != NULL) { + free(decoder->table_quantization[comp_type].d_table); + } + } + + for ( int comp_type = 0; comp_type < GPUJPEG_MAX_COMPONENT_COUNT; comp_type++ ) { + for ( int huff_type = 0; huff_type < GPUJPEG_HUFFMAN_TYPE_COUNT; huff_type++ ) { + free(decoder->d_table_huffman[comp_type][huff_type]); + } + } + + if (decoder->reader != NULL) { + gpujpeg_reader_destroy(decoder->reader); + } + + // ?? + // if (decoder->huffman_gpu_decoder != NULL) { + // gpujpeg_huffman_gpu_decoder_destroy(decoder->huffman_gpu_decoder); + // } + #endif free(decoder); diff --git a/src/gpujpeg_encoder.c b/src/gpujpeg_encoder.c index 2822ff6a..87a19246 100644 --- a/src/gpujpeg_encoder.c +++ b/src/gpujpeg_encoder.c @@ -35,11 +35,11 @@ #include "gpujpeg_huffman_cpu_encoder.h" #include "gpujpeg_marker.h" #include "gpujpeg_util.h" +#include "gpujpeg_preprocessor.h" #ifdef GPUJPEG_USE_CUDA #include "gpujpeg_dct_gpu.h" #include "gpujpeg_huffman_gpu_encoder.h" - #include "gpujpeg_preprocessor.h" #endif /* Documented at declaration */ @@ -109,7 +109,10 @@ gpujpeg_encoder_create(cudaStream_t stream) gpujpeg_cuda_check_error("Encoder table allocation", return NULL); #else // TODO: NEED IMPLEMENTATION - printf("[WARNING] gpujpeg_encoder_create(): NOT YET IMPLEMENTED\n"); + for ( int comp_type = 0; comp_type < GPUJPEG_COMPONENT_TYPE_COUNT; comp_type++ ) { + encoder->table_quantization[comp_type].d_table = malloc(64 * sizeof(uint16_t)); + encoder->table_quantization[comp_type].d_table_forward = malloc(64 * sizeof(float)); + } #endif // Init huffman tables for encoder diff --git a/src/gpujpeg_postprocessor.cpp b/src/gpujpeg_postprocessor.cpp new file mode 100644 index 00000000..812c03c0 --- /dev/null +++ b/src/gpujpeg_postprocessor.cpp @@ -0,0 +1,528 @@ +/* + * Copyright (c) 2011-2021, CESNET z.s.p.o + * Copyright (c) 2011, Silicon Genome, LLC. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/** + * @file + * @brief + * This file contains postprocessors a common format for computational kernels + * to raw image. It also does color space transformations. + */ + +#include "gpujpeg_colorspace.h" +#include "gpujpeg_preprocessor_common.h" +#include "gpujpeg_postprocessor.h" +#include "gpujpeg_util.h" + +/** + * Store value to component data buffer in specified position by buffer size and subsampling + * + * @param value + * @param position_x + * @param position_y + * @param comp + */ +template< + uint8_t s_samp_factor_h = GPUJPEG_DYNAMIC, + uint8_t s_samp_factor_v = GPUJPEG_DYNAMIC +> +struct gpujpeg_preprocessor_comp_to_raw_load_comp +{ + static __device__ void + perform(uint8_t & value, int position_x, int position_y, struct gpujpeg_preprocessor_data_component & comp) + { + uint8_t samp_factor_h = s_samp_factor_h; + if ( samp_factor_h == GPUJPEG_DYNAMIC ) { + samp_factor_h = comp.sampling_factor.horizontal; + } + uint8_t samp_factor_v = s_samp_factor_v; + if ( samp_factor_v == GPUJPEG_DYNAMIC ) { + samp_factor_v = comp.sampling_factor.vertical; + } + + position_x = position_x / samp_factor_h; + position_y = position_y / samp_factor_v; + + int data_position = position_y * comp.data_width + position_x; + value = comp.d_data[data_position]; + } +}; +template<> +struct gpujpeg_preprocessor_comp_to_raw_load_comp<1, 1> +{ + static __device__ void + perform(uint8_t & value, int position_x, int position_y, struct gpujpeg_preprocessor_data_component & comp) + { + int data_position = position_y * comp.data_width + position_x; + value = comp.d_data[data_position]; + } +}; + +template +struct gpujpeg_preprocessor_comp_to_raw_load { +}; + +template< + uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v, + uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v, + uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v, + uint8_t s_comp4_samp_factor_h, uint8_t s_comp4_samp_factor_v +> +struct gpujpeg_preprocessor_comp_to_raw_load <3, s_comp1_samp_factor_h, s_comp1_samp_factor_v, + s_comp2_samp_factor_h, s_comp2_samp_factor_v, + s_comp3_samp_factor_h, s_comp3_samp_factor_v, + s_comp4_samp_factor_h, s_comp4_samp_factor_v> { + static __device__ void perform(uchar4 & value, int position_x, int position_y, struct gpujpeg_preprocessor_data & data) { + gpujpeg_preprocessor_comp_to_raw_load_comp::perform(value.x, position_x, position_y, data.comp[0]); + gpujpeg_preprocessor_comp_to_raw_load_comp::perform(value.y, position_x, position_y, data.comp[1]); + gpujpeg_preprocessor_comp_to_raw_load_comp::perform(value.z, position_x, position_y, data.comp[2]); + } +}; + +template< + uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v, + uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v, + uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v, + uint8_t s_comp4_samp_factor_h, uint8_t s_comp4_samp_factor_v +> +struct gpujpeg_preprocessor_comp_to_raw_load <4, s_comp1_samp_factor_h, s_comp1_samp_factor_v, + s_comp2_samp_factor_h, s_comp2_samp_factor_v, + s_comp3_samp_factor_h, s_comp3_samp_factor_v, + s_comp4_samp_factor_h, s_comp4_samp_factor_v> { + static __device__ void perform(uchar4 & value, int position_x, int position_y, struct gpujpeg_preprocessor_data & data) { + gpujpeg_preprocessor_comp_to_raw_load_comp::perform(value.x, position_x, position_y, data.comp[0]); + gpujpeg_preprocessor_comp_to_raw_load_comp::perform(value.y, position_x, position_y, data.comp[1]); + gpujpeg_preprocessor_comp_to_raw_load_comp::perform(value.z, position_x, position_y, data.comp[2]); + gpujpeg_preprocessor_comp_to_raw_load_comp::perform(value.w, position_x, position_y, data.comp[3]); + } +}; + +template +inline __device__ void gpujpeg_comp_to_raw_store(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r); + +template<> +inline __device__ void gpujpeg_comp_to_raw_store(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + d_data_raw[image_position] = r.x; +} + +template<> +inline __device__ void gpujpeg_comp_to_raw_store(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + image_position = image_position * 3; + d_data_raw[image_position + 0] = r.x; + d_data_raw[image_position + 1] = r.y; + d_data_raw[image_position + 2] = r.z; +} + +template<> +inline __device__ void gpujpeg_comp_to_raw_store(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + image_position = image_position * 4; + d_data_raw[image_position + 0] = r.x; + d_data_raw[image_position + 1] = r.y; + d_data_raw[image_position + 2] = r.z; + d_data_raw[image_position + 3] = r.w; +} + +template<> +inline __device__ void gpujpeg_comp_to_raw_store(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + image_position = image_position * 4; + d_data_raw[image_position + 0] = r.x; + d_data_raw[image_position + 1] = r.y; + d_data_raw[image_position + 2] = r.z; + d_data_raw[image_position + 3] = 0x0; +} + +template<> +inline __device__ void gpujpeg_comp_to_raw_store(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + d_data_raw[image_position] = r.x; + d_data_raw[image_width * image_height + image_position] = r.y; + d_data_raw[2 * image_width * image_height + image_position] = r.z; +} + +template<> +inline __device__ void gpujpeg_comp_to_raw_store(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + d_data_raw[image_position] = r.x; + if ( (x % 2) == 0 ) { + d_data_raw[image_width * image_height + image_position / 2] = r.y; + d_data_raw[image_width * image_height + image_height * ((image_width + 1) / 2) + image_position / 2] = r.z; + } +} + +template<> +inline __device__ void gpujpeg_comp_to_raw_store(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + image_position = image_position * 2; + d_data_raw[image_position + 1] = r.x; + if ( (x % 2) == 0 ) + d_data_raw[image_position + 0] = r.y; + else + d_data_raw[image_position + 0] = r.z; +} + +template<> +inline __device__ void gpujpeg_comp_to_raw_store(uint8_t *d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + d_data_raw[image_position] = r.x; + if ( (image_position % 2) == 0 && (y % 2) == 0 ) { + d_data_raw[image_width * image_height + y / 2 * ((image_width + 1) / 2) + x / 2] = r.y; + d_data_raw[image_width * image_height + ((image_height + 1) / 2 + y / 2) * ((image_width + 1) / 2) + x / 2] = r.z; + } +} + +/** + * Kernel - Copy three separated component buffers into target image data + * + * @param d_c1 First component buffer + * @param d_c2 Second component buffer + * @param d_c3 Third component buffer + * @param d_target Image target data + * @param pixel_count Number of pixels to copy + * @return void + */ +typedef void (*gpujpeg_preprocessor_decode_kernel)(struct gpujpeg_preprocessor_data data, uint8_t* d_data_raw, int image_width, int image_height); + +template< + enum gpujpeg_color_space color_space_internal, + enum gpujpeg_color_space color_space, + enum gpujpeg_pixel_format pixel_format, + int comp_count, + uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v, + uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v, + uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v, + uint8_t s_comp4_samp_factor_h, uint8_t s_comp4_samp_factor_v +> +void +gpujpeg_preprocessor_comp_to_raw_kernel(struct gpujpeg_preprocessor_data data, uint8_t* d_data_raw, int image_width, int image_height) +{ + for (int gX = 0; gX < image_width * image_height; ++gX) { + int image_position = gX; + if ( image_position >= (image_width * image_height) ) + return; + int image_position_x = image_position % image_width; + int image_position_y = image_position / image_width; + + // Load + uchar4 r; + gpujpeg_preprocessor_comp_to_raw_load::perform(r, image_position_x, image_position_y, data); + + // Color transform + gpujpeg_color_transform::perform(r); + + // Save + if (pixel_format == GPUJPEG_444_U8_P012A && comp_count == 3) { + r.w = 0xFF; + } + gpujpeg_comp_to_raw_store(d_data_raw, image_width, image_height, image_position, image_position_x, image_position_y, r); + } +} + +/** + * Select preprocessor decode kernel + * + * @param decoder + * @return kernel + */ +template +gpujpeg_preprocessor_decode_kernel +gpujpeg_preprocessor_select_decode_kernel(struct gpujpeg_coder* coder) +{ + gpujpeg_preprocessor_sampling_factor_t sampling_factor = gpujpeg_preprocessor_make_sampling_factor( + coder->sampling_factor.horizontal / coder->component[0].sampling_factor.horizontal, + coder->sampling_factor.vertical / coder->component[0].sampling_factor.vertical, + coder->sampling_factor.horizontal / coder->component[1].sampling_factor.horizontal, + coder->sampling_factor.vertical / coder->component[1].sampling_factor.vertical, + coder->sampling_factor.horizontal / coder->component[2].sampling_factor.horizontal, + coder->sampling_factor.vertical / coder->component[2].sampling_factor.vertical, + coder->param_image.comp_count == 4 ? coder->sampling_factor.horizontal / coder->component[3].sampling_factor.horizontal : 1, + coder->param_image.comp_count == 4 ? coder->sampling_factor.vertical / coder->component[3].sampling_factor.vertical : 1 + ); + +#define RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6, P7, P8) \ + switch ( PIXEL_FORMAT ) { \ + case GPUJPEG_U8: return &gpujpeg_preprocessor_comp_to_raw_kernel; \ + case GPUJPEG_444_U8_P012: return &gpujpeg_preprocessor_comp_to_raw_kernel; \ + case GPUJPEG_444_U8_P012A: return coder->param_image.comp_count == 4 ? &gpujpeg_preprocessor_comp_to_raw_kernel : &gpujpeg_preprocessor_comp_to_raw_kernel; \ + case GPUJPEG_444_U8_P012Z: return &gpujpeg_preprocessor_comp_to_raw_kernel; \ + case GPUJPEG_422_U8_P1020: return &gpujpeg_preprocessor_comp_to_raw_kernel; \ + case GPUJPEG_444_U8_P0P1P2: return &gpujpeg_preprocessor_comp_to_raw_kernel; \ + case GPUJPEG_422_U8_P0P1P2: return &gpujpeg_preprocessor_comp_to_raw_kernel; \ + case GPUJPEG_420_U8_P0P1P2: return &gpujpeg_preprocessor_comp_to_raw_kernel; \ + case GPUJPEG_PIXFMT_NONE: GPUJPEG_ASSERT(0 && "Postprocess to GPUJPEG_PIXFMT_NONE not allowed"); \ + } \ + +#define RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6, P7, P8) \ + if ( sampling_factor == gpujpeg_preprocessor_make_sampling_factor(P1, P2, P3, P4, P5, P6, P7, P8) ) { \ + int max_h = std::max(P1, std::max(P3, P5)); \ + int max_v = std::max(P2, std::max(P4, P6)); \ + if ( coder->param.verbose >= 1 ) { \ + printf("Using faster kernel for postprocessor (precompiled %dx%d, %dx%d, %dx%d).\n", max_h / P1, max_v / P2, max_h / P3, max_v / P4, max_h / P5, max_v / P6); \ + } \ + RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6, P7, P8) \ + } + +#define RETURN_KERNEL(PIXEL_FORMAT, COLOR) \ + RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 1, 1, 1, 1, 1, 1) /* 4:4:4 */ \ + else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 2, 2, 2, 2, 1, 1) /* 4:2:0 */ \ + else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 1, 2, 1, 2, 1, 1) /* 4:4:0 */ \ + else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 2, 1, 2, 1, 1, 1) /* 4:2:2 */ \ + else { \ + if ( coder->param.verbose >= 0 ) { \ + printf("Using slower kernel for postprocessor (dynamic %dx%d, %dx%d, %dx%d).\n", coder->component[0].sampling_factor.horizontal, coder->component[0].sampling_factor.vertical, coder->component[1].sampling_factor.horizontal, coder->component[1].sampling_factor.vertical, coder->component[2].sampling_factor.horizontal, coder->component[2].sampling_factor.vertical); \ + } \ + RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC) \ + } \ + + // None color space + if ( coder->param_image.color_space == GPUJPEG_NONE ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_NONE) + } + // RGB color space + else if ( coder->param_image.color_space == GPUJPEG_RGB ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_RGB) + } + // YCbCr color space + else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT601 ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT601) + } + // YCbCr color space + else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT601_256LVLS ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT601_256LVLS) + } + // YCbCr color space + else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT709 ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT709) + } +#ifndef ENABLE_YUV + // YUV color space + else if ( coder->param_image.color_space == GPUJPEG_YUV ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YUV) + } +#endif + // Unknown color space + else { + assert(false); + } + +#undef RETURN_KERNEL_SWITCH +#undef RETURN_KERNEL_IF +#undef RETURN_KERNEL + + return NULL; +} + +static int gpujpeg_preprocessor_decode_no_transform(struct gpujpeg_coder * coder) +{ + if (coder->param_image.comp_count == 3 && coder->param_image.color_space != coder->param.color_space_internal) { + /*fprintf(stderr, "Decoding JPEG to a planar pixel format is supported only when no color transformation is required. " + "JPEG internal color space is set to \"%s\", image is \"%s\".\n", + gpujpeg_color_space_get_name(coder->param.color_space_internal), + gpujpeg_color_space_get_name(coder->param_image.color_space));*/ + return 0; + } + + const int *sampling_factors = gpujpeg_pixel_format_get_sampling_factor(coder->param_image.pixel_format); + for (int i = 0; i < coder->param_image.comp_count; ++i) { + if (coder->component[i].sampling_factor.horizontal != sampling_factors[i * 2] + || coder->component[i].sampling_factor.vertical != sampling_factors[i * 2 + 1]) { + //const char *name = gpujpeg_pixel_format_get_name(coder->param_image.pixel_format); + /*fprintf(stderr, "Decoding JPEG to a planar pixel format cannot change subsampling (%s to %s).\n", + gpujpeg_subsampling_get_name(coder->param_image.comp_count, coder->component), + gpujpeg_pixel_format_get_name(coder->param_image.pixel_format));*/ + return 0; + } + } + return 1; +} + +static int gpujpeg_preprocessor_decode_aligned(struct gpujpeg_coder * coder) +{ + for (int i = 0; i < coder->param_image.comp_count; ++i) { + if (coder->component[i].data_width != coder->component[i].width) { + return 0; + } + } + return 1; +} + +/* Documented at declaration */ +int +gpujpeg_preprocessor_decoder_init(struct gpujpeg_coder* coder) +{ + coder->preprocessor = NULL; + + if (!gpujpeg_pixel_format_is_interleaved(coder->param_image.pixel_format) && + gpujpeg_preprocessor_decode_no_transform(coder) && + gpujpeg_preprocessor_decode_aligned(coder)) { + if ( coder->param.verbose >= 2 ) { + printf("Matching format detected - not using postprocessor, using memcpy instead."); + } + return 0; + } + + if (coder->param_image.comp_count == 1 && gpujpeg_pixel_format_get_comp_count(coder->param_image.pixel_format) > 1) { + coder->param.verbose >= 0 && fprintf(stderr, "[GPUJPEG] [Error] Decoding single component JPEG allowed only to single component output format!\n"); + return -1; + } + + assert(coder->param_image.comp_count == 3 || coder->param_image.comp_count == 4); + + if (coder->param.color_space_internal == GPUJPEG_NONE) { + coder->preprocessor = (void*)gpujpeg_preprocessor_select_decode_kernel(coder); + } + else if (coder->param.color_space_internal == GPUJPEG_RGB) { + coder->preprocessor = (void*)gpujpeg_preprocessor_select_decode_kernel(coder); + } + else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT601) { + coder->preprocessor = (void*)gpujpeg_preprocessor_select_decode_kernel(coder); + } + else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT601_256LVLS) { + coder->preprocessor = (void*)gpujpeg_preprocessor_select_decode_kernel(coder); + } + else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT709) { + coder->preprocessor = (void*)gpujpeg_preprocessor_select_decode_kernel(coder); + } + else { + assert(false); + } + if (coder->preprocessor == NULL) { + return -1; + } + return 0; +} + +/** + * Copies raw data GPU memory without running any postprocessor kernel. + * + * This assumes that the JPEG has same color space as input raw image and + * currently also that the component subsampling correspond between raw and + * JPEG (although at least different horizontal subsampling can be quite + * easily done). + * + * @invariant gpujpeg_preprocessor_decode_no_transform(coder) != 0 + */ +static int +gpujpeg_preprocessor_decoder_copy_planar_data(struct gpujpeg_coder * coder, cudaStream_t stream) +{ + assert(coder->param_image.comp_count == 1 || + coder->param_image.comp_count == 3); + size_t data_raw_offset = 0; + bool needs_stride = false; // true if width is not divisible by MCU width + for (int i = 0; i < coder->param_image.comp_count; ++i) { + needs_stride = needs_stride || coder->component[i].width != coder->component[i].data_width; + } + if (!needs_stride) { + for (int i = 0; i < coder->param_image.comp_count; ++i) { + size_t component_size = coder->component[i].width * coder->component[i].height; + memcpy(coder->d_data_raw + data_raw_offset, coder->component[i].d_data, component_size); + data_raw_offset += component_size; + } + } else { + for (int i = 0; i < coder->param_image.comp_count; ++i) { + int spitch = coder->component[i].data_width; + int dpitch = coder->component[i].width; + size_t component_size = spitch * coder->component[i].height; + + for (size_t i = 0; i < coder->component[i].height; ++i) { + size_t dest_offset = data_raw_offset + i * dpitch; + size_t src_offset = i * spitch; + memcpy(coder->d_data_raw + data_raw_offset + dest_offset, coder->component[i].d_data + src_offset, coder->component[i].width); + } + + data_raw_offset += component_size; + } + } + return 0; +} + +/* Documented at declaration */ +int +gpujpeg_preprocessor_decode(struct gpujpeg_coder* coder, cudaStream_t stream) +{ + if (!coder->preprocessor) { + return gpujpeg_preprocessor_decoder_copy_planar_data(coder, stream); + } + + // Select kernel + gpujpeg_preprocessor_decode_kernel kernel = (gpujpeg_preprocessor_decode_kernel)coder->preprocessor; + assert(kernel != NULL); + + int image_width = coder->param_image.width; + int image_height = coder->param_image.height; + + // When saving 4:2:2 data of odd width, the data should have even width, so round it + if (coder->param_image.pixel_format == GPUJPEG_422_U8_P1020) { + image_width = gpujpeg_div_and_round_up(coder->param_image.width, 2) * 2; + } + + // Prepare unit size + /// @todo this stuff doesn't look correct - we multiply by unitSize and then divide by it + int unitSize = gpujpeg_pixel_format_get_unit_size(coder->param_image.pixel_format); + if (unitSize == 0) { + unitSize = 1; + } + + // Prepare kernel + // int alignedSize = gpujpeg_div_and_round_up(image_width * image_height, RGB_8BIT_THREADS) * RGB_8BIT_THREADS * unitSize; + // dim3 threads (RGB_8BIT_THREADS); + // dim3 grid (alignedSize / (RGB_8BIT_THREADS * unitSize)); + // assert(alignedSize % (RGB_8BIT_THREADS * unitSize) == 0); + // if ( grid.x > GPUJPEG_CUDA_MAXIMUM_GRID_SIZE ) { + // grid.y = gpujpeg_div_and_round_up(grid.x, GPUJPEG_CUDA_MAXIMUM_GRID_SIZE); + // grid.x = GPUJPEG_CUDA_MAXIMUM_GRID_SIZE; + // } + + // Run kernel + struct gpujpeg_preprocessor_data data; + for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { + assert(coder->sampling_factor.horizontal % coder->component[comp].sampling_factor.horizontal == 0); + assert(coder->sampling_factor.vertical % coder->component[comp].sampling_factor.vertical == 0); + data.comp[comp].d_data = coder->component[comp].d_data; + data.comp[comp].sampling_factor.horizontal = coder->sampling_factor.horizontal / coder->component[comp].sampling_factor.horizontal; + data.comp[comp].sampling_factor.vertical = coder->sampling_factor.vertical / coder->component[comp].sampling_factor.vertical; + data.comp[comp].data_width = coder->component[comp].data_width; + } + kernel( + data, + coder->d_data_raw, + image_width, + image_height + ); + + return 0; +} + +/* vi: set expandtab sw=4: */ diff --git a/src/gpujpeg_preprocessor.cpp b/src/gpujpeg_preprocessor.cpp new file mode 100644 index 00000000..34f9e5b8 --- /dev/null +++ b/src/gpujpeg_preprocessor.cpp @@ -0,0 +1,491 @@ +/* + * Copyright (c) 2011-2020, CESNET z.s.p.o + * Copyright (c) 2011, Silicon Genome, LLC. + * + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ +/** + * @file + * @brief + * This file contains preprocessors from raw image to a common format for + * computational kernels. It also does color space transformations. + */ + +#include "gpujpeg_colorspace.h" +#include "gpujpeg_preprocessor_common.h" +#include "gpujpeg_preprocessor.h" +#include "gpujpeg_util.h" + +/** + * Store value to component data buffer in specified position by buffer size and subsampling + */ +template< + unsigned int s_samp_factor_h, + unsigned int s_samp_factor_v +> +static void +gpujpeg_preprocessor_raw_to_comp_store_comp(uint8_t value, unsigned int position_x, unsigned int position_y, struct gpujpeg_preprocessor_data_component & comp) +{ + const unsigned int samp_factor_h = ( s_samp_factor_h == GPUJPEG_DYNAMIC ) ? comp.sampling_factor.horizontal : s_samp_factor_h; + const unsigned int samp_factor_v = ( s_samp_factor_v == GPUJPEG_DYNAMIC ) ? comp.sampling_factor.vertical : s_samp_factor_v; + + if ( (position_x % samp_factor_h) || (position_y % samp_factor_v) ) + return; + + position_x = position_x / samp_factor_h; + position_y = position_y / samp_factor_v; + + const unsigned int data_position = position_y * comp.data_width + position_x; + comp.d_data[data_position] = value; +} + +template< + enum gpujpeg_pixel_format pixel_format, + uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v, + uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v, + uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v +> +struct gpujpeg_preprocessor_raw_to_comp_store { + static void perform(uchar4 value, unsigned int position_x, unsigned int position_y, struct gpujpeg_preprocessor_data & data) { + gpujpeg_preprocessor_raw_to_comp_store_comp(value.x, position_x, position_y, data.comp[0]); + gpujpeg_preprocessor_raw_to_comp_store_comp(value.y, position_x, position_y, data.comp[1]); + gpujpeg_preprocessor_raw_to_comp_store_comp(value.z, position_x, position_y, data.comp[2]); + } +}; + +template< + uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v, + uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v, + uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v +> +struct gpujpeg_preprocessor_raw_to_comp_store { + static void perform (uchar4 value, unsigned int position_x, unsigned int position_y, struct gpujpeg_preprocessor_data & data) { + gpujpeg_preprocessor_raw_to_comp_store_comp(value.x, position_x, position_y, data.comp[0]); + gpujpeg_preprocessor_raw_to_comp_store_comp(value.y, position_x, position_y, data.comp[1]); + gpujpeg_preprocessor_raw_to_comp_store_comp(value.z, position_x, position_y, data.comp[2]); + gpujpeg_preprocessor_raw_to_comp_store_comp(value.w, position_x, position_y, data.comp[3]); + } +}; + +template +inline void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r); + +template<> +inline void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + r.x = d_data_raw[image_position]; + r.y = 128; + r.z = 128; +} + +template<> +inline void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + r.x = d_data_raw[image_position]; + r.y = d_data_raw[image_width * image_height + image_position]; + r.z = d_data_raw[2 * image_width * image_height + image_position]; +} + +template<> +inline void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + r.x = d_data_raw[image_position]; + r.y = d_data_raw[image_width * image_height + image_position / 2]; + r.z = d_data_raw[image_width * image_height + image_height * ((image_width + 1) / 2) + image_position / 2]; +} + +template<> +inline void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + r.x = d_data_raw[image_position]; + r.y = d_data_raw[image_width * image_height + y / 2 * ((image_width + 1) / 2) + x / 2]; + r.z = d_data_raw[image_width * image_height + ((image_height + 1) / 2 + y / 2) * ((image_width + 1) / 2) + x / 2]; +} + +template<> +inline void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + const unsigned int offset = image_position * 3; + r.x = d_data_raw[offset]; + r.y = d_data_raw[offset + 1]; + r.z = d_data_raw[offset + 2]; +} + +template<> +inline void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + const unsigned int offset = image_position * 4; + r.x = d_data_raw[offset]; + r.y = d_data_raw[offset + 1]; + r.z = d_data_raw[offset + 2]; + r.w = d_data_raw[offset + 3]; +} + +template<> +inline void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + const unsigned int offset = image_position * 4; + r.x = d_data_raw[offset]; + r.y = d_data_raw[offset + 1]; + r.z = d_data_raw[offset + 2]; +} + +template<> +inline void raw_to_comp_load(const uint8_t* d_data_raw, int &image_width, int &image_height, int &image_position, int &x, int &y, uchar4 &r) +{ + const unsigned int offset = image_position * 2; + r.x = d_data_raw[offset + 1]; + if ( image_position % 2 == 0 ) { + r.y = d_data_raw[offset]; + r.z = d_data_raw[offset + 2]; + } else { + r.y = d_data_raw[offset - 2]; + r.z = d_data_raw[offset]; + } +} + +/** + * Kernel - Copy raw image source data into three separated component buffers + */ +typedef void (*gpujpeg_preprocessor_encode_kernel)(struct gpujpeg_preprocessor_data data, const uint8_t* d_data_raw, const uint8_t* d_data_raw_end, int image_width, int image_height, uint32_t width_div_mul, uint32_t width_div_shift); + +/** + * @note + * In previous versions, there was an optimalization with aligned preloading to shared memory. + * This was, however, removed because it didn't exhibit any performance improvement anymore + * (actually removing that yields slight performance gain). + */ +template< + enum gpujpeg_color_space color_space_internal, + enum gpujpeg_color_space color_space, + enum gpujpeg_pixel_format pixel_format, + uint8_t s_comp1_samp_factor_h, uint8_t s_comp1_samp_factor_v, + uint8_t s_comp2_samp_factor_h, uint8_t s_comp2_samp_factor_v, + uint8_t s_comp3_samp_factor_h, uint8_t s_comp3_samp_factor_v +> +void +gpujpeg_preprocessor_raw_to_comp_kernel(struct gpujpeg_preprocessor_data data, const uint8_t* d_data_raw, const uint8_t* d_data_raw_end, int image_width, int image_height, uint32_t width_div_mul, uint32_t width_div_shift) +{ + for (int gX = 0; gX < image_width * image_height; ++gX) { + int image_position = gX; + int image_position_y = gpujpeg_const_div_divide(image_position, width_div_mul, width_div_shift); + int image_position_x = image_position - (image_position_y * image_width); + + if ( image_position >= (image_width * image_height) ) { + return; + } + + // Load + uchar4 r; + raw_to_comp_load(d_data_raw, image_width, image_height, image_position, image_position_x, image_position_y, r); + + // Color transform + gpujpeg_color_transform::perform(r); + + // Store + gpujpeg_preprocessor_raw_to_comp_store::perform(r, image_position_x, image_position_y, data); + } +} + +/** + * Select preprocessor encode kernel + * + * @param encoder + * @return kernel + */ +template +gpujpeg_preprocessor_encode_kernel +gpujpeg_preprocessor_select_encode_kernel(struct gpujpeg_coder* coder) +{ + gpujpeg_preprocessor_sampling_factor_t sampling_factor = gpujpeg_preprocessor_make_sampling_factor( + coder->sampling_factor.horizontal / coder->component[0].sampling_factor.horizontal, + coder->sampling_factor.vertical / coder->component[0].sampling_factor.vertical, + coder->sampling_factor.horizontal / coder->component[1].sampling_factor.horizontal, + coder->sampling_factor.vertical / coder->component[1].sampling_factor.vertical, + coder->sampling_factor.horizontal / coder->component[2].sampling_factor.horizontal, + coder->sampling_factor.vertical / coder->component[2].sampling_factor.vertical, + 0, 0 // it's for comparison only, not kernel selection - use dummy value to accept any val for those components if comp_count < 4 + ); + + /// @todo allow also different susbsampling for 4rd channel than for first + assert(coder->param_image.comp_count != 4 || + (coder->component[0].sampling_factor.horizontal == coder->component[3].sampling_factor.horizontal && + coder->component[0].sampling_factor.vertical == coder->component[3].sampling_factor.vertical)); + +#define RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6) \ + switch ( PIXEL_FORMAT ) { \ + case GPUJPEG_444_U8_P012: return &gpujpeg_preprocessor_raw_to_comp_kernel; \ + case GPUJPEG_444_U8_P012A: return coder->param_image.comp_count == 4 ? &gpujpeg_preprocessor_raw_to_comp_kernel : &gpujpeg_preprocessor_raw_to_comp_kernel; \ + case GPUJPEG_444_U8_P012Z: return &gpujpeg_preprocessor_raw_to_comp_kernel; \ + case GPUJPEG_422_U8_P1020: return &gpujpeg_preprocessor_raw_to_comp_kernel; \ + case GPUJPEG_444_U8_P0P1P2: return &gpujpeg_preprocessor_raw_to_comp_kernel; \ + case GPUJPEG_422_U8_P0P1P2: return &gpujpeg_preprocessor_raw_to_comp_kernel; \ + case GPUJPEG_420_U8_P0P1P2: return &gpujpeg_preprocessor_raw_to_comp_kernel; \ + case GPUJPEG_U8: return &gpujpeg_preprocessor_raw_to_comp_kernel; \ + case GPUJPEG_PIXFMT_NONE: GPUJPEG_ASSERT(0 && "Preprocess from GPUJPEG_PIXFMT_NONE not allowed" ); \ + } + +#define RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6) \ + if ( sampling_factor == gpujpeg_preprocessor_make_sampling_factor(P1, P2, P3, P4, P5, P6, 0, 0) ) { \ + int max_h = std::max(P1, std::max(P3, P5)); \ + int max_v = std::max(P2, std::max(P4, P6)); \ + if ( coder->param.verbose >= 1 ) { \ + printf("Using faster kernel for preprocessor (precompiled %dx%d, %dx%d, %dx%d).\n", max_h / P1, max_v / P2, max_h / P3, max_v / P4, max_h / P5, max_v / P6); \ + } \ + RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, P1, P2, P3, P4, P5, P6) \ + } + +#define RETURN_KERNEL(PIXEL_FORMAT, COLOR) \ + RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 1, 1, 1, 1) /* 4:4:4 */ \ + else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 2, 2, 2, 2) /* 4:2:0 */ \ + else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 1, 2, 1, 2) /* 4:4:0 */ \ + else RETURN_KERNEL_IF(PIXEL_FORMAT, COLOR, 1, 1, 2, 1, 2, 1) /* 4:2:2 */ \ + else { \ + if ( coder->param.verbose >= 0 ) { \ + printf("Using slower kernel for preprocessor (dynamic %dx%d, %dx%d, %dx%d).\n", coder->component[0].sampling_factor.horizontal, coder->component[0].sampling_factor.vertical, coder->component[1].sampling_factor.horizontal, coder->component[1].sampling_factor.vertical, coder->component[2].sampling_factor.horizontal, coder->component[2].sampling_factor.vertical); \ + } \ + RETURN_KERNEL_SWITCH(PIXEL_FORMAT, COLOR, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC, GPUJPEG_DYNAMIC) \ + } \ + + // None color space + if ( coder->param_image.color_space == GPUJPEG_NONE ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_NONE); + } + // RGB color space + else if ( coder->param_image.color_space == GPUJPEG_RGB ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_RGB); + } + // YCbCr color space + else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT601 ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT601); + } + // YCbCr color space + else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT601_256LVLS ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT601_256LVLS); + } + // YCbCr color space + else if ( coder->param_image.color_space == GPUJPEG_YCBCR_BT709 ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YCBCR_BT709); + } +#ifdef ENABLE_YUV + // YUV color space + else if ( coder->param_image.color_space == GPUJPEG_YUV ) { + RETURN_KERNEL(coder->param_image.pixel_format, GPUJPEG_YUV); + } +#endif + // Unknown color space + else { + assert(false); + } + +#undef RETURN_KERNEL_IF +#undef RETURN_KERNEL + + return NULL; +} + +static int gpujpeg_preprocessor_encode_no_transform(struct gpujpeg_coder * coder) +{ + if (gpujpeg_pixel_format_is_interleaved(coder->param_image.pixel_format)) { + return 0; + } + + if (coder->param_image.comp_count == 3 && coder->param_image.color_space != coder->param.color_space_internal) { + return 0; + } + + const int *sampling_factors = gpujpeg_pixel_format_get_sampling_factor(coder->param_image.pixel_format); + for (int i = 0; i < coder->param_image.comp_count; ++i) { + if (coder->component[i].sampling_factor.horizontal != sampling_factors[i * 2] + || coder->component[i].sampling_factor.vertical != sampling_factors[i * 2 + 1]) { + return 0; + } + } + return 1; +} + +/* Documented at declaration */ +int +gpujpeg_preprocessor_encoder_init(struct gpujpeg_coder* coder) +{ + coder->preprocessor = NULL; + + if ( coder->param_image.comp_count == 1 ) { + return 0; + } + + if ( gpujpeg_preprocessor_encode_no_transform(coder) ) { + if ( coder->param.verbose >= 2 ) { + printf("Matching format detected - not using preprocessor, using memcpy instead."); + } + return 0; + } + + assert(coder->param_image.comp_count == 3 || coder->param_image.comp_count == 4); + + if (coder->param.color_space_internal == GPUJPEG_NONE) { + coder->preprocessor = (void*)gpujpeg_preprocessor_select_encode_kernel(coder); + } + else if (coder->param.color_space_internal == GPUJPEG_RGB) { + coder->preprocessor = (void*)gpujpeg_preprocessor_select_encode_kernel(coder); + } + else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT601) { + coder->preprocessor = (void*)gpujpeg_preprocessor_select_encode_kernel(coder); + } + else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT601_256LVLS) { + coder->preprocessor = (void*)gpujpeg_preprocessor_select_encode_kernel(coder); + } + else if (coder->param.color_space_internal == GPUJPEG_YCBCR_BT709) { + coder->preprocessor = (void*)gpujpeg_preprocessor_select_encode_kernel(coder); + } + + if ( coder->preprocessor == NULL ) { + return -1; + } + + return 0; +} + +int +gpujpeg_preprocessor_encode_interlaced(struct gpujpeg_encoder * encoder) +{ + struct gpujpeg_coder* coder = &encoder->coder; + + // Select kernel + gpujpeg_preprocessor_encode_kernel kernel = (gpujpeg_preprocessor_encode_kernel) coder->preprocessor; + assert(kernel != NULL); + + int image_width = coder->param_image.width; + int image_height = coder->param_image.height; + + // When loading 4:2:2 data of odd width, the data in fact has even width, so round it + // (at least imagemagick convert tool generates data stream in this way) + if (coder->param_image.pixel_format == GPUJPEG_422_U8_P1020) { + image_width = (coder->param_image.width + 1) & ~1; + } + + // Prepare unit size + /// @todo this stuff doesn't look correct - we multiply by unitSize and then divide by it + int unitSize = gpujpeg_pixel_format_get_unit_size(coder->param_image.pixel_format); + if (unitSize == 0) { + unitSize = 1; + } + + // Prepare kernel + // int alignedSize = gpujpeg_div_and_round_up(image_width * image_height, RGB_8BIT_THREADS) * RGB_8BIT_THREADS * unitSize; + // dim3 threads (RGB_8BIT_THREADS); + // dim3 grid (alignedSize / (RGB_8BIT_THREADS * unitSize)); + // assert(alignedSize % (RGB_8BIT_THREADS * unitSize) == 0); + // while ( grid.x > GPUJPEG_CUDA_MAXIMUM_GRID_SIZE ) { + // grid.y *= 2; + // grid.x = gpujpeg_div_and_round_up(grid.x, 2); + // } + + // Decompose input image width for faster division using multiply-high and right shift + uint32_t width_div_mul, width_div_shift; + gpujpeg_const_div_prepare(image_width, width_div_mul, width_div_shift); + + // Run kernel + struct gpujpeg_preprocessor_data data; + for ( int comp = 0; comp < coder->param_image.comp_count; comp++ ) { + assert(coder->sampling_factor.horizontal % coder->component[comp].sampling_factor.horizontal == 0); + assert(coder->sampling_factor.vertical % coder->component[comp].sampling_factor.vertical == 0); + data.comp[comp].d_data = coder->component[comp].d_data; + data.comp[comp].sampling_factor.horizontal = coder->sampling_factor.horizontal / coder->component[comp].sampling_factor.horizontal; + data.comp[comp].sampling_factor.vertical = coder->sampling_factor.vertical / coder->component[comp].sampling_factor.vertical; + data.comp[comp].data_width = coder->component[comp].data_width; + } + //kernel<<stream>>>( + kernel( + data, + coder->d_data_raw, + coder->d_data_raw + coder->data_raw_size, + image_width, + image_height, + width_div_mul, + width_div_shift + ); + + return 0; +} + +/** + * Copies raw data from source image to GPU memory without running + * any preprocessor kernel. + * + * This assumes that the JPEG has same color space as input raw image and + * currently also that the component subsampling correspond between raw and + * JPEG (although at least different horizontal subsampling can be quite + * easily done). + * + * @invariant gpujpeg_preprocessor_encode_no_transform(coder) != 0 + */ +static int +gpujpeg_preprocessor_encoder_copy_planar_data(struct gpujpeg_encoder * encoder) +{ + struct gpujpeg_coder * coder = &encoder->coder; + assert(coder->param_image.comp_count == 1 || + coder->param_image.comp_count == 3); + + size_t data_raw_offset = 0; + bool needs_stride = false; // true if width is not divisible by MCU width + for (int i = 0; i < coder->param_image.comp_count; ++i) { + needs_stride = needs_stride || coder->component[i].width != coder->component[i].data_width; + } + if (!needs_stride) { + for (int i = 0; i < coder->param_image.comp_count; ++i) { + size_t component_size = coder->component[i].width * coder->component[i].height; + memcpy(coder->component[i].d_data, coder->d_data_raw + data_raw_offset, component_size); + data_raw_offset += component_size; + } + } else { + for (int i = 0; i < coder->param_image.comp_count; ++i) { + int spitch = coder->component[i].width; + int dpitch = coder->component[i].data_width; + size_t component_size = spitch * coder->component[i].height; + for (size_t i = 0; i < coder->component[i].height; ++i) { + memcpy((char*)coder->component[i].d_data + i * dpitch, (const char*)coder->d_data_raw + data_raw_offset + i * spitch, spitch); + } + + data_raw_offset += component_size; + } + } + + return 0; +} + +/* Documented at declaration */ +int +gpujpeg_preprocessor_encode(struct gpujpeg_encoder * encoder) +{ + struct gpujpeg_coder * coder = &encoder->coder; + if (coder->preprocessor) { + return gpujpeg_preprocessor_encode_interlaced(encoder); + } else { + return gpujpeg_preprocessor_encoder_copy_planar_data(encoder); + } +} + +/* vi: set expandtab sw=4: */ diff --git a/src/gpujpeg_preprocessor_common.h b/src/gpujpeg_preprocessor_common.h index 048f46b7..93e6cbab 100644 --- a/src/gpujpeg_preprocessor_common.h +++ b/src/gpujpeg_preprocessor_common.h @@ -102,10 +102,17 @@ gpujpeg_const_div_prepare(const uint32_t d, uint32_t & pre_div_mul, uint32_t & p /** * Divides unsigned numerator (up to 2^31) by precomputed constant denominator. */ +#ifdef GPUJPEG_USE_CUDA __device__ static uint32_t gpujpeg_const_div_divide(const uint32_t numerator, const uint32_t pre_div_mul, const uint32_t pre_div_shift) { return pre_div_mul ? __umulhi(numerator, pre_div_mul) >> pre_div_shift : numerator; } +#else +static uint32_t +gpujpeg_const_div_divide(const uint32_t numerator, const uint32_t pre_div_mul, const uint32_t pre_div_shift) { + return pre_div_mul ? (numerator * pre_div_mul) >> pre_div_shift : numerator; +} +#endif /** * Compose sampling factor for all components to single type