From e5f954ac54afa5aabd91cd405ab55c63125c9861 Mon Sep 17 00:00:00 2001 From: Chong Kai Xiong Date: Fri, 3 Feb 2023 02:40:06 +0800 Subject: [PATCH 1/4] Core (LV::Video): Clean up blit_overlay_alphasrc(). --- libvisual/libvisual/private/lv_video_blit.cpp | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/libvisual/libvisual/private/lv_video_blit.cpp b/libvisual/libvisual/private/lv_video_blit.cpp index 91494dbd0..a83d02cc0 100644 --- a/libvisual/libvisual/private/lv_video_blit.cpp +++ b/libvisual/libvisual/private/lv_video_blit.cpp @@ -61,30 +61,33 @@ namespace LV { } } - void VideoBlit::blit_overlay_alphasrc (Video* dest, Video* src) + void VideoBlit::blit_overlay_alphasrc (Video* dst, Video* src) { - auto destbuf = static_cast (dest->get_pixels ()); - auto srcbuf = static_cast (src->get_pixels ()); - if (visual_cpu_has_mmx ()) { - blit_overlay_alphasrc_mmx (dest, src); + blit_overlay_alphasrc_mmx (dst, src); return; } + auto dst_pixel_row_ptr = static_cast (dst->get_pixels ()); + auto src_pixel_row_ptr = static_cast (src->get_pixels ()); + for (int y = 0; y < src->m_impl->height; y++) { + auto dst_pixel = dst_pixel_row_ptr; + auto src_pixel = src_pixel_row_ptr; + for (int x = 0; x < src->m_impl->width; x++) { - uint8_t alpha = srcbuf[3]; + uint8_t const src_alpha = src_pixel[3]; - destbuf[0] = (alpha * (srcbuf[0] - destbuf[0]) >> 8) + destbuf[0]; - destbuf[1] = (alpha * (srcbuf[1] - destbuf[1]) >> 8) + destbuf[1]; - destbuf[2] = (alpha * (srcbuf[2] - destbuf[2]) >> 8) + destbuf[2]; + dst_pixel[0] = (src_alpha * (src_pixel[0] - dst_pixel[0]) >> 8) + dst_pixel[0]; + dst_pixel[1] = (src_alpha * (src_pixel[1] - dst_pixel[1]) >> 8) + dst_pixel[1]; + dst_pixel[2] = (src_alpha * (src_pixel[2] - dst_pixel[2]) >> 8) + dst_pixel[2]; - destbuf += dest->m_impl->bpp; - srcbuf += src->m_impl->bpp; + src_pixel += 4; + dst_pixel += 4; } - destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp); - srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp); + dst_pixel_row_ptr += dst->m_impl->pitch; + src_pixel_row_ptr += src->m_impl->pitch; } } From 3a7d77026acc046a7d5dda1fa5fcef2c0d72a6c7 Mon Sep 17 00:00:00 2001 From: Chong Kai Xiong Date: Fri, 3 Feb 2023 05:09:39 +0800 Subject: [PATCH 2/4] Core (LV::Video): Rewrite MMX alpha blending of 32-bit videos using intrinsics (#230). --- .../libvisual/private/lv_video_blit_simd.cpp | 81 +++++++++++-------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/libvisual/libvisual/private/lv_video_blit_simd.cpp b/libvisual/libvisual/private/lv_video_blit_simd.cpp index 7b26f8f59..e43afb476 100644 --- a/libvisual/libvisual/private/lv_video_blit_simd.cpp +++ b/libvisual/libvisual/private/lv_video_blit_simd.cpp @@ -26,49 +26,60 @@ #include "lv_video_blit.hpp" #include "lv_video_private.hpp" #include "lv_common.h" +#include namespace LV { - void VideoBlit::blit_overlay_alphasrc_mmx (Video* dest, Video* src) + void VideoBlit::blit_overlay_alphasrc_mmx (Video* dst, Video* src) { #if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64) - auto destbuf = static_cast (dest->get_pixels ()); - auto srcbuf = static_cast (src->get_pixels ()); - - for (int i = 0; i < src->m_impl->height; i++) { - for (int j = 0; j < src->m_impl->width; j++) { - __asm __volatile - ("\n\t movd %[spix], %%mm0" - "\n\t movd %[dpix], %%mm1" - "\n\t movq %%mm0, %%mm2" - "\n\t movq %%mm0, %%mm3" - "\n\t psrlq $24, %%mm2" /* The alpha */ - "\n\t movq %%mm0, %%mm4" - "\n\t psrld $24, %%mm3" - "\n\t psrld $24, %%mm4" - "\n\t psllq $32, %%mm2" - "\n\t psllq $16, %%mm3" - "\n\t por %%mm4, %%mm2" - "\n\t punpcklbw %%mm6, %%mm0" /* interleaving dest */ - "\n\t por %%mm3, %%mm2" - "\n\t punpcklbw %%mm6, %%mm1" /* interleaving source */ - "\n\t psubsw %%mm1, %%mm0" /* (src - dest) part */ - "\n\t pmullw %%mm2, %%mm0" /* alpha * (src - dest) */ - "\n\t psrlw $8, %%mm0" /* / 256 */ - "\n\t paddb %%mm1, %%mm0" /* + dest */ - "\n\t packuswb %%mm0, %%mm0" - "\n\t movd %%mm0, %[dest]" - : [dest] "=m" (*destbuf) - : [dpix] "m" (*destbuf) - , [spix] "m" (*srcbuf)); - - destbuf += 4; - srcbuf += 4; + auto dst_pixel_row_ptr = static_cast (dst->get_pixels ()); + auto src_pixel_row_ptr = static_cast (src->get_pixels ()); + + for (int y = 0; y < src->m_impl->height; y++) { + auto dst_pixel = reinterpret_cast (dst_pixel_row_ptr); + auto src_pixel = reinterpret_cast (src_pixel_row_ptr); + + for (int x = 0; x < src->m_impl->width; x++) { + // We work with 32-bit pixel values packed as 4 x 16-bit ints in MMX registers. + // See the pure C implementation in blit_overlay_alphsrc() for the calculation involved. + + // Load source alpha as a 16-bit int. + uint16_t const src_alpha = reinterpret_cast (src_pixel)[3]; + + // Load source and target pixel values into MMX registers, each channel zero-extended into 16 bits. + auto src = _mm_cvtsi32_si64 (*src_pixel); + auto dst = _mm_cvtsi32_si64 (*dst_pixel); + src = _mm_unpacklo_pi8 (src, _mm_setzero_si64 ()); + dst = _mm_unpacklo_pi8 (dst, _mm_setzero_si64 ()); + + // Load src_alpha and (255 - src_alpha) and broadcast them into a1 and a2. + auto a1 = _mm_set1_pi16 (src_alpha); + auto a2 = _mm_set1_pi16 (static_cast (255) - src_alpha); + + // Interpolate between source and target. + auto result = _mm_add_pi16 (_mm_mullo_pi16 (src, a1), _mm_mullo_pi16 (dst, a2)); + result = _mm_srli_pi16 (result, 8); + + // Unpack result but keep the target pixel alpha. + // Is there a nicer way to do this? + uint32_t int_result = _mm_cvtsi64_si32 (_mm_packs_pu16 (result, result)); + int_result = (int_result & 0x00'ff'ff'ff) | (*dst_pixel & 0xff'00'00'00); + + *dst_pixel = int_result; + + dst_pixel++; + src_pixel++; } - destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp); - srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp); + dst_pixel_row_ptr += dst->m_impl->pitch; + src_pixel_row_ptr += src->m_impl->pitch; } + + // FIXME: Some sources said this is not needed for x64 as MMX registers are no longer + // overlayed on FP ones. + _mm_empty (); + #endif /* !VISUAL_ARCH_X86 */ } From 8cbfb5c003e76612ee1437ba0a5b4c9727448162 Mon Sep 17 00:00:00 2001 From: Chong Kai Xiong Date: Thu, 8 Feb 2024 21:04:42 +0800 Subject: [PATCH 3/4] Core (Tests): Add test for LV::VideoBlit::blit_overlay_alphasrc(). --- libvisual/cmake/LVBuildTest.cmake | 1 + libvisual/tests/CMakeLists.txt | 8 ++- libvisual/tests/random.cpp | 32 +++++++++ libvisual/tests/random.hpp | 12 ++++ libvisual/tests/video_test/CMakeLists.txt | 4 ++ .../tests/video_test/video_blit_test.cpp | 66 +++++++++++++++++++ 6 files changed, 122 insertions(+), 1 deletion(-) create mode 100644 libvisual/tests/random.cpp create mode 100644 libvisual/tests/random.hpp create mode 100644 libvisual/tests/video_test/video_blit_test.cpp diff --git a/libvisual/cmake/LVBuildTest.cmake b/libvisual/cmake/LVBuildTest.cmake index 1820b9a45..3680b4aab 100644 --- a/libvisual/cmake/LVBuildTest.cmake +++ b/libvisual/cmake/LVBuildTest.cmake @@ -37,6 +37,7 @@ FUNCTION(LV_BUILD_TEST TEST_NAME) TARGET_LINK_LIBRARIES(${TEST_NAME} PRIVATE + test_common Libvisual::Libvisual Threads::Threads ${PARSED_ARGS_LINK_LIBS} diff --git a/libvisual/tests/CMakeLists.txt b/libvisual/tests/CMakeLists.txt index f7b2556ba..b4ee0f01c 100644 --- a/libvisual/tests/CMakeLists.txt +++ b/libvisual/tests/CMakeLists.txt @@ -1,9 +1,15 @@ INCLUDE(LVBuildTest) -INCLUDE_DIRECTORIES( +ADD_LIBRARY(test_common STATIC + random.cpp +) + +TARGET_INCLUDE_DIRECTORIES(test_common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} ) +TARGET_LINK_LIBRARIES(test_common PUBLIC libvisual) + ADD_SUBDIRECTORY(audio_test) ADD_SUBDIRECTORY(mem_test) ADD_SUBDIRECTORY(video_test) diff --git a/libvisual/tests/random.cpp b/libvisual/tests/random.cpp new file mode 100644 index 000000000..47e58d064 --- /dev/null +++ b/libvisual/tests/random.cpp @@ -0,0 +1,32 @@ +#include "random.hpp" +#include + +namespace LV::Tests +{ + LV::VideoPtr create_random_video (int width, int height, VisVideoDepth depth) + { + std::random_device device {}; + std::uniform_int_distribution distrib {0, 255}; + + auto video {LV::Video::create (width, height, depth)}; + + auto bytes_per_pixel = video->get_bpp (); + auto pitch = video->get_pitch (); + + auto content_bytes_per_row = bytes_per_pixel * video->get_width (); + + auto pixel_row_ptr = static_cast(video->get_pixels ()); + + for (int y = 0; y < video->get_height (); y++) { + auto pixel = pixel_row_ptr; + for (int c = 0; c < content_bytes_per_row; c++) { + *pixel = distrib (device); + pixel++; + } + + pixel_row_ptr += pitch; + } + + return video; + } +} // LV::Tests namespace diff --git a/libvisual/tests/random.hpp b/libvisual/tests/random.hpp new file mode 100644 index 000000000..c9893b3e8 --- /dev/null +++ b/libvisual/tests/random.hpp @@ -0,0 +1,12 @@ +#ifndef _LV_TESTS_VIDEO_RANDOM_HPP +#define _LV_TESTS_VIDEO_RANDOM_HPP + +#include + +namespace LV::Tests +{ + LV::VideoPtr create_random_video (int width, int height, VisVideoDepth depth); + +} // LV::Tests namespace + +#endif // defined(_LV_TESTS_VIDEO_COMMON_HPP) diff --git a/libvisual/tests/video_test/CMakeLists.txt b/libvisual/tests/video_test/CMakeLists.txt index 6f5244909..65cd4341a 100644 --- a/libvisual/tests/video_test/CMakeLists.txt +++ b/libvisual/tests/video_test/CMakeLists.txt @@ -2,6 +2,10 @@ LV_BUILD_TEST(video_check_test SOURCES video_check_test.cpp ) +LV_BUILD_TEST(video_blit_test + SOURCES video_blit_test.cpp +) + IF(HAVE_SDL) LV_BUILD_TEST(video_scale_test SOURCES video_scale_test.cpp diff --git a/libvisual/tests/video_test/video_blit_test.cpp b/libvisual/tests/video_test/video_blit_test.cpp new file mode 100644 index 000000000..34320dd0b --- /dev/null +++ b/libvisual/tests/video_test/video_blit_test.cpp @@ -0,0 +1,66 @@ +#include "test.h" +#include "random.hpp" +#include +#include + +namespace +{ + LV::VideoPtr clone_video (LV::VideoPtr const& source) + { + auto clone {LV::Video::create (source->get_width (), source->get_height (), source->get_depth ())}; + + assert (clone->get_pitch () == source->get_pitch ()); + std::size_t buffer_size = static_cast (clone->get_pitch () * clone->get_width ()); + + visual_mem_copy (clone->get_pixels (), source->get_pixels (), buffer_size); + + return clone; + } + + void test_blit_overlay_alphasrc () + { + // Check that blit_overlay_alphasrc results are within +/- 1 of exact computation for each colour channel. The + // errors largely arise from the use of 256 instead of 255 as divisor for performance reasons. + + int const test_width = 31; + int const test_height = 31; + + auto source = LV::Tests::create_random_video (test_width, test_height, VISUAL_VIDEO_DEPTH_32BIT); + source->set_compose_type (VISUAL_VIDEO_COMPOSE_TYPE_SRC); + + auto target = LV::Tests::create_random_video (test_width, test_height, VISUAL_VIDEO_DEPTH_32BIT); + + auto actual {clone_video (target)}; + actual->blit (source, 0, 0, true); + + for (int y = 0; y < test_height; y++) { + auto source_pixel = static_cast (source->get_pixel_ptr (0, y)); + auto target_pixel = static_cast (target->get_pixel_ptr (0, y)); + auto actual_pixel = static_cast (actual->get_pixel_ptr (0, y)); + + for (int x = 0; x < test_width; x++) { + LV_TEST_ASSERT (actual_pixel[3] == target_pixel[3]); + + float source_alpha = static_cast (source_pixel[3]) / 255.0f; + uint8_t b = source_alpha * source_pixel[0] + (1.0f - source_alpha) * target_pixel[0]; + uint8_t g = source_alpha * source_pixel[1] + (1.0f - source_alpha) * target_pixel[1]; + uint8_t r = source_alpha * source_pixel[2] + (1.0f - source_alpha) * target_pixel[2]; + + LV_TEST_ASSERT (std::abs (static_cast (actual_pixel[0]) - static_cast (b)) <= 1); + LV_TEST_ASSERT (std::abs (static_cast (actual_pixel[1]) - static_cast (g)) <= 1); + LV_TEST_ASSERT (std::abs (static_cast (actual_pixel[2]) - static_cast (r)) <= 1); + + source_pixel += 4; + target_pixel += 4; + actual_pixel += 4; + } + } + } +} // anonymous namespace + +int main(int argc, char *argv[]) +{ + LV::System::init (argc, argv); + test_blit_overlay_alphasrc (); + LV::System::destroy (); +} From 0e95069d8f72f7993e5bb9896f77d73eefef4598 Mon Sep 17 00:00:00 2001 From: Chong Kai Xiong Date: Thu, 26 Dec 2024 15:29:09 +0800 Subject: [PATCH 4/4] Core (LV::Video): Add explanatory note on calculation in blit_overlay_alphasrc() per Sebastian (hartwork)'s suggestion. --- libvisual/libvisual/private/lv_video_blit.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libvisual/libvisual/private/lv_video_blit.cpp b/libvisual/libvisual/private/lv_video_blit.cpp index a83d02cc0..b0955626e 100644 --- a/libvisual/libvisual/private/lv_video_blit.cpp +++ b/libvisual/libvisual/private/lv_video_blit.cpp @@ -78,6 +78,9 @@ namespace LV { for (int x = 0; x < src->m_impl->width; x++) { uint8_t const src_alpha = src_pixel[3]; + // NOTE: This is effectively + // "(src_alpha / 255) * src_pixel[i] + (1 - src_alpha / 255) * dst_pixel[i]" + // but with only a single multiplication, a single division by 256 rather than 255, for speed. dst_pixel[0] = (src_alpha * (src_pixel[0] - dst_pixel[0]) >> 8) + dst_pixel[0]; dst_pixel[1] = (src_alpha * (src_pixel[1] - dst_pixel[1]) >> 8) + dst_pixel[1]; dst_pixel[2] = (src_alpha * (src_pixel[2] - dst_pixel[2]) >> 8) + dst_pixel[2];