From e5f954ac54afa5aabd91cd405ab55c63125c9861 Mon Sep 17 00:00:00 2001
From: Chong Kai Xiong <kaixiong@codeleft.sg>
Date: Fri, 3 Feb 2023 02:40:06 +0800
Subject: [PATCH 1/4] Core (LV::Video): Clean up blit_overlay_alphasrc().

---
 libvisual/libvisual/private/lv_video_blit.cpp | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/libvisual/libvisual/private/lv_video_blit.cpp b/libvisual/libvisual/private/lv_video_blit.cpp
index 91494dbd0..a83d02cc0 100644
--- a/libvisual/libvisual/private/lv_video_blit.cpp
+++ b/libvisual/libvisual/private/lv_video_blit.cpp
@@ -61,30 +61,33 @@ namespace LV {
       }
   }
 
-  void VideoBlit::blit_overlay_alphasrc (Video* dest, Video* src)
+  void VideoBlit::blit_overlay_alphasrc (Video* dst, Video* src)
   {
-      auto destbuf = static_cast<uint8_t*> (dest->get_pixels ());
-      auto srcbuf  = static_cast<uint8_t const*> (src->get_pixels ());
-
       if (visual_cpu_has_mmx ()) {
-          blit_overlay_alphasrc_mmx (dest, src);
+          blit_overlay_alphasrc_mmx (dst, src);
           return;
       }
 
+      auto dst_pixel_row_ptr = static_cast<uint8_t*> (dst->get_pixels ());
+      auto src_pixel_row_ptr = static_cast<uint8_t const*> (src->get_pixels ());
+
       for (int y = 0; y < src->m_impl->height; y++) {
+          auto dst_pixel = dst_pixel_row_ptr;
+          auto src_pixel = src_pixel_row_ptr;
+
           for (int x = 0; x < src->m_impl->width; x++) {
-              uint8_t alpha = srcbuf[3];
+              uint8_t const src_alpha = src_pixel[3];
 
-              destbuf[0] = (alpha * (srcbuf[0] - destbuf[0]) >> 8) + destbuf[0];
-              destbuf[1] = (alpha * (srcbuf[1] - destbuf[1]) >> 8) + destbuf[1];
-              destbuf[2] = (alpha * (srcbuf[2] - destbuf[2]) >> 8) + destbuf[2];
+              dst_pixel[0] = (src_alpha * (src_pixel[0] - dst_pixel[0]) >> 8) + dst_pixel[0];
+              dst_pixel[1] = (src_alpha * (src_pixel[1] - dst_pixel[1]) >> 8) + dst_pixel[1];
+              dst_pixel[2] = (src_alpha * (src_pixel[2] - dst_pixel[2]) >> 8) + dst_pixel[2];
 
-              destbuf += dest->m_impl->bpp;
-              srcbuf  += src->m_impl->bpp;
+              src_pixel += 4;
+              dst_pixel += 4;
           }
 
-          destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp);
-          srcbuf  += src->m_impl->pitch  - (src->m_impl->width  * src->m_impl->bpp);
+          dst_pixel_row_ptr += dst->m_impl->pitch;
+          src_pixel_row_ptr += src->m_impl->pitch;
       }
   }
 

From 3a7d77026acc046a7d5dda1fa5fcef2c0d72a6c7 Mon Sep 17 00:00:00 2001
From: Chong Kai Xiong <kaixiong@codeleft.sg>
Date: Fri, 3 Feb 2023 05:09:39 +0800
Subject: [PATCH 2/4] Core (LV::Video): Rewrite MMX alpha blending of 32-bit
 videos using intrinsics (#230).

---
 .../libvisual/private/lv_video_blit_simd.cpp  | 81 +++++++++++--------
 1 file changed, 46 insertions(+), 35 deletions(-)

diff --git a/libvisual/libvisual/private/lv_video_blit_simd.cpp b/libvisual/libvisual/private/lv_video_blit_simd.cpp
index 7b26f8f59..e43afb476 100644
--- a/libvisual/libvisual/private/lv_video_blit_simd.cpp
+++ b/libvisual/libvisual/private/lv_video_blit_simd.cpp
@@ -26,49 +26,60 @@
 #include "lv_video_blit.hpp"
 #include "lv_video_private.hpp"
 #include "lv_common.h"
+#include <x86intrin.h>
 
 namespace LV {
 
-  void VideoBlit::blit_overlay_alphasrc_mmx (Video* dest, Video* src)
+  void VideoBlit::blit_overlay_alphasrc_mmx (Video* dst, Video* src)
   {
 #if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
-      auto destbuf = static_cast<uint8_t*> (dest->get_pixels ());
-      auto srcbuf = static_cast<uint8_t const*> (src->get_pixels ());
-
-      for (int i = 0; i < src->m_impl->height; i++) {
-          for (int j = 0; j < src->m_impl->width; j++) {
-              __asm __volatile
-                  ("\n\t movd %[spix], %%mm0"
-                   "\n\t movd %[dpix], %%mm1"
-                   "\n\t movq %%mm0, %%mm2"
-                   "\n\t movq %%mm0, %%mm3"
-                   "\n\t psrlq $24, %%mm2"  /* The alpha */
-                   "\n\t movq %%mm0, %%mm4"
-                   "\n\t psrld $24, %%mm3"
-                   "\n\t psrld $24, %%mm4"
-                   "\n\t psllq $32, %%mm2"
-                   "\n\t psllq $16, %%mm3"
-                   "\n\t por %%mm4, %%mm2"
-                   "\n\t punpcklbw %%mm6, %%mm0"    /* interleaving dest */
-                   "\n\t por %%mm3, %%mm2"
-                   "\n\t punpcklbw %%mm6, %%mm1"    /* interleaving source */
-                   "\n\t psubsw %%mm1, %%mm0"   /* (src - dest) part */
-                   "\n\t pmullw %%mm2, %%mm0"   /* alpha * (src - dest) */
-                   "\n\t psrlw $8, %%mm0"       /* / 256 */
-                   "\n\t paddb %%mm1, %%mm0"    /* + dest */
-                   "\n\t packuswb %%mm0, %%mm0"
-                   "\n\t movd %%mm0, %[dest]"
-                   : [dest] "=m" (*destbuf)
-                   : [dpix] "m" (*destbuf)
-                   , [spix] "m" (*srcbuf));
-
-              destbuf += 4;
-              srcbuf += 4;
+      auto dst_pixel_row_ptr = static_cast<uint8_t*> (dst->get_pixels ());
+      auto src_pixel_row_ptr = static_cast<uint8_t const*> (src->get_pixels ());
+
+      for (int y = 0; y < src->m_impl->height; y++) {
+          auto dst_pixel = reinterpret_cast<uint32_t*> (dst_pixel_row_ptr);
+          auto src_pixel = reinterpret_cast<uint32_t const*> (src_pixel_row_ptr);
+
+          for (int x = 0; x < src->m_impl->width; x++) {
+              // We work with 32-bit pixel values packed as 4 x 16-bit ints in MMX registers.
+              // See the pure C implementation in blit_overlay_alphsrc() for the calculation involved.
+
+              // Load source alpha as a 16-bit int.
+              uint16_t const src_alpha = reinterpret_cast<uint8_t const*> (src_pixel)[3];
+
+              // Load source and target pixel values into MMX registers, each channel zero-extended into 16 bits.
+              auto src = _mm_cvtsi32_si64 (*src_pixel);
+              auto dst = _mm_cvtsi32_si64 (*dst_pixel);
+              src = _mm_unpacklo_pi8 (src, _mm_setzero_si64 ());
+              dst = _mm_unpacklo_pi8 (dst, _mm_setzero_si64 ());
+
+              // Load src_alpha and (255 - src_alpha) and broadcast them into a1 and a2.
+              auto a1 = _mm_set1_pi16 (src_alpha);
+              auto a2 = _mm_set1_pi16 (static_cast<uint16_t> (255) - src_alpha);
+
+              // Interpolate between source and target.
+              auto result = _mm_add_pi16 (_mm_mullo_pi16 (src, a1), _mm_mullo_pi16 (dst, a2));
+              result = _mm_srli_pi16 (result, 8);
+
+              // Unpack result but keep the target pixel alpha.
+              // Is there a nicer way to do this?
+              uint32_t int_result = _mm_cvtsi64_si32 (_mm_packs_pu16 (result, result));
+              int_result = (int_result & 0x00'ff'ff'ff) | (*dst_pixel & 0xff'00'00'00);
+
+              *dst_pixel = int_result;
+
+              dst_pixel++;
+              src_pixel++;
           }
 
-          destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp);
-          srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp);
+          dst_pixel_row_ptr += dst->m_impl->pitch;
+          src_pixel_row_ptr += src->m_impl->pitch;
       }
+
+      // FIXME: Some sources said this is not needed for x64 as MMX registers are no longer
+      // overlayed on FP ones.
+      _mm_empty ();
+
 #endif /* !VISUAL_ARCH_X86 */
   }
 

From 8cbfb5c003e76612ee1437ba0a5b4c9727448162 Mon Sep 17 00:00:00 2001
From: Chong Kai Xiong <kaixiong@codeleft.sg>
Date: Thu, 8 Feb 2024 21:04:42 +0800
Subject: [PATCH 3/4] Core (Tests): Add test for
 LV::VideoBlit::blit_overlay_alphasrc().

---
 libvisual/cmake/LVBuildTest.cmake             |  1 +
 libvisual/tests/CMakeLists.txt                |  8 ++-
 libvisual/tests/random.cpp                    | 32 +++++++++
 libvisual/tests/random.hpp                    | 12 ++++
 libvisual/tests/video_test/CMakeLists.txt     |  4 ++
 .../tests/video_test/video_blit_test.cpp      | 66 +++++++++++++++++++
 6 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 libvisual/tests/random.cpp
 create mode 100644 libvisual/tests/random.hpp
 create mode 100644 libvisual/tests/video_test/video_blit_test.cpp

diff --git a/libvisual/cmake/LVBuildTest.cmake b/libvisual/cmake/LVBuildTest.cmake
index 1820b9a45..3680b4aab 100644
--- a/libvisual/cmake/LVBuildTest.cmake
+++ b/libvisual/cmake/LVBuildTest.cmake
@@ -37,6 +37,7 @@ FUNCTION(LV_BUILD_TEST TEST_NAME)
 
   TARGET_LINK_LIBRARIES(${TEST_NAME}
     PRIVATE
+    test_common
     Libvisual::Libvisual
     Threads::Threads
     ${PARSED_ARGS_LINK_LIBS}
diff --git a/libvisual/tests/CMakeLists.txt b/libvisual/tests/CMakeLists.txt
index f7b2556ba..b4ee0f01c 100644
--- a/libvisual/tests/CMakeLists.txt
+++ b/libvisual/tests/CMakeLists.txt
@@ -1,9 +1,15 @@
 INCLUDE(LVBuildTest)
 
-INCLUDE_DIRECTORIES(
+ADD_LIBRARY(test_common STATIC
+  random.cpp
+)
+
+TARGET_INCLUDE_DIRECTORIES(test_common PUBLIC
   ${CMAKE_CURRENT_SOURCE_DIR}
 )
 
+TARGET_LINK_LIBRARIES(test_common PUBLIC libvisual)
+
 ADD_SUBDIRECTORY(audio_test)
 ADD_SUBDIRECTORY(mem_test)
 ADD_SUBDIRECTORY(video_test)
diff --git a/libvisual/tests/random.cpp b/libvisual/tests/random.cpp
new file mode 100644
index 000000000..47e58d064
--- /dev/null
+++ b/libvisual/tests/random.cpp
@@ -0,0 +1,32 @@
+#include "random.hpp"
+#include <random>
+
+namespace LV::Tests
+{
+  LV::VideoPtr create_random_video (int width, int height, VisVideoDepth depth)
+  {
+      std::random_device device {};
+      std::uniform_int_distribution<uint8_t> distrib {0, 255};
+
+      auto video {LV::Video::create (width, height, depth)};
+
+      auto bytes_per_pixel = video->get_bpp ();
+      auto pitch = video->get_pitch ();
+
+      auto content_bytes_per_row = bytes_per_pixel * video->get_width ();
+
+      auto pixel_row_ptr = static_cast<uint8_t *>(video->get_pixels ());
+
+      for (int y = 0; y < video->get_height (); y++) {
+          auto pixel = pixel_row_ptr;
+          for (int c = 0; c < content_bytes_per_row; c++) {
+              *pixel = distrib (device);
+              pixel++;
+          }
+
+          pixel_row_ptr += pitch;
+      }
+
+      return video;
+  }
+} // LV::Tests namespace
diff --git a/libvisual/tests/random.hpp b/libvisual/tests/random.hpp
new file mode 100644
index 000000000..c9893b3e8
--- /dev/null
+++ b/libvisual/tests/random.hpp
@@ -0,0 +1,12 @@
+#ifndef _LV_TESTS_VIDEO_RANDOM_HPP
+#define _LV_TESTS_VIDEO_RANDOM_HPP
+
+#include <libvisual/libvisual.h>
+
+namespace LV::Tests
+{
+  LV::VideoPtr create_random_video (int width, int height, VisVideoDepth depth);
+
+} // LV::Tests namespace
+
+#endif // defined(_LV_TESTS_VIDEO_COMMON_HPP)
diff --git a/libvisual/tests/video_test/CMakeLists.txt b/libvisual/tests/video_test/CMakeLists.txt
index 6f5244909..65cd4341a 100644
--- a/libvisual/tests/video_test/CMakeLists.txt
+++ b/libvisual/tests/video_test/CMakeLists.txt
@@ -2,6 +2,10 @@ LV_BUILD_TEST(video_check_test
   SOURCES video_check_test.cpp
 )
 
+LV_BUILD_TEST(video_blit_test
+  SOURCES video_blit_test.cpp
+)
+
 IF(HAVE_SDL)
   LV_BUILD_TEST(video_scale_test
     SOURCES   video_scale_test.cpp
diff --git a/libvisual/tests/video_test/video_blit_test.cpp b/libvisual/tests/video_test/video_blit_test.cpp
new file mode 100644
index 000000000..34320dd0b
--- /dev/null
+++ b/libvisual/tests/video_test/video_blit_test.cpp
@@ -0,0 +1,66 @@
+#include "test.h"
+#include "random.hpp"
+#include <libvisual/libvisual.h>
+#include <cassert>
+
+namespace
+{
+  LV::VideoPtr clone_video (LV::VideoPtr const& source)
+  {
+      auto clone {LV::Video::create (source->get_width (), source->get_height (), source->get_depth ())};
+
+      assert (clone->get_pitch () == source->get_pitch ());
+      std::size_t buffer_size = static_cast<std::size_t> (clone->get_pitch () * clone->get_width ());
+
+      visual_mem_copy (clone->get_pixels (), source->get_pixels (), buffer_size);
+
+      return clone;
+  }
+
+  void test_blit_overlay_alphasrc ()
+  {
+      // Check that blit_overlay_alphasrc results are within +/- 1 of exact computation for each colour channel. The
+      // errors largely arise from the use of 256 instead of 255 as divisor for performance reasons.
+
+      int const test_width  = 31;
+      int const test_height = 31;
+
+      auto source = LV::Tests::create_random_video (test_width, test_height, VISUAL_VIDEO_DEPTH_32BIT);
+      source->set_compose_type (VISUAL_VIDEO_COMPOSE_TYPE_SRC);
+
+      auto target = LV::Tests::create_random_video (test_width, test_height, VISUAL_VIDEO_DEPTH_32BIT);
+
+      auto actual {clone_video (target)};
+      actual->blit (source, 0, 0, true);
+
+      for (int y = 0; y < test_height; y++) {
+          auto source_pixel = static_cast<uint8_t const*> (source->get_pixel_ptr (0, y));
+          auto target_pixel = static_cast<uint8_t const*> (target->get_pixel_ptr (0, y));
+          auto actual_pixel = static_cast<uint8_t const*> (actual->get_pixel_ptr (0, y));
+
+          for (int x = 0; x < test_width; x++) {
+              LV_TEST_ASSERT (actual_pixel[3] == target_pixel[3]);
+
+              float source_alpha = static_cast<float> (source_pixel[3]) / 255.0f;
+              uint8_t b = source_alpha * source_pixel[0] + (1.0f - source_alpha) * target_pixel[0];
+              uint8_t g = source_alpha * source_pixel[1] + (1.0f - source_alpha) * target_pixel[1];
+              uint8_t r = source_alpha * source_pixel[2] + (1.0f - source_alpha) * target_pixel[2];
+
+              LV_TEST_ASSERT (std::abs (static_cast<int16_t> (actual_pixel[0]) - static_cast<int16_t> (b)) <= 1);
+              LV_TEST_ASSERT (std::abs (static_cast<int16_t> (actual_pixel[1]) - static_cast<int16_t> (g)) <= 1);
+              LV_TEST_ASSERT (std::abs (static_cast<int16_t> (actual_pixel[2]) - static_cast<int16_t> (r)) <= 1);
+
+              source_pixel += 4;
+              target_pixel += 4;
+              actual_pixel += 4;
+          }
+      }
+  }
+} // anonymous namespace
+
+int main(int argc, char *argv[])
+{
+    LV::System::init (argc, argv);
+    test_blit_overlay_alphasrc ();
+    LV::System::destroy ();
+}

From 0e95069d8f72f7993e5bb9896f77d73eefef4598 Mon Sep 17 00:00:00 2001
From: Chong Kai Xiong <kaixiong@codeleft.sg>
Date: Thu, 26 Dec 2024 15:29:09 +0800
Subject: [PATCH 4/4] Core (LV::Video): Add explanatory note on calculation in
 blit_overlay_alphasrc() per Sebastian (hartwork)'s suggestion.

---
 libvisual/libvisual/private/lv_video_blit.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libvisual/libvisual/private/lv_video_blit.cpp b/libvisual/libvisual/private/lv_video_blit.cpp
index a83d02cc0..b0955626e 100644
--- a/libvisual/libvisual/private/lv_video_blit.cpp
+++ b/libvisual/libvisual/private/lv_video_blit.cpp
@@ -78,6 +78,9 @@ namespace LV {
           for (int x = 0; x < src->m_impl->width; x++) {
               uint8_t const src_alpha = src_pixel[3];
 
+              // NOTE: This is effectively
+              //       "(src_alpha / 255) * src_pixel[i] + (1 - src_alpha / 255) * dst_pixel[i]"
+              //       but with only a single multiplication, a single division by 256 rather than 255, for speed.
               dst_pixel[0] = (src_alpha * (src_pixel[0] - dst_pixel[0]) >> 8) + dst_pixel[0];
               dst_pixel[1] = (src_alpha * (src_pixel[1] - dst_pixel[1]) >> 8) + dst_pixel[1];
               dst_pixel[2] = (src_alpha * (src_pixel[2] - dst_pixel[2]) >> 8) + dst_pixel[2];