Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Core (LV::Video) Fix alpha blending of 32-bit videos (#230) #244

Merged
merged 4 commits into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions libvisual/cmake/LVBuildTest.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ FUNCTION(LV_BUILD_TEST TEST_NAME)

TARGET_LINK_LIBRARIES(${TEST_NAME}
PRIVATE
test_common
Libvisual::Libvisual
Threads::Threads
${PARSED_ARGS_LINK_LIBS}
Expand Down
32 changes: 19 additions & 13 deletions libvisual/libvisual/private/lv_video_blit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,30 +61,36 @@ namespace LV {
}
}

void VideoBlit::blit_overlay_alphasrc (Video* dest, Video* src)
void VideoBlit::blit_overlay_alphasrc (Video* dst, Video* src)
{
auto destbuf = static_cast<uint8_t*> (dest->get_pixels ());
auto srcbuf = static_cast<uint8_t const*> (src->get_pixels ());

if (visual_cpu_has_mmx ()) {
blit_overlay_alphasrc_mmx (dest, src);
blit_overlay_alphasrc_mmx (dst, src);
return;
}

auto dst_pixel_row_ptr = static_cast<uint8_t*> (dst->get_pixels ());
auto src_pixel_row_ptr = static_cast<uint8_t const*> (src->get_pixels ());

for (int y = 0; y < src->m_impl->height; y++) {
auto dst_pixel = dst_pixel_row_ptr;
auto src_pixel = src_pixel_row_ptr;

for (int x = 0; x < src->m_impl->width; x++) {
uint8_t alpha = srcbuf[3];
uint8_t const src_alpha = src_pixel[3];

destbuf[0] = (alpha * (srcbuf[0] - destbuf[0]) >> 8) + destbuf[0];
destbuf[1] = (alpha * (srcbuf[1] - destbuf[1]) >> 8) + destbuf[1];
destbuf[2] = (alpha * (srcbuf[2] - destbuf[2]) >> 8) + destbuf[2];
// NOTE: This is effectively
// "(src_alpha / 255) * src_pixel[i] + (1 - src_alpha / 255) * dst_pixel[i]"
// but with only a single multiplication, a single division by 256 rather than 255, for speed.
dst_pixel[0] = (src_alpha * (src_pixel[0] - dst_pixel[0]) >> 8) + dst_pixel[0];
kaixiong marked this conversation as resolved.
Show resolved Hide resolved
dst_pixel[1] = (src_alpha * (src_pixel[1] - dst_pixel[1]) >> 8) + dst_pixel[1];
dst_pixel[2] = (src_alpha * (src_pixel[2] - dst_pixel[2]) >> 8) + dst_pixel[2];

destbuf += dest->m_impl->bpp;
srcbuf += src->m_impl->bpp;
src_pixel += 4;
dst_pixel += 4;
}

destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp);
srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp);
dst_pixel_row_ptr += dst->m_impl->pitch;
src_pixel_row_ptr += src->m_impl->pitch;
}
}

Expand Down
81 changes: 46 additions & 35 deletions libvisual/libvisual/private/lv_video_blit_simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,49 +26,60 @@
#include "lv_video_blit.hpp"
#include "lv_video_private.hpp"
#include "lv_common.h"
#include <x86intrin.h>

namespace LV {

void VideoBlit::blit_overlay_alphasrc_mmx (Video* dest, Video* src)
void VideoBlit::blit_overlay_alphasrc_mmx (Video* dst, Video* src)
{
#if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64)
auto destbuf = static_cast<uint8_t*> (dest->get_pixels ());
auto srcbuf = static_cast<uint8_t const*> (src->get_pixels ());

for (int i = 0; i < src->m_impl->height; i++) {
for (int j = 0; j < src->m_impl->width; j++) {
__asm __volatile
("\n\t movd %[spix], %%mm0"
"\n\t movd %[dpix], %%mm1"
"\n\t movq %%mm0, %%mm2"
"\n\t movq %%mm0, %%mm3"
"\n\t psrlq $24, %%mm2" /* The alpha */
"\n\t movq %%mm0, %%mm4"
"\n\t psrld $24, %%mm3"
"\n\t psrld $24, %%mm4"
"\n\t psllq $32, %%mm2"
"\n\t psllq $16, %%mm3"
"\n\t por %%mm4, %%mm2"
"\n\t punpcklbw %%mm6, %%mm0" /* interleaving dest */
"\n\t por %%mm3, %%mm2"
"\n\t punpcklbw %%mm6, %%mm1" /* interleaving source */
"\n\t psubsw %%mm1, %%mm0" /* (src - dest) part */
"\n\t pmullw %%mm2, %%mm0" /* alpha * (src - dest) */
"\n\t psrlw $8, %%mm0" /* / 256 */
"\n\t paddb %%mm1, %%mm0" /* + dest */
"\n\t packuswb %%mm0, %%mm0"
"\n\t movd %%mm0, %[dest]"
: [dest] "=m" (*destbuf)
: [dpix] "m" (*destbuf)
, [spix] "m" (*srcbuf));

destbuf += 4;
srcbuf += 4;
auto dst_pixel_row_ptr = static_cast<uint8_t*> (dst->get_pixels ());
auto src_pixel_row_ptr = static_cast<uint8_t const*> (src->get_pixels ());

for (int y = 0; y < src->m_impl->height; y++) {
auto dst_pixel = reinterpret_cast<uint32_t*> (dst_pixel_row_ptr);
auto src_pixel = reinterpret_cast<uint32_t const*> (src_pixel_row_ptr);

for (int x = 0; x < src->m_impl->width; x++) {
// We work with 32-bit pixel values packed as 4 x 16-bit ints in MMX registers.
// See the pure C implementation in blit_overlay_alphsrc() for the calculation involved.

// Load source alpha as a 16-bit int.
uint16_t const src_alpha = reinterpret_cast<uint8_t const*> (src_pixel)[3];

// Load source and target pixel values into MMX registers, each channel zero-extended into 16 bits.
auto src = _mm_cvtsi32_si64 (*src_pixel);
auto dst = _mm_cvtsi32_si64 (*dst_pixel);
src = _mm_unpacklo_pi8 (src, _mm_setzero_si64 ());
dst = _mm_unpacklo_pi8 (dst, _mm_setzero_si64 ());

// Load src_alpha and (255 - src_alpha) and broadcast them into a1 and a2.
auto a1 = _mm_set1_pi16 (src_alpha);
auto a2 = _mm_set1_pi16 (static_cast<uint16_t> (255) - src_alpha);

// Interpolate between source and target.
auto result = _mm_add_pi16 (_mm_mullo_pi16 (src, a1), _mm_mullo_pi16 (dst, a2));
result = _mm_srli_pi16 (result, 8);

// Unpack result but keep the target pixel alpha.
// Is there a nicer way to do this?
uint32_t int_result = _mm_cvtsi64_si32 (_mm_packs_pu16 (result, result));
int_result = (int_result & 0x00'ff'ff'ff) | (*dst_pixel & 0xff'00'00'00);

*dst_pixel = int_result;

dst_pixel++;
src_pixel++;
}

destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp);
srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp);
dst_pixel_row_ptr += dst->m_impl->pitch;
src_pixel_row_ptr += src->m_impl->pitch;
}

// FIXME: Some sources said this is not needed for x64 as MMX registers are no longer
// overlayed on FP ones.
_mm_empty ();

#endif /* !VISUAL_ARCH_X86 */
}

Expand Down
8 changes: 7 additions & 1 deletion libvisual/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
INCLUDE(LVBuildTest)

INCLUDE_DIRECTORIES(
ADD_LIBRARY(test_common STATIC
random.cpp
)

TARGET_INCLUDE_DIRECTORIES(test_common PUBLIC
${CMAKE_CURRENT_SOURCE_DIR}
)

TARGET_LINK_LIBRARIES(test_common PUBLIC libvisual)

ADD_SUBDIRECTORY(audio_test)
ADD_SUBDIRECTORY(mem_test)
ADD_SUBDIRECTORY(video_test)
Expand Down
32 changes: 32 additions & 0 deletions libvisual/tests/random.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "random.hpp"
#include <random>

namespace LV::Tests
{
LV::VideoPtr create_random_video (int width, int height, VisVideoDepth depth)
{
std::random_device device {};
std::uniform_int_distribution<uint8_t> distrib {0, 255};

auto video {LV::Video::create (width, height, depth)};

auto bytes_per_pixel = video->get_bpp ();
auto pitch = video->get_pitch ();

auto content_bytes_per_row = bytes_per_pixel * video->get_width ();

auto pixel_row_ptr = static_cast<uint8_t *>(video->get_pixels ());

for (int y = 0; y < video->get_height (); y++) {
auto pixel = pixel_row_ptr;
for (int c = 0; c < content_bytes_per_row; c++) {
*pixel = distrib (device);
pixel++;
}

pixel_row_ptr += pitch;
}

return video;
}
} // LV::Tests namespace
12 changes: 12 additions & 0 deletions libvisual/tests/random.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#ifndef _LV_TESTS_VIDEO_RANDOM_HPP
#define _LV_TESTS_VIDEO_RANDOM_HPP

#include <libvisual/libvisual.h>

namespace LV::Tests
{
LV::VideoPtr create_random_video (int width, int height, VisVideoDepth depth);

} // LV::Tests namespace

#endif // defined(_LV_TESTS_VIDEO_COMMON_HPP)
4 changes: 4 additions & 0 deletions libvisual/tests/video_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ LV_BUILD_TEST(video_check_test
SOURCES video_check_test.cpp
)

LV_BUILD_TEST(video_blit_test
SOURCES video_blit_test.cpp
)

IF(HAVE_SDL)
LV_BUILD_TEST(video_scale_test
SOURCES video_scale_test.cpp
Expand Down
66 changes: 66 additions & 0 deletions libvisual/tests/video_test/video_blit_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#include "test.h"
#include "random.hpp"
#include <libvisual/libvisual.h>
#include <cassert>

namespace
{
LV::VideoPtr clone_video (LV::VideoPtr const& source)
{
auto clone {LV::Video::create (source->get_width (), source->get_height (), source->get_depth ())};

assert (clone->get_pitch () == source->get_pitch ());
std::size_t buffer_size = static_cast<std::size_t> (clone->get_pitch () * clone->get_width ());

visual_mem_copy (clone->get_pixels (), source->get_pixels (), buffer_size);

return clone;
}

void test_blit_overlay_alphasrc ()
{
// Check that blit_overlay_alphasrc results are within +/- 1 of exact computation for each colour channel. The
// errors largely arise from the use of 256 instead of 255 as divisor for performance reasons.

int const test_width = 31;
int const test_height = 31;

auto source = LV::Tests::create_random_video (test_width, test_height, VISUAL_VIDEO_DEPTH_32BIT);
source->set_compose_type (VISUAL_VIDEO_COMPOSE_TYPE_SRC);

auto target = LV::Tests::create_random_video (test_width, test_height, VISUAL_VIDEO_DEPTH_32BIT);

auto actual {clone_video (target)};
actual->blit (source, 0, 0, true);

for (int y = 0; y < test_height; y++) {
auto source_pixel = static_cast<uint8_t const*> (source->get_pixel_ptr (0, y));
auto target_pixel = static_cast<uint8_t const*> (target->get_pixel_ptr (0, y));
auto actual_pixel = static_cast<uint8_t const*> (actual->get_pixel_ptr (0, y));

for (int x = 0; x < test_width; x++) {
LV_TEST_ASSERT (actual_pixel[3] == target_pixel[3]);

float source_alpha = static_cast<float> (source_pixel[3]) / 255.0f;
uint8_t b = source_alpha * source_pixel[0] + (1.0f - source_alpha) * target_pixel[0];
uint8_t g = source_alpha * source_pixel[1] + (1.0f - source_alpha) * target_pixel[1];
uint8_t r = source_alpha * source_pixel[2] + (1.0f - source_alpha) * target_pixel[2];

LV_TEST_ASSERT (std::abs (static_cast<int16_t> (actual_pixel[0]) - static_cast<int16_t> (b)) <= 1);
LV_TEST_ASSERT (std::abs (static_cast<int16_t> (actual_pixel[1]) - static_cast<int16_t> (g)) <= 1);
LV_TEST_ASSERT (std::abs (static_cast<int16_t> (actual_pixel[2]) - static_cast<int16_t> (r)) <= 1);

source_pixel += 4;
target_pixel += 4;
actual_pixel += 4;
}
}
}
} // anonymous namespace

int main(int argc, char *argv[])
{
LV::System::init (argc, argv);
test_blit_overlay_alphasrc ();
LV::System::destroy ();
}