diff --git a/libvisual/libvisual/private/lv_video_blit_simd.cpp b/libvisual/libvisual/private/lv_video_blit_simd.cpp index 7b26f8f5..1e0ee716 100644 --- a/libvisual/libvisual/private/lv_video_blit_simd.cpp +++ b/libvisual/libvisual/private/lv_video_blit_simd.cpp @@ -26,49 +26,60 @@ #include "lv_video_blit.hpp" #include "lv_video_private.hpp" #include "lv_common.h" +#include namespace LV { - void VideoBlit::blit_overlay_alphasrc_mmx (Video* dest, Video* src) + void VideoBlit::blit_overlay_alphasrc_mmx (Video* dst, Video* src) { #if defined(VISUAL_ARCH_X86) || defined(VISUAL_ARCH_X86_64) - auto destbuf = static_cast (dest->get_pixels ()); - auto srcbuf = static_cast (src->get_pixels ()); - - for (int i = 0; i < src->m_impl->height; i++) { - for (int j = 0; j < src->m_impl->width; j++) { - __asm __volatile - ("\n\t movd %[spix], %%mm0" - "\n\t movd %[dpix], %%mm1" - "\n\t movq %%mm0, %%mm2" - "\n\t movq %%mm0, %%mm3" - "\n\t psrlq $24, %%mm2" /* The alpha */ - "\n\t movq %%mm0, %%mm4" - "\n\t psrld $24, %%mm3" - "\n\t psrld $24, %%mm4" - "\n\t psllq $32, %%mm2" - "\n\t psllq $16, %%mm3" - "\n\t por %%mm4, %%mm2" - "\n\t punpcklbw %%mm6, %%mm0" /* interleaving dest */ - "\n\t por %%mm3, %%mm2" - "\n\t punpcklbw %%mm6, %%mm1" /* interleaving source */ - "\n\t psubsw %%mm1, %%mm0" /* (src - dest) part */ - "\n\t pmullw %%mm2, %%mm0" /* alpha * (src - dest) */ - "\n\t psrlw $8, %%mm0" /* / 256 */ - "\n\t paddb %%mm1, %%mm0" /* + dest */ - "\n\t packuswb %%mm0, %%mm0" - "\n\t movd %%mm0, %[dest]" - : [dest] "=m" (*destbuf) - : [dpix] "m" (*destbuf) - , [spix] "m" (*srcbuf)); - - destbuf += 4; - srcbuf += 4; + auto dst_pixel_row_ptr = static_cast (dst->get_pixels ()); + auto src_pixel_row_ptr = static_cast (src->get_pixels ()); + + for (int y = 0; y < src->m_impl->height; y++) { + auto dst_pixel = reinterpret_cast (dst_pixel_row_ptr); + auto src_pixel = reinterpret_cast (src_pixel_row_ptr); + + for (int x = 0; x < src->m_impl->width; x++) { + // We work with 32-bit pixel values packed as 4 x 16-bit ints in MMX registers. + // See the pure C implementation in blit_overlay_alphsrc() for the calculation involved. + + // Load source alpha as a 16-bit int. + uint16_t const src_alpha = *reinterpret_cast (src_pixel); + + // Load source and target pixel values into MMX registers, each channel zero-extended into 16 bits. + auto src = _mm_cvtsi32_si64 (*src_pixel); + auto dst = _mm_cvtsi32_si64 (*dst_pixel); + src = _mm_unpacklo_pi8 (src, _mm_setzero_si64 ()); + dst = _mm_unpacklo_pi8 (dst, _mm_setzero_si64 ()); + + // Load src_alpha and (255 - src_alpha) and broadcast them into a1 and a2. + auto a1 = _mm_set1_pi16 (src_alpha); + auto a2 = _mm_set1_pi16 (static_cast (255) - src_alpha); + + // Interpolate between source and target. + auto result = _mm_add_pi16 (_mm_mullo_pi16 (src, a1), _mm_mullo_pi16 (dst, a2)); + result = _mm_srli_pi16 (result, 8); + + // Unpack result but keep the target pixel alpha. + // Is there a nicer way to do this? + uint32_t int_result = _mm_cvtsi64_si32 (_mm_packs_pu16 (result, result)); + int_result = (int_result & 0x00'ff'ff'ff) | (*dst_pixel & 0xff'00'00'00); + + *dst_pixel = int_result; + + dst_pixel++; + src_pixel++; } - destbuf += dest->m_impl->pitch - (dest->m_impl->width * dest->m_impl->bpp); - srcbuf += src->m_impl->pitch - (src->m_impl->width * src->m_impl->bpp); + dst_pixel_row_ptr += dst->m_impl->pitch; + src_pixel_row_ptr += src->m_impl->pitch; } + + // FIXME: Some sources said this is not needed for x64 as MMX registers are no longer + // overlayed on FP ones. + _mm_empty (); + #endif /* !VISUAL_ARCH_X86 */ }