From c5baae8a24447838268c29d5eb2611e7e3588d56 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Fri, 27 Oct 2023 16:02:15 +0100 Subject: [PATCH 01/10] Add SIMD versions of transform.invert() --- src_c/simd_transform.h | 4 ++ src_c/simd_transform_avx2.c | 96 ++++++++++++++++++++++++++++++++++++ src_c/simd_transform_sse2.c | 98 +++++++++++++++++++++++++++++++++++++ src_c/transform.c | 49 ++++++++++++++----- 4 files changed, 235 insertions(+), 12 deletions(-) diff --git a/src_c/simd_transform.h b/src_c/simd_transform.h index d97a0d5b22..ed4da10cc0 100644 --- a/src_c/simd_transform.h +++ b/src_c/simd_transform.h @@ -9,6 +9,10 @@ // SSE2 functions #if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) +void +invert_sse2(SDL_Surface *src, SDL_Surface *newsurf); #endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */ // AVX2 functions +void +invert_avx2(SDL_Surface *src, SDL_Surface *newsurf); diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index f1889f1bb7..61760f8846 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -42,3 +42,99 @@ pg_avx2_at_runtime_but_uncompiled() } return 0; } + +#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) +void +invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) +{ + int s_row_skip = (src->pitch - src->w * 4) / 4; + + // generate number of batches of pixels we need to loop through + int pixel_batch_length = src->w * src->h; + int num_batches = 1; + if (s_row_skip > 0) { + pixel_batch_length = src->w; + num_batches = src->h; + } + + int remaining_pixels = pixel_batch_length % 8; + int perfect_8_pixels = pixel_batch_length / 8; + + int perfect_8_pixels_batch_counter = perfect_8_pixels; + int remaining_pixels_batch_counter = remaining_pixels; + + Uint32 *srcp = (Uint32 *)src->pixels; + Uint32 *dstp = (Uint32 *)newsurf->pixels; + + Uint32 rgbmask = + (src->format->Rmask | src->format->Gmask | src->format->Bmask); + Uint32 amask = ~rgbmask; + + __m256i *srcp256 = (__m256i *)src->pixels; + __m256i *dstp256 = (__m256i *)newsurf->pixels; + + __m256i mm256_src, mm256_dst, mm256_two_five_fives, + mm256_alpha, mm256_rgb_mask, mm256_alpha_mask; + + mm256_two_five_fives = _mm256_set1_epi16(0xFFFF); + mm256_rgb_mask = _mm256_set1_epi32(rgbmask); + mm256_alpha_mask = _mm256_set1_epi32(amask); + + __m256i _partial8_mask = + _mm256_set_epi32(0x00, (remaining_pixels > 6) ? 0x80000000 : 0x00, + (remaining_pixels > 5) ? 0x80000000 : 0x00, + (remaining_pixels > 4) ? 0x80000000 : 0x00, + (remaining_pixels > 3) ? 0x80000000 : 0x00, + (remaining_pixels > 2) ? 0x80000000 : 0x00, + (remaining_pixels > 1) ? 0x80000000 : 0x00, + (remaining_pixels > 0) ? 0x80000000 : 0x00); + + while (num_batches--) { + perfect_8_pixels_batch_counter = perfect_8_pixels; + remaining_pixels_batch_counter = remaining_pixels; + while (perfect_8_pixels_batch_counter--) { + mm256_src = _mm256_loadu_si256(srcp256); + mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask); + + /* do the invert */ + mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src); + + mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask); + mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha); + + _mm256_storeu_si256(dstp256, mm256_dst); + + srcp256++; + dstp256++; + } + srcp = (Uint32 *)srcp256; + dstp = (Uint32 *)dstp256; + if (remaining_pixels_batch_counter > 0) { + mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask); + mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask); + + /* do the invert */ + mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src); + + mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask); + mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha); + + _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst); + + srcp += remaining_pixels_batch_counter; + dstp += remaining_pixels_batch_counter; + } + srcp += s_row_skip; + srcp256 = (__m256i *)srcp; + } +} +#else +void +invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) +{ + BAD_AVX2_FUNCTION_CALL; +} +#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ + !defined(SDL_DISABLE_IMMINTRIN_H) */ + diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index 8f503b964f..f4a893da4f 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -34,3 +34,101 @@ pg_neon_at_runtime_but_uncompiled() } return 0; } + +#if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) + +#if defined(ENV64BIT) +#define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num) +#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(*reg) +#else +#define LOAD_64_INTO_M128(num, reg) \ + *reg = _mm_loadl_epi64((const __m128i *)num) +#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, *reg) +#endif + +void +invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) +{ + int s_row_skip = (src->pitch - src->w * 4) / 4; + + // generate number of batches of pixels we need to loop through + int pixel_batch_length = src->w * src->h; + int num_batches = 1; + if (s_row_skip > 0) { + pixel_batch_length = src->w; + num_batches = src->h; + } + int remaining_pixels = pixel_batch_length % 2; + int perfect_2_pixels = pixel_batch_length / 2; + + int perfect_2_pixels_batch_counter = perfect_2_pixels; + int remaining_pixels_batch_counter = remaining_pixels; + + Uint32 *srcp = (Uint32 *)src->pixels; + Uint32 *dstp = (Uint32 *)newsurf->pixels; + + Uint32 rgbmask = + (src->format->Rmask | src->format->Gmask | src->format->Bmask); + Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask; + Uint64 amask64 = ~rgbmask64; + + Uint64 *srcp64 = (Uint64 *)src->pixels; + Uint64 *dstp64 = (Uint64 *)newsurf->pixels; + + __m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives, + mm_alpha_mask, mm_rgb_mask; + + LOAD_64_INTO_M128(&amask64, &mm_alpha_mask); + LOAD_64_INTO_M128(&rgbmask64, &mm_rgb_mask); + mm_two_five_fives = _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF); + + while (num_batches--) { + perfect_2_pixels_batch_counter = perfect_2_pixels; + remaining_pixels_batch_counter = remaining_pixels; + while (perfect_2_pixels_batch_counter--) { + LOAD_64_INTO_M128(srcp64, &mm_src); + /*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/ + /* First we strip out the alpha so we have one of our 4 channels + empty for the rest of the calculation */ + mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); + /*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/ + + /*invert the colours*/ + mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src); + + /*add the alpha channel back*/ + mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); + mm_dst = _mm_adds_epu8(mm_dst, mm_alpha); + /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/ + STORE_M128_INTO_64(&mm_dst, dstp64); + /*dstp = 0xAARRGGBB*/ + srcp64++; + dstp64++; + } + srcp = (Uint32 *)srcp64; + dstp = (Uint32 *)dstp64; + if (remaining_pixels_batch_counter > 0) { + mm_src = _mm_cvtsi32_si128(*srcp); + /*mm_src = 0x000000000000000000000000AARRGGBB*/ + /* First we strip out the alpha so we have one of our 4 channels + empty for the rest of the calculation */ + mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); + /*mm_src = 0x00000000000000000000000000RRGGBB*/ + + /*invert the colours*/ + mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src); + + /*add the alpha channel back*/ + mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); + mm_dst = _mm_adds_epu8(mm_dst, mm_alpha); + /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/ + *dstp = _mm_cvtsi128_si32(mm_dst); + /*dstp = 0xAARRGGBB*/ + srcp++; + dstp++; + } + srcp += s_row_skip; + srcp64 = (Uint64 *)srcp; + } +} +#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/ diff --git a/src_c/transform.c b/src_c/transform.c index 5a5d10faa6..a867b5e563 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -3279,6 +3279,25 @@ surf_gaussian_blur(PyObject *self, PyObject *args, PyObject *kwargs) return (PyObject *)pgSurface_New(new_surf); } +void +invert_non_simd(SDL_Surface *src, SDL_Surface *newsurf) +{ + int x, y; + for (y = 0; y < src->h; y++) { + for (x = 0; x < src->w; x++) { + Uint32 pixel; + Uint8 *pix; + SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format, + pix); + unsigned char r, g, b, a; + SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a); + Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a); + SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels, + newsurf->format, pix); + } + } +} + SDL_Surface * invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) { @@ -3306,19 +3325,25 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) "Source and destination surfaces need the same format.")); } - int x, y; - for (y = 0; y < src->h; y++) { - for (x = 0; x < src->w; x++) { - Uint32 pixel; - Uint8 *pix; - SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format, - pix); - unsigned char r, g, b, a; - SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a); - Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a); - SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels, - newsurf->format, pix); + if (src->format->BytesPerPixel == 4 && + src->format->Rmask == newsurf->format->Rmask && + src->format->Gmask == newsurf->format->Gmask && + src->format->Bmask == newsurf->format->Bmask && + (src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) { + if (pg_has_avx2()) { + invert_avx2(src, newsurf); + } +#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) + else if (pg_HasSSE_NEON()) { + invert_sse2(src, newsurf); } +#endif // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON) + else { + invert_non_simd(src, newsurf); + } + } + else { + invert_non_simd(src, newsurf); } SDL_UnlockSurface(newsurf); From 5e804cdb621829fa30b05ae2553a73d40adff1d1 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Fri, 27 Oct 2023 16:33:47 +0100 Subject: [PATCH 02/10] switch SSE2 to use four pixels at a time. --- src_c/simd_transform_sse2.c | 53 +++++++++++++++---------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index f4a893da4f..070643f66a 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -37,15 +37,6 @@ pg_neon_at_runtime_but_uncompiled() #if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) -#if defined(ENV64BIT) -#define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num) -#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(*reg) -#else -#define LOAD_64_INTO_M128(num, reg) \ - *reg = _mm_loadl_epi64((const __m128i *)num) -#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, *reg) -#endif - void invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) { @@ -58,10 +49,10 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) pixel_batch_length = src->w; num_batches = src->h; } - int remaining_pixels = pixel_batch_length % 2; - int perfect_2_pixels = pixel_batch_length / 2; + int remaining_pixels = pixel_batch_length % 4; + int perfect_4_pixels = pixel_batch_length / 4; - int perfect_2_pixels_batch_counter = perfect_2_pixels; + int perfect_4_pixels_batch_counter = perfect_4_pixels; int remaining_pixels_batch_counter = remaining_pixels; Uint32 *srcp = (Uint32 *)src->pixels; @@ -72,26 +63,26 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask; Uint64 amask64 = ~rgbmask64; - Uint64 *srcp64 = (Uint64 *)src->pixels; - Uint64 *dstp64 = (Uint64 *)newsurf->pixels; - __m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives, mm_alpha_mask, mm_rgb_mask; - LOAD_64_INTO_M128(&amask64, &mm_alpha_mask); - LOAD_64_INTO_M128(&rgbmask64, &mm_rgb_mask); + __m128i *srcp128 = (__m128i *)src->pixels; + __m128i *dstp128 = (__m128i *)newsurf->pixels; + + mm_rgb_mask = _mm_set1_epi64x(rgbmask64); + mm_alpha_mask = _mm_set1_epi64x(amask64); mm_two_five_fives = _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF); while (num_batches--) { - perfect_2_pixels_batch_counter = perfect_2_pixels; + perfect_4_pixels_batch_counter = perfect_4_pixels; remaining_pixels_batch_counter = remaining_pixels; - while (perfect_2_pixels_batch_counter--) { - LOAD_64_INTO_M128(srcp64, &mm_src); - /*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/ + while (perfect_4_pixels_batch_counter--) { + mm_src = _mm_loadu_si128(srcp128); + /*mm_src = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/ /* First we strip out the alpha so we have one of our 4 channels empty for the rest of the calculation */ mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); - /*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/ + /*mm_src = 0x00RRGGBB00RRGGBB00RRGGBB00RRGGBB*/ /*invert the colours*/ mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src); @@ -99,14 +90,14 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) /*add the alpha channel back*/ mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); mm_dst = _mm_adds_epu8(mm_dst, mm_alpha); - /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/ - STORE_M128_INTO_64(&mm_dst, dstp64); - /*dstp = 0xAARRGGBB*/ - srcp64++; - dstp64++; + /*mm_dst = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/ + _mm_storeu_si128(dstp128, mm_dst); + /*dstp = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/ + srcp128++; + dstp128++; } - srcp = (Uint32 *)srcp64; - dstp = (Uint32 *)dstp64; + srcp = (Uint32 *)srcp128; + dstp = (Uint32 *)dstp128; if (remaining_pixels_batch_counter > 0) { mm_src = _mm_cvtsi32_si128(*srcp); /*mm_src = 0x000000000000000000000000AARRGGBB*/ @@ -121,14 +112,14 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) /*add the alpha channel back*/ mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); mm_dst = _mm_adds_epu8(mm_dst, mm_alpha); - /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/ + /*mm_dst = 0x000000000000000000000000AARRGGBB*/ *dstp = _mm_cvtsi128_si32(mm_dst); /*dstp = 0xAARRGGBB*/ srcp++; dstp++; } srcp += s_row_skip; - srcp64 = (Uint64 *)srcp; + srcp128 = (__m128i*)srcp; } } #endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/ From fb492df1e114109ffe0201ff793cd1b15c09add6 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Fri, 27 Oct 2023 16:40:34 +0100 Subject: [PATCH 03/10] clang format pass --- src_c/simd_transform_avx2.c | 5 ++--- src_c/simd_transform_sse2.c | 8 ++++---- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index 61760f8846..05d6ebd102 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -74,8 +74,8 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) __m256i *srcp256 = (__m256i *)src->pixels; __m256i *dstp256 = (__m256i *)newsurf->pixels; - __m256i mm256_src, mm256_dst, mm256_two_five_fives, - mm256_alpha, mm256_rgb_mask, mm256_alpha_mask; + __m256i mm256_src, mm256_dst, mm256_two_five_fives, mm256_alpha, + mm256_rgb_mask, mm256_alpha_mask; mm256_two_five_fives = _mm256_set1_epi16(0xFFFF); mm256_rgb_mask = _mm256_set1_epi32(rgbmask); @@ -137,4 +137,3 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) } #endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \ !defined(SDL_DISABLE_IMMINTRIN_H) */ - diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index 070643f66a..e848253ec2 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -63,8 +63,8 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask; Uint64 amask64 = ~rgbmask64; - __m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives, - mm_alpha_mask, mm_rgb_mask; + __m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives, mm_alpha_mask, + mm_rgb_mask; __m128i *srcp128 = (__m128i *)src->pixels; __m128i *dstp128 = (__m128i *)newsurf->pixels; @@ -106,7 +106,7 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); /*mm_src = 0x00000000000000000000000000RRGGBB*/ - /*invert the colours*/ + /*invert the colours*/ mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src); /*add the alpha channel back*/ @@ -119,7 +119,7 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) dstp++; } srcp += s_row_skip; - srcp128 = (__m128i*)srcp; + srcp128 = (__m128i *)srcp; } } #endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/ From 6f8628c52d9ed45c6fa8a27a5551e6649bd8026f Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Mon, 30 Oct 2023 20:10:29 +0000 Subject: [PATCH 04/10] Replace AVX alpha subtraction with blendv slightly faster --- src_c/simd_transform_avx2.c | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index 05d6ebd102..df504ccc4e 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -67,9 +67,8 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) Uint32 *srcp = (Uint32 *)src->pixels; Uint32 *dstp = (Uint32 *)newsurf->pixels; - Uint32 rgbmask = - (src->format->Rmask | src->format->Gmask | src->format->Bmask); - Uint32 amask = ~rgbmask; + Uint32 amask = + ~(src->format->Rmask | src->format->Gmask | src->format->Bmask); __m256i *srcp256 = (__m256i *)src->pixels; __m256i *dstp256 = (__m256i *)newsurf->pixels; @@ -78,7 +77,6 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) mm256_rgb_mask, mm256_alpha_mask; mm256_two_five_fives = _mm256_set1_epi16(0xFFFF); - mm256_rgb_mask = _mm256_set1_epi32(rgbmask); mm256_alpha_mask = _mm256_set1_epi32(amask); __m256i _partial8_mask = @@ -95,13 +93,12 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) remaining_pixels_batch_counter = remaining_pixels; while (perfect_8_pixels_batch_counter--) { mm256_src = _mm256_loadu_si256(srcp256); - mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask); /* do the invert */ mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src); - - mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask); - mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha); + /* blend the original alpha in */ + mm256_dst = + _mm256_blendv_epi8(mm256_dst, mm256_src, mm256_alpha_mask); _mm256_storeu_si256(dstp256, mm256_dst); @@ -112,13 +109,12 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) dstp = (Uint32 *)dstp256; if (remaining_pixels_batch_counter > 0) { mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask); - mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask); /* do the invert */ mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src); - - mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask); - mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha); + /* blend the original alpha in */ + mm256_dst = + _mm256_blendv_epi8(mm256_dst, mm256_src, mm256_alpha_mask); _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst); From 379e4da51c3de6ff12fa6af4532fb7c6339e4e20 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Tue, 31 Oct 2023 19:23:11 +0000 Subject: [PATCH 05/10] Switch to using bitwise logic instructions For better CPI and throughput --- src_c/simd_transform_avx2.c | 33 ++++++++++++++---------- src_c/simd_transform_sse2.c | 50 ++++++++++++++++--------------------- 2 files changed, 42 insertions(+), 41 deletions(-) diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index df504ccc4e..04f0afe53d 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -67,16 +67,17 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) Uint32 *srcp = (Uint32 *)src->pixels; Uint32 *dstp = (Uint32 *)newsurf->pixels; - Uint32 amask = - ~(src->format->Rmask | src->format->Gmask | src->format->Bmask); + Uint32 rgbmask = + (src->format->Rmask | src->format->Gmask | src->format->Bmask); + Uint32 amask = ~rgbmask; __m256i *srcp256 = (__m256i *)src->pixels; __m256i *dstp256 = (__m256i *)newsurf->pixels; - __m256i mm256_src, mm256_dst, mm256_two_five_fives, mm256_alpha, - mm256_rgb_mask, mm256_alpha_mask; + __m256i mm256_src, mm256_dst, mm256_rgb_invert_mask, mm256_alpha, + mm256_alpha_mask; - mm256_two_five_fives = _mm256_set1_epi16(0xFFFF); + mm256_rgb_invert_mask = _mm256_set1_epi32(rgbmask); mm256_alpha_mask = _mm256_set1_epi32(amask); __m256i _partial8_mask = @@ -94,11 +95,14 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) while (perfect_8_pixels_batch_counter--) { mm256_src = _mm256_loadu_si256(srcp256); + /* pull out the alpha */ + mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask); + /* do the invert */ - mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src); - /* blend the original alpha in */ - mm256_dst = - _mm256_blendv_epi8(mm256_dst, mm256_src, mm256_alpha_mask); + mm256_dst = _mm256_andnot_si256(mm256_src, mm256_rgb_invert_mask); + + /* put the alpha back in*/ + mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha); _mm256_storeu_si256(dstp256, mm256_dst); @@ -110,11 +114,14 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) if (remaining_pixels_batch_counter > 0) { mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask); + /* pull out the alpha */ + mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask); + /* do the invert */ - mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src); - /* blend the original alpha in */ - mm256_dst = - _mm256_blendv_epi8(mm256_dst, mm256_src, mm256_alpha_mask); + mm256_dst = _mm256_andnot_si256(mm256_src, mm256_rgb_invert_mask); + + /* put the alpha back in*/ + mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha); _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst); diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index e848253ec2..1dba551a03 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -63,15 +63,13 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask; Uint64 amask64 = ~rgbmask64; - __m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives, mm_alpha_mask, - mm_rgb_mask; + __m128i mm_src, mm_dst, mm_alpha, mm_rgb_invert_mask, mm_alpha_mask; __m128i *srcp128 = (__m128i *)src->pixels; __m128i *dstp128 = (__m128i *)newsurf->pixels; - mm_rgb_mask = _mm_set1_epi64x(rgbmask64); + mm_rgb_invert_mask = _mm_set1_epi64x(rgbmask64); mm_alpha_mask = _mm_set1_epi64x(amask64); - mm_two_five_fives = _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF); while (num_batches--) { perfect_4_pixels_batch_counter = perfect_4_pixels; @@ -79,18 +77,16 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) while (perfect_4_pixels_batch_counter--) { mm_src = _mm_loadu_si128(srcp128); /*mm_src = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/ - /* First we strip out the alpha so we have one of our 4 channels - empty for the rest of the calculation */ - mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); - /*mm_src = 0x00RRGGBB00RRGGBB00RRGGBB00RRGGBB*/ - - /*invert the colours*/ - mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src); - - /*add the alpha channel back*/ - mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); - mm_dst = _mm_adds_epu8(mm_dst, mm_alpha); - /*mm_dst = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/ + + /* pull out the alpha */ + mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask); + + /* do the invert */ + mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask); + + /* put the alpha back in*/ + mm_dst = _mm_or_si128(mm_dst, mm_alpha); + _mm_storeu_si128(dstp128, mm_dst); /*dstp = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/ srcp128++; @@ -101,18 +97,16 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) if (remaining_pixels_batch_counter > 0) { mm_src = _mm_cvtsi32_si128(*srcp); /*mm_src = 0x000000000000000000000000AARRGGBB*/ - /* First we strip out the alpha so we have one of our 4 channels - empty for the rest of the calculation */ - mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask); - /*mm_src = 0x00000000000000000000000000RRGGBB*/ - - /*invert the colours*/ - mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src); - - /*add the alpha channel back*/ - mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask); - mm_dst = _mm_adds_epu8(mm_dst, mm_alpha); - /*mm_dst = 0x000000000000000000000000AARRGGBB*/ + + /* pull out the alpha */ + mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask); + + /* do the invert */ + mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask); + + /* put the alpha back in*/ + mm_dst = _mm_or_si128(mm_dst, mm_alpha); + *dstp = _mm_cvtsi128_si32(mm_dst); /*dstp = 0xAARRGGBB*/ srcp++; From dd0b29f5dd9e076d28faf9556dd2f886f748d9f3 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Mon, 6 Nov 2023 21:33:38 +0000 Subject: [PATCH 06/10] Disable SIMD on Emscripten --- src_c/transform.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src_c/transform.c b/src_c/transform.c index e199c128b0..ba271962dd 100644 --- a/src_c/transform.c +++ b/src_c/transform.c @@ -3347,7 +3347,9 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) PyExc_ValueError, "Source and destination surfaces need the same format.")); } - +#if defined(__EMSCRIPTEN__) + invert_non_simd(src, newsurf); +#else // !defined(__EMSCRIPTEN__) if (src->format->BytesPerPixel == 4 && src->format->Rmask == newsurf->format->Rmask && src->format->Gmask == newsurf->format->Gmask && @@ -3368,6 +3370,7 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj) else { invert_non_simd(src, newsurf); } +#endif // !defined(__EMSCRIPTEN__) SDL_UnlockSurface(newsurf); From cc2de58057d2d9e32e02f0ae1c394fddbc3f18b1 Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Sun, 12 Nov 2023 11:22:38 +0000 Subject: [PATCH 07/10] Simplify rgbmask Co-authored-by: Alberto <103119829+itzpr3d4t0r@users.noreply.github.com> --- src_c/simd_transform_avx2.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index 04f0afe53d..2214c09a05 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -67,9 +67,8 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) Uint32 *srcp = (Uint32 *)src->pixels; Uint32 *dstp = (Uint32 *)newsurf->pixels; - Uint32 rgbmask = - (src->format->Rmask | src->format->Gmask | src->format->Bmask); - Uint32 amask = ~rgbmask; + Uint32 amask = src->format->Amask; + Uint32 rgbmask = ~amask; __m256i *srcp256 = (__m256i *)src->pixels; __m256i *dstp256 = (__m256i *)newsurf->pixels; From 402d415f50291ec7a61c8178b4571ee2d0df26dc Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Sun, 12 Nov 2023 11:22:58 +0000 Subject: [PATCH 08/10] Simplify rgbmask Co-authored-by: Alberto <103119829+itzpr3d4t0r@users.noreply.github.com> --- src_c/simd_transform_sse2.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index deae0d9f93..303c4bf9f1 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -434,8 +434,7 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) Uint32 *srcp = (Uint32 *)src->pixels; Uint32 *dstp = (Uint32 *)newsurf->pixels; - Uint32 rgbmask = - (src->format->Rmask | src->format->Gmask | src->format->Bmask); + Uint32 rgbmask = ~src->format->Amask; Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask; Uint64 amask64 = ~rgbmask64; From 547b70506f56c217087e1ec3868c18832535acda Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Sun, 12 Nov 2023 11:25:17 +0000 Subject: [PATCH 09/10] Simplify sub-8 pixel mask --- src_c/simd_transform_avx2.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c index 2214c09a05..93a9e5d8ad 100644 --- a/src_c/simd_transform_avx2.c +++ b/src_c/simd_transform_avx2.c @@ -79,14 +79,11 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf) mm256_rgb_invert_mask = _mm256_set1_epi32(rgbmask); mm256_alpha_mask = _mm256_set1_epi32(amask); - __m256i _partial8_mask = - _mm256_set_epi32(0x00, (remaining_pixels > 6) ? 0x80000000 : 0x00, - (remaining_pixels > 5) ? 0x80000000 : 0x00, - (remaining_pixels > 4) ? 0x80000000 : 0x00, - (remaining_pixels > 3) ? 0x80000000 : 0x00, - (remaining_pixels > 2) ? 0x80000000 : 0x00, - (remaining_pixels > 1) ? 0x80000000 : 0x00, - (remaining_pixels > 0) ? 0x80000000 : 0x00); + __m256i _partial8_mask = _mm256_set_epi32( + 0x00, (remaining_pixels > 6) ? -1 : 0, (remaining_pixels > 5) ? -1 : 0, + (remaining_pixels > 4) ? -1 : 0, (remaining_pixels > 3) ? -1 : 0, + (remaining_pixels > 2) ? -1 : 0, (remaining_pixels > 1) ? -1 : 0, + (remaining_pixels > 0) ? -1 : 0); while (num_batches--) { perfect_8_pixels_batch_counter = perfect_8_pixels; From 63c6467cc0af397e456e653aad43cd6fd811000e Mon Sep 17 00:00:00 2001 From: Dan Lawrence Date: Mon, 11 Dec 2023 18:52:36 +0000 Subject: [PATCH 10/10] Fix missing pixels for SSE2 algorithm Also, simplify mask code. --- src_c/simd_transform_sse2.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c index e7471e3cbe..b204994c1f 100644 --- a/src_c/simd_transform_sse2.c +++ b/src_c/simd_transform_sse2.c @@ -632,17 +632,13 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) Uint32 *srcp = (Uint32 *)src->pixels; Uint32 *dstp = (Uint32 *)newsurf->pixels; - Uint32 rgbmask = ~src->format->Amask; - Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask; - Uint64 amask64 = ~rgbmask64; - __m128i mm_src, mm_dst, mm_alpha, mm_rgb_invert_mask, mm_alpha_mask; __m128i *srcp128 = (__m128i *)src->pixels; __m128i *dstp128 = (__m128i *)newsurf->pixels; - mm_rgb_invert_mask = _mm_set1_epi64x(rgbmask64); - mm_alpha_mask = _mm_set1_epi64x(amask64); + mm_rgb_invert_mask = _mm_set1_epi32(~src->format->Amask); + mm_alpha_mask = _mm_set1_epi32(src->format->Amask); while (num_batches--) { perfect_4_pixels_batch_counter = perfect_4_pixels; @@ -667,7 +663,7 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf) } srcp = (Uint32 *)srcp128; dstp = (Uint32 *)dstp128; - if (remaining_pixels_batch_counter > 0) { + while (remaining_pixels_batch_counter--) { mm_src = _mm_cvtsi32_si128(*srcp); /*mm_src = 0x000000000000000000000000AARRGGBB*/