From c5baae8a24447838268c29d5eb2611e7e3588d56 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Fri, 27 Oct 2023 16:02:15 +0100
Subject: [PATCH 01/10] Add SIMD versions of transform.invert()

---
 src_c/simd_transform.h      |  4 ++
 src_c/simd_transform_avx2.c | 96 ++++++++++++++++++++++++++++++++++++
 src_c/simd_transform_sse2.c | 98 +++++++++++++++++++++++++++++++++++++
 src_c/transform.c           | 49 ++++++++++++++-----
 4 files changed, 235 insertions(+), 12 deletions(-)

diff --git a/src_c/simd_transform.h b/src_c/simd_transform.h
index d97a0d5b22..ed4da10cc0 100644
--- a/src_c/simd_transform.h
+++ b/src_c/simd_transform.h
@@ -9,6 +9,10 @@
 
 // SSE2 functions
 #if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
+void
+invert_sse2(SDL_Surface *src, SDL_Surface *newsurf);
 #endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */
 
 // AVX2 functions
+void
+invert_avx2(SDL_Surface *src, SDL_Surface *newsurf);
diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index f1889f1bb7..61760f8846 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -42,3 +42,99 @@ pg_avx2_at_runtime_but_uncompiled()
     }
     return 0;
 }
+
+#if defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
+    !defined(SDL_DISABLE_IMMINTRIN_H)
+void
+invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    int s_row_skip = (src->pitch - src->w * 4) / 4;
+
+    // generate number of batches of pixels we need to loop through
+    int pixel_batch_length = src->w * src->h;
+    int num_batches = 1;
+    if (s_row_skip > 0) {
+        pixel_batch_length = src->w;
+        num_batches = src->h;
+    }
+
+    int remaining_pixels = pixel_batch_length % 8;
+    int perfect_8_pixels = pixel_batch_length / 8;
+
+    int perfect_8_pixels_batch_counter = perfect_8_pixels;
+    int remaining_pixels_batch_counter = remaining_pixels;
+
+    Uint32 *srcp = (Uint32 *)src->pixels;
+    Uint32 *dstp = (Uint32 *)newsurf->pixels;
+
+    Uint32 rgbmask =
+        (src->format->Rmask | src->format->Gmask | src->format->Bmask);
+    Uint32 amask = ~rgbmask;
+
+    __m256i *srcp256 = (__m256i *)src->pixels;
+    __m256i *dstp256 = (__m256i *)newsurf->pixels;
+
+    __m256i mm256_src, mm256_dst, mm256_two_five_fives,
+        mm256_alpha, mm256_rgb_mask, mm256_alpha_mask;
+
+    mm256_two_five_fives = _mm256_set1_epi16(0xFFFF);
+    mm256_rgb_mask = _mm256_set1_epi32(rgbmask);
+    mm256_alpha_mask = _mm256_set1_epi32(amask);
+
+    __m256i _partial8_mask =
+        _mm256_set_epi32(0x00, (remaining_pixels > 6) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 5) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 4) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 3) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 2) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 1) ? 0x80000000 : 0x00,
+                         (remaining_pixels > 0) ? 0x80000000 : 0x00);
+
+    while (num_batches--) {
+        perfect_8_pixels_batch_counter = perfect_8_pixels;
+        remaining_pixels_batch_counter = remaining_pixels;
+        while (perfect_8_pixels_batch_counter--) {
+            mm256_src = _mm256_loadu_si256(srcp256);
+            mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask);
+
+            /* do the invert */
+            mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src);
+
+            mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask);
+            mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha);
+
+            _mm256_storeu_si256(dstp256, mm256_dst);
+
+            srcp256++;
+            dstp256++;
+        }
+        srcp = (Uint32 *)srcp256;
+        dstp = (Uint32 *)dstp256;
+        if (remaining_pixels_batch_counter > 0) {
+            mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);
+            mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask);
+
+            /* do the invert */
+            mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src);
+
+            mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask);
+            mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha);
+
+            _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);
+
+            srcp += remaining_pixels_batch_counter;
+            dstp += remaining_pixels_batch_counter;
+        }
+        srcp += s_row_skip;
+        srcp256 = (__m256i *)srcp;
+    }
+}
+#else
+void
+invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    BAD_AVX2_FUNCTION_CALL;
+}
+#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
+          !defined(SDL_DISABLE_IMMINTRIN_H) */
+
diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index 8f503b964f..f4a893da4f 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -34,3 +34,101 @@ pg_neon_at_runtime_but_uncompiled()
     }
     return 0;
 }
+
+#if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON))
+
+#if defined(ENV64BIT)
+#define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num)
+#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(*reg)
+#else
+#define LOAD_64_INTO_M128(num, reg) \
+    *reg = _mm_loadl_epi64((const __m128i *)num)
+#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, *reg)
+#endif
+
+void
+invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    int s_row_skip = (src->pitch - src->w * 4) / 4;
+
+    // generate number of batches of pixels we need to loop through
+    int pixel_batch_length = src->w * src->h;
+    int num_batches = 1;
+    if (s_row_skip > 0) {
+        pixel_batch_length = src->w;
+        num_batches = src->h;
+    }
+    int remaining_pixels = pixel_batch_length % 2;
+    int perfect_2_pixels = pixel_batch_length / 2;
+
+    int perfect_2_pixels_batch_counter = perfect_2_pixels;
+    int remaining_pixels_batch_counter = remaining_pixels;
+
+    Uint32 *srcp = (Uint32 *)src->pixels;
+    Uint32 *dstp = (Uint32 *)newsurf->pixels;
+
+    Uint32 rgbmask =
+        (src->format->Rmask | src->format->Gmask | src->format->Bmask);
+    Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask;
+    Uint64 amask64 = ~rgbmask64;
+
+    Uint64 *srcp64 = (Uint64 *)src->pixels;
+    Uint64 *dstp64 = (Uint64 *)newsurf->pixels;
+
+    __m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives,
+        mm_alpha_mask, mm_rgb_mask;
+
+    LOAD_64_INTO_M128(&amask64, &mm_alpha_mask);
+    LOAD_64_INTO_M128(&rgbmask64, &mm_rgb_mask);
+    mm_two_five_fives = _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF);
+
+    while (num_batches--) {
+        perfect_2_pixels_batch_counter = perfect_2_pixels;
+        remaining_pixels_batch_counter = remaining_pixels;
+        while (perfect_2_pixels_batch_counter--) {
+            LOAD_64_INTO_M128(srcp64, &mm_src);
+            /*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/
+            /* First we strip out the alpha so we have one of our 4 channels
+               empty for the rest of the calculation */
+            mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
+            /*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/
+
+            /*invert the colours*/
+            mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src);
+
+            /*add the alpha channel back*/
+            mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
+            mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
+            /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/
+            STORE_M128_INTO_64(&mm_dst, dstp64);
+            /*dstp = 0xAARRGGBB*/
+            srcp64++;
+            dstp64++;
+        }
+        srcp = (Uint32 *)srcp64;
+        dstp = (Uint32 *)dstp64;
+        if (remaining_pixels_batch_counter > 0) {
+            mm_src = _mm_cvtsi32_si128(*srcp);
+            /*mm_src = 0x000000000000000000000000AARRGGBB*/
+            /* First we strip out the alpha so we have one of our 4 channels
+               empty for the rest of the calculation */
+            mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
+            /*mm_src = 0x00000000000000000000000000RRGGBB*/
+
+             /*invert the colours*/
+            mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src);
+
+            /*add the alpha channel back*/
+            mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
+            mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
+            /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/
+            *dstp = _mm_cvtsi128_si32(mm_dst);
+            /*dstp = 0xAARRGGBB*/
+            srcp++;
+            dstp++;
+        }
+        srcp += s_row_skip;
+        srcp64 = (Uint64 *)srcp;
+    }
+}
+#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/
diff --git a/src_c/transform.c b/src_c/transform.c
index 5a5d10faa6..a867b5e563 100644
--- a/src_c/transform.c
+++ b/src_c/transform.c
@@ -3279,6 +3279,25 @@ surf_gaussian_blur(PyObject *self, PyObject *args, PyObject *kwargs)
     return (PyObject *)pgSurface_New(new_surf);
 }
 
+void
+invert_non_simd(SDL_Surface *src, SDL_Surface *newsurf)
+{
+    int x, y;
+    for (y = 0; y < src->h; y++) {
+        for (x = 0; x < src->w; x++) {
+            Uint32 pixel;
+            Uint8 *pix;
+            SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
+                        pix);
+            unsigned char r, g, b, a;
+            SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
+            Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a);
+            SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
+                        newsurf->format, pix);
+        }
+    }
+}
+
 SDL_Surface *
 invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
 {
@@ -3306,19 +3325,25 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
             "Source and destination surfaces need the same format."));
     }
 
-    int x, y;
-    for (y = 0; y < src->h; y++) {
-        for (x = 0; x < src->w; x++) {
-            Uint32 pixel;
-            Uint8 *pix;
-            SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
-                        pix);
-            unsigned char r, g, b, a;
-            SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
-            Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a);
-            SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
-                        newsurf->format, pix);
+    if (src->format->BytesPerPixel == 4 &&
+        src->format->Rmask == newsurf->format->Rmask &&
+        src->format->Gmask == newsurf->format->Gmask &&
+        src->format->Bmask == newsurf->format->Bmask &&
+        (src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) {
+        if (pg_has_avx2()) {
+            invert_avx2(src, newsurf);
+        }
+#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
+        else if (pg_HasSSE_NEON()) {
+            invert_sse2(src, newsurf);
         }
+#endif  // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
+        else {
+            invert_non_simd(src, newsurf);
+        }
+    }
+    else {
+        invert_non_simd(src, newsurf);
     }
 
     SDL_UnlockSurface(newsurf);

From 5e804cdb621829fa30b05ae2553a73d40adff1d1 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Fri, 27 Oct 2023 16:33:47 +0100
Subject: [PATCH 02/10] switch SSE2 to use four pixels at a time.

---
 src_c/simd_transform_sse2.c | 53 +++++++++++++++----------------------
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index f4a893da4f..070643f66a 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -37,15 +37,6 @@ pg_neon_at_runtime_but_uncompiled()
 
 #if (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON))
 
-#if defined(ENV64BIT)
-#define LOAD_64_INTO_M128(num, reg) *reg = _mm_cvtsi64_si128(*num)
-#define STORE_M128_INTO_64(reg, num) *num = _mm_cvtsi128_si64(*reg)
-#else
-#define LOAD_64_INTO_M128(num, reg) \
-    *reg = _mm_loadl_epi64((const __m128i *)num)
-#define STORE_M128_INTO_64(reg, num) _mm_storel_epi64((__m128i *)num, *reg)
-#endif
-
 void
 invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
 {
@@ -58,10 +49,10 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
         pixel_batch_length = src->w;
         num_batches = src->h;
     }
-    int remaining_pixels = pixel_batch_length % 2;
-    int perfect_2_pixels = pixel_batch_length / 2;
+    int remaining_pixels = pixel_batch_length % 4;
+    int perfect_4_pixels = pixel_batch_length / 4;
 
-    int perfect_2_pixels_batch_counter = perfect_2_pixels;
+    int perfect_4_pixels_batch_counter = perfect_4_pixels;
     int remaining_pixels_batch_counter = remaining_pixels;
 
     Uint32 *srcp = (Uint32 *)src->pixels;
@@ -72,26 +63,26 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
     Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask;
     Uint64 amask64 = ~rgbmask64;
 
-    Uint64 *srcp64 = (Uint64 *)src->pixels;
-    Uint64 *dstp64 = (Uint64 *)newsurf->pixels;
-
     __m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives,
         mm_alpha_mask, mm_rgb_mask;
 
-    LOAD_64_INTO_M128(&amask64, &mm_alpha_mask);
-    LOAD_64_INTO_M128(&rgbmask64, &mm_rgb_mask);
+    __m128i *srcp128 = (__m128i *)src->pixels;
+    __m128i *dstp128 = (__m128i *)newsurf->pixels;
+
+    mm_rgb_mask = _mm_set1_epi64x(rgbmask64);
+    mm_alpha_mask = _mm_set1_epi64x(amask64);
     mm_two_five_fives = _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF);
 
     while (num_batches--) {
-        perfect_2_pixels_batch_counter = perfect_2_pixels;
+        perfect_4_pixels_batch_counter = perfect_4_pixels;
         remaining_pixels_batch_counter = remaining_pixels;
-        while (perfect_2_pixels_batch_counter--) {
-            LOAD_64_INTO_M128(srcp64, &mm_src);
-            /*mm_src = 0x0000000000000000AARRGGBBAARRGGBB*/
+        while (perfect_4_pixels_batch_counter--) {
+            mm_src = _mm_loadu_si128(srcp128);
+            /*mm_src = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
             /* First we strip out the alpha so we have one of our 4 channels
                empty for the rest of the calculation */
             mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
-            /*mm_src = 0x000000000000000000RRGGBB00RRGGBB*/
+            /*mm_src = 0x00RRGGBB00RRGGBB00RRGGBB00RRGGBB*/
 
             /*invert the colours*/
             mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src);
@@ -99,14 +90,14 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
             /*add the alpha channel back*/
             mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
             mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
-            /*mm_dst = 0x0000000000000000AAGrGrGrGrGrGrAAGrGrGrGrGrGr*/
-            STORE_M128_INTO_64(&mm_dst, dstp64);
-            /*dstp = 0xAARRGGBB*/
-            srcp64++;
-            dstp64++;
+            /*mm_dst = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
+            _mm_storeu_si128(dstp128, mm_dst);
+            /*dstp = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
+            srcp128++;
+            dstp128++;
         }
-        srcp = (Uint32 *)srcp64;
-        dstp = (Uint32 *)dstp64;
+        srcp = (Uint32 *)srcp128;
+        dstp = (Uint32 *)dstp128;
         if (remaining_pixels_batch_counter > 0) {
             mm_src = _mm_cvtsi32_si128(*srcp);
             /*mm_src = 0x000000000000000000000000AARRGGBB*/
@@ -121,14 +112,14 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
             /*add the alpha channel back*/
             mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
             mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
-            /*mm_dst = 0x000000000000000000000000AAGrGrGrGrGrGr*/
+            /*mm_dst = 0x000000000000000000000000AARRGGBB*/
             *dstp = _mm_cvtsi128_si32(mm_dst);
             /*dstp = 0xAARRGGBB*/
             srcp++;
             dstp++;
         }
         srcp += s_row_skip;
-        srcp64 = (Uint64 *)srcp;
+        srcp128 = (__m128i*)srcp;
     }
 }
 #endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/

From fb492df1e114109ffe0201ff793cd1b15c09add6 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Fri, 27 Oct 2023 16:40:34 +0100
Subject: [PATCH 03/10] clang format pass

---
 src_c/simd_transform_avx2.c | 5 ++---
 src_c/simd_transform_sse2.c | 8 ++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index 61760f8846..05d6ebd102 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -74,8 +74,8 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     __m256i *srcp256 = (__m256i *)src->pixels;
     __m256i *dstp256 = (__m256i *)newsurf->pixels;
 
-    __m256i mm256_src, mm256_dst, mm256_two_five_fives,
-        mm256_alpha, mm256_rgb_mask, mm256_alpha_mask;
+    __m256i mm256_src, mm256_dst, mm256_two_five_fives, mm256_alpha,
+        mm256_rgb_mask, mm256_alpha_mask;
 
     mm256_two_five_fives = _mm256_set1_epi16(0xFFFF);
     mm256_rgb_mask = _mm256_set1_epi32(rgbmask);
@@ -137,4 +137,3 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
 }
 #endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
           !defined(SDL_DISABLE_IMMINTRIN_H) */
-
diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index 070643f66a..e848253ec2 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -63,8 +63,8 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
     Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask;
     Uint64 amask64 = ~rgbmask64;
 
-    __m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives,
-        mm_alpha_mask, mm_rgb_mask;
+    __m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives, mm_alpha_mask,
+        mm_rgb_mask;
 
     __m128i *srcp128 = (__m128i *)src->pixels;
     __m128i *dstp128 = (__m128i *)newsurf->pixels;
@@ -106,7 +106,7 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
             mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
             /*mm_src = 0x00000000000000000000000000RRGGBB*/
 
-             /*invert the colours*/
+            /*invert the colours*/
             mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src);
 
             /*add the alpha channel back*/
@@ -119,7 +119,7 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
             dstp++;
         }
         srcp += s_row_skip;
-        srcp128 = (__m128i*)srcp;
+        srcp128 = (__m128i *)srcp;
     }
 }
 #endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/

From 6f8628c52d9ed45c6fa8a27a5551e6649bd8026f Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Mon, 30 Oct 2023 20:10:29 +0000
Subject: [PATCH 04/10] Replace AVX alpha subtraction with blendv

slightly faster
---
 src_c/simd_transform_avx2.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index 05d6ebd102..df504ccc4e 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -67,9 +67,8 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     Uint32 *srcp = (Uint32 *)src->pixels;
     Uint32 *dstp = (Uint32 *)newsurf->pixels;
 
-    Uint32 rgbmask =
-        (src->format->Rmask | src->format->Gmask | src->format->Bmask);
-    Uint32 amask = ~rgbmask;
+    Uint32 amask =
+        ~(src->format->Rmask | src->format->Gmask | src->format->Bmask);
 
     __m256i *srcp256 = (__m256i *)src->pixels;
     __m256i *dstp256 = (__m256i *)newsurf->pixels;
@@ -78,7 +77,6 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
         mm256_rgb_mask, mm256_alpha_mask;
 
     mm256_two_five_fives = _mm256_set1_epi16(0xFFFF);
-    mm256_rgb_mask = _mm256_set1_epi32(rgbmask);
     mm256_alpha_mask = _mm256_set1_epi32(amask);
 
     __m256i _partial8_mask =
@@ -95,13 +93,12 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
         remaining_pixels_batch_counter = remaining_pixels;
         while (perfect_8_pixels_batch_counter--) {
             mm256_src = _mm256_loadu_si256(srcp256);
-            mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask);
 
             /* do the invert */
             mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src);
-
-            mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask);
-            mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha);
+            /* blend the original alpha in */
+            mm256_dst =
+                _mm256_blendv_epi8(mm256_dst, mm256_src, mm256_alpha_mask);
 
             _mm256_storeu_si256(dstp256, mm256_dst);
 
@@ -112,13 +109,12 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
         dstp = (Uint32 *)dstp256;
         if (remaining_pixels_batch_counter > 0) {
             mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);
-            mm256_alpha = _mm256_subs_epu8(mm256_src, mm256_rgb_mask);
 
             /* do the invert */
             mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src);
-
-            mm256_dst = _mm256_subs_epu8(mm256_dst, mm256_alpha_mask);
-            mm256_dst = _mm256_adds_epu8(mm256_dst, mm256_alpha);
+            /* blend the original alpha in */
+            mm256_dst =
+                _mm256_blendv_epi8(mm256_dst, mm256_src, mm256_alpha_mask);
 
             _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);
 

From 379e4da51c3de6ff12fa6af4532fb7c6339e4e20 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Tue, 31 Oct 2023 19:23:11 +0000
Subject: [PATCH 05/10] Switch to using bitwise logic instructions

For better CPI and throughput
---
 src_c/simd_transform_avx2.c | 33 ++++++++++++++----------
 src_c/simd_transform_sse2.c | 50 ++++++++++++++++---------------------
 2 files changed, 42 insertions(+), 41 deletions(-)

diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index df504ccc4e..04f0afe53d 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -67,16 +67,17 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     Uint32 *srcp = (Uint32 *)src->pixels;
     Uint32 *dstp = (Uint32 *)newsurf->pixels;
 
-    Uint32 amask =
-        ~(src->format->Rmask | src->format->Gmask | src->format->Bmask);
+    Uint32 rgbmask =
+        (src->format->Rmask | src->format->Gmask | src->format->Bmask);
+    Uint32 amask = ~rgbmask;
 
     __m256i *srcp256 = (__m256i *)src->pixels;
     __m256i *dstp256 = (__m256i *)newsurf->pixels;
 
-    __m256i mm256_src, mm256_dst, mm256_two_five_fives, mm256_alpha,
-        mm256_rgb_mask, mm256_alpha_mask;
+    __m256i mm256_src, mm256_dst, mm256_rgb_invert_mask, mm256_alpha,
+        mm256_alpha_mask;
 
-    mm256_two_five_fives = _mm256_set1_epi16(0xFFFF);
+    mm256_rgb_invert_mask = _mm256_set1_epi32(rgbmask);
     mm256_alpha_mask = _mm256_set1_epi32(amask);
 
     __m256i _partial8_mask =
@@ -94,11 +95,14 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
         while (perfect_8_pixels_batch_counter--) {
             mm256_src = _mm256_loadu_si256(srcp256);
 
+            /* pull out the alpha */
+            mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
+
             /* do the invert */
-            mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src);
-            /* blend the original alpha in */
-            mm256_dst =
-                _mm256_blendv_epi8(mm256_dst, mm256_src, mm256_alpha_mask);
+            mm256_dst = _mm256_andnot_si256(mm256_src, mm256_rgb_invert_mask);
+
+            /* put the alpha back in*/
+            mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
 
             _mm256_storeu_si256(dstp256, mm256_dst);
 
@@ -110,11 +114,14 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
         if (remaining_pixels_batch_counter > 0) {
             mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);
 
+            /* pull out the alpha */
+            mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);
+
             /* do the invert */
-            mm256_dst = _mm256_subs_epu8(mm256_two_five_fives, mm256_src);
-            /* blend the original alpha in */
-            mm256_dst =
-                _mm256_blendv_epi8(mm256_dst, mm256_src, mm256_alpha_mask);
+            mm256_dst = _mm256_andnot_si256(mm256_src, mm256_rgb_invert_mask);
+
+            /* put the alpha back in*/
+            mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);
 
             _mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);
 
diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index e848253ec2..1dba551a03 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -63,15 +63,13 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
     Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask;
     Uint64 amask64 = ~rgbmask64;
 
-    __m128i mm_src, mm_dst, mm_alpha, mm_two_five_fives, mm_alpha_mask,
-        mm_rgb_mask;
+    __m128i mm_src, mm_dst, mm_alpha, mm_rgb_invert_mask, mm_alpha_mask;
 
     __m128i *srcp128 = (__m128i *)src->pixels;
     __m128i *dstp128 = (__m128i *)newsurf->pixels;
 
-    mm_rgb_mask = _mm_set1_epi64x(rgbmask64);
+    mm_rgb_invert_mask = _mm_set1_epi64x(rgbmask64);
     mm_alpha_mask = _mm_set1_epi64x(amask64);
-    mm_two_five_fives = _mm_set1_epi64x(0xFFFFFFFFFFFFFFFF);
 
     while (num_batches--) {
         perfect_4_pixels_batch_counter = perfect_4_pixels;
@@ -79,18 +77,16 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
         while (perfect_4_pixels_batch_counter--) {
             mm_src = _mm_loadu_si128(srcp128);
             /*mm_src = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
-            /* First we strip out the alpha so we have one of our 4 channels
-               empty for the rest of the calculation */
-            mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
-            /*mm_src = 0x00RRGGBB00RRGGBB00RRGGBB00RRGGBB*/
-
-            /*invert the colours*/
-            mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src);
-
-            /*add the alpha channel back*/
-            mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
-            mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
-            /*mm_dst = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
+
+            /* pull out the alpha */
+            mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
+
+            /* do the invert */
+            mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask);
+
+            /* put the alpha back in*/
+            mm_dst = _mm_or_si128(mm_dst, mm_alpha);
+
             _mm_storeu_si128(dstp128, mm_dst);
             /*dstp = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
             srcp128++;
@@ -101,18 +97,16 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
         if (remaining_pixels_batch_counter > 0) {
             mm_src = _mm_cvtsi32_si128(*srcp);
             /*mm_src = 0x000000000000000000000000AARRGGBB*/
-            /* First we strip out the alpha so we have one of our 4 channels
-               empty for the rest of the calculation */
-            mm_alpha = _mm_subs_epu8(mm_src, mm_rgb_mask);
-            /*mm_src = 0x00000000000000000000000000RRGGBB*/
-
-            /*invert the colours*/
-            mm_dst = _mm_subs_epu8(mm_two_five_fives, mm_src);
-
-            /*add the alpha channel back*/
-            mm_dst = _mm_subs_epu8(mm_dst, mm_alpha_mask);
-            mm_dst = _mm_adds_epu8(mm_dst, mm_alpha);
-            /*mm_dst = 0x000000000000000000000000AARRGGBB*/
+
+            /* pull out the alpha */
+            mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);
+
+            /* do the invert */
+            mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask);
+
+            /* put the alpha back in*/
+            mm_dst = _mm_or_si128(mm_dst, mm_alpha);
+
             *dstp = _mm_cvtsi128_si32(mm_dst);
             /*dstp = 0xAARRGGBB*/
             srcp++;

From dd0b29f5dd9e076d28faf9556dd2f886f748d9f3 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Mon, 6 Nov 2023 21:33:38 +0000
Subject: [PATCH 06/10] Disable SIMD on Emscripten

---
 src_c/transform.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src_c/transform.c b/src_c/transform.c
index e199c128b0..ba271962dd 100644
--- a/src_c/transform.c
+++ b/src_c/transform.c
@@ -3347,7 +3347,9 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
             PyExc_ValueError,
             "Source and destination surfaces need the same format."));
     }
-
+#if defined(__EMSCRIPTEN__)
+    invert_non_simd(src, newsurf);
+#else  // !defined(__EMSCRIPTEN__)
     if (src->format->BytesPerPixel == 4 &&
         src->format->Rmask == newsurf->format->Rmask &&
         src->format->Gmask == newsurf->format->Gmask &&
@@ -3368,6 +3370,7 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
     else {
         invert_non_simd(src, newsurf);
     }
+#endif  // !defined(__EMSCRIPTEN__)
 
     SDL_UnlockSurface(newsurf);
 

From cc2de58057d2d9e32e02f0ae1c394fddbc3f18b1 Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Sun, 12 Nov 2023 11:22:38 +0000
Subject: [PATCH 07/10] Simplify rgbmask

Co-authored-by: Alberto <103119829+itzpr3d4t0r@users.noreply.github.com>
---
 src_c/simd_transform_avx2.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index 04f0afe53d..2214c09a05 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -67,9 +67,8 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     Uint32 *srcp = (Uint32 *)src->pixels;
     Uint32 *dstp = (Uint32 *)newsurf->pixels;
 
-    Uint32 rgbmask =
-        (src->format->Rmask | src->format->Gmask | src->format->Bmask);
-    Uint32 amask = ~rgbmask;
+    Uint32 amask = src->format->Amask;
+    Uint32 rgbmask = ~amask;
 
     __m256i *srcp256 = (__m256i *)src->pixels;
     __m256i *dstp256 = (__m256i *)newsurf->pixels;

From 402d415f50291ec7a61c8178b4571ee2d0df26dc Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Sun, 12 Nov 2023 11:22:58 +0000
Subject: [PATCH 08/10] Simplify rgbmask

Co-authored-by: Alberto <103119829+itzpr3d4t0r@users.noreply.github.com>
---
 src_c/simd_transform_sse2.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index deae0d9f93..303c4bf9f1 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -434,8 +434,7 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
     Uint32 *srcp = (Uint32 *)src->pixels;
     Uint32 *dstp = (Uint32 *)newsurf->pixels;
 
-    Uint32 rgbmask =
-        (src->format->Rmask | src->format->Gmask | src->format->Bmask);
+    Uint32 rgbmask = ~src->format->Amask;
     Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask;
     Uint64 amask64 = ~rgbmask64;
 

From 547b70506f56c217087e1ec3868c18832535acda Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Sun, 12 Nov 2023 11:25:17 +0000
Subject: [PATCH 09/10] Simplify sub-8 pixel mask

---
 src_c/simd_transform_avx2.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/src_c/simd_transform_avx2.c b/src_c/simd_transform_avx2.c
index 2214c09a05..93a9e5d8ad 100644
--- a/src_c/simd_transform_avx2.c
+++ b/src_c/simd_transform_avx2.c
@@ -79,14 +79,11 @@ invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
     mm256_rgb_invert_mask = _mm256_set1_epi32(rgbmask);
     mm256_alpha_mask = _mm256_set1_epi32(amask);
 
-    __m256i _partial8_mask =
-        _mm256_set_epi32(0x00, (remaining_pixels > 6) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 5) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 4) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 3) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 2) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 1) ? 0x80000000 : 0x00,
-                         (remaining_pixels > 0) ? 0x80000000 : 0x00);
+    __m256i _partial8_mask = _mm256_set_epi32(
+        0x00, (remaining_pixels > 6) ? -1 : 0, (remaining_pixels > 5) ? -1 : 0,
+        (remaining_pixels > 4) ? -1 : 0, (remaining_pixels > 3) ? -1 : 0,
+        (remaining_pixels > 2) ? -1 : 0, (remaining_pixels > 1) ? -1 : 0,
+        (remaining_pixels > 0) ? -1 : 0);
 
     while (num_batches--) {
         perfect_8_pixels_batch_counter = perfect_8_pixels;

From 63c6467cc0af397e456e653aad43cd6fd811000e Mon Sep 17 00:00:00 2001
From: Dan Lawrence <danintheshed@gmail.com>
Date: Mon, 11 Dec 2023 18:52:36 +0000
Subject: [PATCH 10/10] Fix missing pixels for SSE2 algorithm

Also, simplify mask code.
---
 src_c/simd_transform_sse2.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src_c/simd_transform_sse2.c b/src_c/simd_transform_sse2.c
index e7471e3cbe..b204994c1f 100644
--- a/src_c/simd_transform_sse2.c
+++ b/src_c/simd_transform_sse2.c
@@ -632,17 +632,13 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
     Uint32 *srcp = (Uint32 *)src->pixels;
     Uint32 *dstp = (Uint32 *)newsurf->pixels;
 
-    Uint32 rgbmask = ~src->format->Amask;
-    Uint64 rgbmask64 = ((Uint64)rgbmask << 32) | rgbmask;
-    Uint64 amask64 = ~rgbmask64;
-
     __m128i mm_src, mm_dst, mm_alpha, mm_rgb_invert_mask, mm_alpha_mask;
 
     __m128i *srcp128 = (__m128i *)src->pixels;
     __m128i *dstp128 = (__m128i *)newsurf->pixels;
 
-    mm_rgb_invert_mask = _mm_set1_epi64x(rgbmask64);
-    mm_alpha_mask = _mm_set1_epi64x(amask64);
+    mm_rgb_invert_mask = _mm_set1_epi32(~src->format->Amask);
+    mm_alpha_mask = _mm_set1_epi32(src->format->Amask);
 
     while (num_batches--) {
         perfect_4_pixels_batch_counter = perfect_4_pixels;
@@ -667,7 +663,7 @@ invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
         }
         srcp = (Uint32 *)srcp128;
         dstp = (Uint32 *)dstp128;
-        if (remaining_pixels_batch_counter > 0) {
+        while (remaining_pixels_batch_counter--) {
             mm_src = _mm_cvtsi32_si128(*srcp);
             /*mm_src = 0x000000000000000000000000AARRGGBB*/