Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SIMD versions of the invert transform #2534

Merged
merged 13 commits into from
Apr 12, 2024
4 changes: 4 additions & 0 deletions src_c/simd_transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,13 @@ filter_expand_X_SSE2(Uint8 *srcpix, Uint8 *dstpix, int height, int srcpitch,
void
filter_expand_Y_SSE2(Uint8 *srcpix, Uint8 *dstpix, int width, int srcpitch,
int dstpitch, int srcheight, int dstheight);
void
invert_sse2(SDL_Surface *src, SDL_Surface *newsurf);

#endif /* (defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)) */

// AVX2 functions
void
grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf);
void
invert_avx2(SDL_Surface *src, SDL_Surface *newsurf);
89 changes: 89 additions & 0 deletions src_c/simd_transform_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -214,11 +214,100 @@ grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
srcp256 = (__m256i *)srcp;
}
}

void
invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
{
int s_row_skip = (src->pitch - src->w * 4) / 4;

// generate number of batches of pixels we need to loop through
int pixel_batch_length = src->w * src->h;
int num_batches = 1;
if (s_row_skip > 0) {
pixel_batch_length = src->w;
num_batches = src->h;
}

int remaining_pixels = pixel_batch_length % 8;
int perfect_8_pixels = pixel_batch_length / 8;

int perfect_8_pixels_batch_counter = perfect_8_pixels;
int remaining_pixels_batch_counter = remaining_pixels;

Uint32 *srcp = (Uint32 *)src->pixels;
Uint32 *dstp = (Uint32 *)newsurf->pixels;

Uint32 amask = src->format->Amask;
Uint32 rgbmask = ~amask;

__m256i *srcp256 = (__m256i *)src->pixels;
__m256i *dstp256 = (__m256i *)newsurf->pixels;

__m256i mm256_src, mm256_dst, mm256_rgb_invert_mask, mm256_alpha,
mm256_alpha_mask;

mm256_rgb_invert_mask = _mm256_set1_epi32(rgbmask);
mm256_alpha_mask = _mm256_set1_epi32(amask);

__m256i _partial8_mask = _mm256_set_epi32(
0x00, (remaining_pixels > 6) ? -1 : 0, (remaining_pixels > 5) ? -1 : 0,
(remaining_pixels > 4) ? -1 : 0, (remaining_pixels > 3) ? -1 : 0,
(remaining_pixels > 2) ? -1 : 0, (remaining_pixels > 1) ? -1 : 0,
(remaining_pixels > 0) ? -1 : 0);

while (num_batches--) {
perfect_8_pixels_batch_counter = perfect_8_pixels;
remaining_pixels_batch_counter = remaining_pixels;
while (perfect_8_pixels_batch_counter--) {
mm256_src = _mm256_loadu_si256(srcp256);

/* pull out the alpha */
mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);

/* do the invert */
mm256_dst = _mm256_andnot_si256(mm256_src, mm256_rgb_invert_mask);

/* put the alpha back in*/
mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);

_mm256_storeu_si256(dstp256, mm256_dst);

srcp256++;
dstp256++;
}
srcp = (Uint32 *)srcp256;
dstp = (Uint32 *)dstp256;
if (remaining_pixels_batch_counter > 0) {
mm256_src = _mm256_maskload_epi32((int *)srcp, _partial8_mask);

/* pull out the alpha */
mm256_alpha = _mm256_and_si256(mm256_src, mm256_alpha_mask);

/* do the invert */
mm256_dst = _mm256_andnot_si256(mm256_src, mm256_rgb_invert_mask);

/* put the alpha back in*/
mm256_dst = _mm256_or_si256(mm256_dst, mm256_alpha);

_mm256_maskstore_epi32((int *)dstp, _partial8_mask, mm256_dst);

srcp += remaining_pixels_batch_counter;
dstp += remaining_pixels_batch_counter;
}
srcp += s_row_skip;
srcp256 = (__m256i *)srcp;
}
}
#else
void
grayscale_avx2(SDL_Surface *src, SDL_Surface *newsurf)
{
BAD_AVX2_FUNCTION_CALL;
}
void
invert_avx2(SDL_Surface *src, SDL_Surface *newsurf)
{
BAD_AVX2_FUNCTION_CALL;
}
#endif /* defined(__AVX2__) && defined(HAVE_IMMINTRIN_H) && \
!defined(SDL_DISABLE_IMMINTRIN_H) */
76 changes: 76 additions & 0 deletions src_c/simd_transform_sse2.c
Original file line number Diff line number Diff line change
Expand Up @@ -610,4 +610,80 @@ grayscale_sse2(SDL_Surface *src, SDL_Surface *newsurf)
srcp64 = (Uint64 *)srcp;
}
}

void
invert_sse2(SDL_Surface *src, SDL_Surface *newsurf)
{
int s_row_skip = (src->pitch - src->w * 4) / 4;

// generate number of batches of pixels we need to loop through
int pixel_batch_length = src->w * src->h;
int num_batches = 1;
if (s_row_skip > 0) {
pixel_batch_length = src->w;
num_batches = src->h;
}
int remaining_pixels = pixel_batch_length % 4;
int perfect_4_pixels = pixel_batch_length / 4;

int perfect_4_pixels_batch_counter = perfect_4_pixels;
int remaining_pixels_batch_counter = remaining_pixels;

Uint32 *srcp = (Uint32 *)src->pixels;
Uint32 *dstp = (Uint32 *)newsurf->pixels;

__m128i mm_src, mm_dst, mm_alpha, mm_rgb_invert_mask, mm_alpha_mask;

__m128i *srcp128 = (__m128i *)src->pixels;
__m128i *dstp128 = (__m128i *)newsurf->pixels;

mm_rgb_invert_mask = _mm_set1_epi32(~src->format->Amask);
mm_alpha_mask = _mm_set1_epi32(src->format->Amask);

while (num_batches--) {
perfect_4_pixels_batch_counter = perfect_4_pixels;
remaining_pixels_batch_counter = remaining_pixels;
while (perfect_4_pixels_batch_counter--) {
mm_src = _mm_loadu_si128(srcp128);
/*mm_src = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/

/* pull out the alpha */
mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);

/* do the invert */
mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask);

/* put the alpha back in*/
mm_dst = _mm_or_si128(mm_dst, mm_alpha);

_mm_storeu_si128(dstp128, mm_dst);
/*dstp = 0xAARRGGBBAARRGGBBAARRGGBBAARRGGBB*/
srcp128++;
dstp128++;
}
srcp = (Uint32 *)srcp128;
dstp = (Uint32 *)dstp128;
while (remaining_pixels_batch_counter--) {
mm_src = _mm_cvtsi32_si128(*srcp);
/*mm_src = 0x000000000000000000000000AARRGGBB*/

/* pull out the alpha */
mm_alpha = _mm_and_si128(mm_src, mm_alpha_mask);

/* do the invert */
mm_dst = _mm_andnot_si128(mm_src, mm_rgb_invert_mask);

/* put the alpha back in*/
mm_dst = _mm_or_si128(mm_dst, mm_alpha);

*dstp = _mm_cvtsi128_si32(mm_dst);
/*dstp = 0xAARRGGBB*/
srcp++;
dstp++;
}
srcp += s_row_skip;
srcp128 = (__m128i *)srcp;
}
}

#endif /* __SSE2__ || PG_ENABLE_ARM_NEON*/
54 changes: 41 additions & 13 deletions src_c/transform.c
Original file line number Diff line number Diff line change
Expand Up @@ -3373,6 +3373,25 @@ surf_gaussian_blur(PyObject *self, PyObject *args, PyObject *kwargs)
return (PyObject *)pgSurface_New(new_surf);
}

void
invert_non_simd(SDL_Surface *src, SDL_Surface *newsurf)
{
int x, y;
for (y = 0; y < src->h; y++) {
for (x = 0; x < src->w; x++) {
Uint32 pixel;
Uint8 *pix;
SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
pix);
unsigned char r, g, b, a;
SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a);
SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
newsurf->format, pix);
}
}
}

SDL_Surface *
invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
{
Expand All @@ -3399,21 +3418,30 @@ invert(pgSurfaceObject *srcobj, pgSurfaceObject *dstobj)
PyExc_ValueError,
"Source and destination surfaces need the same format."));
}

int x, y;
for (y = 0; y < src->h; y++) {
for (x = 0; x < src->w; x++) {
Uint32 pixel;
Uint8 *pix;
SURF_GET_AT(pixel, src, x, y, (Uint8 *)src->pixels, src->format,
pix);
unsigned char r, g, b, a;
SDL_GetRGBA(pixel, src->format, &r, &g, &b, &a);
Uint32 new_pixel = SDL_MapRGBA(newsurf->format, ~r, ~g, ~b, a);
SURF_SET_AT(new_pixel, newsurf, x, y, (Uint8 *)newsurf->pixels,
newsurf->format, pix);
#if defined(__EMSCRIPTEN__)
invert_non_simd(src, newsurf);
#else // !defined(__EMSCRIPTEN__)
if (src->format->BytesPerPixel == 4 &&
MyreMylar marked this conversation as resolved.
Show resolved Hide resolved
src->format->Rmask == newsurf->format->Rmask &&
src->format->Gmask == newsurf->format->Gmask &&
src->format->Bmask == newsurf->format->Bmask &&
(src->pitch % 4 == 0) && (newsurf->pitch == (newsurf->w * 4))) {
if (pg_has_avx2()) {
invert_avx2(src, newsurf);
}
#if defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
else if (pg_HasSSE_NEON()) {
invert_sse2(src, newsurf);
}
#endif // defined(__SSE2__) || defined(PG_ENABLE_ARM_NEON)
else {
invert_non_simd(src, newsurf);
}
}
else {
invert_non_simd(src, newsurf);
}
#endif // !defined(__EMSCRIPTEN__)

SDL_UnlockSurface(newsurf);

Expand Down
Loading