From ef498f2baffba8a44227526100f0e90af3e0e869 Mon Sep 17 00:00:00 2001 From: Gleb Mazovetskiy Date: Tue, 20 Aug 2024 01:20:49 +0100 Subject: [PATCH] Speed up `world_draw_black_tile` Previously, there were 2 loops to render all the lines of a diamond. This changes it to call `RenderLeftTriangle` + `RenderRightTriangle`. That's 4 loops + some extra clipping calculations Yet, this version is ~28% faster. I don't understand why it's faster at all. Perhaps it's only faster on some hardware? Modern computers are magic. Anyway, here are the (very surprising) results on my Ryzen 3950x: ``` BASELINE=master BENCHMARK=dun_render_benchmark git checkout "$BASELINE" tools/build_and_run_benchmark.py -B "build-reld-${BASELINE}" --no-run "$BENCHMARK" git checkout - tools/build_and_run_benchmark.py --no-run "$BENCHMARK" tools/linux_reduced_cpu_variance_run.sh ~/google-benchmark/tools/compare.py -a benchmarks \ "build-reld-${BASELINE}/${BENCHMARK}" "build-reld/${BENCHMARK}" \ --benchmark_filter='.*BlackTile.*' --benchmark_repetitions=10 ``` ``` Comparing build-reld-master/dun_render_benchmark to build-reld/dun_render_benchmark Benchmark Time CPU Time Old Time New CPU Old CPU New ------------------------------------------------------------------------------------------------------------------------ BM_RenderBlackTile_pvalue 0.0002 0.0002 U Test, Repetitions: 10 vs 10 BM_RenderBlackTile_mean -0.2786 -0.2786 156 113 156 113 BM_RenderBlackTile_median -0.2808 -0.2808 157 113 157 113 BM_RenderBlackTile_stddev -0.3148 -0.3138 1 1 1 1 BM_RenderBlackTile_cv -0.0503 -0.0488 0 0 0 0 OVERALL_GEOMEAN -0.2785 -0.2786 0 0 0 0 ``` --- Source/engine/render/dun_render.cpp | 124 ++++------------------------ 1 file changed, 17 insertions(+), 107 deletions(-) diff --git a/Source/engine/render/dun_render.cpp b/Source/engine/render/dun_render.cpp index acac9ce7631..c1f4d152b90 100644 --- a/Source/engine/render/dun_render.cpp +++ b/Source/engine/render/dun_render.cpp @@ -1050,96 +1050,6 @@ DVL_ALWAYS_INLINE DVL_ATTRIBUTE_HOT void RenderTileDispatch(TileType tile, uint8 } } -// Blit with left and vertical clipping. -void RenderBlackTileClipLeftAndVertical(uint8_t *DVL_RESTRICT dst, uint16_t dstPitch, int sx, DiamondClipY clipY) -{ - dst += XStep * (LowerHeight - clipY.lowerBottom - 1); - // Lower triangle (drawn bottom to top): - const auto lowerMax = LowerHeight - clipY.lowerTop; - for (auto i = clipY.lowerBottom + 1; i <= lowerMax; ++i, dst -= dstPitch + XStep) { - const auto w = 2 * XStep * i; - const auto curX = sx + TILE_WIDTH / 2 - XStep * i; - if (curX >= 0) { - memset(dst, 0, w); - } else if (-curX <= w) { - memset(dst - curX, 0, w + curX); - } - } - dst += 2 * XStep + XStep * clipY.upperBottom; - // Upper triangle (drawn bottom to top): - const auto upperMax = TriangleUpperHeight - clipY.upperTop; - for (auto i = clipY.upperBottom; i < upperMax; ++i, dst -= dstPitch - XStep) { - const auto w = 2 * XStep * (TriangleUpperHeight - i); - const auto curX = sx + TILE_WIDTH / 2 - XStep * (TriangleUpperHeight - i); - if (curX >= 0) { - memset(dst, 0, w); - } else if (-curX <= w) { - memset(dst - curX, 0, w + curX); - } else { - break; - } - } -} - -// Blit with right and vertical clipping. -void RenderBlackTileClipRightAndVertical(uint8_t *DVL_RESTRICT dst, uint16_t dstPitch, int_fast16_t maxWidth, DiamondClipY clipY) -{ - dst += XStep * (LowerHeight - clipY.lowerBottom - 1); - // Lower triangle (drawn bottom to top): - const auto lowerMax = LowerHeight - clipY.lowerTop; - for (auto i = clipY.lowerBottom + 1; i <= lowerMax; ++i, dst -= dstPitch + XStep) { - const auto width = 2 * XStep * i; - const auto endX = TILE_WIDTH / 2 + XStep * i; - const auto skip = endX > maxWidth ? endX - maxWidth : 0; - if (width > skip) - memset(dst, 0, width - skip); - } - dst += 2 * XStep + XStep * clipY.upperBottom; - // Upper triangle (drawn bottom to top): - const auto upperMax = TriangleUpperHeight - clipY.upperTop; - for (auto i = 1 + clipY.upperBottom; i <= upperMax; ++i, dst -= dstPitch - XStep) { - const auto width = TILE_WIDTH - 2 * XStep * i; - const auto endX = TILE_WIDTH / 2 + XStep * (TriangleUpperHeight - i + 1); - const auto skip = endX > maxWidth ? endX - maxWidth : 0; - if (width <= skip) - break; - memset(dst, 0, width - skip); - } -} - -// Blit with vertical clipping only. -void RenderBlackTileClipY(uint8_t *DVL_RESTRICT dst, uint16_t dstPitch, DiamondClipY clipY) -{ - dst += XStep * (LowerHeight - clipY.lowerBottom - 1); - // Lower triangle (drawn bottom to top): - const auto lowerMax = LowerHeight - clipY.lowerTop; - for (auto i = 1 + clipY.lowerBottom; i <= lowerMax; ++i, dst -= dstPitch + XStep) { - memset(dst, 0, 2 * XStep * i); - } - dst += 2 * XStep + XStep * clipY.upperBottom; - // Upper triangle (drawn bottom to top): - const auto upperMax = TriangleUpperHeight - clipY.upperTop; - for (auto i = 1 + clipY.upperBottom; i <= upperMax; ++i, dst -= dstPitch - XStep) { - memset(dst, 0, TILE_WIDTH - 2 * XStep * i); - } -} - -// Blit a black tile without clipping (must be fully in bounds). -void RenderBlackTileFull(uint8_t *DVL_RESTRICT dst, uint16_t dstPitch) -{ - dst += XStep * (LowerHeight - 1); - // Tile is fully in bounds, can use constant loop boundaries. - // Lower triangle (drawn bottom to top): - for (unsigned i = 1; i <= LowerHeight; ++i, dst -= dstPitch + XStep) { - memset(dst, 0, 2 * XStep * i); - } - dst += 2 * XStep; - // Upper triangle (drawn bottom to to top): - for (unsigned i = 1; i <= TriangleUpperHeight; ++i, dst -= dstPitch - XStep) { - memset(dst, 0, TILE_WIDTH - 2 * XStep * i); - } -} - } // namespace #ifdef DUN_RENDER_STATS @@ -1239,24 +1149,24 @@ void world_draw_black_tile(const Surface &out, int sx, int sy) #ifdef DEBUG_RENDER_OFFSET_Y sy += DEBUG_RENDER_OFFSET_Y; #endif - auto clip = CalculateClip(sx, sy, TILE_WIDTH, TriangleHeight, out); - if (clip.width <= 0 || clip.height <= 0) - return; + const Clip clipLeft = CalculateClip(sx, sy, Width, TriangleHeight, out); + if (clipLeft.height <= 0) return; + Clip clipRight; + clipRight.top = clipLeft.top; + clipRight.bottom = clipLeft.bottom; + clipRight.left = (sx + Width) < 0 ? -(sx + Width) : 0; + clipRight.right = sx + Width + Width > out.w() ? sx + Width + Width - out.w() : 0; + clipRight.width = Width - clipRight.left - clipRight.right; + clipRight.height = clipLeft.height; - auto clipY = CalculateDiamondClipY(clip); - uint8_t *dst = out.at(sx, static_cast(sy - clip.bottom)); - if (clip.width == TILE_WIDTH) { - if (clip.height == TriangleHeight) { - RenderBlackTileFull(dst, out.pitch()); - } else { - RenderBlackTileClipY(dst, out.pitch(), clipY); - } - } else { - if (clip.right == 0) { - RenderBlackTileClipLeftAndVertical(dst, out.pitch(), sx, clipY); - } else { - RenderBlackTileClipRightAndVertical(dst, out.pitch(), clip.width, clipY); - } + const uint16_t dstPitch = out.pitch(); + if (clipLeft.width > 0) { + uint8_t *dst = out.at(static_cast(sx + clipLeft.left), static_cast(sy - clipLeft.bottom)); + RenderLeftTriangle(dst, dstPitch, nullptr, nullptr, clipLeft); + } + if (clipRight.width > 0) { + uint8_t *dst = out.at(static_cast(sx + Width + clipRight.left), static_cast(sy - clipRight.bottom)); + RenderRightTriangle(dst + Width, dstPitch, nullptr, nullptr, clipRight); } }