Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speed improvements for discussion #4138

Open
wants to merge 38 commits into
base: 0_15
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
c3f472f
some improvements to consider
DedeHai Sep 11, 2024
9341768
more improvements to color_scale() now even faster.
DedeHai Sep 12, 2024
feac45f
improvement in color_add
DedeHai Sep 12, 2024
992d11b
Improvements in get/set PixelColor()
DedeHai Sep 12, 2024
b07658b
improved Segment::setPixelColorXY a tiny bit
DedeHai Sep 12, 2024
09428dc
inlined getMappedPixelIndex, improved color_add, bugfix in colorFromP…
DedeHai Sep 12, 2024
ec938f2
removed old code
DedeHai Sep 12, 2024
d45b4ad
fixes and consistency
DedeHai Sep 13, 2024
2afff05
minor tweak (break instead of continue in setPixelColorXY)
DedeHai Sep 14, 2024
6a37f25
memory improvement: dropped static gamma table
DedeHai Sep 14, 2024
0e5bd4e
remove test printout
DedeHai Sep 14, 2024
f3137eb
updated Segment::color_from_palette
DedeHai Sep 14, 2024
686866c
Merge remote-tracking branch 'upstream/0_15' into 0_15__speed_improve…
DedeHai Sep 18, 2024
6962905
cleanup and improved color_add()
DedeHai Sep 18, 2024
a88436c
revert removal of adding with saturation, renamed 'fast' to 'saturate'
DedeHai Sep 19, 2024
17d59d3
adding initialization to vStrip, added comment on padding bytes
DedeHai Sep 22, 2024
0a54002
removed IRAM_ATTR from inlined function
DedeHai Sep 22, 2024
33cf82a
Indentations and a few optimisations
blazoncek Sep 23, 2024
906f8fc
Fix C3 compiler issue.
blazoncek Sep 25, 2024
bef1ac2
Added HSV2RGB and RGB2HSV functions for higher accuracy conversions
DedeHai Sep 25, 2024
c44b9f8
Merge remote-tracking branch 'upstream/0_15' into 0_15__speed_improve…
DedeHai Sep 26, 2024
b404458
fixed one forgotten replacement of rgb2hsv_approximate
DedeHai Sep 26, 2024
a76a895
bugfix
DedeHai Sep 27, 2024
7c0fe12
updated setPixelColor() and getPixelColor() functions
DedeHai Sep 28, 2024
202901b
bugfix, ESP32 compiler requires the color order to be identical
DedeHai Sep 28, 2024
c842994
Pre-calculate virtual
blazoncek Sep 28, 2024
9114867
Fix compiler error
blazoncek Sep 28, 2024
ffbc8c5
Reverting addition of `bool unScale`, added new improvements and fixes
DedeHai Sep 29, 2024
336da25
Private global _colorScaled
blazoncek Sep 29, 2024
8e78fb4
Merge branch '0_15' into 0_15__speed_improvements
blazoncek Sep 29, 2024
0ae7329
Update comment
blazoncek Sep 29, 2024
ee380c5
Replace uint16_t with unsigned for segment data
blazoncek Sep 30, 2024
ba3a61f
Reduced code size by:
blazoncek Oct 2, 2024
a15c391
Improvement to `setPixelColorXY` and some flash optimisations
DedeHai Oct 3, 2024
ca06214
removed todo.
DedeHai Oct 3, 2024
eb5ad23
Minor tweaks and whitespace
blazoncek Oct 5, 2024
be64930
Indentation and shadowed variable.
blazoncek Oct 7, 2024
210191b
Fix for realtime drawing on main segment
blazoncek Oct 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 15 additions & 18 deletions wled00/FX_2Dfcn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,47 +173,44 @@ void IRAM_ATTR_YN Segment::setPixelColorXY(int x, int y, uint32_t col)
if (!isActive()) return; // not active
if (x >= virtualWidth() || y >= virtualHeight() || x<0 || y<0) return; // if pixel would fall out of virtual segment just exit

uint8_t _bri_t = currentBri();
uint8_t _bri_t = currentBri();
if (_bri_t < 255) {
col = color_fade(col, _bri_t);
}

if (reverse ) x = virtualWidth() - x - 1;
if (reverse_y) y = virtualHeight() - y - 1;
if (transpose) { std::swap(x,y); } // swap X & Y if segment transposed

x *= groupLength(); // expand to physical pixels
y *= groupLength(); // expand to physical pixels

int W = width();
int H = height();
if (x >= W || y >= H) return; // if pixel would fall out of segment just exit

uint32_t tmpCol = col;

int yY = y;
for (int j = 0; j < grouping; j++) { // groupping vertically
if(yY >= H) continue;
blazoncek marked this conversation as resolved.
Show resolved Hide resolved
int xX = x;
for (int g = 0; g < grouping; g++) { // groupping horizontally
int xX = (x+g), yY = (y+j);
if (xX >= W || yY >= H) continue; // we have reached one dimension's end

if (xX >= W) continue; // we have reached one dimension's end
#ifndef WLED_DISABLE_MODE_BLEND
// if blending modes, blend with underlying pixel
if (_modeBlend) tmpCol = color_blend(strip.getPixelColorXY(start + xX, startY + yY), col, 0xFFFFU - progress(), true);
if (_modeBlend) col = color_blend(strip.getPixelColorXY(start + xX, startY + yY), col, 0xFFFFU - progress(), true);
#endif

strip.setPixelColorXY(start + xX, startY + yY, tmpCol);

strip.setPixelColorXY(start + xX, startY + yY, col);
if (mirror) { //set the corresponding horizontally mirrored pixel
if (transpose) strip.setPixelColorXY(start + xX, startY + height() - yY - 1, tmpCol);
else strip.setPixelColorXY(start + width() - xX - 1, startY + yY, tmpCol);
if (transpose) strip.setPixelColorXY(start + xX, startY + height() - yY - 1, col);
else strip.setPixelColorXY(start + width() - xX - 1, startY + yY, col);
}
if (mirror_y) { //set the corresponding vertically mirrored pixel
if (transpose) strip.setPixelColorXY(start + width() - xX - 1, startY + yY, tmpCol);
else strip.setPixelColorXY(start + xX, startY + height() - yY - 1, tmpCol);
if (transpose) strip.setPixelColorXY(start + width() - xX - 1, startY + yY, col);
else strip.setPixelColorXY(start + xX, startY + height() - yY - 1, col);
}
if (mirror_y && mirror) { //set the corresponding vertically AND horizontally mirrored pixel
strip.setPixelColorXY(start + width() - xX - 1, startY + height() - yY - 1, tmpCol);
strip.setPixelColorXY(start + width() - xX - 1, startY + height() - yY - 1, col);
}
xX++;
}
yY++;
}
}

Expand Down
20 changes: 12 additions & 8 deletions wled00/FX_fcn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -705,11 +705,16 @@ void IRAM_ATTR_YN Segment::setPixelColor(int i, uint32_t col)
{
if (!isActive()) return; // not active
#ifndef WLED_DISABLE_2D
int vStrip = i>>16; // hack to allow running on virtual strips (2D segment columns/rows)
int vStrip;
#endif
i &= 0xFFFF;

if (i >= virtualLength() || i<0) return; // if pixel would fall out of segment just exit
if (i >= virtualLength() || i<0) // pixel would fall out of segment, check if this is a virtual strip NOTE: this is almost always false if not virtual strip, saves the calculation on 'standard' call
{
#ifndef WLED_DISABLE_2D
vStrip = i>>16; // hack to allow running on virtual strips (2D segment columns/rows)
#endif
i &= 0xFFFF; //truncate vstrip index
if (i >= virtualLength() || i<0) return; // if pixel would still fall out of segment just exit
}

#ifndef WLED_DISABLE_2D
if (is2D()) {
Expand Down Expand Up @@ -900,8 +905,7 @@ uint32_t IRAM_ATTR_YN Segment::getPixelColor(int i) const
if (!isActive()) return 0; // not active
#ifndef WLED_DISABLE_2D
int vStrip = i>>16;
#endif
i &= 0xFFFF;
#endif

#ifndef WLED_DISABLE_2D
if (is2D()) {
Expand All @@ -912,7 +916,7 @@ uint32_t IRAM_ATTR_YN Segment::getPixelColor(int i) const
return getPixelColorXY(i % vW, i / vW);
break;
case M12_pBar:
if (vStrip>0) return getPixelColorXY(vStrip - 1, vH - i -1);
if (vStrip>0) { i &= 0xFFFF; return getPixelColorXY(vStrip - 1, vH - i -1); }
blazoncek marked this conversation as resolved.
Show resolved Hide resolved
else return getPixelColorXY(0, vH - i -1);
break;
case M12_pArc:
Expand Down Expand Up @@ -1816,7 +1820,7 @@ bool WS2812FX::deserializeMap(uint8_t n) {
return (customMappingSize > 0);
}

uint16_t IRAM_ATTR WS2812FX::getMappedPixelIndex(uint16_t index) const {
__attribute__ ((always_inline)) inline uint16_t IRAM_ATTR WS2812FX::getMappedPixelIndex(uint16_t index) const {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Non obligatory: I would prefer __attribute__ at the end but [[...]] in front.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I honestly have no Idea what the precompiler instructions mean in detail, I copied this from what they use in fastled...
the idea behind this is to get rid of the 'function entry' instructions that are added when a function is called. When I added the inline flash size increased by a few bytes, telling me that it is actually inlined. Since this short function is only called from two places and is called A LOT this may be faster. I have no way to check (would need a proper debugger that shows assembly instructions being executed line by line).

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As I've recently learned these are compiler attributes.

Copy link
Collaborator

@softhack007 softhack007 Sep 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤔 not sure if always_inline plays well with IRAM_ATTR .... the first tells the compiler to always inline the function, the latter says "put the function into IRAM" which means that a real function is needed.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that is what I was wondering too, this is just a suggestion, i.e. to inline this for speed but how to tell the compiler to inline it to the functions that are in ram... not sure how it will do it.

Copy link
Collaborator

@softhack007 softhack007 Sep 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To my understanding:

  • inline is a hint/suggestion to the compiler. So it might get inlined, or not.
  • __attribute__((always_inline)) is a directive. So the compiler must inline this function, no matter if its efficient or not.

If you want to optimize function calls, its sometime useful to add __attribute__((pure)) or __attribute__((const)) to the function declaration. But only do this after double-checking that the code is actually "pure" (no side-effects) or "const" (solely depends on arguments). I did this in the MoonModules fork, but honestly it does not give you more than 1 or 2 fps even if you apply it to lots of functions.

See MoonModules@7f9da30

// convert logical address to physical
if (index < customMappingSize
&& (realtimeMode == REALTIME_MODE_INACTIVE || realtimeRespectLedMaps)) index = customMappingTable[index];
Expand Down
84 changes: 56 additions & 28 deletions wled00/colors.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,21 +39,20 @@ uint32_t color_add(uint32_t c1, uint32_t c2, bool fast)
{
if (c1 == BLACK) return c2;
if (c2 == BLACK) return c1;
if (fast) {
uint8_t r = R(c1);
uint8_t g = G(c1);
uint8_t b = B(c1);
uint8_t w = W(c1);
r = qadd8(r, R(c2));
g = qadd8(g, G(c2));
b = qadd8(b, B(c2));
w = qadd8(w, W(c2));
uint32_t rb = (c1 & 0x00FF00FF) + (c2 & 0x00FF00FF);
blazoncek marked this conversation as resolved.
Show resolved Hide resolved
uint32_t r = rb >> 16;
uint32_t b = rb & 0xFFFF;
uint32_t wg = ((c1>>8) & 0x00FF00FF) + ((c2>>8) & 0x00FF00FF);
uint32_t w = wg >> 16;
uint32_t g = wg & 0xFFFF;

if (fast) {
r = r > 255 ? 255 : r;
blazoncek marked this conversation as resolved.
Show resolved Hide resolved
g = g > 255 ? 255 : g;
b = b > 255 ? 255 : b;
w = w > 255 ? 255 : w;
return RGBW32(r,g,b,w);
} else {
uint32_t r = R(c1) + R(c2);
uint32_t g = G(c1) + G(c2);
uint32_t b = B(c1) + B(c2);
uint32_t w = W(c1) + W(c2);
unsigned max = r;
if (g > max) max = g;
if (b > max) max = b;
Expand All @@ -70,27 +69,56 @@ uint32_t color_add(uint32_t c1, uint32_t c2, bool fast)

uint32_t color_fade(uint32_t c1, uint8_t amount, bool video)
{
if (c1 == BLACK || amount + video == 0) return BLACK;
if (c1 == BLACK || amount == 0) return BLACK;
else if (amount == 255) return c1;
blazoncek marked this conversation as resolved.
Show resolved Hide resolved
uint32_t scaledcolor; // color order is: W R G B from MSB to LSB
uint32_t r = R(c1);
uint32_t g = G(c1);
uint32_t b = B(c1);
uint32_t w = W(c1);
uint32_t scale = amount; // 32bit for faster calculation
if (video) {
scaledcolor = (((r * scale) >> 8) + ((r && scale) ? 1 : 0)) << 16;
scaledcolor |= (((g * scale) >> 8) + ((g && scale) ? 1 : 0)) << 8;
scaledcolor |= ((b * scale) >> 8) + ((b && scale) ? 1 : 0);
scaledcolor |= (((w * scale) >> 8) + ((w && scale) ? 1 : 0)) << 24;
} else {
scaledcolor = ((r * scale) >> 8) << 16;
scaledcolor |= ((g * scale) >> 8) << 8;
scaledcolor |= (b * scale) >> 8;
scaledcolor |= ((w * scale) >> 8) << 24;
uint32_t addRemains = 0;
blazoncek marked this conversation as resolved.
Show resolved Hide resolved
if (!video) amount++; // add one for correct scaling using bitshifts
else { // video scaling: make sure colors do not dim to zero if they started non-zero
addRemains = R(c1) ? 0x00010000 : 0;
addRemains |= G(c1) ? 0x00000100 : 0;
addRemains |= B(c1) ? 0x00000001 : 0;
addRemains |= W(c1) ? 0x01000000 : 0;
}
uint32_t rb = (((c1 & 0x00FF00FF) * scale) >> 8) & 0x00FF00FF; // scale red and blue
uint32_t wg = (((c1 & 0xFF00FF00) >> 8) * scale) & 0xFF00FF00; // scale white and green
scaledcolor = (rb | wg) + addRemains;
return scaledcolor;
}

// 1:1 replacement of fastled function optimized for ESP, slightly faster, more accurate and uses less flash (~ -200bytes)
blazoncek marked this conversation as resolved.
Show resolved Hide resolved
CRGB ColorFromPaletteWLED(const CRGBPalette16& pal, unsigned index, uint8_t brightness, TBlendType blendType)
{
if ( blendType == LINEARBLEND_NOWRAP) {
//index = map8(index, 0, 239);
index = (index*240) >> 8; // Blend range is affected by lo4 blend of values, remap to avoid wrapping
}
unsigned hi4 = byte(index) >> 4;
// We then add that to a base array pointer.
const CRGB* entry = (CRGB*)( (uint8_t*)(&(pal[0])) + (hi4 * sizeof(CRGB)));
unsigned red1 = entry->r;
unsigned green1 = entry->g;
unsigned blue1 = entry->b;
if(blendType != NOBLEND) {
if(hi4 == 15) entry = &(pal[0]);
else ++entry;
// unsigned red2 = entry->red;
unsigned f2 = ((index & 0x0F) << 4) + 1; // +1 so we scale by 256 as a max value, then result can just be shifted by 8
unsigned f1 = (257 - f2); // f2 is 1 minimum, so this is 256 max
red1 = (red1 * f1 + (unsigned)entry->r * f2) >> 8;
green1 = (green1 * f1 + (unsigned)entry->g * f2) >> 8;
blue1 = (blue1 * f1 + (unsigned)entry->b * f2) >> 8;
}
if( brightness < 255) { // note: zero checking could be done to return black but that is hardly ever used so it is omitted
uint32_t scale = brightness + 1; // adjust for rounding (bitshift)
red1 = (red1 * scale) >> 8;
green1 = (green1 * scale) >> 8;
blue1 = (blue1 * scale) >> 8;
}
return CRGB((uint8_t)red1, (uint8_t)green1, (uint8_t)blue1);
}

void setRandomColor(byte* rgb)
{
lastRandomIndex = get_random_wheel_index(lastRandomIndex);
Expand Down
2 changes: 2 additions & 0 deletions wled00/fcn_declare.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ typedef struct WiFiConfig {
} wifi_config;

//colors.cpp
#define ColorFromPalette ColorFromPaletteWLED // override fastled version
// similar to NeoPixelBus NeoGammaTableMethod but allows dynamic changes (superseded by NPB::NeoGammaDynamicTableMethod)
class NeoGammaWLEDMethod {
public:
Expand All @@ -81,6 +82,7 @@ class NeoGammaWLEDMethod {
[[gnu::hot]] uint32_t color_blend(uint32_t,uint32_t,uint16_t,bool b16=false);
[[gnu::hot]] uint32_t color_add(uint32_t,uint32_t, bool fast=false);
[[gnu::hot]] uint32_t color_fade(uint32_t c1, uint8_t amount, bool video=false);
CRGB ColorFromPaletteWLED(const CRGBPalette16 &pal, unsigned index, uint8_t brightness = (uint8_t)255U, TBlendType blendType = LINEARBLEND);
blazoncek marked this conversation as resolved.
Show resolved Hide resolved
CRGBPalette16 generateHarmonicRandomPalette(CRGBPalette16 &basepalette);
CRGBPalette16 generateRandomPalette();
inline uint32_t colorFromRgbw(byte* rgbw) { return uint32_t((byte(rgbw[3]) << 24) | (byte(rgbw[0]) << 16) | (byte(rgbw[1]) << 8) | (byte(rgbw[2]))); }
Expand Down