From 3641db289799649c7550b2760fb5f7067678ad35 Mon Sep 17 00:00:00 2001 From: Damian Schneider Date: Sun, 31 Aug 2025 16:14:07 +0200 Subject: [PATCH 1/3] speed optimizations, fix for restoreColorLossy, code cleanup --- wled00/FX_2Dfcn.cpp | 14 +++++++------- wled00/FX_fcn.cpp | 12 ++++++------ wled00/FXparticleSystem.cpp | 19 +++++-------------- wled00/bus_manager.cpp | 5 ++++- wled00/bus_manager.h | 7 ++++--- wled00/colors.cpp | 24 ++++++++++++------------ wled00/colors.h | 9 ++++++++- wled00/const.h | 2 ++ wled00/data/settings_leds.htm | 4 ++-- 9 files changed, 50 insertions(+), 46 deletions(-) diff --git a/wled00/FX_2Dfcn.cpp b/wled00/FX_2Dfcn.cpp index 9a3c6fbe81..1fa50f1ee9 100644 --- a/wled00/FX_2Dfcn.cpp +++ b/wled00/FX_2Dfcn.cpp @@ -146,7 +146,7 @@ void WS2812FX::setUpMatrix() { #ifndef WLED_DISABLE_2D // pixel is clipped if it falls outside clipping range // if clipping start > stop the clipping range is inverted -bool IRAM_ATTR_YN Segment::isPixelXYClipped(int x, int y) const { +bool Segment::isPixelXYClipped(int x, int y) const { if (blendingStyle != BLEND_STYLE_FADE && isInTransition() && _clipStart != _clipStop) { const bool invertX = _clipStart > _clipStop; const bool invertY = _clipStartY > _clipStopY; @@ -186,7 +186,7 @@ bool IRAM_ATTR_YN Segment::isPixelXYClipped(int x, int y) const { void IRAM_ATTR_YN Segment::setPixelColorXY(int x, int y, uint32_t col) const { if (!isActive()) return; // not active - if (x >= (int)vWidth() || y >= (int)vHeight() || x < 0 || y < 0) return; // if pixel would fall out of virtual segment just exit + if ((unsigned)x >= vWidth() || (unsigned)y >= vHeight()) return; // if pixel would fall out of virtual segment just exit setPixelColorXYRaw(x, y, col); } @@ -236,7 +236,7 @@ void Segment::setPixelColorXY(float x, float y, uint32_t col, bool aa) const // returns RGBW values of pixel uint32_t IRAM_ATTR_YN Segment::getPixelColorXY(int x, int y) const { if (!isActive()) return 0; // not active - if (x >= (int)vWidth() || y >= (int)vHeight() || x<0 || y<0) return 0; // if pixel would fall out of virtual segment just exit + if ((unsigned)x >= vWidth() || (unsigned)y >= vHeight()) return 0; // if pixel would fall out of virtual segment just exit return getPixelColorXYRaw(x,y); } @@ -256,8 +256,8 @@ void Segment::blur2D(uint8_t blur_x, uint8_t blur_y, bool smear) const { uint32_t curnew = BLACK; for (unsigned x = 0; x < cols; x++) { uint32_t cur = getPixelColorRaw(XY(x, row)); - uint32_t part = color_fade(cur, seepx); - curnew = color_fade(cur, keepx); + uint32_t part = fast_color_scale(cur, seepx); + curnew = fast_color_scale(cur, keepx); if (x > 0) { if (carryover) curnew = color_add(curnew, carryover); uint32_t prev = color_add(lastnew, part); @@ -279,8 +279,8 @@ void Segment::blur2D(uint8_t blur_x, uint8_t blur_y, bool smear) const { uint32_t curnew = BLACK; for (unsigned y = 0; y < rows; y++) { uint32_t cur = getPixelColorRaw(XY(col, y)); - uint32_t part = color_fade(cur, seepy); - curnew = color_fade(cur, keepy); + uint32_t part = fast_color_scale(cur, seepy); + curnew = fast_color_scale(cur, keepy); if (y > 0) { if (carryover) curnew = color_add(curnew, carryover); uint32_t prev = color_add(lastnew, part); diff --git a/wled00/FX_fcn.cpp b/wled00/FX_fcn.cpp index 2f8d5515fd..677056770c 100644 --- a/wled00/FX_fcn.cpp +++ b/wled00/FX_fcn.cpp @@ -673,7 +673,7 @@ uint16_t Segment::maxMappingLength() const { #endif // pixel is clipped if it falls outside clipping range // if clipping start > stop the clipping range is inverted -bool IRAM_ATTR_YN Segment::isPixelClipped(int i) const { +bool Segment::isPixelClipped(int i) const { if (blendingStyle != BLEND_STYLE_FADE && isInTransition() && _clipStart != _clipStop) { bool invert = _clipStart > _clipStop; // ineverted start & stop int start = invert ? _clipStop : _clipStart; @@ -691,7 +691,7 @@ bool IRAM_ATTR_YN Segment::isPixelClipped(int i) const { return false; } -void IRAM_ATTR_YN Segment::setPixelColor(int i, uint32_t col) const +void WLED_O2_ATTR Segment::setPixelColor(int i, uint32_t col) const { if (!isActive() || i < 0) return; // not active or invalid index #ifndef WLED_DISABLE_2D @@ -904,7 +904,7 @@ void Segment::setPixelColor(float i, uint32_t col, bool aa) const } #endif -uint32_t IRAM_ATTR_YN Segment::getPixelColor(int i) const +uint32_t WLED_O2_ATTR Segment::getPixelColor(int i) const { if (!isActive() || i < 0) return 0; // not active or invalid index @@ -1043,7 +1043,7 @@ void Segment::fadeToSecondaryBy(uint8_t fadeBy) const { void Segment::fadeToBlackBy(uint8_t fadeBy) const { if (!isActive() || fadeBy == 0) return; // optimization - no scaling to apply const size_t rlength = rawLength(); // calculate only once - for (unsigned i = 0; i < rlength; i++) setPixelColorRaw(i, color_fade(getPixelColorRaw(i), 255-fadeBy)); + for (unsigned i = 0; i < rlength; i++) setPixelColorRaw(i, fast_color_scale(getPixelColorRaw(i), 255-fadeBy)); } /* @@ -1069,8 +1069,8 @@ void Segment::blur(uint8_t blur_amount, bool smear) const { uint32_t curnew = BLACK; for (unsigned i = 0; i < vlength; i++) { uint32_t cur = getPixelColorRaw(i); - uint32_t part = color_fade(cur, seep); - curnew = color_fade(cur, keep); + uint32_t part = fast_color_scale(cur, seep); + curnew = fast_color_scale(cur, keep); if (i > 0) { if (carryover) curnew = color_add(curnew, carryover); uint32_t prev = color_add(lastnew, part); diff --git a/wled00/FXparticleSystem.cpp b/wled00/FXparticleSystem.cpp index 8b684a5f69..4446be8d81 100644 --- a/wled00/FXparticleSystem.cpp +++ b/wled00/FXparticleSystem.cpp @@ -18,7 +18,6 @@ static int32_t calcForce_dv(const int8_t force, uint8_t &counter); static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32_t particleradius, const bool wrap); // returns false if out of bounds by more than particleradius static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding) -static uint32_t fast_color_scale(CRGBW c, const uint8_t scale); // fast scaling function using 32bit variable and pointer. note: keep 'scale' within 0-255 #endif #ifndef WLED_DISABLE_PARTICLESYSTEM2D @@ -625,7 +624,7 @@ void ParticleSystem2D::render() { } // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer -__attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) { +void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) { uint32_t size = particlesize; if (advPartProps && advPartProps[particleindex].size > 0) // use advanced size properties (0 means use global size including single pixel rendering) size = advPartProps[particleindex].size; @@ -857,7 +856,7 @@ void ParticleSystem2D::handleCollisions() { // handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS // takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard) -__attribute__((optimize("O2"))) void ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq) { +void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq) { int32_t distanceSquared = dx * dx + dy * dy; // Calculate relative velocity note: could zero check but that does not improve overall speed but deminish it as that is rarely the case and pushing is still required int32_t relativeVx = (int32_t)particle2.vx - (int32_t)particle1.vx; @@ -1485,7 +1484,7 @@ void ParticleSystem1D::render() { } // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer -__attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) { +void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) { uint32_t size = particlesize; if (advPartProps) // use advanced size properties (1D system has no large size global rendering TODO: add large global rendering?) size = advPartProps[particleindex].size; @@ -1648,7 +1647,7 @@ void ParticleSystem1D::handleCollisions() { } // handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS // takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard) -__attribute__((optimize("O2"))) void ParticleSystem1D::collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance) { +void WLED_O2_ATTR ParticleSystem1D::collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance) { int32_t dv = particle2.vx - particle1.vx; int32_t dotProduct = (dx * dv); // is always negative if moving towards each other @@ -1891,7 +1890,7 @@ static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32 // this is a fast version for CRGBW color adding ignoring white channel (PS does not handle white) including scaling of second color // note: function is mainly used to add scaled colors, so checking if one color is black is slower // note2: returning CRGBW value is slightly slower as the return value gets written to uint32_t framebuffer - __attribute__((optimize("O2"))) static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, const uint8_t scale) { +static uint32_t WLED_O2_ATTR fast_color_add(CRGBW c1, const CRGBW c2, const uint8_t scale) { uint32_t r, g, b; r = c1.r + ((c2.r * scale) >> 8); g = c1.g + ((c2.g * scale) >> 8); @@ -1912,12 +1911,4 @@ static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32 return c1.color32; } -// fast CRGBW color scaling ignoring white channel (PS does not handle white) - __attribute__((optimize("O2"))) static uint32_t fast_color_scale(CRGBW c, const uint8_t scale) { - c.r = ((c.r * scale) >> 8); - c.g = ((c.g * scale) >> 8); - c.b = ((c.b * scale) >> 8); - return c.color32; -} - #endif // !(defined(WLED_DISABLE_PARTICLESYSTEM2D) && defined(WLED_DISABLE_PARTICLESYSTEM1D)) diff --git a/wled00/bus_manager.cpp b/wled00/bus_manager.cpp index 99523bba9f..d220149404 100644 --- a/wled00/bus_manager.cpp +++ b/wled00/bus_manager.cpp @@ -209,6 +209,7 @@ void BusDigital::estimateCurrent() { void BusDigital::applyBriLimit(uint8_t newBri) { // a newBri of 0 means calculate per-bus brightness limit + _NPBbri = 255; // reset, intermediate value is set below, final value is calculated in bus::show() if (newBri == 0) { if (_milliAmpsLimit == 0 || _milliAmpsTotal == 0) return; // ABL not used for this bus newBri = 255; @@ -226,6 +227,7 @@ void BusDigital::applyBriLimit(uint8_t newBri) { } if (newBri < 255) { + _NPBbri = newBri; // store value so it can be updated in show() (must be updated even if ABL is not used) uint8_t cctWW = 0, cctCW = 0; unsigned hwLen = _len; if (_type == TYPE_WS2812_1CH_X3) hwLen = NUM_ICS_WS2812_1CH_3X(_len); // only needs a third of "RGB" LEDs for NeoPixelBus @@ -243,6 +245,7 @@ void BusDigital::applyBriLimit(uint8_t newBri) { void BusDigital::show() { if (!_valid) return; + _NPBbri = (_NPBbri * _bri) / 255; // total applied brightness for use in restoreColorLossy (see applyBriLimit()) PolyBus::show(_busPtr, _iType, _skip); // faster if buffer consistency is not important (no skipped LEDs) } @@ -305,7 +308,7 @@ uint32_t IRAM_ATTR BusDigital::getPixelColor(unsigned pix) const { if (_reversed) pix = _len - pix -1; pix += _skip; const uint8_t co = _colorOrderMap.getPixelColorOrder(pix+_start, _colorOrder); - uint32_t c = restoreColorLossy(PolyBus::getPixelColor(_busPtr, _iType, (_type==TYPE_WS2812_1CH_X3) ? IC_INDEX_WS2812_1CH_3X(pix) : pix, co),_bri); + uint32_t c = restoreColorLossy(PolyBus::getPixelColor(_busPtr, _iType, (_type==TYPE_WS2812_1CH_X3) ? IC_INDEX_WS2812_1CH_3X(pix) : pix, co),_NPBbri); if (_type == TYPE_WS2812_1CH_X3) { // map to correct IC, each controls 3 LEDs uint8_t r = R(c); uint8_t g = _reversed ? B(c) : G(c); // should G and B be switched if _reversed? diff --git a/wled00/bus_manager.h b/wled00/bus_manager.h index fe70a05170..87f2a595bd 100644 --- a/wled00/bus_manager.h +++ b/wled00/bus_manager.h @@ -105,6 +105,7 @@ class Bus { Bus(uint8_t type, uint16_t start, uint8_t aw, uint16_t len = 1, bool reversed = false, bool refresh = false) : _type(type) , _bri(255) + , _NPBbri(255) , _start(start) , _len(std::max(len,(uint16_t)1)) , _reversed(reversed) @@ -202,7 +203,9 @@ class Bus { protected: uint8_t _type; - uint8_t _bri; + uint8_t _bri; // bus brightness + uint8_t _NPBbri; // total brightness applied to colors in NPB buffer (_bri + ABL) + uint8_t _autoWhiteMode; // global Auto White Calculation override uint16_t _start; uint16_t _len; //struct { //using bitfield struct adds abour 250 bytes to binary size @@ -213,8 +216,6 @@ class Bus { bool _hasWhite;// : 1; bool _hasCCT;// : 1; //} __attribute__ ((packed)); - uint8_t _autoWhiteMode; - // global Auto White Calculation override static uint8_t _gAWM; // _cct has the following meanings (see calculateCCT() & BusManager::setSegmentCCT()): // -1 means to extract approximate CCT value in K from RGB (in calcualteCCT()) diff --git a/wled00/colors.cpp b/wled00/colors.cpp index bf2b69d73a..0b95f88994 100644 --- a/wled00/colors.cpp +++ b/wled00/colors.cpp @@ -8,7 +8,7 @@ * color blend function, based on FastLED blend function * the calculation for each color is: result = (A*(amountOfA) + A + B*(amountOfB) + B) / 256 with amountOfA = 255 - amountOfB */ -uint32_t IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, uint8_t blend) { +uint32_t WLED_O2_ATTR IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, uint8_t blend) { // min / max blend checking is omitted: calls with 0 or 255 are rare, checking lowers overall performance const uint32_t TWO_CHANNEL_MASK = 0x00FF00FF; // mask for R and B channels or W and G if negated (poorman's SIMD; https://github.com/wled/WLED/pull/4568#discussion_r1986587221) uint32_t rb1 = color1 & TWO_CHANNEL_MASK; // extract R & B channels from color1 @@ -92,15 +92,15 @@ uint32_t IRAM_ATTR color_fade(uint32_t c1, uint8_t amount, bool video) { note: inputs are 32bit to speed up the function, useful input value ranges are 0-255 */ uint32_t adjust_color(uint32_t rgb, uint32_t hueShift, uint32_t lighten, uint32_t brighten) { - if (rgb == 0 | hueShift + lighten + brighten == 0) return rgb; // black or no change - CHSV32 hsv; - rgb2hsv(rgb, hsv); //convert to HSV - hsv.h += (hueShift << 8); // shift hue (hue is 16 bits) - hsv.s = max((int32_t)0, (int32_t)hsv.s - (int32_t)lighten); // desaturate - hsv.v = min((uint32_t)255, (uint32_t)hsv.v + brighten); // increase brightness - uint32_t rgb_adjusted; - hsv2rgb(hsv, rgb_adjusted); // convert back to RGB TODO: make this into 16 bit conversion - return rgb_adjusted; + if (rgb == 0 || hueShift + lighten + brighten == 0) return rgb; // black or no change + CHSV32 hsv; + rgb2hsv(rgb, hsv); //convert to HSV + hsv.h += (hueShift << 8); // shift hue (hue is 16 bits) + hsv.s = max((int32_t)0, (int32_t)hsv.s - (int32_t)lighten); // desaturate + hsv.v = min((uint32_t)255, (uint32_t)hsv.v + brighten); // increase brightness + uint32_t rgb_adjusted; + hsv2rgb(hsv, rgb_adjusted); // convert back to RGB TODO: make this into 16 bit conversion + return rgb_adjusted; } // 1:1 replacement of fastled function optimized for ESP, slightly faster, more accurate and uses less flash (~ -200bytes) @@ -597,13 +597,13 @@ void NeoGammaWLEDMethod::calcGammaTable(float gamma) gammaT_inv[0] = 0; } -uint8_t IRAM_ATTR_YN NeoGammaWLEDMethod::Correct(uint8_t value) +uint8_t NeoGammaWLEDMethod::Correct(uint8_t value) { if (!gammaCorrectCol) return value; return gammaT[value]; } -uint32_t IRAM_ATTR_YN NeoGammaWLEDMethod::inverseGamma32(uint32_t color) +uint32_t NeoGammaWLEDMethod::inverseGamma32(uint32_t color) { if (!gammaCorrectCol) return color; uint8_t w = W(color); diff --git a/wled00/colors.h b/wled00/colors.h index 376959fd65..af7dd5cb69 100644 --- a/wled00/colors.h +++ b/wled00/colors.h @@ -117,6 +117,7 @@ class NeoGammaWLEDMethod { [[gnu::hot, gnu::pure]] uint32_t color_blend(uint32_t c1, uint32_t c2 , uint8_t blend); inline uint32_t color_blend16(uint32_t c1, uint32_t c2, uint16_t b) { return color_blend(c1, c2, b >> 8); }; [[gnu::hot, gnu::pure]] uint32_t color_add(uint32_t, uint32_t, bool preserveCR = false); +[[gnu::hot, gnu::pure]] uint32_t color_fade(uint32_t c1, uint8_t amount, bool video = false); [[gnu::hot, gnu::pure]] uint32_t adjust_color(uint32_t rgb, uint32_t hueShift, uint32_t lighten, uint32_t brighten); [[gnu::hot, gnu::pure]] uint32_t ColorFromPaletteWLED(const CRGBPalette16 &pal, unsigned index, uint8_t brightness = (uint8_t)255U, TBlendType blendType = LINEARBLEND); CRGBPalette16 generateHarmonicRandomPalette(const CRGBPalette16 &basepalette); @@ -139,6 +140,12 @@ uint32_t colorBalanceFromKelvin(uint16_t kelvin, uint32_t rgb); uint16_t approximateKelvinFromRGB(uint32_t rgb); void setRandomColor(byte* rgb); -[[gnu::hot, gnu::pure]] uint32_t color_fade(uint32_t c1, uint8_t amount, bool video = false); +// fast scaling function for colors, performs color*scale/256 for all four channels, speed over accuracy +// note: inlining uses less code than actual function calls +static inline uint32_t fast_color_scale(uint32_t c, const uint8_t scale) { + uint32_t rb = (((c & 0x00FF00FF) * scale) >> 8) & 0x00FF00FF; + uint32_t wg = (((c>>8) & 0x00FF00FF) * scale) & ~0x00FF00FF; + return rb | wg; +} #endif diff --git a/wled00/const.h b/wled00/const.h index 1abf245396..b5fdc4ccf7 100644 --- a/wled00/const.h +++ b/wled00/const.h @@ -655,4 +655,6 @@ static_assert(WLED_MAX_BUSSES <= 32, "WLED_MAX_BUSSES exceeds hard limit"); #define IRAM_ATTR_YN IRAM_ATTR #endif +#define WLED_O2_ATTR __attribute__((optimize("O2"))) + #endif diff --git a/wled00/data/settings_leds.htm b/wled00/data/settings_leds.htm index 928da11753..a971e93143 100644 --- a/wled00/data/settings_leds.htm +++ b/wled00/data/settings_leds.htm @@ -43,13 +43,13 @@ } function bLimits(b,v,p,m,l,o=5,d=2,a=6) { maxB = b; // maxB - max physical (analog + digital) buses: 32 - ESP32, 14 - S3/S2, 6 - C3, 4 - 8266 - maxD = d; // maxD - max digital channels (can be changed if using ESP32 parallel I2S): 16 - ESP32, 12 - S3/S2, 2 - C3, 3 - 8266 - maxA = a; // maxA - max analog channels: 16 - ESP32, 8 - S3/S2, 6 - C3, 5 - 8266 maxV = v; // maxV - min virtual buses: 6 - ESP32/S3, 4 - S2/C3, 3 - ESP8266 (only used to distinguish S2/S3) maxPB = p; // maxPB - max LEDs per bus maxM = m; // maxM - max LED memory maxL = l; // maxL - max LEDs (will serve to determine ESP >1664 == ESP32) maxCO = o; // maxCO - max Color Order mappings + maxD = d; // maxD - max digital channels (can be changed if using ESP32 parallel I2S): 16 - ESP32, 12 - S3/S2, 2 - C3, 3 - 8266 + maxA = a; // maxA - max analog channels: 16 - ESP32, 8 - S3/S2, 6 - C3, 5 - 8266 } function is8266() { return maxA == 5 && maxD == 3; } // NOTE: see const.h function is32() { return maxA == 16 && maxD == 16; } // NOTE: see const.h From fa0673284dca20fff4d3fae93a3875dfe43a9267 Mon Sep 17 00:00:00 2001 From: Damian Schneider Date: Sat, 13 Sep 2025 23:27:33 +0200 Subject: [PATCH 2/3] speed optimization in color_add, PS fast_color_add and blur functions applying more bit and shift manipulation tricks to squeeze out just a bit more speed on color manipulation functions. - Optimization on blur is based on work by @blazoncek - Renamed PS fast_color_add() to fast_color_scaleAdd() In my arbitrary speed tests thse changes resulted in 2-3% higher FPS. --- wled00/FX_2Dfcn.cpp | 50 ++++++++------------ wled00/FX_fcn.cpp | 26 ++++------- wled00/FXparticleSystem.cpp | 91 +++++++++++++++++++------------------ wled00/colors.cpp | 34 ++++++-------- wled00/colors.h | 2 +- 5 files changed, 93 insertions(+), 110 deletions(-) diff --git a/wled00/FX_2Dfcn.cpp b/wled00/FX_2Dfcn.cpp index 1fa50f1ee9..72b7f3d0bb 100644 --- a/wled00/FX_2Dfcn.cpp +++ b/wled00/FX_2Dfcn.cpp @@ -246,52 +246,42 @@ void Segment::blur2D(uint8_t blur_x, uint8_t blur_y, bool smear) const { const unsigned cols = vWidth(); const unsigned rows = vHeight(); const auto XY = [&](unsigned x, unsigned y){ return x + y*cols; }; - uint32_t lastnew; // not necessary to initialize lastnew and last, as both will be initialized by the first loop iteration - uint32_t last; if (blur_x) { const uint8_t keepx = smear ? 255 : 255 - blur_x; const uint8_t seepx = blur_x >> 1; for (unsigned row = 0; row < rows; row++) { // blur rows (x direction) - uint32_t carryover = BLACK; - uint32_t curnew = BLACK; - for (unsigned x = 0; x < cols; x++) { - uint32_t cur = getPixelColorRaw(XY(x, row)); + // handle first pixel in row to avoid conditional in loop (faster) + uint32_t cur = getPixelColorRaw(XY(0, row)); + uint32_t carryover = fast_color_scale(cur, seepx); + setPixelColorRaw(XY(0, row), fast_color_scale(cur, keepx)); + for (unsigned x = 1; x < cols; x++) { + cur = getPixelColorRaw(XY(x, row)); uint32_t part = fast_color_scale(cur, seepx); - curnew = fast_color_scale(cur, keepx); - if (x > 0) { - if (carryover) curnew = color_add(curnew, carryover); - uint32_t prev = color_add(lastnew, part); - // optimization: only set pixel if color has changed - if (last != prev) setPixelColorRaw(XY(x - 1, row), prev); - } else setPixelColorRaw(XY(x, row), curnew); // first pixel - lastnew = curnew; - last = cur; // save original value for comparison on next iteration + cur = fast_color_scale(cur, keepx); + cur = color_add(cur, carryover); + setPixelColorRaw(XY(x - 1, row), color_add(getPixelColorRaw(XY(x-1, row)), part)); // previous pixel + setPixelColorRaw(XY(x, row), cur); // current pixel carryover = part; } - setPixelColorRaw(XY(cols-1, row), curnew); // set last pixel } } if (blur_y) { const uint8_t keepy = smear ? 255 : 255 - blur_y; const uint8_t seepy = blur_y >> 1; for (unsigned col = 0; col < cols; col++) { - uint32_t carryover = BLACK; - uint32_t curnew = BLACK; - for (unsigned y = 0; y < rows; y++) { - uint32_t cur = getPixelColorRaw(XY(col, y)); + // handle first pixel in column + uint32_t cur = getPixelColorRaw(XY(col, 0)); + uint32_t carryover = fast_color_scale(cur, seepy); + setPixelColorRaw(XY(col, 0), fast_color_scale(cur, keepy)); + for (unsigned y = 1; y < rows; y++) { + cur = getPixelColorRaw(XY(col, y)); uint32_t part = fast_color_scale(cur, seepy); - curnew = fast_color_scale(cur, keepy); - if (y > 0) { - if (carryover) curnew = color_add(curnew, carryover); - uint32_t prev = color_add(lastnew, part); - // optimization: only set pixel if color has changed - if (last != prev) setPixelColorRaw(XY(col, y - 1), prev); - } else setPixelColorRaw(XY(col, y), curnew); // first pixel - lastnew = curnew; - last = cur; //save original value for comparison on next iteration + cur = fast_color_scale(cur, keepy); + cur = color_add(cur, carryover); + setPixelColorRaw(XY(col, y - 1), color_add(getPixelColorRaw(XY(col, y-1)), part)); // previous pixel + setPixelColorRaw(XY(col, y), cur); // current pixel carryover = part; } - setPixelColorRaw(XY(col, rows - 1), curnew); } } } diff --git a/wled00/FX_fcn.cpp b/wled00/FX_fcn.cpp index 677056770c..d3b44f1e3a 100644 --- a/wled00/FX_fcn.cpp +++ b/wled00/FX_fcn.cpp @@ -1063,25 +1063,19 @@ void Segment::blur(uint8_t blur_amount, bool smear) const { uint8_t keep = smear ? 255 : 255 - blur_amount; uint8_t seep = blur_amount >> 1; unsigned vlength = vLength(); - uint32_t carryover = BLACK; - uint32_t lastnew; // not necessary to initialize lastnew and last, as both will be initialized by the first loop iteration - uint32_t last; - uint32_t curnew = BLACK; - for (unsigned i = 0; i < vlength; i++) { - uint32_t cur = getPixelColorRaw(i); + // handle first pixel to avoid conditional in loop (faster) + uint32_t cur = getPixelColorRaw(0); + uint32_t carryover = fast_color_scale(cur, seep); + setPixelColorRaw(0, fast_color_scale(cur, keep)); + for (unsigned i = 1; i < vlength; i++) { + cur = getPixelColorRaw(i); uint32_t part = fast_color_scale(cur, seep); - curnew = fast_color_scale(cur, keep); - if (i > 0) { - if (carryover) curnew = color_add(curnew, carryover); - uint32_t prev = color_add(lastnew, part); - // optimization: only set pixel if color has changed - if (last != prev) setPixelColorRaw(i - 1, prev); - } else setPixelColorRaw(i, curnew); // first pixel - lastnew = curnew; - last = cur; // save original value for comparison on next iteration + cur = fast_color_scale(cur, keep); + cur = color_add(cur, carryover); + setPixelColorRaw(i - 1, color_add(getPixelColorRaw(i - 1), part)); // previous pixel + setPixelColorRaw(i, cur); // current pixel carryover = part; } - setPixelColorRaw(vlength - 1, curnew); } /* diff --git a/wled00/FXparticleSystem.cpp b/wled00/FXparticleSystem.cpp index 4446be8d81..1a1ed08850 100644 --- a/wled00/FXparticleSystem.cpp +++ b/wled00/FXparticleSystem.cpp @@ -17,7 +17,7 @@ // local shared functions (used both in 1D and 2D system) static int32_t calcForce_dv(const int8_t force, uint8_t &counter); static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32_t particleradius, const bool wrap); // returns false if out of bounds by more than particleradius -static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding) +static uint32_t fast_color_scaleAdd(const uint32_t c1, const uint32_t c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding) #endif #ifndef WLED_DISABLE_PARTICLESYSTEM2D @@ -634,7 +634,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, uint32_t y = particles[particleindex].y >> PS_P_RADIUS_SHIFT; if (x <= (uint32_t)maxXpixel && y <= (uint32_t)maxYpixel) { uint32_t index = x + (maxYpixel - y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer) - framebuffer[index] = fast_color_add(framebuffer[index], color, brightness); + framebuffer[index] = fast_color_scaleAdd(framebuffer[index], color, brightness); } return; } @@ -686,10 +686,10 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, memset(renderbuffer, 0, sizeof(renderbuffer)); // clear buffer //particle size to pixels: < 64 is 4x4, < 128 is 6x6, < 192 is 8x8, bigger is 10x10 //first, render the pixel to the center of the renderbuffer, then apply 2D blurring - renderbuffer[4 + (4 * 10)] = fast_color_add(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // order is: bottom left, bottom right, top right, top left - renderbuffer[5 + (4 * 10)] = fast_color_add(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]); - renderbuffer[5 + (5 * 10)] = fast_color_add(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]); - renderbuffer[4 + (5 * 10)] = fast_color_add(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]); + renderbuffer[4 + (4 * 10)] = fast_color_scaleAdd(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // order is: bottom left, bottom right, top right, top left + renderbuffer[5 + (4 * 10)] = fast_color_scaleAdd(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]); + renderbuffer[5 + (5 * 10)] = fast_color_scaleAdd(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]); + renderbuffer[4 + (5 * 10)] = fast_color_scaleAdd(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]); uint32_t rendersize = 2; // initialize render size, minimum is 4x4 pixels, it is incremented int he loop below to start with 4 uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below) uint32_t maxsize = advPartProps[particleindex].size; @@ -747,7 +747,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, continue; } uint32_t idx = xfb + (maxYpixel - yfb) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer) - framebuffer[idx] = fast_color_add(framebuffer[idx], renderbuffer[xrb + yrb * 10]); + framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], renderbuffer[xrb + yrb * 10]); } } } else { // standard rendering (2x2 pixels) @@ -784,7 +784,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, for (uint32_t i = 0; i < 4; i++) { if (pixelvalid[i]) { uint32_t idx = pixco[i].x + (maxYpixel - pixco[i].y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer) - framebuffer[idx] = fast_color_add(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left + framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left } } } @@ -1027,9 +1027,8 @@ void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblu for (uint32_t x = xstart; x < xstart + xsize; x++) { seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours if (x > 0) { - colorbuffer[indexXY - 1] = fast_color_add(colorbuffer[indexXY - 1], seeppart); - if (carryover.color32) // note: check adds overhead but is faster on average - colorbuffer[indexXY] = fast_color_add(colorbuffer[indexXY], carryover); + colorbuffer[indexXY - 1] = fast_color_scaleAdd(colorbuffer[indexXY - 1], seeppart); + colorbuffer[indexXY] = fast_color_scaleAdd(colorbuffer[indexXY], carryover); } carryover = seeppart; indexXY++; // next pixel in x direction @@ -1048,9 +1047,8 @@ void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblu for (uint32_t y = ystart; y < ystart + ysize; y++) { seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours if (y > 0) { - colorbuffer[indexXY - width] = fast_color_add(colorbuffer[indexXY - width], seeppart); - if (carryover.color32) // note: check adds overhead but is faster on average - colorbuffer[indexXY] = fast_color_add(colorbuffer[indexXY], carryover); + colorbuffer[indexXY - width] = fast_color_scaleAdd(colorbuffer[indexXY - width], seeppart); + colorbuffer[indexXY] = fast_color_scaleAdd(colorbuffer[indexXY], carryover); } carryover = seeppart; indexXY += width; // next pixel in y direction @@ -1469,7 +1467,7 @@ void ParticleSystem1D::render() { CRGBW bg_color = SEGCOLOR(1); if (bg_color > 0) { //if not black for (int32_t i = 0; i <= maxXpixel; i++) { - framebuffer[i] = fast_color_add(framebuffer[i], bg_color); + framebuffer[i] = fast_color_scaleAdd(framebuffer[i], bg_color); } } #ifndef WLED_DISABLE_2D @@ -1492,7 +1490,7 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, if (size == 0) { //single pixel particle, can be out of bounds as oob checking is made for 2-pixel particles (and updating it uses more code) uint32_t x = particles[particleindex].x >> PS_P_RADIUS_SHIFT_1D; if (x <= (uint32_t)maxXpixel) { //by making x unsigned there is no need to check < 0 as it will overflow - framebuffer[x] = fast_color_add(framebuffer[x], color, brightness); + framebuffer[x] = fast_color_scaleAdd(framebuffer[x], color, brightness); } return; } @@ -1529,8 +1527,8 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, //render particle to a bigger size //particle size to pixels: 2 - 63 is 4 pixels, < 128 is 6pixels, < 192 is 8 pixels, bigger is 10 pixels //first, render the pixel to the center of the renderbuffer, then apply 1D blurring - renderbuffer[4] = fast_color_add(renderbuffer[4], color, pxlbrightness[0]); - renderbuffer[5] = fast_color_add(renderbuffer[5], color, pxlbrightness[1]); + renderbuffer[4] = fast_color_scaleAdd(renderbuffer[4], color, pxlbrightness[0]); + renderbuffer[5] = fast_color_scaleAdd(renderbuffer[5], color, pxlbrightness[1]); uint32_t rendersize = 2; // initialize render size, minimum is 4 pixels, it is incremented int he loop below to start with 4 uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below) uint32_t blurpasses = size/64 + 1; // number of blur passes depends on size, four passes max @@ -1564,7 +1562,7 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, #ifdef ESP8266 // no local buffer on ESP8266 SEGMENT.addPixelColor(xfb, renderbuffer[xrb], true); #else - framebuffer[xfb] = fast_color_add(framebuffer[xfb], renderbuffer[xrb]); + framebuffer[xfb] = fast_color_scaleAdd(framebuffer[xfb], renderbuffer[xrb]); #endif } } @@ -1584,7 +1582,7 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, } for (uint32_t i = 0; i < 2; i++) { if (pxlisinframe[i]) { - framebuffer[pixco[i]] = fast_color_add(framebuffer[pixco[i]], color, pxlbrightness[i]); + framebuffer[pixco[i]] = fast_color_scaleAdd(framebuffer[pixco[i]], color, pxlbrightness[i]); } } } @@ -1836,9 +1834,8 @@ void blur1D(uint32_t *colorbuffer, uint32_t size, uint32_t blur, uint32_t start) for (uint32_t x = start; x < start + size; x++) { seeppart = fast_color_scale(colorbuffer[x], seep); // scale it and seep to neighbours if (x > 0) { - colorbuffer[x-1] = fast_color_add(colorbuffer[x-1], seeppart); - if (carryover.color32) // note: check adds overhead but is faster on average - colorbuffer[x] = fast_color_add(colorbuffer[x], carryover); // is black on first pass + colorbuffer[x-1] = fast_color_scaleAdd(colorbuffer[x-1], seeppart); + colorbuffer[x] = fast_color_scaleAdd(colorbuffer[x], carryover); // is black on first pass } carryover = seeppart; } @@ -1887,28 +1884,34 @@ static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32 return true; // particle is in bounds } -// this is a fast version for CRGBW color adding ignoring white channel (PS does not handle white) including scaling of second color +// this is a fast version for RGB color adding ignoring white channel (PS does not handle white) including scaling of second color // note: function is mainly used to add scaled colors, so checking if one color is black is slower -// note2: returning CRGBW value is slightly slower as the return value gets written to uint32_t framebuffer -static uint32_t WLED_O2_ATTR fast_color_add(CRGBW c1, const CRGBW c2, const uint8_t scale) { - uint32_t r, g, b; - r = c1.r + ((c2.r * scale) >> 8); - g = c1.g + ((c2.g * scale) >> 8); - b = c1.b + ((c2.b * scale) >> 8); - - // note: this chained comparison is the fastest method for max of 3 values (faster than std:max() or using xor) - uint32_t max = (r > g) ? ((r > b) ? r : b) : ((g > b) ? g : b); - if (max <= 255) { - c1.r = r; // save result to c1 - c1.g = g; - c1.b = b; - } else { - uint32_t newscale = (255U << 16) / max; - c1.r = (r * newscale) >> 16; - c1.g = (g * newscale) >> 16; - c1.b = (b * newscale) >> 16; - } - return c1.color32; +static uint32_t fast_color_scaleAdd(const uint32_t c1, const uint32_t c2, const uint8_t scale) { + constexpr uint32_t MASK_RB = 0x00FF00FF; // red and blue mask + constexpr uint32_t MASK_G = 0x0000FF00; // green mask + + uint32_t rb = c2 & MASK_RB; // 0x00RR00BB + uint32_t g = c2 & MASK_G; // 0x0000GG00 + // scale second color + rb = ((rb * scale) >> 8) & MASK_RB; + g = ((g * scale) >> 8) & MASK_G; + // add colors + rb = (c1 & MASK_RB) + rb; + g = ((c1 & MASK_G) + g); + + // check for overflow by looking at the 9th bit of each channel + if ((rb | (g >> 8)) & 0x01000100) { + // find max among the three 16-bit values + g = g >> 8; // shift to get 0x000000GG + uint32_t max_val = (rb >> 16); // red + max_val = ((rb & 0xFFFF) > max_val) ? rb & 0xFFFF : max_val; // blue + max_val = (g > max_val) ? g : max_val; // green + // scale down to avoid saturation + uint32_t scale_factor = (255 << 8) / max_val; + rb = ((rb * scale_factor) >> 8) & MASK_RB; + g = (g * scale_factor) & MASK_G; + } + return rb | g; } #endif // !(defined(WLED_DISABLE_PARTICLESYSTEM2D) && defined(WLED_DISABLE_PARTICLESYSTEM1D)) diff --git a/wled00/colors.cpp b/wled00/colors.cpp index 0b95f88994..b17c0b6d61 100644 --- a/wled00/colors.cpp +++ b/wled00/colors.cpp @@ -25,39 +25,35 @@ uint32_t WLED_O2_ATTR IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, ui * original idea: https://github.com/wled-dev/WLED/pull/2465 by https://github.com/Proto-molecule * speed optimisations by @dedehai */ -uint32_t color_add(uint32_t c1, uint32_t c2, bool preserveCR) +uint32_t WLED_O2_ATTR color_add(uint32_t c1, uint32_t c2, bool preserveCR) //1212558 | 1212598 | 1212576 | 1212530 { if (c1 == BLACK) return c2; if (c2 == BLACK) return c1; const uint32_t TWO_CHANNEL_MASK = 0x00FF00FF; // mask for R and B channels or W and G if negated uint32_t rb = ( c1 & TWO_CHANNEL_MASK) + ( c2 & TWO_CHANNEL_MASK); // mask and add two colors at once uint32_t wg = ((c1>>8) & TWO_CHANNEL_MASK) + ((c2>>8) & TWO_CHANNEL_MASK); - uint32_t r = rb >> 16; // extract single color values - uint32_t b = rb & 0xFFFF; - uint32_t w = wg >> 16; - uint32_t g = wg & 0xFFFF; if (preserveCR) { // preserve color ratios - uint32_t max = std::max(r,g); // check for overflow note - max = std::max(max,b); - max = std::max(max,w); - //unsigned max = r; // check for overflow note - //max = g > max ? g : max; - //max = b > max ? b : max; - //max = w > max ? w : max; - if (max > 255) { + uint32_t overflow = (rb | wg) & 0x01000100; // detect overflow by checking 9th bit + if (overflow) { + uint32_t r = rb >> 16; // extract single color values + uint32_t b = rb & 0xFFFF; + uint32_t w = wg >> 16; + uint32_t g = wg & 0xFFFF; + uint32_t max = std::max(r,g); + max = std::max(max,b); + max = std::max(max,w); const uint32_t scale = (uint32_t(255)<<8) / max; // division of two 8bit (shifted) values does not work -> use bit shifts and multiplaction instead rb = ((rb * scale) >> 8) & TWO_CHANNEL_MASK; wg = (wg * scale) & ~TWO_CHANNEL_MASK; } else wg <<= 8; //shift white and green back to correct position - return rb | wg; } else { - r = r > 255 ? 255 : r; - g = g > 255 ? 255 : g; - b = b > 255 ? 255 : b; - w = w > 255 ? 255 : w; - return RGBW32(r,g,b,w); + // branchless per-channel saturation to 255 (extract 9th bit, subtract 1 if it is set, mask with 0xFF) + rb |= ((rb & 0x01000100) - ((rb >> 8) & 0x00010001)) & 0x00FF00FF; + wg |= ((wg & 0x01000100) - ((wg >> 8) & 0x00010001)) & 0x00FF00FF; + wg <<= 8; // restore WG position } + return rb | wg; } /* diff --git a/wled00/colors.h b/wled00/colors.h index af7dd5cb69..b5a7befe8e 100644 --- a/wled00/colors.h +++ b/wled00/colors.h @@ -142,7 +142,7 @@ void setRandomColor(byte* rgb); // fast scaling function for colors, performs color*scale/256 for all four channels, speed over accuracy // note: inlining uses less code than actual function calls -static inline uint32_t fast_color_scale(uint32_t c, const uint8_t scale) { +static inline uint32_t fast_color_scale(const uint32_t c, const uint8_t scale) { uint32_t rb = (((c & 0x00FF00FF) * scale) >> 8) & 0x00FF00FF; uint32_t wg = (((c>>8) & 0x00FF00FF) * scale) & ~0x00FF00FF; return rb | wg; From 2f5f76bb287bc3d5851f313ca3aff3fc610d2547 Mon Sep 17 00:00:00 2001 From: Damian Schneider Date: Tue, 23 Sep 2025 18:43:45 +0200 Subject: [PATCH 3/3] add branchless saturation example comments --- wled00/colors.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/wled00/colors.cpp b/wled00/colors.cpp index b17c0b6d61..1aa82fbd87 100644 --- a/wled00/colors.cpp +++ b/wled00/colors.cpp @@ -48,7 +48,9 @@ uint32_t WLED_O2_ATTR color_add(uint32_t c1, uint32_t c2, bool preserveCR) //121 wg = (wg * scale) & ~TWO_CHANNEL_MASK; } else wg <<= 8; //shift white and green back to correct position } else { - // branchless per-channel saturation to 255 (extract 9th bit, subtract 1 if it is set, mask with 0xFF) + // branchless per-channel saturation to 255 (extract 9th bit, subtract 1 if it is set, mask with 0xFF, input is 0xFF+0xFF=0x1EF max) + // example with overflow: input: 0x01EF01EF -> (0x0100100 - 0x00010001) = 0x00FF00FF -> input|0x00FF00FF = 0x00FF00FF (saturate) + // example without overflow: input: 0x007F007F -> (0x00000000 - 0x00000000) = 0x00000000 -> input|0x00000000 = input (no change) rb |= ((rb & 0x01000100) - ((rb >> 8) & 0x00010001)) & 0x00FF00FF; wg |= ((wg & 0x01000100) - ((wg >> 8) & 0x00010001)) & 0x00FF00FF; wg <<= 8; // restore WG position