From 3641db289799649c7550b2760fb5f7067678ad35 Mon Sep 17 00:00:00 2001
From: Damian Schneider <daedae@gmx.ch>
Date: Sun, 31 Aug 2025 16:14:07 +0200
Subject: [PATCH 1/3] speed optimizations, fix for restoreColorLossy, code
 cleanup

---
 wled00/FX_2Dfcn.cpp           | 14 +++++++-------
 wled00/FX_fcn.cpp             | 12 ++++++------
 wled00/FXparticleSystem.cpp   | 19 +++++--------------
 wled00/bus_manager.cpp        |  5 ++++-
 wled00/bus_manager.h          |  7 ++++---
 wled00/colors.cpp             | 24 ++++++++++++------------
 wled00/colors.h               |  9 ++++++++-
 wled00/const.h                |  2 ++
 wled00/data/settings_leds.htm |  4 ++--
 9 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/wled00/FX_2Dfcn.cpp b/wled00/FX_2Dfcn.cpp
index 9a3c6fbe81..1fa50f1ee9 100644
--- a/wled00/FX_2Dfcn.cpp
+++ b/wled00/FX_2Dfcn.cpp
@@ -146,7 +146,7 @@ void WS2812FX::setUpMatrix() {
 #ifndef WLED_DISABLE_2D
 // pixel is clipped if it falls outside clipping range
 // if clipping start > stop the clipping range is inverted
-bool IRAM_ATTR_YN Segment::isPixelXYClipped(int x, int y) const {
+bool Segment::isPixelXYClipped(int x, int y) const {
   if (blendingStyle != BLEND_STYLE_FADE && isInTransition() && _clipStart != _clipStop) {
     const bool invertX = _clipStart  > _clipStop;
     const bool invertY = _clipStartY > _clipStopY;
@@ -186,7 +186,7 @@ bool IRAM_ATTR_YN Segment::isPixelXYClipped(int x, int y) const {
 void IRAM_ATTR_YN Segment::setPixelColorXY(int x, int y, uint32_t col) const
 {
   if (!isActive()) return; // not active
-  if (x >= (int)vWidth() || y >= (int)vHeight() || x < 0 || y < 0) return;  // if pixel would fall out of virtual segment just exit
+  if ((unsigned)x >= vWidth() || (unsigned)y >= vHeight()) return;  // if pixel would fall out of virtual segment just exit
   setPixelColorXYRaw(x, y, col);
 }
 
@@ -236,7 +236,7 @@ void Segment::setPixelColorXY(float x, float y, uint32_t col, bool aa) const
 // returns RGBW values of pixel
 uint32_t IRAM_ATTR_YN Segment::getPixelColorXY(int x, int y) const {
   if (!isActive()) return 0; // not active
-  if (x >= (int)vWidth() || y >= (int)vHeight() || x<0 || y<0) return 0;  // if pixel would fall out of virtual segment just exit
+  if ((unsigned)x >= vWidth() || (unsigned)y >= vHeight()) return 0;  // if pixel would fall out of virtual segment just exit
   return getPixelColorXYRaw(x,y);
 }
 
@@ -256,8 +256,8 @@ void Segment::blur2D(uint8_t blur_x, uint8_t blur_y, bool smear) const {
       uint32_t curnew = BLACK;
       for (unsigned x = 0; x < cols; x++) {
         uint32_t cur = getPixelColorRaw(XY(x, row));
-        uint32_t part = color_fade(cur, seepx);
-        curnew = color_fade(cur, keepx);
+        uint32_t part = fast_color_scale(cur, seepx);
+        curnew = fast_color_scale(cur, keepx);
         if (x > 0) {
           if (carryover) curnew = color_add(curnew, carryover);
           uint32_t prev = color_add(lastnew, part);
@@ -279,8 +279,8 @@ void Segment::blur2D(uint8_t blur_x, uint8_t blur_y, bool smear) const {
       uint32_t curnew = BLACK;
       for (unsigned y = 0; y < rows; y++) {
         uint32_t cur = getPixelColorRaw(XY(col, y));
-        uint32_t part = color_fade(cur, seepy);
-        curnew = color_fade(cur, keepy);
+        uint32_t part = fast_color_scale(cur, seepy);
+        curnew = fast_color_scale(cur, keepy);
         if (y > 0) {
           if (carryover) curnew = color_add(curnew, carryover);
           uint32_t prev = color_add(lastnew, part);
diff --git a/wled00/FX_fcn.cpp b/wled00/FX_fcn.cpp
index 2f8d5515fd..677056770c 100644
--- a/wled00/FX_fcn.cpp
+++ b/wled00/FX_fcn.cpp
@@ -673,7 +673,7 @@ uint16_t Segment::maxMappingLength() const {
 #endif
 // pixel is clipped if it falls outside clipping range
 // if clipping start > stop the clipping range is inverted
-bool IRAM_ATTR_YN Segment::isPixelClipped(int i) const {
+bool Segment::isPixelClipped(int i) const {
   if (blendingStyle != BLEND_STYLE_FADE && isInTransition() && _clipStart != _clipStop) {
     bool invert = _clipStart > _clipStop;  // ineverted start & stop
     int start = invert ? _clipStop : _clipStart;
@@ -691,7 +691,7 @@ bool IRAM_ATTR_YN Segment::isPixelClipped(int i) const {
   return false;
 }
 
-void IRAM_ATTR_YN Segment::setPixelColor(int i, uint32_t col) const
+void WLED_O2_ATTR Segment::setPixelColor(int i, uint32_t col) const
 {
   if (!isActive() || i < 0) return; // not active or invalid index
 #ifndef WLED_DISABLE_2D
@@ -904,7 +904,7 @@ void Segment::setPixelColor(float i, uint32_t col, bool aa) const
 }
 #endif
 
-uint32_t IRAM_ATTR_YN Segment::getPixelColor(int i) const
+uint32_t WLED_O2_ATTR Segment::getPixelColor(int i) const
 {
   if (!isActive() || i < 0) return 0; // not active or invalid index
 
@@ -1043,7 +1043,7 @@ void Segment::fadeToSecondaryBy(uint8_t fadeBy) const {
 void Segment::fadeToBlackBy(uint8_t fadeBy) const {
   if (!isActive() || fadeBy == 0) return;   // optimization - no scaling to apply
   const size_t rlength = rawLength();  // calculate only once
-  for (unsigned i = 0; i < rlength; i++) setPixelColorRaw(i, color_fade(getPixelColorRaw(i), 255-fadeBy));
+  for (unsigned i = 0; i < rlength; i++) setPixelColorRaw(i, fast_color_scale(getPixelColorRaw(i), 255-fadeBy));
 }
 
 /*
@@ -1069,8 +1069,8 @@ void Segment::blur(uint8_t blur_amount, bool smear) const {
   uint32_t curnew = BLACK;
   for (unsigned i = 0; i < vlength; i++) {
     uint32_t cur = getPixelColorRaw(i);
-    uint32_t part = color_fade(cur, seep);
-    curnew = color_fade(cur, keep);
+    uint32_t part = fast_color_scale(cur, seep);
+    curnew = fast_color_scale(cur, keep);
     if (i > 0) {
       if (carryover) curnew = color_add(curnew, carryover);
       uint32_t prev = color_add(lastnew, part);
diff --git a/wled00/FXparticleSystem.cpp b/wled00/FXparticleSystem.cpp
index 8b684a5f69..4446be8d81 100644
--- a/wled00/FXparticleSystem.cpp
+++ b/wled00/FXparticleSystem.cpp
@@ -18,7 +18,6 @@
 static int32_t calcForce_dv(const int8_t force, uint8_t &counter);
 static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32_t particleradius, const bool wrap); // returns false if out of bounds by more than particleradius
 static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding)
-static uint32_t fast_color_scale(CRGBW c, const uint8_t scale); // fast scaling function using 32bit variable and pointer. note: keep 'scale' within 0-255
 #endif
 
 #ifndef WLED_DISABLE_PARTICLESYSTEM2D
@@ -625,7 +624,7 @@ void ParticleSystem2D::render() {
 }
 
 // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
-__attribute__((optimize("O2"))) void ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
+void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW& color, const bool wrapX, const bool wrapY) {
   uint32_t size = particlesize;
   if (advPartProps && advPartProps[particleindex].size > 0) // use advanced size properties (0 means use global size including single pixel rendering)
     size = advPartProps[particleindex].size;
@@ -857,7 +856,7 @@ void ParticleSystem2D::handleCollisions() {
 
 // handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
 // takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
-__attribute__((optimize("O2"))) void ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq) {
+void WLED_O2_ATTR ParticleSystem2D::collideParticles(PSparticle &particle1, PSparticle &particle2, int32_t dx, int32_t dy, const uint32_t collDistSq) {
   int32_t distanceSquared = dx * dx + dy * dy;
   // Calculate relative velocity note: could zero check but that does not improve overall speed but deminish it as that is rarely the case and pushing is still required
   int32_t relativeVx = (int32_t)particle2.vx - (int32_t)particle1.vx;
@@ -1485,7 +1484,7 @@ void ParticleSystem1D::render() {
 }
 
 // calculate pixel positions and brightness distribution and render the particle to local buffer or global buffer
-__attribute__((optimize("O2"))) void ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) {
+void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex, const uint8_t brightness, const CRGBW &color, const bool wrap) {
   uint32_t size = particlesize;
   if (advPartProps) // use advanced size properties (1D system has no large size global rendering TODO: add large global rendering?)
     size = advPartProps[particleindex].size;
@@ -1648,7 +1647,7 @@ void ParticleSystem1D::handleCollisions() {
 }
 // handle a collision if close proximity is detected, i.e. dx and/or dy smaller than 2*PS_P_RADIUS
 // takes two pointers to the particles to collide and the particle hardness (softer means more energy lost in collision, 255 means full hard)
-__attribute__((optimize("O2"))) void ParticleSystem1D::collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance) {
+void WLED_O2_ATTR ParticleSystem1D::collideParticles(PSparticle1D &particle1, const PSparticleFlags1D &particle1flags, PSparticle1D &particle2, const PSparticleFlags1D &particle2flags, const int32_t dx, const uint32_t dx_abs, const uint32_t collisiondistance) {
   int32_t dv = particle2.vx - particle1.vx;
   int32_t dotProduct = (dx * dv); // is always negative if moving towards each other
 
@@ -1891,7 +1890,7 @@ static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32
 // this is a fast version for CRGBW color adding ignoring white channel (PS does not handle white) including scaling of second color
 // note: function is mainly used to add scaled colors, so checking if one color is black is slower
 // note2: returning CRGBW value is slightly slower as the return value gets written to uint32_t framebuffer
- __attribute__((optimize("O2"))) static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, const uint8_t scale) {
+static uint32_t WLED_O2_ATTR fast_color_add(CRGBW c1, const CRGBW c2, const uint8_t scale) {
   uint32_t r, g, b;
   r = c1.r + ((c2.r * scale) >> 8);
   g = c1.g + ((c2.g * scale) >> 8);
@@ -1912,12 +1911,4 @@ static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32
   return c1.color32;
 }
 
-// fast CRGBW color scaling ignoring white channel (PS does not handle white)
- __attribute__((optimize("O2"))) static uint32_t fast_color_scale(CRGBW c, const uint8_t scale) {
-  c.r = ((c.r * scale) >> 8);
-  c.g = ((c.g * scale) >> 8);
-  c.b = ((c.b * scale) >> 8);
-  return c.color32;
-}
-
 #endif  // !(defined(WLED_DISABLE_PARTICLESYSTEM2D) && defined(WLED_DISABLE_PARTICLESYSTEM1D))
diff --git a/wled00/bus_manager.cpp b/wled00/bus_manager.cpp
index 99523bba9f..d220149404 100644
--- a/wled00/bus_manager.cpp
+++ b/wled00/bus_manager.cpp
@@ -209,6 +209,7 @@ void BusDigital::estimateCurrent() {
 
 void BusDigital::applyBriLimit(uint8_t newBri) {
   // a newBri of 0 means calculate per-bus brightness limit
+  _NPBbri = 255; // reset, intermediate value is set below, final value is calculated in bus::show()
   if (newBri == 0) {
     if (_milliAmpsLimit == 0 || _milliAmpsTotal == 0) return; // ABL not used for this bus
     newBri = 255;
@@ -226,6 +227,7 @@ void BusDigital::applyBriLimit(uint8_t newBri) {
   }
 
   if (newBri < 255) {
+    _NPBbri = newBri; // store value so it can be updated in show() (must be updated even if ABL is not used)
     uint8_t cctWW = 0, cctCW = 0;
     unsigned hwLen = _len;
     if (_type == TYPE_WS2812_1CH_X3) hwLen = NUM_ICS_WS2812_1CH_3X(_len); // only needs a third of "RGB" LEDs for NeoPixelBus
@@ -243,6 +245,7 @@ void BusDigital::applyBriLimit(uint8_t newBri) {
 
 void BusDigital::show() {
   if (!_valid) return;
+  _NPBbri = (_NPBbri * _bri) / 255;      // total applied brightness for use in restoreColorLossy (see applyBriLimit())
   PolyBus::show(_busPtr, _iType, _skip); // faster if buffer consistency is not important (no skipped LEDs)
 }
 
@@ -305,7 +308,7 @@ uint32_t IRAM_ATTR BusDigital::getPixelColor(unsigned pix) const {
   if (_reversed) pix = _len - pix -1;
   pix += _skip;
   const uint8_t co = _colorOrderMap.getPixelColorOrder(pix+_start, _colorOrder);
-  uint32_t c = restoreColorLossy(PolyBus::getPixelColor(_busPtr, _iType, (_type==TYPE_WS2812_1CH_X3) ? IC_INDEX_WS2812_1CH_3X(pix) : pix, co),_bri);
+  uint32_t c = restoreColorLossy(PolyBus::getPixelColor(_busPtr, _iType, (_type==TYPE_WS2812_1CH_X3) ? IC_INDEX_WS2812_1CH_3X(pix) : pix, co),_NPBbri);
   if (_type == TYPE_WS2812_1CH_X3) { // map to correct IC, each controls 3 LEDs
     uint8_t r = R(c);
     uint8_t g = _reversed ? B(c) : G(c); // should G and B be switched if _reversed?
diff --git a/wled00/bus_manager.h b/wled00/bus_manager.h
index fe70a05170..87f2a595bd 100644
--- a/wled00/bus_manager.h
+++ b/wled00/bus_manager.h
@@ -105,6 +105,7 @@ class Bus {
     Bus(uint8_t type, uint16_t start, uint8_t aw, uint16_t len = 1, bool reversed = false, bool refresh = false)
     : _type(type)
     , _bri(255)
+    , _NPBbri(255)
     , _start(start)
     , _len(std::max(len,(uint16_t)1))
     , _reversed(reversed)
@@ -202,7 +203,9 @@ class Bus {
 
   protected:
     uint8_t  _type;
-    uint8_t  _bri;
+    uint8_t  _bri;    // bus brightness
+    uint8_t  _NPBbri; // total brightness applied to colors in NPB buffer (_bri + ABL)
+    uint8_t  _autoWhiteMode; // global Auto White Calculation override
     uint16_t _start;
     uint16_t _len;
     //struct { //using bitfield struct adds abour 250 bytes to binary size
@@ -213,8 +216,6 @@ class Bus {
       bool _hasWhite;//     : 1;
       bool _hasCCT;//       : 1;
     //} __attribute__ ((packed));
-    uint8_t  _autoWhiteMode;
-    // global Auto White Calculation override
     static uint8_t _gAWM;
     // _cct has the following meanings (see calculateCCT() & BusManager::setSegmentCCT()):
     //    -1 means to extract approximate CCT value in K from RGB (in calcualteCCT())
diff --git a/wled00/colors.cpp b/wled00/colors.cpp
index bf2b69d73a..0b95f88994 100644
--- a/wled00/colors.cpp
+++ b/wled00/colors.cpp
@@ -8,7 +8,7 @@
  * color blend function, based on FastLED blend function
  * the calculation for each color is: result = (A*(amountOfA) + A + B*(amountOfB) + B) / 256 with amountOfA = 255 - amountOfB
  */
-uint32_t IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, uint8_t blend) {
+uint32_t WLED_O2_ATTR IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, uint8_t blend) {
   // min / max blend checking is omitted: calls with 0 or 255 are rare, checking lowers overall performance
   const uint32_t TWO_CHANNEL_MASK = 0x00FF00FF;     // mask for R and B channels or W and G if negated (poorman's SIMD; https://github.com/wled/WLED/pull/4568#discussion_r1986587221)
   uint32_t rb1 =  color1       & TWO_CHANNEL_MASK;  // extract R & B channels from color1
@@ -92,15 +92,15 @@ uint32_t IRAM_ATTR color_fade(uint32_t c1, uint8_t amount, bool video) {
    note: inputs are 32bit to speed up the function, useful input value ranges are 0-255
  */
 uint32_t adjust_color(uint32_t rgb, uint32_t hueShift, uint32_t lighten, uint32_t brighten) {
-    if (rgb == 0 | hueShift + lighten + brighten == 0) return rgb; // black or no change
-    CHSV32 hsv;
-    rgb2hsv(rgb, hsv); //convert to HSV
-    hsv.h += (hueShift << 8); // shift hue (hue is 16 bits)
-    hsv.s =  max((int32_t)0, (int32_t)hsv.s - (int32_t)lighten); // desaturate
-    hsv.v =  min((uint32_t)255, (uint32_t)hsv.v + brighten); // increase brightness
-    uint32_t rgb_adjusted;
-    hsv2rgb(hsv, rgb_adjusted); // convert back to RGB TODO: make this into 16 bit conversion
-    return rgb_adjusted;
+  if (rgb == 0 || hueShift + lighten + brighten == 0) return rgb; // black or no change
+  CHSV32 hsv;
+  rgb2hsv(rgb, hsv); //convert to HSV
+  hsv.h += (hueShift << 8); // shift hue (hue is 16 bits)
+  hsv.s =  max((int32_t)0, (int32_t)hsv.s - (int32_t)lighten); // desaturate
+  hsv.v =  min((uint32_t)255, (uint32_t)hsv.v + brighten); // increase brightness
+  uint32_t rgb_adjusted;
+  hsv2rgb(hsv, rgb_adjusted); // convert back to RGB TODO: make this into 16 bit conversion
+  return rgb_adjusted;
 }
 
 // 1:1 replacement of fastled function optimized for ESP, slightly faster, more accurate and uses less flash (~ -200bytes)
@@ -597,13 +597,13 @@ void NeoGammaWLEDMethod::calcGammaTable(float gamma)
   gammaT_inv[0] = 0;
 }
 
-uint8_t IRAM_ATTR_YN NeoGammaWLEDMethod::Correct(uint8_t value)
+uint8_t NeoGammaWLEDMethod::Correct(uint8_t value)
 {
   if (!gammaCorrectCol) return value;
   return gammaT[value];
 }
 
-uint32_t IRAM_ATTR_YN NeoGammaWLEDMethod::inverseGamma32(uint32_t color)
+uint32_t NeoGammaWLEDMethod::inverseGamma32(uint32_t color)
 {
   if (!gammaCorrectCol) return color;
   uint8_t w = W(color);
diff --git a/wled00/colors.h b/wled00/colors.h
index 376959fd65..af7dd5cb69 100644
--- a/wled00/colors.h
+++ b/wled00/colors.h
@@ -117,6 +117,7 @@ class NeoGammaWLEDMethod {
 [[gnu::hot, gnu::pure]] uint32_t color_blend(uint32_t c1, uint32_t c2 , uint8_t blend);
 inline uint32_t color_blend16(uint32_t c1, uint32_t c2, uint16_t b) { return color_blend(c1, c2, b >> 8); };
 [[gnu::hot, gnu::pure]] uint32_t color_add(uint32_t, uint32_t, bool preserveCR = false);
+[[gnu::hot, gnu::pure]] uint32_t color_fade(uint32_t c1, uint8_t amount, bool video = false);
 [[gnu::hot, gnu::pure]] uint32_t adjust_color(uint32_t rgb, uint32_t hueShift, uint32_t lighten, uint32_t brighten);
 [[gnu::hot, gnu::pure]] uint32_t ColorFromPaletteWLED(const CRGBPalette16 &pal, unsigned index, uint8_t brightness = (uint8_t)255U, TBlendType blendType = LINEARBLEND);
 CRGBPalette16 generateHarmonicRandomPalette(const CRGBPalette16 &basepalette);
@@ -139,6 +140,12 @@ uint32_t colorBalanceFromKelvin(uint16_t kelvin, uint32_t rgb);
 uint16_t approximateKelvinFromRGB(uint32_t rgb);
 void setRandomColor(byte* rgb);
 
-[[gnu::hot, gnu::pure]] uint32_t color_fade(uint32_t c1, uint8_t amount, bool video = false);
+// fast scaling function for colors, performs color*scale/256 for all four channels, speed over accuracy
+// note: inlining uses less code than actual function calls
+static inline uint32_t fast_color_scale(uint32_t c, const uint8_t scale) {
+  uint32_t rb = (((c     & 0x00FF00FF) * scale) >> 8) &  0x00FF00FF;
+  uint32_t wg = (((c>>8) & 0x00FF00FF) * scale)       & ~0x00FF00FF;
+  return rb | wg;
+}
 
 #endif
diff --git a/wled00/const.h b/wled00/const.h
index 1abf245396..b5fdc4ccf7 100644
--- a/wled00/const.h
+++ b/wled00/const.h
@@ -655,4 +655,6 @@ static_assert(WLED_MAX_BUSSES <= 32, "WLED_MAX_BUSSES exceeds hard limit");
   #define IRAM_ATTR_YN IRAM_ATTR
 #endif
 
+#define WLED_O2_ATTR __attribute__((optimize("O2")))
+
 #endif
diff --git a/wled00/data/settings_leds.htm b/wled00/data/settings_leds.htm
index 928da11753..a971e93143 100644
--- a/wled00/data/settings_leds.htm
+++ b/wled00/data/settings_leds.htm
@@ -43,13 +43,13 @@
 		}
 		function bLimits(b,v,p,m,l,o=5,d=2,a=6) {
 			maxB  = b; // maxB - max physical (analog + digital) buses: 32 - ESP32, 14 - S3/S2, 6 - C3, 4 - 8266
-			maxD  = d; // maxD - max digital channels (can be changed if using ESP32 parallel I2S): 16 - ESP32, 12 - S3/S2, 2 - C3, 3 - 8266
-			maxA  = a; // maxA - max analog channels: 16 - ESP32, 8 - S3/S2, 6 - C3, 5 - 8266
 			maxV  = v; // maxV - min virtual buses: 6 - ESP32/S3, 4 - S2/C3, 3 - ESP8266 (only used to distinguish S2/S3)
 			maxPB = p; // maxPB - max LEDs per bus
 			maxM  = m; // maxM - max LED memory
 			maxL  = l; // maxL - max LEDs (will serve to determine ESP >1664 == ESP32)
 			maxCO = o; // maxCO - max Color Order mappings
+			maxD  = d; // maxD - max digital channels (can be changed if using ESP32 parallel I2S): 16 - ESP32, 12 - S3/S2, 2 - C3, 3 - 8266
+			maxA  = a; // maxA - max analog channels: 16 - ESP32, 8 - S3/S2, 6 - C3, 5 - 8266
 		}
 		function is8266() { return maxA ==  5 && maxD ==  3; } // NOTE: see const.h
 		function is32()   { return maxA == 16 && maxD == 16; } // NOTE: see const.h

From fa0673284dca20fff4d3fae93a3875dfe43a9267 Mon Sep 17 00:00:00 2001
From: Damian Schneider <daedae@gmx.ch>
Date: Sat, 13 Sep 2025 23:27:33 +0200
Subject: [PATCH 2/3] speed optimization in color_add, PS fast_color_add and
 blur functions

applying more bit and shift manipulation tricks to squeeze out just a bit more speed on color manipulation functions.
- Optimization on blur is based on work by @blazoncek
- Renamed PS fast_color_add() to fast_color_scaleAdd()
In my arbitrary speed tests thse changes resulted in 2-3% higher FPS.
---
 wled00/FX_2Dfcn.cpp         | 50 ++++++++------------
 wled00/FX_fcn.cpp           | 26 ++++-------
 wled00/FXparticleSystem.cpp | 91 +++++++++++++++++++------------------
 wled00/colors.cpp           | 34 ++++++--------
 wled00/colors.h             |  2 +-
 5 files changed, 93 insertions(+), 110 deletions(-)

diff --git a/wled00/FX_2Dfcn.cpp b/wled00/FX_2Dfcn.cpp
index 1fa50f1ee9..72b7f3d0bb 100644
--- a/wled00/FX_2Dfcn.cpp
+++ b/wled00/FX_2Dfcn.cpp
@@ -246,52 +246,42 @@ void Segment::blur2D(uint8_t blur_x, uint8_t blur_y, bool smear) const {
   const unsigned cols = vWidth();
   const unsigned rows = vHeight();
   const auto XY = [&](unsigned x, unsigned y){ return x + y*cols; };
-  uint32_t lastnew; // not necessary to initialize lastnew and last, as both will be initialized by the first loop iteration
-  uint32_t last;
   if (blur_x) {
     const uint8_t keepx = smear ? 255 : 255 - blur_x;
     const uint8_t seepx = blur_x >> 1;
     for (unsigned row = 0; row < rows; row++) { // blur rows (x direction)
-      uint32_t carryover = BLACK;
-      uint32_t curnew = BLACK;
-      for (unsigned x = 0; x < cols; x++) {
-        uint32_t cur = getPixelColorRaw(XY(x, row));
+      // handle first pixel in row to avoid conditional in loop (faster)
+      uint32_t cur = getPixelColorRaw(XY(0, row));
+      uint32_t carryover = fast_color_scale(cur, seepx);
+      setPixelColorRaw(XY(0, row), fast_color_scale(cur, keepx));
+      for (unsigned x = 1; x < cols; x++) {
+         cur = getPixelColorRaw(XY(x, row));
         uint32_t part = fast_color_scale(cur, seepx);
-        curnew = fast_color_scale(cur, keepx);
-        if (x > 0) {
-          if (carryover) curnew = color_add(curnew, carryover);
-          uint32_t prev = color_add(lastnew, part);
-          // optimization: only set pixel if color has changed
-          if (last != prev) setPixelColorRaw(XY(x - 1, row), prev);
-        } else setPixelColorRaw(XY(x, row), curnew); // first pixel
-        lastnew = curnew;
-        last = cur; // save original value for comparison on next iteration
+        cur = fast_color_scale(cur, keepx);
+        cur = color_add(cur, carryover);
+        setPixelColorRaw(XY(x - 1, row), color_add(getPixelColorRaw(XY(x-1, row)), part)); // previous pixel
+        setPixelColorRaw(XY(x, row), cur); // current pixel
         carryover = part;
       }
-      setPixelColorRaw(XY(cols-1, row), curnew); // set last pixel
     }
   }
   if (blur_y) {
     const uint8_t keepy = smear ? 255 : 255 - blur_y;
     const uint8_t seepy = blur_y >> 1;
     for (unsigned col = 0; col < cols; col++) {
-      uint32_t carryover = BLACK;
-      uint32_t curnew = BLACK;
-      for (unsigned y = 0; y < rows; y++) {
-        uint32_t cur = getPixelColorRaw(XY(col, y));
+      // handle first pixel in column
+      uint32_t cur = getPixelColorRaw(XY(col, 0));
+      uint32_t carryover = fast_color_scale(cur, seepy);
+      setPixelColorRaw(XY(col, 0), fast_color_scale(cur, keepy));
+      for (unsigned y = 1; y < rows; y++) {
+        cur = getPixelColorRaw(XY(col, y));
         uint32_t part = fast_color_scale(cur, seepy);
-        curnew = fast_color_scale(cur, keepy);
-        if (y > 0) {
-          if (carryover) curnew = color_add(curnew, carryover);
-          uint32_t prev = color_add(lastnew, part);
-          // optimization: only set pixel if color has changed
-          if (last != prev) setPixelColorRaw(XY(col, y - 1), prev);
-        } else setPixelColorRaw(XY(col, y), curnew); // first pixel
-        lastnew = curnew;
-        last = cur; //save original value for comparison on next iteration
+        cur = fast_color_scale(cur, keepy);
+        cur = color_add(cur, carryover);
+        setPixelColorRaw(XY(col, y - 1), color_add(getPixelColorRaw(XY(col, y-1)), part)); // previous pixel
+        setPixelColorRaw(XY(col, y), cur); // current pixel
         carryover = part;
       }
-      setPixelColorRaw(XY(col, rows - 1), curnew);
     }
   }
 }
diff --git a/wled00/FX_fcn.cpp b/wled00/FX_fcn.cpp
index 677056770c..d3b44f1e3a 100644
--- a/wled00/FX_fcn.cpp
+++ b/wled00/FX_fcn.cpp
@@ -1063,25 +1063,19 @@ void Segment::blur(uint8_t blur_amount, bool smear) const {
   uint8_t keep = smear ? 255 : 255 - blur_amount;
   uint8_t seep = blur_amount >> 1;
   unsigned vlength = vLength();
-  uint32_t carryover = BLACK;
-  uint32_t lastnew; // not necessary to initialize lastnew and last, as both will be initialized by the first loop iteration
-  uint32_t last;
-  uint32_t curnew = BLACK;
-  for (unsigned i = 0; i < vlength; i++) {
-    uint32_t cur = getPixelColorRaw(i);
+  // handle first pixel to avoid conditional in loop (faster)
+  uint32_t cur = getPixelColorRaw(0);
+  uint32_t carryover = fast_color_scale(cur, seep);
+  setPixelColorRaw(0, fast_color_scale(cur, keep));
+  for (unsigned i = 1; i < vlength; i++) {
+    cur = getPixelColorRaw(i);
     uint32_t part = fast_color_scale(cur, seep);
-    curnew = fast_color_scale(cur, keep);
-    if (i > 0) {
-      if (carryover) curnew = color_add(curnew, carryover);
-      uint32_t prev = color_add(lastnew, part);
-      // optimization: only set pixel if color has changed
-      if (last != prev) setPixelColorRaw(i - 1, prev);
-    } else setPixelColorRaw(i, curnew); // first pixel
-    lastnew = curnew;
-    last = cur; // save original value for comparison on next iteration
+    cur = fast_color_scale(cur, keep);
+    cur = color_add(cur, carryover);
+    setPixelColorRaw(i - 1, color_add(getPixelColorRaw(i - 1), part)); // previous pixel
+    setPixelColorRaw(i, cur); // current pixel
     carryover = part;
   }
-  setPixelColorRaw(vlength - 1, curnew);
 }
 
 /*
diff --git a/wled00/FXparticleSystem.cpp b/wled00/FXparticleSystem.cpp
index 4446be8d81..1a1ed08850 100644
--- a/wled00/FXparticleSystem.cpp
+++ b/wled00/FXparticleSystem.cpp
@@ -17,7 +17,7 @@
 // local shared functions (used both in 1D and 2D system)
 static int32_t calcForce_dv(const int8_t force, uint8_t &counter);
 static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32_t particleradius, const bool wrap); // returns false if out of bounds by more than particleradius
-static uint32_t fast_color_add(CRGBW c1, const CRGBW c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding)
+static uint32_t fast_color_scaleAdd(const uint32_t c1, const uint32_t c2, uint8_t scale = 255); // fast and accurate color adding with scaling (scales c2 before adding)
 #endif
 
 #ifndef WLED_DISABLE_PARTICLESYSTEM2D
@@ -634,7 +634,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
     uint32_t y = particles[particleindex].y >> PS_P_RADIUS_SHIFT;
     if (x <= (uint32_t)maxXpixel && y <= (uint32_t)maxYpixel) {
       uint32_t index = x + (maxYpixel - y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
-      framebuffer[index] = fast_color_add(framebuffer[index], color, brightness);
+      framebuffer[index] = fast_color_scaleAdd(framebuffer[index], color, brightness);
     }
     return;
   }
@@ -686,10 +686,10 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
     memset(renderbuffer, 0, sizeof(renderbuffer)); // clear buffer
     //particle size to pixels: < 64 is 4x4, < 128 is 6x6, < 192 is 8x8, bigger is 10x10
     //first, render the pixel to the center of the renderbuffer, then apply 2D blurring
-    renderbuffer[4 + (4 * 10)] = fast_color_add(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // order is: bottom left, bottom right, top right, top left
-    renderbuffer[5 + (4 * 10)] = fast_color_add(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]);
-    renderbuffer[5 + (5 * 10)] = fast_color_add(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]);
-    renderbuffer[4 + (5 * 10)] = fast_color_add(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]);
+    renderbuffer[4 + (4 * 10)] = fast_color_scaleAdd(renderbuffer[4 + (4 * 10)], color, pxlbrightness[0]); // order is: bottom left, bottom right, top right, top left
+    renderbuffer[5 + (4 * 10)] = fast_color_scaleAdd(renderbuffer[5 + (4 * 10)], color, pxlbrightness[1]);
+    renderbuffer[5 + (5 * 10)] = fast_color_scaleAdd(renderbuffer[5 + (5 * 10)], color, pxlbrightness[2]);
+    renderbuffer[4 + (5 * 10)] = fast_color_scaleAdd(renderbuffer[4 + (5 * 10)], color, pxlbrightness[3]);
     uint32_t rendersize = 2; // initialize render size, minimum is 4x4 pixels, it is incremented int he loop below to start with 4
     uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
     uint32_t maxsize = advPartProps[particleindex].size;
@@ -747,7 +747,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
           continue;
         }
         uint32_t idx = xfb + (maxYpixel - yfb) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
-        framebuffer[idx] = fast_color_add(framebuffer[idx], renderbuffer[xrb + yrb * 10]);
+        framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], renderbuffer[xrb + yrb * 10]);
       }
     }
     } else { // standard rendering (2x2 pixels)
@@ -784,7 +784,7 @@ void WLED_O2_ATTR ParticleSystem2D::renderParticle(const uint32_t particleindex,
     for (uint32_t i = 0; i < 4; i++) {
       if (pixelvalid[i]) {
         uint32_t idx = pixco[i].x + (maxYpixel - pixco[i].y) * (maxXpixel + 1); // flip y coordinate (0,0 is bottom left in PS but top left in framebuffer)
-        framebuffer[idx] = fast_color_add(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left
+        framebuffer[idx] = fast_color_scaleAdd(framebuffer[idx], color, pxlbrightness[i]); // order is: bottom left, bottom right, top right, top left
       }
     }
   }
@@ -1027,9 +1027,8 @@ void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblu
     for (uint32_t x = xstart; x < xstart + xsize; x++) {
       seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
       if (x > 0) {
-        colorbuffer[indexXY - 1] = fast_color_add(colorbuffer[indexXY - 1], seeppart);
-        if (carryover.color32) // note: check adds overhead but is faster on average
-          colorbuffer[indexXY] = fast_color_add(colorbuffer[indexXY], carryover);
+        colorbuffer[indexXY - 1] = fast_color_scaleAdd(colorbuffer[indexXY - 1], seeppart);
+        colorbuffer[indexXY]     = fast_color_scaleAdd(colorbuffer[indexXY], carryover);
       }
       carryover = seeppart;
       indexXY++; // next pixel in x direction
@@ -1048,9 +1047,8 @@ void blur2D(uint32_t *colorbuffer, uint32_t xsize, uint32_t ysize, uint32_t xblu
     for (uint32_t y = ystart; y < ystart + ysize; y++) {
       seeppart = fast_color_scale(colorbuffer[indexXY], seep); // scale it and seep to neighbours
       if (y > 0) {
-        colorbuffer[indexXY - width] = fast_color_add(colorbuffer[indexXY - width], seeppart);
-        if (carryover.color32) // note: check adds overhead but is faster on average
-          colorbuffer[indexXY] = fast_color_add(colorbuffer[indexXY], carryover);
+        colorbuffer[indexXY - width] = fast_color_scaleAdd(colorbuffer[indexXY - width], seeppart);
+        colorbuffer[indexXY]         = fast_color_scaleAdd(colorbuffer[indexXY], carryover);
       }
       carryover = seeppart;
       indexXY += width; // next pixel in y direction
@@ -1469,7 +1467,7 @@ void ParticleSystem1D::render() {
   CRGBW bg_color = SEGCOLOR(1);
   if (bg_color > 0) { //if not black
     for (int32_t i = 0; i <= maxXpixel; i++) {
-      framebuffer[i] = fast_color_add(framebuffer[i], bg_color);
+      framebuffer[i] = fast_color_scaleAdd(framebuffer[i], bg_color);
     }
   }
 #ifndef WLED_DISABLE_2D
@@ -1492,7 +1490,7 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex,
   if (size == 0) { //single pixel particle, can be out of bounds as oob checking is made for 2-pixel particles (and updating it uses more code)
     uint32_t x =  particles[particleindex].x >> PS_P_RADIUS_SHIFT_1D;
     if (x <= (uint32_t)maxXpixel) { //by making x unsigned there is no need to check < 0 as it will overflow
-      framebuffer[x] = fast_color_add(framebuffer[x], color, brightness);
+      framebuffer[x] = fast_color_scaleAdd(framebuffer[x], color, brightness);
     }
     return;
   }
@@ -1529,8 +1527,8 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex,
     //render particle to a bigger size
     //particle size to pixels: 2 - 63 is 4 pixels, < 128 is 6pixels, < 192 is 8 pixels, bigger is 10 pixels
     //first, render the pixel to the center of the renderbuffer, then apply 1D blurring
-    renderbuffer[4] = fast_color_add(renderbuffer[4], color, pxlbrightness[0]);
-    renderbuffer[5] = fast_color_add(renderbuffer[5], color, pxlbrightness[1]);
+    renderbuffer[4] = fast_color_scaleAdd(renderbuffer[4], color, pxlbrightness[0]);
+    renderbuffer[5] = fast_color_scaleAdd(renderbuffer[5], color, pxlbrightness[1]);
     uint32_t rendersize = 2; // initialize render size, minimum is 4 pixels, it is incremented int he loop below to start with 4
     uint32_t offset = 4; // offset to zero coordinate to write/read data in renderbuffer (actually needs to be 3, is decremented in the loop below)
     uint32_t blurpasses = size/64 + 1; // number of blur passes depends on size, four passes max
@@ -1564,7 +1562,7 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex,
       #ifdef ESP8266 // no local buffer on ESP8266
       SEGMENT.addPixelColor(xfb, renderbuffer[xrb], true);
       #else
-      framebuffer[xfb] = fast_color_add(framebuffer[xfb], renderbuffer[xrb]);
+      framebuffer[xfb] = fast_color_scaleAdd(framebuffer[xfb], renderbuffer[xrb]);
       #endif
     }
   }
@@ -1584,7 +1582,7 @@ void WLED_O2_ATTR ParticleSystem1D::renderParticle(const uint32_t particleindex,
     }
     for (uint32_t i = 0; i < 2; i++) {
       if (pxlisinframe[i]) {
-        framebuffer[pixco[i]] = fast_color_add(framebuffer[pixco[i]], color, pxlbrightness[i]);
+        framebuffer[pixco[i]] = fast_color_scaleAdd(framebuffer[pixco[i]], color, pxlbrightness[i]);
       }
     }
   }
@@ -1836,9 +1834,8 @@ void blur1D(uint32_t *colorbuffer, uint32_t size, uint32_t blur, uint32_t start)
   for (uint32_t x = start; x < start + size; x++) {
     seeppart = fast_color_scale(colorbuffer[x], seep); // scale it and seep to neighbours
     if (x > 0) {
-      colorbuffer[x-1] = fast_color_add(colorbuffer[x-1], seeppart);
-      if (carryover.color32) // note: check adds overhead but is faster on average
-        colorbuffer[x] = fast_color_add(colorbuffer[x], carryover); // is black on first pass
+      colorbuffer[x-1] = fast_color_scaleAdd(colorbuffer[x-1], seeppart);
+      colorbuffer[x]   = fast_color_scaleAdd(colorbuffer[x], carryover); // is black on first pass
     }
     carryover = seeppart;
   }
@@ -1887,28 +1884,34 @@ static bool checkBoundsAndWrap(int32_t &position, const int32_t max, const int32
   return true; // particle is in bounds
 }
 
-// this is a fast version for CRGBW color adding ignoring white channel (PS does not handle white) including scaling of second color
+// this is a fast version for RGB color adding ignoring white channel (PS does not handle white) including scaling of second color
 // note: function is mainly used to add scaled colors, so checking if one color is black is slower
-// note2: returning CRGBW value is slightly slower as the return value gets written to uint32_t framebuffer
-static uint32_t WLED_O2_ATTR fast_color_add(CRGBW c1, const CRGBW c2, const uint8_t scale) {
-  uint32_t r, g, b;
-  r = c1.r + ((c2.r * scale) >> 8);
-  g = c1.g + ((c2.g * scale) >> 8);
-  b = c1.b + ((c2.b * scale) >> 8);
-
-  // note: this chained comparison is the fastest method for max of 3 values (faster than std:max() or using xor)
-  uint32_t max = (r > g) ? ((r > b) ? r : b) : ((g > b) ? g : b);
-  if (max <= 255) {
-    c1.r = r; // save result to c1
-    c1.g = g;
-    c1.b = b;
-  } else {
-    uint32_t newscale = (255U << 16) / max;
-    c1.r = (r * newscale) >> 16;
-    c1.g = (g * newscale) >> 16;
-    c1.b = (b * newscale) >> 16;
-  }
-  return c1.color32;
+static uint32_t fast_color_scaleAdd(const uint32_t c1, const uint32_t c2, const uint8_t scale) {
+    constexpr uint32_t MASK_RB = 0x00FF00FF;  // red and blue mask
+    constexpr uint32_t MASK_G  = 0x0000FF00;  // green mask
+
+    uint32_t rb = c2 & MASK_RB; // 0x00RR00BB
+    uint32_t g  = c2 & MASK_G;  // 0x0000GG00
+    // scale second color
+    rb = ((rb * scale) >> 8) & MASK_RB;
+    g  = ((g  * scale) >> 8) & MASK_G;
+    // add colors
+    rb = (c1 & MASK_RB) + rb;
+    g = ((c1 & MASK_G)  + g);
+
+    // check for overflow by looking at the 9th bit of each channel
+    if ((rb | (g >> 8)) & 0x01000100) {
+        // find max among the three 16-bit values
+        g = g >> 8; // shift to get 0x000000GG
+        uint32_t max_val = (rb >> 16); // red
+        max_val = ((rb & 0xFFFF) > max_val) ? rb & 0xFFFF : max_val;  // blue
+        max_val = (g > max_val) ? g : max_val; // green
+        // scale down to avoid saturation
+        uint32_t scale_factor = (255 << 8) / max_val;
+        rb = ((rb * scale_factor) >> 8) & MASK_RB;
+        g = (g * scale_factor) & MASK_G;
+    }
+    return rb | g;
 }
 
 #endif  // !(defined(WLED_DISABLE_PARTICLESYSTEM2D) && defined(WLED_DISABLE_PARTICLESYSTEM1D))
diff --git a/wled00/colors.cpp b/wled00/colors.cpp
index 0b95f88994..b17c0b6d61 100644
--- a/wled00/colors.cpp
+++ b/wled00/colors.cpp
@@ -25,39 +25,35 @@ uint32_t WLED_O2_ATTR IRAM_ATTR color_blend(uint32_t color1, uint32_t color2, ui
  * original idea: https://github.com/wled-dev/WLED/pull/2465 by https://github.com/Proto-molecule
  * speed optimisations by @dedehai
  */
-uint32_t color_add(uint32_t c1, uint32_t c2, bool preserveCR)
+uint32_t WLED_O2_ATTR color_add(uint32_t c1, uint32_t c2, bool preserveCR) //1212558 | 1212598 | 1212576 | 1212530
 {
   if (c1 == BLACK) return c2;
   if (c2 == BLACK) return c1;
   const uint32_t TWO_CHANNEL_MASK = 0x00FF00FF; // mask for R and B channels or W and G if negated
   uint32_t rb = ( c1     & TWO_CHANNEL_MASK) + ( c2     & TWO_CHANNEL_MASK); // mask and add two colors at once
   uint32_t wg = ((c1>>8) & TWO_CHANNEL_MASK) + ((c2>>8) & TWO_CHANNEL_MASK);
-  uint32_t r = rb >> 16; // extract single color values
-  uint32_t b = rb & 0xFFFF;
-  uint32_t w = wg >> 16;
-  uint32_t g = wg & 0xFFFF;
 
   if (preserveCR) { // preserve color ratios
-    uint32_t max = std::max(r,g); // check for overflow note
-    max = std::max(max,b);
-    max = std::max(max,w);
-    //unsigned max = r; // check for overflow note
-    //max = g > max ? g : max;
-    //max = b > max ? b : max;
-    //max = w > max ? w : max;
-    if (max > 255) {
+    uint32_t overflow = (rb | wg) & 0x01000100; // detect overflow by checking 9th bit
+    if (overflow) {
+      uint32_t r = rb >> 16; // extract single color values
+      uint32_t b = rb & 0xFFFF;
+      uint32_t w = wg >> 16;
+      uint32_t g = wg & 0xFFFF;
+      uint32_t max = std::max(r,g);
+      max = std::max(max,b);
+      max = std::max(max,w);
       const uint32_t scale = (uint32_t(255)<<8) / max; // division of two 8bit (shifted) values does not work -> use bit shifts and multiplaction instead
       rb = ((rb * scale) >> 8) &  TWO_CHANNEL_MASK;
       wg =  (wg * scale)       & ~TWO_CHANNEL_MASK;
     } else wg <<= 8; //shift white and green back to correct position
-    return rb | wg;
   } else {
-    r = r > 255 ? 255 : r;
-    g = g > 255 ? 255 : g;
-    b = b > 255 ? 255 : b;
-    w = w > 255 ? 255 : w;
-    return RGBW32(r,g,b,w);
+    // branchless per-channel saturation to 255 (extract 9th bit, subtract 1 if it is set, mask with 0xFF)
+    rb |= ((rb & 0x01000100) - ((rb >> 8) & 0x00010001)) & 0x00FF00FF;
+    wg |= ((wg & 0x01000100) - ((wg >> 8) & 0x00010001)) & 0x00FF00FF;
+    wg <<= 8; // restore WG position
   }
+  return rb | wg;
 }
 
 /*
diff --git a/wled00/colors.h b/wled00/colors.h
index af7dd5cb69..b5a7befe8e 100644
--- a/wled00/colors.h
+++ b/wled00/colors.h
@@ -142,7 +142,7 @@ void setRandomColor(byte* rgb);
 
 // fast scaling function for colors, performs color*scale/256 for all four channels, speed over accuracy
 // note: inlining uses less code than actual function calls
-static inline uint32_t fast_color_scale(uint32_t c, const uint8_t scale) {
+static inline uint32_t fast_color_scale(const uint32_t c, const uint8_t scale) {
   uint32_t rb = (((c     & 0x00FF00FF) * scale) >> 8) &  0x00FF00FF;
   uint32_t wg = (((c>>8) & 0x00FF00FF) * scale)       & ~0x00FF00FF;
   return rb | wg;

From 2f5f76bb287bc3d5851f313ca3aff3fc610d2547 Mon Sep 17 00:00:00 2001
From: Damian Schneider <daedae@gmx.ch>
Date: Tue, 23 Sep 2025 18:43:45 +0200
Subject: [PATCH 3/3] add branchless saturation example comments

---
 wled00/colors.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/wled00/colors.cpp b/wled00/colors.cpp
index b17c0b6d61..1aa82fbd87 100644
--- a/wled00/colors.cpp
+++ b/wled00/colors.cpp
@@ -48,7 +48,9 @@ uint32_t WLED_O2_ATTR color_add(uint32_t c1, uint32_t c2, bool preserveCR) //121
       wg =  (wg * scale)       & ~TWO_CHANNEL_MASK;
     } else wg <<= 8; //shift white and green back to correct position
   } else {
-    // branchless per-channel saturation to 255 (extract 9th bit, subtract 1 if it is set, mask with 0xFF)
+    // branchless per-channel saturation to 255 (extract 9th bit, subtract 1 if it is set, mask with 0xFF, input is 0xFF+0xFF=0x1EF max)
+    // example with overflow: input: 0x01EF01EF -> (0x0100100 - 0x00010001) = 0x00FF00FF -> input|0x00FF00FF = 0x00FF00FF (saturate)
+    // example without overflow: input: 0x007F007F -> (0x00000000 - 0x00000000) = 0x00000000 -> input|0x00000000 = input  (no change)
     rb |= ((rb & 0x01000100) - ((rb >> 8) & 0x00010001)) & 0x00FF00FF;
     wg |= ((wg & 0x01000100) - ((wg >> 8) & 0x00010001)) & 0x00FF00FF;
     wg <<= 8; // restore WG position