From c2e597888916373751bfc7c42c9ea07becfa0fc4 Mon Sep 17 00:00:00 2001 From: Jonathan Pobst Date: Mon, 30 Mar 2026 20:43:24 -1000 Subject: [PATCH] Vectorize GaussianBlur effect. --- Pinta.Effects/Effects/GaussianBlurEffect.cs | 338 +++++++++--------- Pinta.Effects/Effects/GlowEffect.cs | 2 +- Pinta.Effects/Effects/PencilSketchEffect.cs | 2 +- Pinta.Effects/Effects/SoftenPortraitEffect.cs | 2 +- .../Mocks/MockSystemService.cs | 10 + tests/PintaBenchmarks/Utilities/Utilities.cs | 1 + 6 files changed, 188 insertions(+), 167 deletions(-) create mode 100644 tests/PintaBenchmarks/Mocks/MockSystemService.cs diff --git a/Pinta.Effects/Effects/GaussianBlurEffect.cs b/Pinta.Effects/Effects/GaussianBlurEffect.cs index ed997d5a96..f03f335236 100644 --- a/Pinta.Effects/Effects/GaussianBlurEffect.cs +++ b/Pinta.Effects/Effects/GaussianBlurEffect.cs @@ -8,7 +8,11 @@ ///////////////////////////////////////////////////////////////////////////////// using System; +using System.Buffers; using System.Collections.Immutable; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; using System.Threading.Tasks; using Cairo; using Pinta.Core; @@ -19,7 +23,7 @@ public sealed class GaussianBlurEffect : BaseEffect { public override string Icon => Resources.Icons.EffectsBlursGaussianBlur; - public sealed override bool IsTileable => true; + public sealed override bool IsTileable => false; public override string Name => Translations.GetString ("Gaussian Blur"); @@ -31,18 +35,19 @@ public sealed class GaussianBlurEffect : BaseEffect private readonly IChromeService chrome; private readonly IWorkspaceService workspace; + private readonly ISystemService system; + public GaussianBlurEffect (IServiceProvider services) { chrome = services.GetService (); workspace = services.GetService (); + system = services.GetService (); EffectData = new GaussianBlurData (); } public override Task LaunchConfiguration () => chrome.LaunchSimpleEffectDialog (this, workspace); - #region Algorithm Code Ported From PDN - public static ImmutableArray CreateGaussianBlurRow (int amount) { int size = 1 + (amount * 2); @@ -58,189 +63,194 @@ public static ImmutableArray CreateGaussianBlurRow (int amount) return weights.MoveToImmutable (); } + // --- Separable two-pass Gaussian blur --- + // + // The 2D Gaussian kernel is separable: it can be decomposed into two + // sequential 1D convolutions (horizontal then vertical). This reduces + // the per-pixel work from O(kernel²) to O(2·kernel), giving a large + // speedup for bigger radii. + // + // Pass 1 (horizontal): For each pixel, convolve the source row with + // the 1D kernel and store the weighted sums in an intermediate buffer. + // Pass 2 (vertical): For each pixel, convolve the intermediate + // column with the 1D kernel and write the final result. + // + // Alpha handling: the source is premultiplied. We blur the premultiplied + // B/G/R channels and the alpha channel separately. The final straight- + // alpha color is recovered as (255 · blurred_premul_c / blurred_alpha), + // and the output is converted back to premultiplied format. public override void Render (ImageSurface src, ImageSurface dest, ReadOnlySpan rois) { if (Data.Radius == 0) return; // Copy src to dest int r = Data.Radius; - ImmutableArray w = CreateGaussianBlurRow (r); - int wlen = w.Length; - - Span waSums = stackalloc long[wlen]; - Span wcSums = stackalloc long[wlen]; - Span aSums = stackalloc long[wlen]; - Span bSums = stackalloc long[wlen]; - Span gSums = stackalloc long[wlen]; - Span rSums = stackalloc long[wlen]; - - // Cache these for a massive performance boost - int src_width = src.Width; - int src_height = src.Height; - ReadOnlySpan src_data = src.GetReadOnlyPixelData (); - Span dst_data = dest.GetPixelData (); - - foreach (var rect in rois) { - - if (rect.Height < 1 || rect.Width < 1) - continue; - - for (int y = rect.Top; y <= rect.Bottom; ++y) { - long waSum = 0; - long wcSum = 0; - long aSum = 0; - long bSum = 0; - long gSum = 0; - long rSum = 0; - - var dst_row = dst_data.Slice (y * src_width, src_width); - - for (int wx = 0; wx < wlen; ++wx) { - int srcX = rect.Left + wx - r; - waSums[wx] = 0; - wcSums[wx] = 0; - aSums[wx] = 0; - bSums[wx] = 0; - gSums[wx] = 0; - rSums[wx] = 0; - - if (srcX < 0 || srcX >= src_width) - continue; - - for (int wy = 0; wy < wlen; ++wy) { - int srcY = y + wy - r; - - if (srcY < 0 || srcY >= src_height) - continue; - - PointI pixelPosition = new (srcX, srcY); - - ColorBgra c = src.GetColorBgra (src_data, src_width, pixelPosition).ToStraightAlpha (); - int wp = w[wy]; - - waSums[wx] += wp; - wp *= c.A + (c.A >> 7); - wcSums[wx] += wp; - wp >>= 8; - - if (c.A > 0) { - aSums[wx] += wp * c.A; - bSums[wx] += wp * c.B; - gSums[wx] += wp * c.G; - rSums[wx] += wp * c.R; - } + ImmutableArray weights = CreateGaussianBlurRow (r); + int wlen = weights.Length; + int width = src.Width; + int height = src.Height; + int threads = system.RenderThreads; + + // --- Pass 1: Horizontal convolution (parallelized by row) --- + int size = width * height; + int[] h_b = new int[size]; + int[] h_g = new int[size]; + int[] h_r = new int[size]; + int[] h_a = new int[size]; + + // Precompute horizontal weight sums (depends only on x position) + long[] h_weight_sums = new long[width]; + for (int x = 0; x < width; ++x) { + long sum = 0; + int wx_start = Math.Max (0, r - x); + int wx_end = Math.Min (wlen, width - x + r); + for (int wx = wx_start; wx < wx_end; ++wx) + sum += weights[wx]; + h_weight_sums[x] = sum; + } + + Parallel.For (0, height, + new ParallelOptions { MaxDegreeOfParallelism = threads }, + y => { + ReadOnlySpan src_data = src.GetReadOnlyPixelData (); + int row_offset = y * width; + + for (int x = 0; x < width; ++x) { + int s_b = 0, s_g = 0, s_r = 0, s_a = 0; + + int wx_start = Math.Max (0, r - x); + int wx_end = Math.Min (wlen, width - x + r); + + for (int wx = wx_start; wx < wx_end; ++wx) { + int src_x = x + wx - r; + ColorBgra c = src_data[row_offset + src_x]; + int w = weights[wx]; + + s_b += w * c.B; + s_g += w * c.G; + s_r += w * c.R; + s_a += w * c.A; } - int wwx = w[wx]; - waSum += wwx * waSums[wx]; - wcSum += wwx * wcSums[wx]; - aSum += wwx * aSums[wx]; - bSum += wwx * bSums[wx]; - gSum += wwx * gSums[wx]; - rSum += wwx * rSums[wx]; + int idx = row_offset + x; + h_b[idx] = s_b; + h_g[idx] = s_g; + h_r[idx] = s_r; + h_a[idx] = s_a; } + }); + + // --- Pass 2: Vertical convolution (parallelized by row) --- + Parallel.For (0, height, + new ParallelOptions { MaxDegreeOfParallelism = threads }, + y => { + Span dst_data = dest.GetPixelData (); + RenderVerticalRow (dst_data, y, width, height, r, weights, wlen, h_b, h_g, h_r, h_a, h_weight_sums); + }); + } - wcSum >>= 8; + private static void RenderVerticalRow ( + Span dst_data, + int y, + int width, int height, int r, + ImmutableArray weights, int wlen, + int[] h_b, int[] h_g, int[] h_r, int[] h_a, + long[] h_weight_sums) + { + // Determine valid vertical kernel range for this row + int wy_start = Math.Max (0, r - y); + int wy_end = Math.Min (wlen, height - y + r); + + long v_weight_sum = 0; + for (int wy = wy_start; wy < wy_end; ++wy) + v_weight_sum += weights[wy]; + + // Rent accumulators from the pool to avoid per-row heap allocation + long[] rent_b = ArrayPool.Shared.Rent (width); + long[] rent_g = ArrayPool.Shared.Rent (width); + long[] rent_r = ArrayPool.Shared.Rent (width); + long[] rent_a = ArrayPool.Shared.Rent (width); + + try { + Span sum_b = rent_b.AsSpan (0, width); + Span sum_g = rent_g.AsSpan (0, width); + Span sum_r = rent_r.AsSpan (0, width); + Span sum_a = rent_a.AsSpan (0, width); + + sum_b.Clear (); + sum_g.Clear (); + sum_r.Clear (); + sum_a.Clear (); + + // Accumulate weighted intermediate rows (vertical convolution) + for (int wy = wy_start; wy < wy_end; ++wy) { + int src_y = y + wy - r; + int w = weights[wy]; + int row_offset = src_y * width; + + AccumulateRow (sum_b, h_b, row_offset, width, w); + AccumulateRow (sum_g, h_g, row_offset, width, w); + AccumulateRow (sum_r, h_r, row_offset, width, w); + AccumulateRow (sum_a, h_a, row_offset, width, w); + } - if (waSum == 0 || wcSum == 0) { - dst_row[rect.Left] = ColorBgra.Zero; - } else { - byte alpha = (byte) (aSum / waSum); - byte blue = (byte) (bSum / wcSum); - byte green = (byte) (gSum / wcSum); - byte red = (byte) (rSum / wcSum); + // Write output pixels + var dst_row = dst_data.Slice (y * width, width); - dst_row[rect.Left] = ColorBgra.FromBgra (blue, green, red, alpha).ToPremultipliedAlpha (); + for (int x = 0; x < width; ++x) { + long total_weight = h_weight_sums[x] * v_weight_sum; + + if (total_weight == 0 || sum_a[x] == 0) { + dst_row[x] = ColorBgra.Zero; + } else { + byte alpha = (byte) (sum_a[x] / total_weight); + byte blue = (byte) (sum_b[x] * 255 / sum_a[x]); + byte green = (byte) (sum_g[x] * 255 / sum_a[x]); + byte red = (byte) (sum_r[x] * 255 / sum_a[x]); + dst_row[x] = ColorBgra.FromBgra (blue, green, red, alpha).ToPremultipliedAlpha (); } + } + } finally { + ArrayPool.Shared.Return (rent_b); + ArrayPool.Shared.Return (rent_g); + ArrayPool.Shared.Return (rent_r); + ArrayPool.Shared.Return (rent_a); + } + } - for (int x = rect.Left + 1; x <= rect.Right; ++x) { - for (int i = 0; i < wlen - 1; ++i) { - waSums[i] = waSums[i + 1]; - wcSums[i] = wcSums[i + 1]; - aSums[i] = aSums[i + 1]; - bSums[i] = bSums[i + 1]; - gSums[i] = gSums[i + 1]; - rSums[i] = rSums[i + 1]; - } + /// + /// Adds weight × source[offset..offset+length] into the accumulator span, using + /// SIMD (Vector256) when available. + /// + [MethodImpl (MethodImplOptions.AggressiveInlining)] + private static void AccumulateRow (Span accumulator, int[] source, int source_offset, int length, int weight) + { + ref int src_ref = ref source[source_offset]; + ref long acc_ref = ref MemoryMarshal.GetReference (accumulator); + int i = 0; - waSum = 0; - wcSum = 0; - aSum = 0; - bSum = 0; - gSum = 0; - rSum = 0; - - int wx; - for (wx = 0; wx < wlen - 1; ++wx) { - long wwx = w[wx]; - waSum += wwx * waSums[wx]; - wcSum += wwx * wcSums[wx]; - aSum += wwx * aSums[wx]; - bSum += wwx * bSums[wx]; - gSum += wwx * gSums[wx]; - rSum += wwx * rSums[wx]; - } + if (Vector256.IsHardwareAccelerated && length >= Vector256.Count) { + Vector256 w_vec = Vector256.Create ((long) weight); - wx = wlen - 1; - - waSums[wx] = 0; - wcSums[wx] = 0; - aSums[wx] = 0; - bSums[wx] = 0; - gSums[wx] = 0; - rSums[wx] = 0; - - int srcX = x + wx - r; - - if (srcX >= 0 && srcX < src_width) { - for (int wy = 0; wy < wlen; ++wy) { - int srcY = y + wy - r; - - if (srcY < 0 || srcY >= src_height) - continue; - - ColorBgra c = src.GetColorBgra (src_data, src_width, new (srcX, srcY)).ToStraightAlpha (); - int wp = w[wy]; - - waSums[wx] += wp; - wp *= c.A + (c.A >> 7); - wcSums[wx] += wp; - wp >>= 8; - - if (c.A > 0) { - aSums[wx] += wp * (long) c.A; - bSums[wx] += wp * (long) c.B; - gSums[wx] += wp * (long) c.G; - rSums[wx] += wp * (long) c.R; - } - } - - int wr = w[wx]; - waSum += wr * waSums[wx]; - wcSum += wr * wcSums[wx]; - aSum += wr * aSums[wx]; - bSum += wr * bSums[wx]; - gSum += wr * gSums[wx]; - rSum += wr * rSums[wx]; - } + for (; i <= length - Vector256.Count; i += Vector256.Count) { + Vector256 src_vec = Vector256.LoadUnsafe (ref src_ref, (nuint) i); + (Vector256 lo, Vector256 hi) = Vector256.Widen (src_vec); - wcSum >>= 8; + Vector256 acc_lo = Vector256.LoadUnsafe (ref acc_ref, (nuint) i); + Vector256 acc_hi = Vector256.LoadUnsafe (ref acc_ref, (nuint) (i + Vector256.Count)); - if (waSum == 0 || wcSum == 0) { - dst_row[x] = ColorBgra.Zero; - } else { - byte alpha = (byte) (aSum / waSum); - byte blue = (byte) (bSum / wcSum); - byte green = (byte) (gSum / wcSum); - byte red = (byte) (rSum / wcSum); + acc_lo += lo * w_vec; + acc_hi += hi * w_vec; - dst_row[x] = ColorBgra.FromBgra (blue, green, red, alpha).ToPremultipliedAlpha (); - } - } + acc_lo.StoreUnsafe (ref acc_ref, (nuint) i); + acc_hi.StoreUnsafe (ref acc_ref, (nuint) (i + Vector256.Count)); } } + + // Scalar tail + for (; i < length; ++i) + Unsafe.Add (ref acc_ref, i) += (long) weight * Unsafe.Add (ref src_ref, i); } - #endregion public sealed class GaussianBlurData : EffectData { diff --git a/Pinta.Effects/Effects/GlowEffect.cs b/Pinta.Effects/Effects/GlowEffect.cs index 42eccad129..01cdddad8e 100644 --- a/Pinta.Effects/Effects/GlowEffect.cs +++ b/Pinta.Effects/Effects/GlowEffect.cs @@ -21,7 +21,7 @@ public sealed class GlowEffect : BaseEffect public override string Icon => Resources.Icons.EffectsPhotoGlow; - public sealed override bool IsTileable => true; + public sealed override bool IsTileable => false; public override string Name => Translations.GetString ("Glow"); diff --git a/Pinta.Effects/Effects/PencilSketchEffect.cs b/Pinta.Effects/Effects/PencilSketchEffect.cs index 1fee190000..24cb38f650 100644 --- a/Pinta.Effects/Effects/PencilSketchEffect.cs +++ b/Pinta.Effects/Effects/PencilSketchEffect.cs @@ -24,7 +24,7 @@ public sealed class PencilSketchEffect : BaseEffect public override string Icon => Resources.Icons.EffectsArtisticPencilSketch; - public sealed override bool IsTileable => true; + public sealed override bool IsTileable => false; public override string Name => Translations.GetString ("Pencil Sketch"); diff --git a/Pinta.Effects/Effects/SoftenPortraitEffect.cs b/Pinta.Effects/Effects/SoftenPortraitEffect.cs index f1ac581a3f..e8776bafdc 100644 --- a/Pinta.Effects/Effects/SoftenPortraitEffect.cs +++ b/Pinta.Effects/Effects/SoftenPortraitEffect.cs @@ -49,7 +49,7 @@ public sealed class SoftenPortraitEffect : BaseEffect public override string Icon => Resources.Icons.EffectsPhotoSoftenPortrait; - public sealed override bool IsTileable => true; + public sealed override bool IsTileable => false; public override string Name => Translations.GetString ("Soften Portrait"); diff --git a/tests/PintaBenchmarks/Mocks/MockSystemService.cs b/tests/PintaBenchmarks/Mocks/MockSystemService.cs new file mode 100644 index 0000000000..a391a8b1a5 --- /dev/null +++ b/tests/PintaBenchmarks/Mocks/MockSystemService.cs @@ -0,0 +1,10 @@ +using Pinta.Core; + +namespace PintaBenchmarks; + +internal class MockSystemService : ISystemService +{ + public int RenderThreads => Environment.ProcessorCount; + + public OS OperatingSystem => throw new NotImplementedException (); +} diff --git a/tests/PintaBenchmarks/Utilities/Utilities.cs b/tests/PintaBenchmarks/Utilities/Utilities.cs index 658d82b766..d028a0d404 100644 --- a/tests/PintaBenchmarks/Utilities/Utilities.cs +++ b/tests/PintaBenchmarks/Utilities/Utilities.cs @@ -13,6 +13,7 @@ public static IServiceProvider CreateMockServices () manager.AddService (new MockChromeManager ()); manager.AddService (new MockWorkspaceService (imageSize)); manager.AddService (new MockLivePreview (new RectangleI (0, 0, imageSize.Width, imageSize.Height))); + manager.AddService (new MockSystemService ()); return manager; } }