From 01c3fab408d1ab354947889c21066e1559021d32 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sat, 12 Dec 2020 15:40:14 +0100
Subject: [PATCH 01/22] Add BokehBlur benchmark

---
 .../Samplers/BokehBlur.cs                     | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs
diff --git a/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs b/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs
new file mode 100644
index 000000000..1c3b1a7b2
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs
@@ -0,0 +1,22 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.PixelFormats;
+using SixLabors.ImageSharp.Processing;
+
+namespace SixLabors.ImageSharp.Benchmarks.Samplers
+{
+    [Config(typeof(Config.MultiFramework))]
+    public class BokehBlur
+    {
+        [Benchmark]
+        public void Blur()
+        {
+            using (var image = new Image<Rgba32>(Configuration.Default, 400, 400, Color.White))
+            {
+                image.Mutate(c => c.BokehBlur());
+            }
+        }
+    }
+}

From bd6e555312bae10bd211a6bcdabc1ee1bcb5b87a Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sat, 12 Dec 2020 18:02:31 +0100
Subject: [PATCH 02/22] Minor code refactoring to improve flexibility

---
 .../Convolution2DRowOperation{TPixel}.cs      |  8 +++----
 .../Convolution/Convolution2DState.cs         |  8 +++----
 .../ConvolutionProcessor{TPixel}.cs           |  2 +-
 .../ConvolutionRowOperation{TPixel}.cs        | 12 +++++-----
 .../Convolution/ConvolutionState.cs           | 23 +++++++++++++++----
 .../Convolution/KernelSamplingMap.cs          | 11 +++++++--
 ...ReadOnlyKernel.cs => ReadOnlyKernel{T}.cs} | 19 +++++++++++----
 7 files changed, 57 insertions(+), 26 deletions(-)
 rename src/ImageSharp/Processing/Processors/Convolution/{ReadOnlyKernel.cs => ReadOnlyKernel{T}.cs} (73%)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs
index 802d1809f..dd3e98609 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs
@@ -80,8 +80,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             ref Vector4 targetBaseY = ref MemoryMarshal.GetReference(targetYBuffer);
             ref Vector4 targetBaseX = ref MemoryMarshal.GetReference(targetXBuffer);
 
-            ReadOnlyKernel kernelY = state.KernelY;
-            ReadOnlyKernel kernelX = state.KernelX;
+            ReadOnlyKernel<float> kernelY = state.KernelY;
+            ReadOnlyKernel<float> kernelX = state.KernelX;
             Span<TPixel> sourceRow;
             for (int kY = 0; kY < kernelY.Rows; kY++)
             {
@@ -146,8 +146,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             ref Vector4 targetBaseY = ref MemoryMarshal.GetReference(targetYBuffer);
             ref Vector4 targetBaseX = ref MemoryMarshal.GetReference(targetXBuffer);
 
-            ReadOnlyKernel kernelY = state.KernelY;
-            ReadOnlyKernel kernelX = state.KernelX;
+            ReadOnlyKernel<float> kernelY = state.KernelY;
+            ReadOnlyKernel<float> kernelX = state.KernelX;
             for (int kY = 0; kY < kernelY.Rows; kY++)
             {
                 // Get the precalculated source sample row for this kernel row and copy to our buffer.
diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs
index 218093ac4..6f9b11857 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs
@@ -23,21 +23,21 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             KernelSamplingMap map)
         {
             // We check the kernels are the same size upstream.
-            this.KernelY = new ReadOnlyKernel(kernelY);
-            this.KernelX = new ReadOnlyKernel(kernelX);
+            this.KernelY = new ReadOnlyKernel<float>(kernelY);
+            this.KernelX = new ReadOnlyKernel<float>(kernelX);
             this.kernelHeight = kernelY.Rows;
             this.kernelWidth = kernelY.Columns;
             this.rowOffsetMap = map.GetRowOffsetSpan();
             this.columnOffsetMap = map.GetColumnOffsetSpan();
         }
 
-        public readonly ReadOnlyKernel KernelY
+        public readonly ReadOnlyKernel<float> KernelY
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get;
         }
 
-        public readonly ReadOnlyKernel KernelX
+        public readonly ReadOnlyKernel<float> KernelX
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get;
diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs
index 924a1125b..b0254bc91 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs
@@ -120,7 +120,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                 ref Vector4 targetRowRef = ref MemoryMarshal.GetReference(span);
                 Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
 
-                var state = new ConvolutionState(in this.kernel, this.map);
+                var state = new ConvolutionState<float>(in this.kernel, this.map);
                 int row = y - this.bounds.Y;
                 ref int sampleRowBase = ref state.GetSampleRow(row);
 
diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs
index 9876b2885..beccfff01 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs
@@ -67,14 +67,14 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
             Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
 
-            var state = new ConvolutionState(in this.kernelMatrix, this.map);
+            var state = new ConvolutionState<float>(in this.kernelMatrix, this.map);
             ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
 
             // Clear the target buffer for each row run.
             targetBuffer.Clear();
             ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
 
-            ReadOnlyKernel kernel = state.Kernel;
+            ReadOnlyKernel<float> kernel = state.Kernel;
             Span<TPixel> sourceRow;
             for (int kY = 0; kY < kernel.Rows; kY++)
             {
@@ -119,17 +119,17 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             // Span is 2x bounds.
             int boundsX = this.bounds.X;
             int boundsWidth = this.bounds.Width;
-            Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
-            Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
+            Span<Vector4> sourceBuffer = span.Slice(0, boundsWidth);
+            Span<Vector4> targetBuffer = span.Slice(boundsWidth);
 
-            var state = new ConvolutionState(in this.kernelMatrix, this.map);
+            var state = new ConvolutionState<float>(in this.kernelMatrix, this.map);
             ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
 
             // Clear the target buffer for each row run.
             targetBuffer.Clear();
             ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
 
-            ReadOnlyKernel kernel = state.Kernel;
+            ReadOnlyKernel<float> kernel = state.Kernel;
             for (int kY = 0; kY < kernel.Rows; kY++)
             {
                 // Get the precalculated source sample row for this kernel row and copy to our buffer.
diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs
index 3f296c67d..0b3dbc2d1 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs
@@ -10,7 +10,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
     /// <summary>
     /// A stack only struct used for reducing reference indirection during convolution operations.
     /// </summary>
-    internal readonly ref struct ConvolutionState
+    /// <typeparam name="T">The type of values for the kernel in use.</typeparam>
+    internal readonly ref struct ConvolutionState<T>
+        where T : unmanaged, IEquatable<T>
     {
         private readonly Span<int> rowOffsetMap;
         private readonly Span<int> columnOffsetMap;
@@ -18,17 +20,30 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
         private readonly int kernelWidth;
 
         public ConvolutionState(
-            in DenseMatrix<float> kernel,
+            in DenseMatrix<T> kernel,
             KernelSamplingMap map)
         {
-            this.Kernel = new ReadOnlyKernel(kernel);
+            this.Kernel = new ReadOnlyKernel<T>(kernel);
             this.kernelHeight = kernel.Rows;
             this.kernelWidth = kernel.Columns;
             this.rowOffsetMap = map.GetRowOffsetSpan();
             this.columnOffsetMap = map.GetColumnOffsetSpan();
         }
 
-        public readonly ReadOnlyKernel Kernel
+        public ConvolutionState(
+            T[] kernel,
+            int height,
+            int width,
+            KernelSamplingMap map)
+        {
+            this.Kernel = new ReadOnlyKernel<T>(kernel, height, width);
+            this.kernelHeight = height;
+            this.kernelWidth = width;
+            this.rowOffsetMap = map.GetRowOffsetSpan();
+            this.columnOffsetMap = map.GetColumnOffsetSpan();
+        }
+
+        public readonly ReadOnlyKernel<T> Kernel
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get;
diff --git a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
index e4b7dbea0..f912b9562 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
@@ -31,9 +31,16 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
         /// <param name="kernel">The convolution kernel.</param>
         /// <param name="bounds">The source bounds.</param>
         public void BuildSamplingOffsetMap(DenseMatrix<float> kernel, Rectangle bounds)
+            => this.BuildSamplingOffsetMap(kernel.Rows, kernel.Columns, bounds);
+
+        /// <summary>
+        /// Builds a map of the sampling offsets for the kernel clamped by the given bounds.
+        /// </summary>
+        /// <param name="kernelHeight">The height (number of rows) of the convolution kernel to use.</param>
+        /// <param name="kernelWidth">The width (number of columns) of the convolution kernel to use.</param>
+        /// <param name="bounds">The source bounds.</param>
+        public void BuildSamplingOffsetMap(int kernelHeight, int kernelWidth, Rectangle bounds)
         {
-            int kernelHeight = kernel.Rows;
-            int kernelWidth = kernel.Columns;
             this.yOffsets = this.allocator.Allocate<int>(bounds.Height * kernelHeight);
             this.xOffsets = this.allocator.Allocate<int>(bounds.Width * kernelWidth);
 
diff --git a/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs b/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs
similarity index 73%
rename from src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs
rename to src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs
index 37e006005..f95c3dc0a 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs
@@ -12,17 +12,26 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
     /// A stack only, readonly, kernel matrix that can be indexed without
     /// bounds checks when compiled in release mode.
     /// </summary>
-    internal readonly ref struct ReadOnlyKernel
+    /// <typeparam name="T">The type of items in the kernel.</typeparam>
+    internal readonly ref struct ReadOnlyKernel<T>
+        where T : unmanaged, IEquatable<T>
     {
-        private readonly ReadOnlySpan<float> values;
+        private readonly ReadOnlySpan<T> values;
 
-        public ReadOnlyKernel(DenseMatrix<float> matrix)
+        public ReadOnlyKernel(DenseMatrix<T> matrix)
         {
             this.Columns = matrix.Columns;
             this.Rows = matrix.Rows;
             this.values = matrix.Span;
         }
 
+        public ReadOnlyKernel(T[] kernel, int height, int width)
+        {
+            this.Columns = width;
+            this.Rows = height;
+            this.values = kernel;
+        }
+
         public int Columns
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -35,13 +44,13 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             get;
         }
 
-        public float this[int row, int column]
+        public T this[int row, int column]
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get
             {
                 this.CheckCoordinates(row, column);
-                ref float vBase = ref MemoryMarshal.GetReference(this.values);
+                ref T vBase = ref MemoryMarshal.GetReference(this.values);
                 return Unsafe.Add(ref vBase, (row * this.Columns) + column);
             }
         }

From b3f4befe5ecd75aad288bb1e315e5597ba5341b9 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sat, 12 Dec 2020 20:02:10 +0100
Subject: [PATCH 03/22] Switched bokeh blur to optimized pipeline

---
 .../Convolution/BokehBlurProcessor.cs         |  49 ++++++--
 .../Convolution/BokehBlurProcessor{TPixel}.cs | 115 ++++++++++++++----
 .../Convolution2PassProcessor{TPixel}.cs      |   3 -
 3 files changed, 132 insertions(+), 35 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
index 352960f41..e8f7351fa 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
@@ -4,6 +4,7 @@
 using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
@@ -91,31 +92,30 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
         /// it is actually used, because it does not use any generic parameters internally. Defining in a non-generic class means that there will only
         /// ever be a single instantiation of this type for the JIT/AOT compilers to process, instead of having duplicate versions for each pixel type.
         /// </remarks>
-        internal readonly struct ApplyHorizontalConvolutionRowOperation : IRowOperation
+        internal readonly struct SecondPassConvolutionRowOperation : IRowOperation
         {
             private readonly Rectangle bounds;
             private readonly Buffer2D<Vector4> targetValues;
             private readonly Buffer2D<ComplexVector4> sourceValues;
+            private readonly KernelSamplingMap map;
             private readonly Complex64[] kernel;
             private readonly float z;
             private readonly float w;
-            private readonly int maxY;
-            private readonly int maxX;
 
             [MethodImpl(InliningOptions.ShortMethod)]
-            public ApplyHorizontalConvolutionRowOperation(
+            public SecondPassConvolutionRowOperation(
                 Rectangle bounds,
                 Buffer2D<Vector4> targetValues,
                 Buffer2D<ComplexVector4> sourceValues,
+                KernelSamplingMap map,
                 Complex64[] kernel,
                 float z,
                 float w)
             {
                 this.bounds = bounds;
-                this.maxY = this.bounds.Bottom - 1;
-                this.maxX = this.bounds.Right - 1;
                 this.targetValues = targetValues;
                 this.sourceValues = sourceValues;
+                this.map = map;
                 this.kernel = kernel;
                 this.z = z;
                 this.w = w;
@@ -125,11 +125,42 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             [MethodImpl(InliningOptions.ShortMethod)]
             public void Invoke(int y)
             {
-                Span<Vector4> targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X);
+                int boundsX = this.bounds.X;
+                int boundsWidth = this.bounds.Width;
+                Span<Vector4> targetBuffer = this.targetValues.GetRowSpan(y);
 
-                for (int x = 0; x < this.bounds.Width; x++)
+                var state = new ConvolutionState<Complex64>(this.kernel, this.kernel.Length, 1, this.map);
+                ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
+
+                // The target buffer is zeroed initially and then it accumulates the results
+                // of each partial convolution, so we don't have to clear it here as well.
+                ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+
+                ReadOnlyKernel<Complex64> kernel = state.Kernel;
+
+                for (int kY = 0; kY < kernel.Rows; kY++)
                 {
-                    Buffer2DUtils.Convolve4AndAccumulatePartials(this.kernel, this.sourceValues, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX, this.z, this.w);
+                    // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                    int sampleY = Unsafe.Add(ref sampleRowBase, kY);
+                    Span<ComplexVector4> sourceRow = this.sourceValues.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
+                    ref ComplexVector4 sourceBase = ref MemoryMarshal.GetReference(sourceRow);
+
+                    for (int x = 0; x < boundsWidth; x++)
+                    {
+                        ref int sampleColumnBase = ref state.GetSampleColumn(x);
+                        ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
+                        ComplexVector4 pixel4 = default;
+
+                        for (int kX = 0; kX < kernel.Columns; kX++)
+                        {
+                            int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+                            ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+
+                            pixel4.Sum(kernel[kY, kX] * sample);
+                        }
+
+                        target += pixel4.WeightedSum(this.z, this.w);
+                    }
                 }
             }
         }
diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
index dfe54bf2e..aa6160799 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@@ -26,6 +26,11 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
         /// </summary>
         private readonly float gamma;
 
+        /// <summary>
+        /// The size of each complex convolution kernel.
+        /// </summary>
+        private readonly int kernelSize;
+
         /// <summary>
         /// The kernel parameters to use for the current instance (a: X, b: Y, A: Z, B: W)
         /// </summary>
@@ -47,11 +52,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             : base(configuration, source, sourceRectangle)
         {
             this.gamma = definition.Gamma;
+            this.kernelSize = (definition.Radius * 2) + 1;
 
             // Get the bokeh blur data
             BokehBlurKernelData data = BokehBlurKernelDataProvider.GetBokehBlurKernelData(
                 definition.Radius,
-                (definition.Radius * 2) + 1,
+                this.kernelSize,
                 definition.Components);
 
             this.kernelParameters = data.Parameters;
@@ -108,69 +114,132 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             Buffer2D<Vector4> processingBuffer)
         {
             // Allocate the buffer with the intermediate convolution results
-            using Buffer2D<ComplexVector4> firstPassBuffer = this.Configuration.MemoryAllocator.Allocate2D<ComplexVector4>(source.Size());
+            using Buffer2D<ComplexVector4> firstPassBuffer = configuration.MemoryAllocator.Allocate2D<ComplexVector4>(source.Size());
+
+            var interest = Rectangle.Intersect(sourceRectangle, source.Bounds());
+
+            // Unlike in the standard 2 pass convolution processor, we use a rectangle of 1x the interest width
+            // to speedup the actual convolution, by applying bulk pixel conversion and clamping calculation.
+            // The second half of the buffer will just target the temporary buffer of complex pixel values.
+            // This is needed because the bokeh blur operates as TPixel -> complex -> TPixel, so we cannot
+            // convert back to standard pixels after each separate 1D convolution pass. Like in the gaussian
+            // blur though, we preallocate and compute the kernel sampling maps before processing each complex
+            // component, to avoid recomputing the same sampling map once per convolution pass.
+            using var mapX = new KernelSamplingMap(configuration.MemoryAllocator);
+            using var mapY = new KernelSamplingMap(configuration.MemoryAllocator);
+
+            mapX.BuildSamplingOffsetMap(1, this.kernelSize, interest);
+            mapY.BuildSamplingOffsetMap(this.kernelSize, 1, interest);
 
-            // Perform two 1D convolutions for each component in the current instance
             ref Complex64[] baseRef = ref MemoryMarshal.GetReference(this.kernels.AsSpan());
             ref Vector4 paramsRef = ref MemoryMarshal.GetReference(this.kernelParameters.AsSpan());
+
+            // Perform two 1D convolutions for each component in the current instance
             for (int i = 0; i < this.kernels.Length; i++)
             {
                 // Compute the resulting complex buffer for the current component
                 Complex64[] kernel = Unsafe.Add(ref baseRef, i);
                 Vector4 parameters = Unsafe.Add(ref paramsRef, i);
 
-                // Compute the vertical 1D convolution
-                var verticalOperation = new ApplyVerticalConvolutionRowOperation(sourceRectangle, firstPassBuffer, source.PixelBuffer, kernel);
-                ParallelRowIterator.IterateRows(
+                // Horizontal convolution
+                var horizontalOperation = new FirstPassConvolutionRowOperation(
+                    interest,
+                    firstPassBuffer,
+                    source.PixelBuffer,
+                    mapX,
+                    kernel,
+                    configuration);
+
+                ParallelRowIterator.IterateRows<FirstPassConvolutionRowOperation, Vector4>(
                     configuration,
-                    sourceRectangle,
-                    in verticalOperation);
+                    interest,
+                    in horizontalOperation);
+
+                // Vertical 1D convolutions to accumulate the partial results on the target buffer
+                var verticalOperation = new BokehBlurProcessor.SecondPassConvolutionRowOperation(
+                    interest,
+                    processingBuffer,
+                    firstPassBuffer,
+                    mapY,
+                    kernel,
+                    parameters.Z,
+                    parameters.W);
 
-                // Compute the horizontal 1D convolutions and accumulate the partial results on the target buffer
-                var horizontalOperation = new BokehBlurProcessor.ApplyHorizontalConvolutionRowOperation(sourceRectangle, processingBuffer, firstPassBuffer, kernel, parameters.Z, parameters.W);
                 ParallelRowIterator.IterateRows(
                     configuration,
-                    sourceRectangle,
-                    in horizontalOperation);
+                    interest,
+                    in verticalOperation);
             }
         }
 
         /// <summary>
         /// A <see langword="struct"/> implementing the vertical convolution logic for <see cref="BokehBlurProcessor{T}"/>.
         /// </summary>
-        private readonly struct ApplyVerticalConvolutionRowOperation : IRowOperation
+        private readonly struct FirstPassConvolutionRowOperation : IRowOperation<Vector4>
         {
             private readonly Rectangle bounds;
             private readonly Buffer2D<ComplexVector4> targetValues;
             private readonly Buffer2D<TPixel> sourcePixels;
+            private readonly KernelSamplingMap map;
             private readonly Complex64[] kernel;
-            private readonly int maxY;
-            private readonly int maxX;
+            private readonly Configuration configuration;
 
             [MethodImpl(InliningOptions.ShortMethod)]
-            public ApplyVerticalConvolutionRowOperation(
+            public FirstPassConvolutionRowOperation(
                 Rectangle bounds,
                 Buffer2D<ComplexVector4> targetValues,
                 Buffer2D<TPixel> sourcePixels,
-                Complex64[] kernel)
+                KernelSamplingMap map,
+                Complex64[] kernel,
+                Configuration configuration)
             {
                 this.bounds = bounds;
-                this.maxY = this.bounds.Bottom - 1;
-                this.maxX = this.bounds.Right - 1;
                 this.targetValues = targetValues;
                 this.sourcePixels = sourcePixels;
+                this.map = map;
                 this.kernel = kernel;
+                this.configuration = configuration;
             }
 
             /// <inheritdoc/>
             [MethodImpl(InliningOptions.ShortMethod)]
-            public void Invoke(int y)
+            public void Invoke(int y, Span<Vector4> span)
             {
-                Span<ComplexVector4> targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X);
+                int boundsX = this.bounds.X;
+                int boundsWidth = this.bounds.Width;
 
-                for (int x = 0; x < this.bounds.Width; x++)
+                var state = new ConvolutionState<Complex64>(this.kernel, 1, this.kernel.Length, this.map);
+                ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
+
+                Span<ComplexVector4> targetBuffer = this.targetValues.GetRowSpan(y);
+
+                // Clear the target buffer
+                targetBuffer.Clear();
+                ref ComplexVector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+
+                ReadOnlyKernel<Complex64> kernel = state.Kernel;
+
+                for (int kY = 0; kY < kernel.Rows; kY++)
                 {
-                    Buffer2DUtils.Convolve4(this.kernel, this.sourcePixels, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX);
+                    // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                    int sampleY = Unsafe.Add(ref sampleRowBase, kY);
+                    Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
+                    PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, span);
+
+                    ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span);
+
+                    for (int x = 0; x < span.Length; x++)
+                    {
+                        ref int sampleColumnBase = ref state.GetSampleColumn(x);
+                        ref ComplexVector4 target = ref Unsafe.Add(ref targetBase, x);
+
+                        for (int kX = 0; kX < kernel.Columns; kX++)
+                        {
+                            int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+                            Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+                            target.Sum(kernel[kY, kX] * sample);
+                        }
+                    }
                 }
             }
         }
diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
index 151b0ffcc..16ce0fdd7 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
@@ -1,10 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.
 
-using System;
 using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;

From ca1a67a36cacd3c95c8a1fd91fdc994d57460b08 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sat, 12 Dec 2020 22:10:36 +0100
Subject: [PATCH 04/22] Specialize bokeh blur operations for 1D kernels

---
 .../Convolution/BokehBlurProcessor.cs         | 34 +++++++-------
 .../Convolution/BokehBlurProcessor{TPixel}.cs | 46 ++++++++++---------
 2 files changed, 40 insertions(+), 40 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
index e8f7351fa..edaac45b6 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
@@ -127,39 +127,37 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             {
                 int boundsX = this.bounds.X;
                 int boundsWidth = this.bounds.Width;
-                Span<Vector4> targetBuffer = this.targetValues.GetRowSpan(y);
+                int kernelSize = this.kernel.Length;
 
-                var state = new ConvolutionState<Complex64>(this.kernel, this.kernel.Length, 1, this.map);
-                ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
+                Span<int> rowOffsets = this.map.GetRowOffsetSpan();
+                Span<int> columnOffsets = this.map.GetColumnOffsetSpan();
+                ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize);
+                ref int sampleColumnBase = ref MemoryMarshal.GetReference(columnOffsets);
 
                 // The target buffer is zeroed initially and then it accumulates the results
-                // of each partial convolution, so we don't have to clear it here as well.
-                ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+                // of each partial convolution, so we don't have to clear it here as well
+                Span<Vector4> targetBuffer = this.targetValues.GetRowSpan(y);
 
-                ReadOnlyKernel<Complex64> kernel = state.Kernel;
+                ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+                ref Complex64 kernelBase = ref this.kernel[0];
 
-                for (int kY = 0; kY < kernel.Rows; kY++)
+                for (int kY = 0; kY < kernelSize; kY++)
                 {
-                    // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                    // Get the precalculated source sample row for this kernel row and copy to our buffer
                     int sampleY = Unsafe.Add(ref sampleRowBase, kY);
                     Span<ComplexVector4> sourceRow = this.sourceValues.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
                     ref ComplexVector4 sourceBase = ref MemoryMarshal.GetReference(sourceRow);
+                    Complex64 factor = Unsafe.Add(ref kernelBase, kY);
 
                     for (int x = 0; x < boundsWidth; x++)
                     {
-                        ref int sampleColumnBase = ref state.GetSampleColumn(x);
+                        int sampleX = Unsafe.Add(ref sampleColumnBase, x) - boundsX;
                         ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
-                        ComplexVector4 pixel4 = default;
-
-                        for (int kX = 0; kX < kernel.Columns; kX++)
-                        {
-                            int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
-                            ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX);
 
-                            pixel4.Sum(kernel[kY, kX] * sample);
-                        }
+                        ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+                        ComplexVector4 partial = factor * sample;
 
-                        target += pixel4.WeightedSum(this.z, this.w);
+                        target += partial.WeightedSum(this.z, this.w);
                     }
                 }
             }
diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
index aa6160799..cdadd4dee 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@@ -207,39 +207,41 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             {
                 int boundsX = this.bounds.X;
                 int boundsWidth = this.bounds.Width;
+                int kernelSize = this.kernel.Length;
 
-                var state = new ConvolutionState<Complex64>(this.kernel, 1, this.kernel.Length, this.map);
-                ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
+                Span<int> rowOffsets = this.map.GetRowOffsetSpan();
+                Span<int> columnOffsets = this.map.GetColumnOffsetSpan();
+                int sampleY = Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), y - this.bounds.Y);
+                ref int sampleColumnBase = ref MemoryMarshal.GetReference(columnOffsets);
 
+                // Clear the target buffer for each row run
                 Span<ComplexVector4> targetBuffer = this.targetValues.GetRowSpan(y);
-
-                // Clear the target buffer
                 targetBuffer.Clear();
                 ref ComplexVector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
 
-                ReadOnlyKernel<Complex64> kernel = state.Kernel;
+                // Execute the bulk pixel format conversion for the current row
+                Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, span);
 
-                for (int kY = 0; kY < kernel.Rows; kY++)
-                {
-                    // Get the precalculated source sample row for this kernel row and copy to our buffer.
-                    int sampleY = Unsafe.Add(ref sampleRowBase, kY);
-                    Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
-                    PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, span);
+                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span);
+                ref Complex64 kernelBase = ref this.kernel[0];
 
-                    ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span);
+                for (int x = 0; x < span.Length; x++)
+                {
+                    ref ComplexVector4 target = ref Unsafe.Add(ref targetBase, x);
 
-                    for (int x = 0; x < span.Length; x++)
+                    for (int kX = 0; kX < kernelSize; kX++)
                     {
-                        ref int sampleColumnBase = ref state.GetSampleColumn(x);
-                        ref ComplexVector4 target = ref Unsafe.Add(ref targetBase, x);
-
-                        for (int kX = 0; kX < kernel.Columns; kX++)
-                        {
-                            int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
-                            Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
-                            target.Sum(kernel[kY, kX] * sample);
-                        }
+                        int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+                        Complex64 factor = Unsafe.Add(ref kernelBase, kX);
+
+                        target.Sum(factor * sample);
                     }
+
+                    // Shift the base column sampling reference by one row at the end of each outer
+                    // iteration so that the inner tight loop indexing can skip the multiplication
+                    sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize);
                 }
             }
         }

From 22f151286928f582dc522dcb3e3609cb737428d6 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sat, 12 Dec 2020 22:15:00 +0100
Subject: [PATCH 05/22] Minor code tweaks

---
 .../Processors/Convolution/BokehBlurProcessor.cs          | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
index edaac45b6..243bc46cb 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
@@ -136,24 +136,20 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
 
                 // The target buffer is zeroed initially and then it accumulates the results
                 // of each partial convolution, so we don't have to clear it here as well
-                Span<Vector4> targetBuffer = this.targetValues.GetRowSpan(y);
-
-                ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+                ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(0, y);
                 ref Complex64 kernelBase = ref this.kernel[0];
 
                 for (int kY = 0; kY < kernelSize; kY++)
                 {
                     // Get the precalculated source sample row for this kernel row and copy to our buffer
                     int sampleY = Unsafe.Add(ref sampleRowBase, kY);
-                    Span<ComplexVector4> sourceRow = this.sourceValues.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
-                    ref ComplexVector4 sourceBase = ref MemoryMarshal.GetReference(sourceRow);
+                    ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(boundsX, sampleY);
                     Complex64 factor = Unsafe.Add(ref kernelBase, kY);
 
                     for (int x = 0; x < boundsWidth; x++)
                     {
                         int sampleX = Unsafe.Add(ref sampleColumnBase, x) - boundsX;
                         ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
-
                         ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX);
                         ComplexVector4 partial = factor * sample;
 

From 68eeca928295ef1ace6a2e034e11073c54ab278b Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sat, 12 Dec 2020 22:19:24 +0100
Subject: [PATCH 06/22] Restore temporary changes

---
 .../Convolution2DRowOperation{TPixel}.cs      |  8 +++----
 .../Convolution/Convolution2DState.cs         |  8 +++----
 .../ConvolutionProcessor{TPixel}.cs           |  2 +-
 .../ConvolutionRowOperation{TPixel}.cs        | 12 +++++-----
 .../Convolution/ConvolutionState.cs           | 23 ++++---------------
 ...ReadOnlyKernel{T}.cs => ReadOnlyKernel.cs} | 19 ++++-----------
 6 files changed, 24 insertions(+), 48 deletions(-)
 rename src/ImageSharp/Processing/Processors/Convolution/{ReadOnlyKernel{T}.cs => ReadOnlyKernel.cs} (73%)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs
index dd3e98609..802d1809f 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs
@@ -80,8 +80,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             ref Vector4 targetBaseY = ref MemoryMarshal.GetReference(targetYBuffer);
             ref Vector4 targetBaseX = ref MemoryMarshal.GetReference(targetXBuffer);
 
-            ReadOnlyKernel<float> kernelY = state.KernelY;
-            ReadOnlyKernel<float> kernelX = state.KernelX;
+            ReadOnlyKernel kernelY = state.KernelY;
+            ReadOnlyKernel kernelX = state.KernelX;
             Span<TPixel> sourceRow;
             for (int kY = 0; kY < kernelY.Rows; kY++)
             {
@@ -146,8 +146,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             ref Vector4 targetBaseY = ref MemoryMarshal.GetReference(targetYBuffer);
             ref Vector4 targetBaseX = ref MemoryMarshal.GetReference(targetXBuffer);
 
-            ReadOnlyKernel<float> kernelY = state.KernelY;
-            ReadOnlyKernel<float> kernelX = state.KernelX;
+            ReadOnlyKernel kernelY = state.KernelY;
+            ReadOnlyKernel kernelX = state.KernelX;
             for (int kY = 0; kY < kernelY.Rows; kY++)
             {
                 // Get the precalculated source sample row for this kernel row and copy to our buffer.
diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs
index 6f9b11857..218093ac4 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs
@@ -23,21 +23,21 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             KernelSamplingMap map)
         {
             // We check the kernels are the same size upstream.
-            this.KernelY = new ReadOnlyKernel<float>(kernelY);
-            this.KernelX = new ReadOnlyKernel<float>(kernelX);
+            this.KernelY = new ReadOnlyKernel(kernelY);
+            this.KernelX = new ReadOnlyKernel(kernelX);
             this.kernelHeight = kernelY.Rows;
             this.kernelWidth = kernelY.Columns;
             this.rowOffsetMap = map.GetRowOffsetSpan();
             this.columnOffsetMap = map.GetColumnOffsetSpan();
         }
 
-        public readonly ReadOnlyKernel<float> KernelY
+        public readonly ReadOnlyKernel KernelY
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get;
         }
 
-        public readonly ReadOnlyKernel<float> KernelX
+        public readonly ReadOnlyKernel KernelX
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get;
diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs
index b0254bc91..924a1125b 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs
@@ -120,7 +120,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                 ref Vector4 targetRowRef = ref MemoryMarshal.GetReference(span);
                 Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
 
-                var state = new ConvolutionState<float>(in this.kernel, this.map);
+                var state = new ConvolutionState(in this.kernel, this.map);
                 int row = y - this.bounds.Y;
                 ref int sampleRowBase = ref state.GetSampleRow(row);
 
diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs
index beccfff01..9876b2885 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs
@@ -67,14 +67,14 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
             Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
 
-            var state = new ConvolutionState<float>(in this.kernelMatrix, this.map);
+            var state = new ConvolutionState(in this.kernelMatrix, this.map);
             ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
 
             // Clear the target buffer for each row run.
             targetBuffer.Clear();
             ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
 
-            ReadOnlyKernel<float> kernel = state.Kernel;
+            ReadOnlyKernel kernel = state.Kernel;
             Span<TPixel> sourceRow;
             for (int kY = 0; kY < kernel.Rows; kY++)
             {
@@ -119,17 +119,17 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             // Span is 2x bounds.
             int boundsX = this.bounds.X;
             int boundsWidth = this.bounds.Width;
-            Span<Vector4> sourceBuffer = span.Slice(0, boundsWidth);
-            Span<Vector4> targetBuffer = span.Slice(boundsWidth);
+            Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
+            Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
 
-            var state = new ConvolutionState<float>(in this.kernelMatrix, this.map);
+            var state = new ConvolutionState(in this.kernelMatrix, this.map);
             ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
 
             // Clear the target buffer for each row run.
             targetBuffer.Clear();
             ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
 
-            ReadOnlyKernel<float> kernel = state.Kernel;
+            ReadOnlyKernel kernel = state.Kernel;
             for (int kY = 0; kY < kernel.Rows; kY++)
             {
                 // Get the precalculated source sample row for this kernel row and copy to our buffer.
diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs
index 0b3dbc2d1..3f296c67d 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs
@@ -10,9 +10,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
     /// <summary>
     /// A stack only struct used for reducing reference indirection during convolution operations.
     /// </summary>
-    /// <typeparam name="T">The type of values for the kernel in use.</typeparam>
-    internal readonly ref struct ConvolutionState<T>
-        where T : unmanaged, IEquatable<T>
+    internal readonly ref struct ConvolutionState
     {
         private readonly Span<int> rowOffsetMap;
         private readonly Span<int> columnOffsetMap;
@@ -20,30 +18,17 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
         private readonly int kernelWidth;
 
         public ConvolutionState(
-            in DenseMatrix<T> kernel,
+            in DenseMatrix<float> kernel,
             KernelSamplingMap map)
         {
-            this.Kernel = new ReadOnlyKernel<T>(kernel);
+            this.Kernel = new ReadOnlyKernel(kernel);
             this.kernelHeight = kernel.Rows;
             this.kernelWidth = kernel.Columns;
             this.rowOffsetMap = map.GetRowOffsetSpan();
             this.columnOffsetMap = map.GetColumnOffsetSpan();
         }
 
-        public ConvolutionState(
-            T[] kernel,
-            int height,
-            int width,
-            KernelSamplingMap map)
-        {
-            this.Kernel = new ReadOnlyKernel<T>(kernel, height, width);
-            this.kernelHeight = height;
-            this.kernelWidth = width;
-            this.rowOffsetMap = map.GetRowOffsetSpan();
-            this.columnOffsetMap = map.GetColumnOffsetSpan();
-        }
-
-        public readonly ReadOnlyKernel<T> Kernel
+        public readonly ReadOnlyKernel Kernel
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get;
diff --git a/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs b/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs
similarity index 73%
rename from src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs
rename to src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs
index f95c3dc0a..37e006005 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs
@@ -12,26 +12,17 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
     /// A stack only, readonly, kernel matrix that can be indexed without
     /// bounds checks when compiled in release mode.
     /// </summary>
-    /// <typeparam name="T">The type of items in the kernel.</typeparam>
-    internal readonly ref struct ReadOnlyKernel<T>
-        where T : unmanaged, IEquatable<T>
+    internal readonly ref struct ReadOnlyKernel
     {
-        private readonly ReadOnlySpan<T> values;
+        private readonly ReadOnlySpan<float> values;
 
-        public ReadOnlyKernel(DenseMatrix<T> matrix)
+        public ReadOnlyKernel(DenseMatrix<float> matrix)
         {
             this.Columns = matrix.Columns;
             this.Rows = matrix.Rows;
             this.values = matrix.Span;
         }
 
-        public ReadOnlyKernel(T[] kernel, int height, int width)
-        {
-            this.Columns = width;
-            this.Rows = height;
-            this.values = kernel;
-        }
-
         public int Columns
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
@@ -44,13 +35,13 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             get;
         }
 
-        public T this[int row, int column]
+        public float this[int row, int column]
         {
             [MethodImpl(MethodImplOptions.AggressiveInlining)]
             get
             {
                 this.CheckCoordinates(row, column);
-                ref T vBase = ref MemoryMarshal.GetReference(this.values);
+                ref float vBase = ref MemoryMarshal.GetReference(this.values);
                 return Unsafe.Add(ref vBase, (row * this.Columns) + column);
             }
         }

From f8f3eaa321faede2d9e66d2c4042636eef735963 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sat, 12 Dec 2020 22:19:57 +0100
Subject: [PATCH 07/22] Remove unnecessary code

---
 .../Common/Helpers/Buffer2DUtils.cs           | 109 ------------------
 1 file changed, 109 deletions(-)
 delete mode 100644 src/ImageSharp/Common/Helpers/Buffer2DUtils.cs

diff --git a/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs b/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs
deleted file mode 100644
index 02a5afff7..000000000
--- a/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-
-using SixLabors.ImageSharp.Memory;
-using SixLabors.ImageSharp.PixelFormats;
-
-namespace SixLabors.ImageSharp
-{
-    /// <summary>
-    /// Extension methods for <see cref="Buffer2D{T}"/>.
-    /// TODO: One day rewrite all this to use SIMD intrinsics. There's a lot of scope for improvement.
-    /// </summary>
-    internal static class Buffer2DUtils
-    {
-        /// <summary>
-        /// Computes the sum of vectors in <paramref name="targetRow"/> weighted by the kernel weight values.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="kernel">The 1D convolution kernel.</param>
-        /// <param name="sourcePixels">The source frame.</param>
-        /// <param name="targetRow">The target row.</param>
-        /// <param name="row">The current row.</param>
-        /// <param name="column">The current column.</param>
-        /// <param name="minRow">The minimum working area row.</param>
-        /// <param name="maxRow">The maximum working area row.</param>
-        /// <param name="minColumn">The minimum working area column.</param>
-        /// <param name="maxColumn">The maximum working area column.</param>
-        public static void Convolve4<TPixel>(
-            Span<Complex64> kernel,
-            Buffer2D<TPixel> sourcePixels,
-            Span<ComplexVector4> targetRow,
-            int row,
-            int column,
-            int minRow,
-            int maxRow,
-            int minColumn,
-            int maxColumn)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            ComplexVector4 vector = default;
-            int kernelLength = kernel.Length;
-            int radiusY = kernelLength >> 1;
-            int sourceOffsetColumnBase = column + minColumn;
-            ref Complex64 baseRef = ref MemoryMarshal.GetReference(kernel);
-
-            for (int i = 0; i < kernelLength; i++)
-            {
-                int offsetY = Numerics.Clamp(row + i - radiusY, minRow, maxRow);
-                int offsetX = Numerics.Clamp(sourceOffsetColumnBase, minColumn, maxColumn);
-                Span<TPixel> sourceRowSpan = sourcePixels.GetRowSpan(offsetY);
-                var currentColor = sourceRowSpan[offsetX].ToVector4();
-
-                vector.Sum(Unsafe.Add(ref baseRef, i) * currentColor);
-            }
-
-            targetRow[column] = vector;
-        }
-
-        /// <summary>
-        /// Computes the sum of vectors in <paramref name="targetRow"/> weighted by the kernel weight values and accumulates the partial results.
-        /// </summary>
-        /// <param name="kernel">The 1D convolution kernel.</param>
-        /// <param name="sourceValues">The source frame.</param>
-        /// <param name="targetRow">The target row.</param>
-        /// <param name="row">The current row.</param>
-        /// <param name="column">The current column.</param>
-        /// <param name="minRow">The minimum working area row.</param>
-        /// <param name="maxRow">The maximum working area row.</param>
-        /// <param name="minColumn">The minimum working area column.</param>
-        /// <param name="maxColumn">The maximum working area column.</param>
-        /// <param name="z">The weight factor for the real component of the complex pixel values.</param>
-        /// <param name="w">The weight factor for the imaginary component of the complex pixel values.</param>
-        public static void Convolve4AndAccumulatePartials(
-            Span<Complex64> kernel,
-            Buffer2D<ComplexVector4> sourceValues,
-            Span<Vector4> targetRow,
-            int row,
-            int column,
-            int minRow,
-            int maxRow,
-            int minColumn,
-            int maxColumn,
-            float z,
-            float w)
-        {
-            ComplexVector4 vector = default;
-            int kernelLength = kernel.Length;
-            int radiusX = kernelLength >> 1;
-            int sourceOffsetColumnBase = column + minColumn;
-
-            int offsetY = Numerics.Clamp(row, minRow, maxRow);
-            ref ComplexVector4 sourceRef = ref MemoryMarshal.GetReference(sourceValues.GetRowSpan(offsetY));
-            ref Complex64 baseRef = ref MemoryMarshal.GetReference(kernel);
-
-            for (int x = 0; x < kernelLength; x++)
-            {
-                int offsetX = Numerics.Clamp(sourceOffsetColumnBase + x - radiusX, minColumn, maxColumn);
-                vector.Sum(Unsafe.Add(ref baseRef, x) * Unsafe.Add(ref sourceRef, offsetX));
-            }
-
-            targetRow[column] += vector.WeightedSum(z, w);
-        }
-    }
-}

From 16f4842f64bdb77c251eed0a3a4636a9b7ed604a Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sat, 12 Dec 2020 23:07:11 +0100
Subject: [PATCH 08/22] Fix gamma processing out of image bounds

---
 .../Convolution/BokehBlurProcessor{TPixel}.cs | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
index cdadd4dee..4b1d7f8f1 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@@ -77,26 +77,28 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
         /// <inheritdoc/>
         protected override void OnFrameApply(ImageFrame<TPixel> source)
         {
+            var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds());
+
             // Preliminary gamma highlight pass
-            var gammaOperation = new ApplyGammaExposureRowOperation(this.SourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
+            var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
             ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
                 this.Configuration,
-                this.SourceRectangle,
+                sourceRectangle,
                 in gammaOperation);
 
             // Create a 0-filled buffer to use to store the result of the component convolutions
             using Buffer2D<Vector4> processingBuffer = this.Configuration.MemoryAllocator.Allocate2D<Vector4>(source.Size(), AllocationOptions.Clean);
 
             // Perform the 1D convolutions on all the kernel components and accumulate the results
-            this.OnFrameApplyCore(source, this.SourceRectangle, this.Configuration, processingBuffer);
+            this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer);
 
             float inverseGamma = 1 / this.gamma;
 
             // Apply the inverse gamma exposure pass, and write the final pixel data
-            var operation = new ApplyInverseGammaExposureRowOperation(this.SourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma);
+            var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma);
             ParallelRowIterator.IterateRows(
                 this.Configuration,
-                this.SourceRectangle,
+                sourceRectangle,
                 in operation);
         }
 
@@ -116,8 +118,6 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             // Allocate the buffer with the intermediate convolution results
             using Buffer2D<ComplexVector4> firstPassBuffer = configuration.MemoryAllocator.Allocate2D<ComplexVector4>(source.Size());
 
-            var interest = Rectangle.Intersect(sourceRectangle, source.Bounds());
-
             // Unlike in the standard 2 pass convolution processor, we use a rectangle of 1x the interest width
             // to speedup the actual convolution, by applying bulk pixel conversion and clamping calculation.
             // The second half of the buffer will just target the temporary buffer of complex pixel values.
@@ -128,8 +128,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             using var mapX = new KernelSamplingMap(configuration.MemoryAllocator);
             using var mapY = new KernelSamplingMap(configuration.MemoryAllocator);
 
-            mapX.BuildSamplingOffsetMap(1, this.kernelSize, interest);
-            mapY.BuildSamplingOffsetMap(this.kernelSize, 1, interest);
+            mapX.BuildSamplingOffsetMap(1, this.kernelSize, sourceRectangle);
+            mapY.BuildSamplingOffsetMap(this.kernelSize, 1, sourceRectangle);
 
             ref Complex64[] baseRef = ref MemoryMarshal.GetReference(this.kernels.AsSpan());
             ref Vector4 paramsRef = ref MemoryMarshal.GetReference(this.kernelParameters.AsSpan());
@@ -143,7 +143,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
 
                 // Horizontal convolution
                 var horizontalOperation = new FirstPassConvolutionRowOperation(
-                    interest,
+                    sourceRectangle,
                     firstPassBuffer,
                     source.PixelBuffer,
                     mapX,
@@ -152,12 +152,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
 
                 ParallelRowIterator.IterateRows<FirstPassConvolutionRowOperation, Vector4>(
                     configuration,
-                    interest,
+                    sourceRectangle,
                     in horizontalOperation);
 
                 // Vertical 1D convolutions to accumulate the partial results on the target buffer
                 var verticalOperation = new BokehBlurProcessor.SecondPassConvolutionRowOperation(
-                    interest,
+                    sourceRectangle,
                     processingBuffer,
                     firstPassBuffer,
                     mapY,
@@ -167,7 +167,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
 
                 ParallelRowIterator.IterateRows(
                     configuration,
-                    interest,
+                    sourceRectangle,
                     in verticalOperation);
             }
         }

From 6187fb55e0aeb90c18d21f5f685baacac0172364 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sun, 13 Dec 2020 00:18:09 +0100
Subject: [PATCH 09/22] Fix blur processing when constrained to region

---
 .../Processing/Processors/Convolution/BokehBlurProcessor.cs   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
index 243bc46cb..b3844ded8 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
@@ -136,14 +136,14 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
 
                 // The target buffer is zeroed initially and then it accumulates the results
                 // of each partial convolution, so we don't have to clear it here as well
-                ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(0, y);
+                ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(boundsX, y);
                 ref Complex64 kernelBase = ref this.kernel[0];
 
                 for (int kY = 0; kY < kernelSize; kY++)
                 {
                     // Get the precalculated source sample row for this kernel row and copy to our buffer
                     int sampleY = Unsafe.Add(ref sampleRowBase, kY);
-                    ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(boundsX, sampleY);
+                    ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleY);
                     Complex64 factor = Unsafe.Add(ref kernelBase, kY);
 
                     for (int x = 0; x < boundsWidth; x++)

From 0a6f7baa719fa69ca4c56d0fcefc405b1a2051ff Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sun, 13 Dec 2020 00:58:26 +0100
Subject: [PATCH 10/22] Fix NullReferenceException in KernelSamplingMap.Dispose

---
 .../Processing/Processors/Convolution/KernelSamplingMap.cs    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
index f912b9562..904b599f7 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
@@ -99,8 +99,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
         {
             if (!this.isDisposed)
             {
-                this.yOffsets.Dispose();
-                this.xOffsets.Dispose();
+                this.yOffsets?.Dispose();
+                this.xOffsets?.Dispose();
 
                 this.isDisposed = true;
             }

From f62e2f9748e149394ccedd8595f212be80ca87f7 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sun, 13 Dec 2020 01:08:01 +0100
Subject: [PATCH 11/22] Remove allocation constrained test for bokeh blur

---
 .../Processing/Processors/Convolution/BokehBlurTest.cs    | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
index 6c48cf843..666fbdd93 100644
--- a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
@@ -173,13 +173,5 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution
                 testOutputDetails: value.ToString(),
                 appendPixelTypeToFileName: false);
         }
-
-        [Theory]
-        [WithTestPatternImages(100, 300, PixelTypes.Bgr24)]
-        public void WorksWithDiscoBuffers<TPixel>(TestImageProvider<TPixel> provider)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            provider.RunBufferCapacityLimitProcessorTest(41, c => c.BokehBlur());
-        }
     }
 }

From 3356225bb163ea28ce6f013ada157b4beba323c1 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Sun, 13 Dec 2020 23:08:55 +0100
Subject: [PATCH 12/22] Remove unnecessary offset indirections

---
 .../Convolution/BokehBlurProcessor.cs         |  5 +---
 .../Convolution/BokehBlurProcessor{TPixel}.cs | 23 ++++++++-----------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
index b3844ded8..d4fb27a57 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
@@ -130,9 +130,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                 int kernelSize = this.kernel.Length;
 
                 Span<int> rowOffsets = this.map.GetRowOffsetSpan();
-                Span<int> columnOffsets = this.map.GetColumnOffsetSpan();
                 ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize);
-                ref int sampleColumnBase = ref MemoryMarshal.GetReference(columnOffsets);
 
                 // The target buffer is zeroed initially and then it accumulates the results
                 // of each partial convolution, so we don't have to clear it here as well
@@ -148,9 +146,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
 
                     for (int x = 0; x < boundsWidth; x++)
                     {
-                        int sampleX = Unsafe.Add(ref sampleColumnBase, x) - boundsX;
                         ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
-                        ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+                        ComplexVector4 sample = Unsafe.Add(ref sourceBase, x);
                         ComplexVector4 partial = factor * sample;
 
                         target += partial.WeightedSum(this.z, this.w);
diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
index 4b1d7f8f1..dda384390 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@@ -124,12 +124,13 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             // This is needed because the bokeh blur operates as TPixel -> complex -> TPixel, so we cannot
             // convert back to standard pixels after each separate 1D convolution pass. Like in the gaussian
             // blur though, we preallocate and compute the kernel sampling maps before processing each complex
-            // component, to avoid recomputing the same sampling map once per convolution pass.
-            using var mapX = new KernelSamplingMap(configuration.MemoryAllocator);
-            using var mapY = new KernelSamplingMap(configuration.MemoryAllocator);
+            // component, to avoid recomputing the same sampling map once per convolution pass. Since we are
+            // doing two 1D convolutions with the same kernel, we can use a single kernel sampling map as if
+            // we were using a 2D kernel with each dimension being the same as the length of our kernel, and
+            // use the two sampling offset spans resulting from this same map. This saves some extra work.
+            using var mapXY = new KernelSamplingMap(configuration.MemoryAllocator);
 
-            mapX.BuildSamplingOffsetMap(1, this.kernelSize, sourceRectangle);
-            mapY.BuildSamplingOffsetMap(this.kernelSize, 1, sourceRectangle);
+            mapXY.BuildSamplingOffsetMap(this.kernelSize, this.kernelSize, sourceRectangle);
 
             ref Complex64[] baseRef = ref MemoryMarshal.GetReference(this.kernels.AsSpan());
             ref Vector4 paramsRef = ref MemoryMarshal.GetReference(this.kernelParameters.AsSpan());
@@ -146,7 +147,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                     sourceRectangle,
                     firstPassBuffer,
                     source.PixelBuffer,
-                    mapX,
+                    mapXY,
                     kernel,
                     configuration);
 
@@ -160,7 +161,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                     sourceRectangle,
                     processingBuffer,
                     firstPassBuffer,
-                    mapY,
+                    mapXY,
                     kernel,
                     parameters.Z,
                     parameters.W);
@@ -209,22 +210,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                 int boundsWidth = this.bounds.Width;
                 int kernelSize = this.kernel.Length;
 
-                Span<int> rowOffsets = this.map.GetRowOffsetSpan();
-                Span<int> columnOffsets = this.map.GetColumnOffsetSpan();
-                int sampleY = Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), y - this.bounds.Y);
-                ref int sampleColumnBase = ref MemoryMarshal.GetReference(columnOffsets);
-
                 // Clear the target buffer for each row run
                 Span<ComplexVector4> targetBuffer = this.targetValues.GetRowSpan(y);
                 targetBuffer.Clear();
                 ref ComplexVector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
 
                 // Execute the bulk pixel format conversion for the current row
-                Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
+                Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
                 PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, span);
 
                 ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span);
                 ref Complex64 kernelBase = ref this.kernel[0];
+                ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan());
 
                 for (int x = 0; x < span.Length; x++)
                 {

From 8292407ae2258f413e13754d443e69ba21a92b8b Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Mon, 14 Dec 2020 01:51:24 +0100
Subject: [PATCH 13/22] Add optimized paths for default gamma exposure

---
 .../Convolution/BokehBlurProcessor{TPixel}.cs | 171 ++++++++++++++++--
 1 file changed, 159 insertions(+), 12 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
index dda384390..c01fc3ba1 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@@ -80,11 +80,22 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds());
 
             // Preliminary gamma highlight pass
-            var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
-            ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
-                this.Configuration,
-                sourceRectangle,
-                in gammaOperation);
+            if (this.gamma == 3F)
+            {
+                var gammaOperation = new ApplyGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration);
+                ParallelRowIterator.IterateRows<ApplyGamma3ExposureRowOperation, Vector4>(
+                    this.Configuration,
+                    sourceRectangle,
+                    in gammaOperation);
+            }
+            else
+            {
+                var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
+                ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
+                    this.Configuration,
+                    sourceRectangle,
+                    in gammaOperation);
+            }
 
             // Create a 0-filled buffer to use to store the result of the component convolutions
             using Buffer2D<Vector4> processingBuffer = this.Configuration.MemoryAllocator.Allocate2D<Vector4>(source.Size(), AllocationOptions.Clean);
@@ -92,14 +103,23 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             // Perform the 1D convolutions on all the kernel components and accumulate the results
             this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer);
 
-            float inverseGamma = 1 / this.gamma;
-
             // Apply the inverse gamma exposure pass, and write the final pixel data
-            var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma);
-            ParallelRowIterator.IterateRows(
-                this.Configuration,
-                sourceRectangle,
-                in operation);
+            if (this.gamma == 3F)
+            {
+                var operation = new ApplyInverseGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration);
+                ParallelRowIterator.IterateRows(
+                    this.Configuration,
+                    sourceRectangle,
+                    in operation);
+            }
+            else
+            {
+                var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma);
+                ParallelRowIterator.IterateRows(
+                    this.Configuration,
+                    sourceRectangle,
+                    in operation);
+            }
         }
 
         /// <summary>
@@ -286,6 +306,56 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             }
         }
 
+        /// <summary>
+        /// A <see langword="struct"/> implementing the 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
+        /// </summary>
+        private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation<Vector4>
+        {
+            private readonly Rectangle bounds;
+            private readonly Buffer2D<TPixel> targetPixels;
+            private readonly Configuration configuration;
+
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public ApplyGamma3ExposureRowOperation(
+                Rectangle bounds,
+                Buffer2D<TPixel> targetPixels,
+                Configuration configuration)
+            {
+                this.bounds = bounds;
+                this.targetPixels = targetPixels;
+                this.configuration = configuration;
+            }
+
+            /// <inheritdoc/>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public void Invoke(int y, Span<Vector4> span)
+            {
+                Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
+                ref Vector4 baseRef = ref MemoryMarshal.GetReference(span);
+
+                for (int x = 0; x < this.bounds.Width; x++)
+                {
+                    ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
+                    Vector4 v = pixel4;
+                    float a = v.W;
+
+                    // Fast path for the default gamma exposure, which is 3. In this case we can skip
+                    // calling Math.Pow 3 times (one per component), as the method is an internal call and
+                    // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
+                    // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
+                    // back to the target index in the temporary span. The whole iteration will get completely
+                    // inlined and traslated into vectorized instructions, with much better performance.
+                    v = v * v * v;
+                    v.W = a;
+
+                    pixel4 = v;
+                }
+
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
+            }
+        }
+
         /// <summary>
         /// A <see langword="struct"/> implementing the inverse gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
         /// </summary>
@@ -335,5 +405,82 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                 PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
             }
         }
+
+        /// <summary>
+        /// A <see langword="struct"/> implementing the inverse 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
+        /// </summary>
+        private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation
+        {
+            private readonly Rectangle bounds;
+            private readonly Buffer2D<TPixel> targetPixels;
+            private readonly Buffer2D<Vector4> sourceValues;
+            private readonly Configuration configuration;
+
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public ApplyInverseGamma3ExposureRowOperation(
+                Rectangle bounds,
+                Buffer2D<TPixel> targetPixels,
+                Buffer2D<Vector4> sourceValues,
+                Configuration configuration)
+            {
+                this.bounds = bounds;
+                this.targetPixels = targetPixels;
+                this.sourceValues = sourceValues;
+                this.configuration = configuration;
+            }
+
+            /// <inheritdoc/>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public unsafe void Invoke(int y)
+            {
+                Vector4 low = Vector4.Zero;
+                var high = new Vector4(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity);
+
+                Span<TPixel> targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
+                Span<Vector4> sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X);
+                ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
+
+                for (int x = 0; x < this.bounds.Width; x++)
+                {
+                    ref Vector4 v = ref Unsafe.Add(ref sourceRef, x);
+                    Vector4 clamp = Numerics.Clamp(v, low, high);
+
+                    double
+                        x64 = clamp.X,
+                        y64 = clamp.Y,
+                        z64 = clamp.Z;
+                    float a = clamp.W;
+
+                    ulong
+                        xl = *(ulong*)&x64,
+                        yl = *(ulong*)&y64,
+                        zl = *(ulong*)&z64;
+
+                    // Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root
+                    // of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the
+                    // constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform
+                    // the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set
+                    // these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD.
+                    // As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values.
+                    xl = 0x2a9f8a7be393b600 + (xl / 3);
+                    yl = 0x2a9f8a7be393b600 + (yl / 3);
+                    zl = 0x2a9f8a7be393b600 + (zl / 3);
+
+                    Vector4 y4;
+                    y4.X = (float)*(double*)&xl;
+                    y4.Y = (float)*(double*)&yl;
+                    y4.Z = (float)*(double*)&zl;
+                    y4.W = 0;
+
+                    y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
+                    y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
+                    y4.W = a;
+
+                    v = y4;
+                }
+
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
+            }
+        }
     }
 }

From 0903a58e588c6a8c32cfeba88349d26bb8e28558 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Mon, 14 Dec 2020 02:54:18 +0100
Subject: [PATCH 14/22] Switch to vectorized clamping

---
 .../Convolution/BokehBlurProcessor{TPixel}.cs | 27 ++++++++++---------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
index c01fc3ba1..02308d3fb 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@@ -333,8 +333,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                 Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
                 PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
                 ref Vector4 baseRef = ref MemoryMarshal.GetReference(span);
+                int length = this.bounds.Width;
 
-                for (int x = 0; x < this.bounds.Width; x++)
+                for (int x = 0; x < length; x++)
                 {
                     ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
                     Vector4 v = pixel4;
@@ -433,23 +434,23 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             [MethodImpl(InliningOptions.ShortMethod)]
             public unsafe void Invoke(int y)
             {
-                Vector4 low = Vector4.Zero;
-                var high = new Vector4(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity);
+                Span<Vector4> sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X, this.bounds.Width);
+                ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
+
+                Numerics.Clamp(MemoryMarshal.Cast<Vector4, float>(sourceRowSpan), 0, float.PositiveInfinity);
 
                 Span<TPixel> targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
-                Span<Vector4> sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X);
-                ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
+                int length = this.bounds.Width;
 
-                for (int x = 0; x < this.bounds.Width; x++)
+                for (int x = 0; x < length; x++)
                 {
                     ref Vector4 v = ref Unsafe.Add(ref sourceRef, x);
-                    Vector4 clamp = Numerics.Clamp(v, low, high);
 
                     double
-                        x64 = clamp.X,
-                        y64 = clamp.Y,
-                        z64 = clamp.Z;
-                    float a = clamp.W;
+                        x64 = v.X,
+                        y64 = v.Y,
+                        z64 = v.Z;
+                    float a = v.W;
 
                     ulong
                         xl = *(ulong*)&x64,
@@ -472,8 +473,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                     y4.Z = (float)*(double*)&zl;
                     y4.W = 0;
 
-                    y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
-                    y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
+                    y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+                    y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
                     y4.W = a;
 
                     v = y4;

From 3bba7deda18fcdabdc7f87c1c762e60398850579 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Mon, 14 Dec 2020 16:14:41 +0100
Subject: [PATCH 15/22] Initial vectorized cube root implementation

---
 src/ImageSharp/Common/Helpers/Numerics.cs     | 126 ++++++++++++++++++
 .../Convolution/BokehBlurProcessor{TPixel}.cs |  61 +--------
 2 files changed, 129 insertions(+), 58 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index b2bedb87b..f09530d6b 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -547,5 +547,131 @@ namespace SixLabors.ImageSharp
                 }
             }
         }
+
+        /// <summary>
+        /// Calculates the cube pow of all the XYZ channels of the input vectors.
+        /// </summary>
+        /// <param name="vectors">The span of vectors</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
+        {
+            ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+            int length = vectors.Length;
+
+            for (int x = 0; x < length; x++)
+            {
+                ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
+                Vector4 v = pixel4;
+                float a = v.W;
+
+                // Fast path for the default gamma exposure, which is 3. In this case we can skip
+                // calling Math.Pow 3 times (one per component), as the method is an internal call and
+                // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
+                // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
+                // back to the target index in the temporary span. The whole iteration will get completely
+                // inlined and traslated into vectorized instructions, with much better performance.
+                v = v * v * v;
+                v.W = a;
+
+                pixel4 = v;
+            }
+        }
+
+        /// <summary>
+        /// Calculates the cube root of all the XYZ channels of the input vectors.
+        /// </summary>
+        /// <param name="vectors">The span of vectors</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void CubeRootOnXYZ(Span<Vector4> vectors)
+        {
+            ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
+            int length = vectors.Length;
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse41.IsSupported)
+            {
+                var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF);
+                var v128_0x3F8000000 = Vector128.Create(0x3F800000);
+                var v128_341 = Vector128.Create(341);
+                var v128_0x80000000 = Vector128.Create(unchecked((int)0x80000000));
+                var v4_23rds = new Vector4(2 / 3f);
+                var v4_13rds = new Vector4(1 / 3f);
+
+                for (int x = 0; x < length; x++)
+                {
+                    ref Vector4 v4 = ref Unsafe.Add(ref vectorsRef, x);
+
+                    Vector4 vx = v4;
+                    float a = vx.W;
+                    Vector128<int> veax = Unsafe.As<Vector4, Vector128<int>>(ref vx);
+                    Vector128<int> vecx = veax;
+
+                    // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
+                    // directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
+                    // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
+                    // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
+                    // using a different succession (the same used below), which should be less unstable due to not having cube pow.
+                    veax = Sse2.And(veax, v128_0x7FFFFFFF);
+                    veax = Sse2.Subtract(veax, v128_0x3F8000000);
+                    veax = Sse2.ShiftRightArithmetic(veax, 10);
+                    veax = Sse41.MultiplyLow(veax, v128_341);
+                    veax = Sse2.Add(veax, v128_0x3F8000000);
+                    veax = Sse2.And(veax, v128_0x7FFFFFFF);
+                    vecx = Sse2.And(vecx, v128_0x80000000);
+                    veax = Sse2.Or(veax, vecx);
+
+                    Vector4 y4 = *(Vector4*)&veax;
+
+                    y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4)));
+                    y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4)));
+                    y4.W = a;
+
+                    v4 = y4;
+                }
+
+                return;
+            }
+#else
+            for (int x = 0; x < length; x++)
+            {
+                ref Vector4 v = ref Unsafe.Add(ref vectorsRef, x);
+
+                double
+                    x64 = v.X,
+                    y64 = v.Y,
+                    z64 = v.Z;
+                float a = v.W;
+
+                ulong
+                    xl = *(ulong*)&x64,
+                    yl = *(ulong*)&y64,
+                    zl = *(ulong*)&z64;
+
+                // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
+                // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
+                // this means what we actually want is to find the cube root of our clamped values.
+                // For more info on the  constant below, see:
+                // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
+                // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
+                // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
+                // register, and use it to accelerate two steps of the Newton approximation using SIMD.
+                xl = 0x2a9f8a7be393b600 + (xl / 3);
+                yl = 0x2a9f8a7be393b600 + (yl / 3);
+                zl = 0x2a9f8a7be393b600 + (zl / 3);
+
+                Vector4 y4;
+                y4.X = (float)*(double*)&xl;
+                y4.Y = (float)*(double*)&yl;
+                y4.Z = (float)*(double*)&zl;
+                y4.W = 0;
+
+                y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+                y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+                y4.W = a;
+
+                v = y4;
+            }
+#endif
+        }
     }
 }
diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
index 02308d3fb..a21155e10 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@@ -331,27 +331,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
             public void Invoke(int y, Span<Vector4> span)
             {
                 Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
+
                 PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
-                ref Vector4 baseRef = ref MemoryMarshal.GetReference(span);
-                int length = this.bounds.Width;
 
-                for (int x = 0; x < length; x++)
-                {
-                    ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
-                    Vector4 v = pixel4;
-                    float a = v.W;
-
-                    // Fast path for the default gamma exposure, which is 3. In this case we can skip
-                    // calling Math.Pow 3 times (one per component), as the method is an internal call and
-                    // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
-                    // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
-                    // back to the target index in the temporary span. The whole iteration will get completely
-                    // inlined and traslated into vectorized instructions, with much better performance.
-                    v = v * v * v;
-                    v.W = a;
-
-                    pixel4 = v;
-                }
+                Numerics.CubePowOnXYZ(span);
 
                 PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
             }
@@ -438,47 +421,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                 ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
 
                 Numerics.Clamp(MemoryMarshal.Cast<Vector4, float>(sourceRowSpan), 0, float.PositiveInfinity);
+                Numerics.CubeRootOnXYZ(sourceRowSpan);
 
                 Span<TPixel> targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
-                int length = this.bounds.Width;
-
-                for (int x = 0; x < length; x++)
-                {
-                    ref Vector4 v = ref Unsafe.Add(ref sourceRef, x);
-
-                    double
-                        x64 = v.X,
-                        y64 = v.Y,
-                        z64 = v.Z;
-                    float a = v.W;
-
-                    ulong
-                        xl = *(ulong*)&x64,
-                        yl = *(ulong*)&y64,
-                        zl = *(ulong*)&z64;
-
-                    // Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root
-                    // of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the
-                    // constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform
-                    // the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set
-                    // these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD.
-                    // As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values.
-                    xl = 0x2a9f8a7be393b600 + (xl / 3);
-                    yl = 0x2a9f8a7be393b600 + (yl / 3);
-                    zl = 0x2a9f8a7be393b600 + (zl / 3);
-
-                    Vector4 y4;
-                    y4.X = (float)*(double*)&xl;
-                    y4.Y = (float)*(double*)&yl;
-                    y4.Z = (float)*(double*)&zl;
-                    y4.W = 0;
-
-                    y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
-                    y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
-                    y4.W = a;
-
-                    v = y4;
-                }
 
                 PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
             }

From 392afeadeff989c9c2ae40a4a380f8797413e030 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Mon, 14 Dec 2020 16:30:14 +0100
Subject: [PATCH 16/22] Fix vectorized cube root on x86-64 with no SSE41

---
 src/ImageSharp/Common/Helpers/Numerics.cs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index f09530d6b..115cebef2 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -631,7 +631,9 @@ namespace SixLabors.ImageSharp
 
                 return;
             }
-#else
+#endif
+
+            // Fallback with scalar preprocessing and vectorized approximation steps
             for (int x = 0; x < length; x++)
             {
                 ref Vector4 v = ref Unsafe.Add(ref vectorsRef, x);
@@ -671,7 +673,6 @@ namespace SixLabors.ImageSharp
 
                 v = y4;
             }
-#endif
         }
     }
 }

From 76e704d4616fe51e47e3eefd19f38a97159c50d4 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Mon, 14 Dec 2020 16:49:18 +0100
Subject: [PATCH 17/22] Minor codegen tweaks

---
 src/ImageSharp/Common/Helpers/Numerics.cs | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index 115cebef2..55718e724 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -584,12 +584,12 @@ namespace SixLabors.ImageSharp
         [MethodImpl(MethodImplOptions.AggressiveInlining)]
         public static unsafe void CubeRootOnXYZ(Span<Vector4> vectors)
         {
-            ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
-            int length = vectors.Length;
-
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse41.IsSupported)
             {
+                ref Vector4 vectors4Ref = ref MemoryMarshal.GetReference(vectors);
+                ref Vector4 vectors4End = ref Unsafe.Add(ref vectors4Ref, vectors.Length);
+
                 var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF);
                 var v128_0x3F8000000 = Vector128.Create(0x3F800000);
                 var v128_341 = Vector128.Create(341);
@@ -597,11 +597,9 @@ namespace SixLabors.ImageSharp
                 var v4_23rds = new Vector4(2 / 3f);
                 var v4_13rds = new Vector4(1 / 3f);
 
-                for (int x = 0; x < length; x++)
+                while (Unsafe.IsAddressLessThan(ref vectors4Ref, ref vectors4End))
                 {
-                    ref Vector4 v4 = ref Unsafe.Add(ref vectorsRef, x);
-
-                    Vector4 vx = v4;
+                    Vector4 vx = vectors4Ref;
                     float a = vx.W;
                     Vector128<int> veax = Unsafe.As<Vector4, Vector128<int>>(ref vx);
                     Vector128<int> vecx = veax;
@@ -626,12 +624,15 @@ namespace SixLabors.ImageSharp
                     y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4)));
                     y4.W = a;
 
-                    v4 = y4;
+                    vectors4Ref = y4;
+                    vectors4Ref = ref Unsafe.Add(ref vectors4Ref, 1);
                 }
 
                 return;
             }
 #endif
+            ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
+            int length = vectors.Length;
 
             // Fallback with scalar preprocessing and vectorized approximation steps
             for (int x = 0; x < length; x++)

From b383dd4496ade8d1b0d6c90b00590d228c8642f4 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 14 Dec 2020 18:21:55 +0000
Subject: [PATCH 18/22] Add discontigous buffers and intrinsics tests

---
 .../Processors/Convolution/BokehBlurTest.cs   | 44 +++++++++++------
 .../FeatureTesting/FeatureTestRunner.cs       | 47 +++++++++++++++++++
 2 files changed, 76 insertions(+), 15 deletions(-)

diff --git a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
index 666fbdd93..dbf59a29b 100644
--- a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
@@ -6,7 +6,6 @@ using System.Collections.Generic;
 using System.Globalization;
 using System.Linq;
 using System.Text.RegularExpressions;
-using Microsoft.DotNet.RemoteExecutor;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Processing;
@@ -44,9 +43,8 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution
         [InlineData(20, 4, -10f)]
         [InlineData(20, 4, 0f)]
         public void VerifyBokehBlurProcessorArguments_Fail(int radius, int components, float gamma)
-        {
-            Assert.Throws<ArgumentOutOfRangeException>(() => new BokehBlurProcessor(radius, components, gamma));
-        }
+            => Assert.Throws<ArgumentOutOfRangeException>(
+                () => new BokehBlurProcessor(radius, components, gamma));
 
         [Fact]
         public void VerifyComplexComponents()
@@ -137,12 +135,10 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution
         [WithTestPatternImages(nameof(BokehBlurValues), 30, 20, PixelTypes.Rgba32)]
         public void BokehBlurFilterProcessor<TPixel>(TestImageProvider<TPixel> provider, BokehBlurInfo value)
             where TPixel : unmanaged, IPixel<TPixel>
-        {
-            provider.RunValidatingProcessorTest(
+            => provider.RunValidatingProcessorTest(
                 x => x.BokehBlur(value.Radius, value.Components, value.Gamma),
                 testOutputDetails: value.ToString(),
                 appendPixelTypeToFileName: false);
-        }
 
         [Theory]
         /*
@@ -152,18 +148,23 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution
         [WithTestPatternImages(200, 200, PixelTypes.Bgr24 | PixelTypes.Bgra32)]
         public void BokehBlurFilterProcessor_WorksWithAllPixelTypes<TPixel>(TestImageProvider<TPixel> provider)
             where TPixel : unmanaged, IPixel<TPixel>
-        {
-            provider.RunValidatingProcessorTest(
-                    x => x.BokehBlur(8, 2, 3),
-                    appendSourceFileOrDescription: false);
-        }
+            => provider.RunValidatingProcessorTest(
+                x => x.BokehBlur(8, 2, 3),
+                appendSourceFileOrDescription: false);
 
         [Theory]
         [WithFileCollection(nameof(TestFiles), nameof(BokehBlurValues), PixelTypes.Rgba32)]
-        public void BokehBlurFilterProcessor_Bounded<TPixel>(TestImageProvider<TPixel> provider, BokehBlurInfo value)
-            where TPixel : unmanaged, IPixel<TPixel>
+        public void BokehBlurFilterProcessor_Bounded(TestImageProvider<Rgba32> provider, BokehBlurInfo value)
         {
-            provider.RunValidatingProcessorTest(
+            static void RunTest(string arg1, string arg2)
+            {
+                TestImageProvider<Rgba32> provider =
+                    FeatureTestRunner.DeserializeForXunit<TestImageProvider<Rgba32>>(arg1);
+
+                BokehBlurInfo value =
+                    FeatureTestRunner.DeserializeForXunit<BokehBlurInfo>(arg2);
+
+                provider.RunValidatingProcessorTest(
                 x =>
                 {
                     Size size = x.GetCurrentSize();
@@ -172,6 +173,19 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution
                 },
                 testOutputDetails: value.ToString(),
                 appendPixelTypeToFileName: false);
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                HwIntrinsics.DisableSSE41,
+                provider,
+                value);
         }
+
+        [Theory]
+        [WithTestPatternImages(100, 300, PixelTypes.Bgr24)]
+        public void WorksWithDiscoBuffers<TPixel>(TestImageProvider<TPixel> provider)
+            where TPixel : unmanaged, IPixel<TPixel>
+            => provider.RunBufferCapacityLimitProcessorTest(260, c => c.BokehBlur());
     }
 }
diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
index 4720ea78a..8c78cc2b3 100644
--- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
@@ -211,6 +211,53 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities
             }
         }
 
+        /// <summary>
+        /// Runs the given test <paramref name="action"/> within an environment
+        /// where the given <paramref name="intrinsics"/> features.
+        /// </summary>
+        /// <param name="action">The test action to run.</param>
+        /// <param name="intrinsics">The intrinsics features.</param>
+        /// <param name="arg1">The value to pass as a parameter to the test action.</param>
+        /// <param name="arg2">The second value to pass as a parameter to the test action.</param>
+        public static void RunWithHwIntrinsicsFeature<T, T2>(
+            Action<string, string> action,
+            HwIntrinsics intrinsics,
+            T arg1,
+            T2 arg2)
+            where T : IXunitSerializable
+            where T2 : IXunitSerializable
+        {
+            if (!RemoteExecutor.IsSupported)
+            {
+                return;
+            }
+
+            foreach (KeyValuePair<HwIntrinsics, string> intrinsic in intrinsics.ToFeatureKeyValueCollection())
+            {
+                var processStartInfo = new ProcessStartInfo();
+                if (intrinsic.Key != HwIntrinsics.AllowAll)
+                {
+                    processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0";
+
+                    RemoteExecutor.Invoke(
+                        action,
+                        BasicSerializer.Serialize(arg1),
+                        BasicSerializer.Serialize(arg1),
+                        new RemoteInvokeOptions
+                        {
+                            StartInfo = processStartInfo
+                        })
+                        .Dispose();
+                }
+                else
+                {
+                    // Since we are running using the default architecture there is no
+                    // point creating the overhead of running the action in a separate process.
+                    action(BasicSerializer.Serialize(arg1), BasicSerializer.Serialize(arg2));
+                }
+            }
+        }
+
         /// <summary>
         /// Runs the given test <paramref name="action"/> within an environment
         /// where the given <paramref name="intrinsics"/> features.

From c9b07964cc17ebd271b9a9b3f9400cb80acba83b Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 14 Dec 2020 18:38:02 +0000
Subject: [PATCH 19/22] Fix feature test runner

---
 .../TestUtilities/FeatureTesting/FeatureTestRunner.cs           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
index 8c78cc2b3..fa0f02ca1 100644
--- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
@@ -242,7 +242,7 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities
                     RemoteExecutor.Invoke(
                         action,
                         BasicSerializer.Serialize(arg1),
-                        BasicSerializer.Serialize(arg1),
+                        BasicSerializer.Serialize(arg2),
                         new RemoteInvokeOptions
                         {
                             StartInfo = processStartInfo

From df18c4e4c87ef20e2c75c2242d3076d4287f0ed4 Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Mon, 14 Dec 2020 20:11:47 +0100
Subject: [PATCH 20/22] Switch to explicit SSE Newton approximations

---
 src/ImageSharp/Common/Helpers/Numerics.cs | 31 ++++++++++-------------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index 55718e724..e7ae71210 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -587,22 +587,20 @@ namespace SixLabors.ImageSharp
 #if SUPPORTS_RUNTIME_INTRINSICS
             if (Sse41.IsSupported)
             {
-                ref Vector4 vectors4Ref = ref MemoryMarshal.GetReference(vectors);
-                ref Vector4 vectors4End = ref Unsafe.Add(ref vectors4Ref, vectors.Length);
+                ref Vector128<float> vectors128Ref = ref Unsafe.As<Vector4, Vector128<float>>(ref MemoryMarshal.GetReference(vectors));
+                ref Vector128<float> vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length);
 
                 var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF);
                 var v128_0x3F8000000 = Vector128.Create(0x3F800000);
                 var v128_341 = Vector128.Create(341);
                 var v128_0x80000000 = Vector128.Create(unchecked((int)0x80000000));
-                var v4_23rds = new Vector4(2 / 3f);
-                var v4_13rds = new Vector4(1 / 3f);
+                var v4_23rds = Vector128.Create(2 / 3f);
+                var v4_13rds = Vector128.Create(1 / 3f);
 
-                while (Unsafe.IsAddressLessThan(ref vectors4Ref, ref vectors4End))
+                while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End))
                 {
-                    Vector4 vx = vectors4Ref;
-                    float a = vx.W;
-                    Vector128<int> veax = Unsafe.As<Vector4, Vector128<int>>(ref vx);
-                    Vector128<int> vecx = veax;
+                    Vector128<float> vecx = vectors128Ref;
+                    Vector128<int> veax = vecx.AsInt32();
 
                     // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
                     // directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
@@ -615,17 +613,16 @@ namespace SixLabors.ImageSharp
                     veax = Sse41.MultiplyLow(veax, v128_341);
                     veax = Sse2.Add(veax, v128_0x3F8000000);
                     veax = Sse2.And(veax, v128_0x7FFFFFFF);
-                    vecx = Sse2.And(vecx, v128_0x80000000);
-                    veax = Sse2.Or(veax, vecx);
+                    veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_0x80000000));
 
-                    Vector4 y4 = *(Vector4*)&veax;
+                    Vector128<float> y4 = veax.AsSingle();
 
-                    y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4)));
-                    y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4)));
-                    y4.W = a;
+                    y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                    y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                    y4 = Sse41.Insert(y4, vecx, 0xF0);
 
-                    vectors4Ref = y4;
-                    vectors4Ref = ref Unsafe.Add(ref vectors4Ref, 1);
+                    vectors128Ref = y4;
+                    vectors128Ref = ref Unsafe.Add(ref vectors128Ref, 1);
                 }
 
                 return;

From e7cdb0aaab2d1c0c122f8e0cb7618bc5b624befa Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Mon, 14 Dec 2020 20:19:47 +0100
Subject: [PATCH 21/22] Add FMA support, more SSE optimizations
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Special thanks to @tannergooding for the help 🚀
---
 src/ImageSharp/Common/Helpers/Numerics.cs | 33 ++++++++++++++---------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index e7ae71210..88b6d83ee 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -590,12 +590,12 @@ namespace SixLabors.ImageSharp
                 ref Vector128<float> vectors128Ref = ref Unsafe.As<Vector4, Vector128<float>>(ref MemoryMarshal.GetReference(vectors));
                 ref Vector128<float> vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length);
 
-                var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF);
-                var v128_0x3F8000000 = Vector128.Create(0x3F800000);
                 var v128_341 = Vector128.Create(341);
-                var v128_0x80000000 = Vector128.Create(unchecked((int)0x80000000));
-                var v4_23rds = Vector128.Create(2 / 3f);
-                var v4_13rds = Vector128.Create(1 / 3f);
+                Vector128<int> v128_negativeZero = Vector128.Create(-0.0f).AsInt32();
+                Vector128<int> v128_one = Vector128.Create(1.0f).AsInt32();
+
+                var v128_13rd = Vector128.Create(1 / 3f);
+                var v128_23rds = Vector128.Create(2 / 3f);
 
                 while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End))
                 {
@@ -607,18 +607,27 @@ namespace SixLabors.ImageSharp
                     // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
                     // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
                     // using a different succession (the same used below), which should be less unstable due to not having cube pow.
-                    veax = Sse2.And(veax, v128_0x7FFFFFFF);
-                    veax = Sse2.Subtract(veax, v128_0x3F8000000);
+                    veax = Sse2.AndNot(v128_negativeZero, veax);
+                    veax = Sse2.Subtract(veax, v128_one);
                     veax = Sse2.ShiftRightArithmetic(veax, 10);
                     veax = Sse41.MultiplyLow(veax, v128_341);
-                    veax = Sse2.Add(veax, v128_0x3F8000000);
-                    veax = Sse2.And(veax, v128_0x7FFFFFFF);
-                    veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_0x80000000));
+                    veax = Sse2.Add(veax, v128_one);
+                    veax = Sse2.AndNot(v128_negativeZero, veax);
+                    veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_negativeZero));
 
                     Vector128<float> y4 = veax.AsSingle();
 
-                    y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
-                    y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                    if (Fma.IsSupported)
+                    {
+                        y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                        y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                    }
+                    else
+                    {
+                        y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                        y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                    }
+
                     y4 = Sse41.Insert(y4, vecx, 0xF0);
 
                     vectors128Ref = y4;

From 80617a060c5647048ff3fe8bd3ad352c067fb4ba Mon Sep 17 00:00:00 2001
From: Sergio Pedri <sergio0694@live.com>
Date: Mon, 14 Dec 2020 20:44:30 +0100
Subject: [PATCH 22/22] Add more codegen improvements

---
 src/ImageSharp/Common/Helpers/Numerics.cs | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index 88b6d83ee..56ab46c68 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -556,12 +556,11 @@ namespace SixLabors.ImageSharp
         public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
         {
             ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
-            int length = vectors.Length;
+            ref Vector4 endRef = ref Unsafe.Add(ref baseRef, vectors.Length);
 
-            for (int x = 0; x < length; x++)
+            while (Unsafe.IsAddressLessThan(ref baseRef, ref endRef))
             {
-                ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
-                Vector4 v = pixel4;
+                Vector4 v = baseRef;
                 float a = v.W;
 
                 // Fast path for the default gamma exposure, which is 3. In this case we can skip
@@ -573,7 +572,8 @@ namespace SixLabors.ImageSharp
                 v = v * v * v;
                 v.W = a;
 
-                pixel4 = v;
+                baseRef = v;
+                baseRef = ref Unsafe.Add(ref baseRef, 1);
             }
         }
 
@@ -638,12 +638,12 @@ namespace SixLabors.ImageSharp
             }
 #endif
             ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
-            int length = vectors.Length;
+            ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length);
 
             // Fallback with scalar preprocessing and vectorized approximation steps
-            for (int x = 0; x < length; x++)
+            while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd))
             {
-                ref Vector4 v = ref Unsafe.Add(ref vectorsRef, x);
+                Vector4 v = vectorsRef;
 
                 double
                     x64 = v.X,
@@ -678,7 +678,8 @@ namespace SixLabors.ImageSharp
                 y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
                 y4.W = a;
 
-                v = y4;
+                vectorsRef = y4;
+                vectorsRef = ref Unsafe.Add(ref vectorsRef, 1);
             }
         }
     }