Merge branch 'master' into sp/image-wrap-ptr

5 years ago · fea74f33bd
9 changed files with 465 additions and 177 deletions
--- a/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs
+++ b/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs
@ -1,109 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-
-using SixLabors.ImageSharp.Memory;
-using SixLabors.ImageSharp.PixelFormats;
-
-namespace SixLabors.ImageSharp
-{
-    /// <summary>
-    /// Extension methods for <see cref="Buffer2D{T}"/>.
-    /// TODO: One day rewrite all this to use SIMD intrinsics. There's a lot of scope for improvement.
-    /// </summary>
-    internal static class Buffer2DUtils
-    {
-        /// <summary>
-        /// Computes the sum of vectors in <paramref name="targetRow"/> weighted by the kernel weight values.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="kernel">The 1D convolution kernel.</param>
-        /// <param name="sourcePixels">The source frame.</param>
-        /// <param name="targetRow">The target row.</param>
-        /// <param name="row">The current row.</param>
-        /// <param name="column">The current column.</param>
-        /// <param name="minRow">The minimum working area row.</param>
-        /// <param name="maxRow">The maximum working area row.</param>
-        /// <param name="minColumn">The minimum working area column.</param>
-        /// <param name="maxColumn">The maximum working area column.</param>
-        public static void Convolve4<TPixel>(
-            Span<Complex64> kernel,
-            Buffer2D<TPixel> sourcePixels,
-            Span<ComplexVector4> targetRow,
-            int row,
-            int column,
-            int minRow,
-            int maxRow,
-            int minColumn,
-            int maxColumn)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            ComplexVector4 vector = default;
-            int kernelLength = kernel.Length;
-            int radiusY = kernelLength >> 1;
-            int sourceOffsetColumnBase = column + minColumn;
-            ref Complex64 baseRef = ref MemoryMarshal.GetReference(kernel);
-
-            for (int i = 0; i < kernelLength; i++)
-            {
-                int offsetY = Numerics.Clamp(row + i - radiusY, minRow, maxRow);
-                int offsetX = Numerics.Clamp(sourceOffsetColumnBase, minColumn, maxColumn);
-                Span<TPixel> sourceRowSpan = sourcePixels.GetRowSpan(offsetY);
-                var currentColor = sourceRowSpan[offsetX].ToVector4();
-
-                vector.Sum(Unsafe.Add(ref baseRef, i) * currentColor);
-            }
-
-            targetRow[column] = vector;
-        }
-
-        /// <summary>
-        /// Computes the sum of vectors in <paramref name="targetRow"/> weighted by the kernel weight values and accumulates the partial results.
-        /// </summary>
-        /// <param name="kernel">The 1D convolution kernel.</param>
-        /// <param name="sourceValues">The source frame.</param>
-        /// <param name="targetRow">The target row.</param>
-        /// <param name="row">The current row.</param>
-        /// <param name="column">The current column.</param>
-        /// <param name="minRow">The minimum working area row.</param>
-        /// <param name="maxRow">The maximum working area row.</param>
-        /// <param name="minColumn">The minimum working area column.</param>
-        /// <param name="maxColumn">The maximum working area column.</param>
-        /// <param name="z">The weight factor for the real component of the complex pixel values.</param>
-        /// <param name="w">The weight factor for the imaginary component of the complex pixel values.</param>
-        public static void Convolve4AndAccumulatePartials(
-            Span<Complex64> kernel,
-            Buffer2D<ComplexVector4> sourceValues,
-            Span<Vector4> targetRow,
-            int row,
-            int column,
-            int minRow,
-            int maxRow,
-            int minColumn,
-            int maxColumn,
-            float z,
-            float w)
-        {
-            ComplexVector4 vector = default;
-            int kernelLength = kernel.Length;
-            int radiusX = kernelLength >> 1;
-            int sourceOffsetColumnBase = column + minColumn;
-
-            int offsetY = Numerics.Clamp(row, minRow, maxRow);
-            ref ComplexVector4 sourceRef = ref MemoryMarshal.GetReference(sourceValues.GetRowSpan(offsetY));
-            ref Complex64 baseRef = ref MemoryMarshal.GetReference(kernel);
-
-            for (int x = 0; x < kernelLength; x++)
-            {
-                int offsetX = Numerics.Clamp(sourceOffsetColumnBase + x - radiusX, minColumn, maxColumn);
-                vector.Sum(Unsafe.Add(ref baseRef, x) * Unsafe.Add(ref sourceRef, offsetX));
-            }
-
-            targetRow[column] += vector.WeightedSum(z, w);
-        }
-    }
-}
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@ -547,5 +547,140 @@ namespace SixLabors.ImageSharp
                }
            }
        }
+
+        /// <summary>
+        /// Calculates the cube pow of all the XYZ channels of the input vectors.
+        /// </summary>
+        /// <param name="vectors">The span of vectors</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void CubePowOnXYZ(Span<Vector4> vectors)
+        {
+            ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+            ref Vector4 endRef = ref Unsafe.Add(ref baseRef, vectors.Length);
+
+            while (Unsafe.IsAddressLessThan(ref baseRef, ref endRef))
+            {
+                Vector4 v = baseRef;
+                float a = v.W;
+
+                // Fast path for the default gamma exposure, which is 3. In this case we can skip
+                // calling Math.Pow 3 times (one per component), as the method is an internal call and
+                // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
+                // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
+                // back to the target index in the temporary span. The whole iteration will get completely
+                // inlined and traslated into vectorized instructions, with much better performance.
+                v = v * v * v;
+                v.W = a;
+
+                baseRef = v;
+                baseRef = ref Unsafe.Add(ref baseRef, 1);
+            }
+        }
+
+        /// <summary>
+        /// Calculates the cube root of all the XYZ channels of the input vectors.
+        /// </summary>
+        /// <param name="vectors">The span of vectors</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public static unsafe void CubeRootOnXYZ(Span<Vector4> vectors)
+        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse41.IsSupported)
+            {
+                ref Vector128<float> vectors128Ref = ref Unsafe.As<Vector4, Vector128<float>>(ref MemoryMarshal.GetReference(vectors));
+                ref Vector128<float> vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length);
+
+                var v128_341 = Vector128.Create(341);
+                Vector128<int> v128_negativeZero = Vector128.Create(-0.0f).AsInt32();
+                Vector128<int> v128_one = Vector128.Create(1.0f).AsInt32();
+
+                var v128_13rd = Vector128.Create(1 / 3f);
+                var v128_23rds = Vector128.Create(2 / 3f);
+
+                while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End))
+                {
+                    Vector128<float> vecx = vectors128Ref;
+                    Vector128<int> veax = vecx.AsInt32();
+
+                    // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
+                    // directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
+                    // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
+                    // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
+                    // using a different succession (the same used below), which should be less unstable due to not having cube pow.
+                    veax = Sse2.AndNot(v128_negativeZero, veax);
+                    veax = Sse2.Subtract(veax, v128_one);
+                    veax = Sse2.ShiftRightArithmetic(veax, 10);
+                    veax = Sse41.MultiplyLow(veax, v128_341);
+                    veax = Sse2.Add(veax, v128_one);
+                    veax = Sse2.AndNot(v128_negativeZero, veax);
+                    veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_negativeZero));
+
+                    Vector128<float> y4 = veax.AsSingle();
+
+                    if (Fma.IsSupported)
+                    {
+                        y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                        y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                    }
+                    else
+                    {
+                        y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                        y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+                    }
+
+                    y4 = Sse41.Insert(y4, vecx, 0xF0);
+
+                    vectors128Ref = y4;
+                    vectors128Ref = ref Unsafe.Add(ref vectors128Ref, 1);
+                }
+
+                return;
+            }
+#endif
+            ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
+            ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length);
+
+            // Fallback with scalar preprocessing and vectorized approximation steps
+            while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd))
+            {
+                Vector4 v = vectorsRef;
+
+                double
+                    x64 = v.X,
+                    y64 = v.Y,
+                    z64 = v.Z;
+                float a = v.W;
+
+                ulong
+                    xl = *(ulong*)&x64,
+                    yl = *(ulong*)&y64,
+                    zl = *(ulong*)&z64;
+
+                // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
+                // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
+                // this means what we actually want is to find the cube root of our clamped values.
+                // For more info on the  constant below, see:
+                // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
+                // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
+                // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
+                // register, and use it to accelerate two steps of the Newton approximation using SIMD.
+                xl = 0x2a9f8a7be393b600 + (xl / 3);
+                yl = 0x2a9f8a7be393b600 + (yl / 3);
+                zl = 0x2a9f8a7be393b600 + (zl / 3);
+
+                Vector4 y4;
+                y4.X = (float)*(double*)&xl;
+                y4.Y = (float)*(double*)&yl;
+                y4.Z = (float)*(double*)&zl;
+                y4.W = 0;
+
+                y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+                y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+                y4.W = a;
+
+                vectorsRef = y4;
+                vectorsRef = ref Unsafe.Add(ref vectorsRef, 1);
+            }
+        }
    }
 }
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
@ -4,6 +4,7 @@
 using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
@ -91,31 +92,30 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        /// it is actually used, because it does not use any generic parameters internally. Defining in a non-generic class means that there will only
        /// ever be a single instantiation of this type for the JIT/AOT compilers to process, instead of having duplicate versions for each pixel type.
        /// </remarks>
-        internal readonly struct ApplyHorizontalConvolutionRowOperation : IRowOperation
+        internal readonly struct SecondPassConvolutionRowOperation : IRowOperation
        {
            private readonly Rectangle bounds;
            private readonly Buffer2D<Vector4> targetValues;
            private readonly Buffer2D<ComplexVector4> sourceValues;
+            private readonly KernelSamplingMap map;
            private readonly Complex64[] kernel;
            private readonly float z;
            private readonly float w;
-            private readonly int maxY;
-            private readonly int maxX;

            [MethodImpl(InliningOptions.ShortMethod)]
-            public ApplyHorizontalConvolutionRowOperation(
+            public SecondPassConvolutionRowOperation(
                Rectangle bounds,
                Buffer2D<Vector4> targetValues,
                Buffer2D<ComplexVector4> sourceValues,
+                KernelSamplingMap map,
                Complex64[] kernel,
                float z,
                float w)
            {
                this.bounds = bounds;
-                this.maxY = this.bounds.Bottom - 1;
-                this.maxX = this.bounds.Right - 1;
                this.targetValues = targetValues;
                this.sourceValues = sourceValues;
+                this.map = map;
                this.kernel = kernel;
                this.z = z;
                this.w = w;
@ -125,11 +125,33 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
            [MethodImpl(InliningOptions.ShortMethod)]
            public void Invoke(int y)
            {
-                Span<Vector4> targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X);
+                int boundsX = this.bounds.X;
+                int boundsWidth = this.bounds.Width;
+                int kernelSize = this.kernel.Length;

-                for (int x = 0; x < this.bounds.Width; x++)
+                Span<int> rowOffsets = this.map.GetRowOffsetSpan();
+                ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize);
+
+                // The target buffer is zeroed initially and then it accumulates the results
+                // of each partial convolution, so we don't have to clear it here as well
+                ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(boundsX, y);
+                ref Complex64 kernelBase = ref this.kernel[0];
+
+                for (int kY = 0; kY < kernelSize; kY++)
                {
-                    Buffer2DUtils.Convolve4AndAccumulatePartials(this.kernel, this.sourceValues, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX, this.z, this.w);
+                    // Get the precalculated source sample row for this kernel row and copy to our buffer
+                    int sampleY = Unsafe.Add(ref sampleRowBase, kY);
+                    ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleY);
+                    Complex64 factor = Unsafe.Add(ref kernelBase, kY);
+
+                    for (int x = 0; x < boundsWidth; x++)
+                    {
+                        ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
+                        ComplexVector4 sample = Unsafe.Add(ref sourceBase, x);
+                        ComplexVector4 partial = factor * sample;
+
+                        target += partial.WeightedSum(this.z, this.w);
+                    }
                }
            }
        }
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@ -26,6 +26,11 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        /// </summary>
        private readonly float gamma;

+        /// <summary>
+        /// The size of each complex convolution kernel.
+        /// </summary>
+        private readonly int kernelSize;
+
        /// <summary>
        /// The kernel parameters to use for the current instance (a: X, b: Y, A: Z, B: W)
        /// </summary>
@ -47,11 +52,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
            : base(configuration, source, sourceRectangle)
        {
            this.gamma = definition.Gamma;
+            this.kernelSize = (definition.Radius * 2) + 1;

            // Get the bokeh blur data
            BokehBlurKernelData data = BokehBlurKernelDataProvider.GetBokehBlurKernelData(
                definition.Radius,
-                (definition.Radius * 2) + 1,
+                this.kernelSize,
                definition.Components);

            this.kernelParameters = data.Parameters;
@ -71,27 +77,49 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        /// <inheritdoc/>
        protected override void OnFrameApply(ImageFrame<TPixel> source)
        {
+            var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds());
+
            // Preliminary gamma highlight pass
-            var gammaOperation = new ApplyGammaExposureRowOperation(this.SourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
-            ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
-                this.Configuration,
-                this.SourceRectangle,
-                in gammaOperation);
+            if (this.gamma == 3F)
+            {
+                var gammaOperation = new ApplyGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration);
+                ParallelRowIterator.IterateRows<ApplyGamma3ExposureRowOperation, Vector4>(
+                    this.Configuration,
+                    sourceRectangle,
+                    in gammaOperation);
+            }
+            else
+            {
+                var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
+                ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
+                    this.Configuration,
+                    sourceRectangle,
+                    in gammaOperation);
+            }

            // Create a 0-filled buffer to use to store the result of the component convolutions
            using Buffer2D<Vector4> processingBuffer = this.Configuration.MemoryAllocator.Allocate2D<Vector4>(source.Size(), AllocationOptions.Clean);

            // Perform the 1D convolutions on all the kernel components and accumulate the results
-            this.OnFrameApplyCore(source, this.SourceRectangle, this.Configuration, processingBuffer);
-
-            float inverseGamma = 1 / this.gamma;
+            this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer);

            // Apply the inverse gamma exposure pass, and write the final pixel data
-            var operation = new ApplyInverseGammaExposureRowOperation(this.SourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma);
-            ParallelRowIterator.IterateRows(
-                this.Configuration,
-                this.SourceRectangle,
-                in operation);
+            if (this.gamma == 3F)
+            {
+                var operation = new ApplyInverseGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration);
+                ParallelRowIterator.IterateRows(
+                    this.Configuration,
+                    sourceRectangle,
+                    in operation);
+            }
+            else
+            {
+                var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma);
+                ParallelRowIterator.IterateRows(
+                    this.Configuration,
+                    sourceRectangle,
+                    in operation);
+            }
        }

        /// <summary>
@ -108,69 +136,129 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
            Buffer2D<Vector4> processingBuffer)
        {
            // Allocate the buffer with the intermediate convolution results
-            using Buffer2D<ComplexVector4> firstPassBuffer = this.Configuration.MemoryAllocator.Allocate2D<ComplexVector4>(source.Size());
+            using Buffer2D<ComplexVector4> firstPassBuffer = configuration.MemoryAllocator.Allocate2D<ComplexVector4>(source.Size());
+
+            // Unlike in the standard 2 pass convolution processor, we use a rectangle of 1x the interest width
+            // to speedup the actual convolution, by applying bulk pixel conversion and clamping calculation.
+            // The second half of the buffer will just target the temporary buffer of complex pixel values.
+            // This is needed because the bokeh blur operates as TPixel -> complex -> TPixel, so we cannot
+            // convert back to standard pixels after each separate 1D convolution pass. Like in the gaussian
+            // blur though, we preallocate and compute the kernel sampling maps before processing each complex
+            // component, to avoid recomputing the same sampling map once per convolution pass. Since we are
+            // doing two 1D convolutions with the same kernel, we can use a single kernel sampling map as if
+            // we were using a 2D kernel with each dimension being the same as the length of our kernel, and
+            // use the two sampling offset spans resulting from this same map. This saves some extra work.
+            using var mapXY = new KernelSamplingMap(configuration.MemoryAllocator);
+
+            mapXY.BuildSamplingOffsetMap(this.kernelSize, this.kernelSize, sourceRectangle);

-            // Perform two 1D convolutions for each component in the current instance
            ref Complex64[] baseRef = ref MemoryMarshal.GetReference(this.kernels.AsSpan());
            ref Vector4 paramsRef = ref MemoryMarshal.GetReference(this.kernelParameters.AsSpan());
+
+            // Perform two 1D convolutions for each component in the current instance
            for (int i = 0; i < this.kernels.Length; i++)
            {
                // Compute the resulting complex buffer for the current component
                Complex64[] kernel = Unsafe.Add(ref baseRef, i);
                Vector4 parameters = Unsafe.Add(ref paramsRef, i);

-                // Compute the vertical 1D convolution
-                var verticalOperation = new ApplyVerticalConvolutionRowOperation(sourceRectangle, firstPassBuffer, source.PixelBuffer, kernel);
-                ParallelRowIterator.IterateRows(
+                // Horizontal convolution
+                var horizontalOperation = new FirstPassConvolutionRowOperation(
+                    sourceRectangle,
+                    firstPassBuffer,
+                    source.PixelBuffer,
+                    mapXY,
+                    kernel,
+                    configuration);
+
+                ParallelRowIterator.IterateRows<FirstPassConvolutionRowOperation, Vector4>(
                    configuration,
                    sourceRectangle,
-                    in verticalOperation);
+                    in horizontalOperation);
+
+                // Vertical 1D convolutions to accumulate the partial results on the target buffer
+                var verticalOperation = new BokehBlurProcessor.SecondPassConvolutionRowOperation(
+                    sourceRectangle,
+                    processingBuffer,
+                    firstPassBuffer,
+                    mapXY,
+                    kernel,
+                    parameters.Z,
+                    parameters.W);

-                // Compute the horizontal 1D convolutions and accumulate the partial results on the target buffer
-                var horizontalOperation = new BokehBlurProcessor.ApplyHorizontalConvolutionRowOperation(sourceRectangle, processingBuffer, firstPassBuffer, kernel, parameters.Z, parameters.W);
                ParallelRowIterator.IterateRows(
                    configuration,
                    sourceRectangle,
-                    in horizontalOperation);
+                    in verticalOperation);
            }
        }

        /// <summary>
        /// A <see langword="struct"/> implementing the vertical convolution logic for <see cref="BokehBlurProcessor{T}"/>.
        /// </summary>
-        private readonly struct ApplyVerticalConvolutionRowOperation : IRowOperation
+        private readonly struct FirstPassConvolutionRowOperation : IRowOperation<Vector4>
        {
            private readonly Rectangle bounds;
            private readonly Buffer2D<ComplexVector4> targetValues;
            private readonly Buffer2D<TPixel> sourcePixels;
+            private readonly KernelSamplingMap map;
            private readonly Complex64[] kernel;
-            private readonly int maxY;
-            private readonly int maxX;
+            private readonly Configuration configuration;

            [MethodImpl(InliningOptions.ShortMethod)]
-            public ApplyVerticalConvolutionRowOperation(
+            public FirstPassConvolutionRowOperation(
                Rectangle bounds,
                Buffer2D<ComplexVector4> targetValues,
                Buffer2D<TPixel> sourcePixels,
-                Complex64[] kernel)
+                KernelSamplingMap map,
+                Complex64[] kernel,
+                Configuration configuration)
            {
                this.bounds = bounds;
-                this.maxY = this.bounds.Bottom - 1;
-                this.maxX = this.bounds.Right - 1;
                this.targetValues = targetValues;
                this.sourcePixels = sourcePixels;
+                this.map = map;
                this.kernel = kernel;
+                this.configuration = configuration;
            }

            /// <inheritdoc/>
            [MethodImpl(InliningOptions.ShortMethod)]
-            public void Invoke(int y)
+            public void Invoke(int y, Span<Vector4> span)
            {
-                Span<ComplexVector4> targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X);
+                int boundsX = this.bounds.X;
+                int boundsWidth = this.bounds.Width;
+                int kernelSize = this.kernel.Length;

-                for (int x = 0; x < this.bounds.Width; x++)
+                // Clear the target buffer for each row run
+                Span<ComplexVector4> targetBuffer = this.targetValues.GetRowSpan(y);
+                targetBuffer.Clear();
+                ref ComplexVector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+
+                // Execute the bulk pixel format conversion for the current row
+                Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, span);
+
+                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span);
+                ref Complex64 kernelBase = ref this.kernel[0];
+                ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan());
+
+                for (int x = 0; x < span.Length; x++)
                {
-                    Buffer2DUtils.Convolve4(this.kernel, this.sourcePixels, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX);
+                    ref ComplexVector4 target = ref Unsafe.Add(ref targetBase, x);
+
+                    for (int kX = 0; kX < kernelSize; kX++)
+                    {
+                        int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+                        Complex64 factor = Unsafe.Add(ref kernelBase, kX);
+
+                        target.Sum(factor * sample);
+                    }
+
+                    // Shift the base column sampling reference by one row at the end of each outer
+                    // iteration so that the inner tight loop indexing can skip the multiplication
+                    sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize);
                }
            }
        }
@ -218,6 +306,40 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
            }
        }

+        /// <summary>
+        /// A <see langword="struct"/> implementing the 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
+        /// </summary>
+        private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation<Vector4>
+        {
+            private readonly Rectangle bounds;
+            private readonly Buffer2D<TPixel> targetPixels;
+            private readonly Configuration configuration;
+
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public ApplyGamma3ExposureRowOperation(
+                Rectangle bounds,
+                Buffer2D<TPixel> targetPixels,
+                Configuration configuration)
+            {
+                this.bounds = bounds;
+                this.targetPixels = targetPixels;
+                this.configuration = configuration;
+            }
+
+            /// <inheritdoc/>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public void Invoke(int y, Span<Vector4> span)
+            {
+                Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
+
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
+
+                Numerics.CubePowOnXYZ(span);
+
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
+            }
+        }
+
        /// <summary>
        /// A <see langword="struct"/> implementing the inverse gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
        /// </summary>
@ -267,5 +389,44 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
            }
        }
+
+        /// <summary>
+        /// A <see langword="struct"/> implementing the inverse 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
+        /// </summary>
+        private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation
+        {
+            private readonly Rectangle bounds;
+            private readonly Buffer2D<TPixel> targetPixels;
+            private readonly Buffer2D<Vector4> sourceValues;
+            private readonly Configuration configuration;
+
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public ApplyInverseGamma3ExposureRowOperation(
+                Rectangle bounds,
+                Buffer2D<TPixel> targetPixels,
+                Buffer2D<Vector4> sourceValues,
+                Configuration configuration)
+            {
+                this.bounds = bounds;
+                this.targetPixels = targetPixels;
+                this.sourceValues = sourceValues;
+                this.configuration = configuration;
+            }
+
+            /// <inheritdoc/>
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public unsafe void Invoke(int y)
+            {
+                Span<Vector4> sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X, this.bounds.Width);
+                ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
+
+                Numerics.Clamp(MemoryMarshal.Cast<Vector4, float>(sourceRowSpan), 0, float.PositiveInfinity);
+                Numerics.CubeRootOnXYZ(sourceRowSpan);
+
+                Span<TPixel> targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
+
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
+            }
+        }
    }
 }
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
@ -1,10 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

-using System;
 using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
--- a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
@ -31,9 +31,16 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        /// <param name="kernel">The convolution kernel.</param>
        /// <param name="bounds">The source bounds.</param>
        public void BuildSamplingOffsetMap(DenseMatrix<float> kernel, Rectangle bounds)
+            => this.BuildSamplingOffsetMap(kernel.Rows, kernel.Columns, bounds);
+
+        /// <summary>
+        /// Builds a map of the sampling offsets for the kernel clamped by the given bounds.
+        /// </summary>
+        /// <param name="kernelHeight">The height (number of rows) of the convolution kernel to use.</param>
+        /// <param name="kernelWidth">The width (number of columns) of the convolution kernel to use.</param>
+        /// <param name="bounds">The source bounds.</param>
+        public void BuildSamplingOffsetMap(int kernelHeight, int kernelWidth, Rectangle bounds)
        {
-            int kernelHeight = kernel.Rows;
-            int kernelWidth = kernel.Columns;
            this.yOffsets = this.allocator.Allocate<int>(bounds.Height * kernelHeight);
            this.xOffsets = this.allocator.Allocate<int>(bounds.Width * kernelWidth);

@ -92,8 +99,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        {
            if (!this.isDisposed)
            {
-                this.yOffsets.Dispose();
-                this.xOffsets.Dispose();
+                this.yOffsets?.Dispose();
+                this.xOffsets?.Dispose();

                this.isDisposed = true;
            }
--- a/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs
+++ b/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs
@ -0,0 +1,22 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.PixelFormats;
+using SixLabors.ImageSharp.Processing;
+
+namespace SixLabors.ImageSharp.Benchmarks.Samplers
+{
+    [Config(typeof(Config.MultiFramework))]
+    public class BokehBlur
+    {
+        [Benchmark]
+        public void Blur()
+        {
+            using (var image = new Image<Rgba32>(Configuration.Default, 400, 400, Color.White))
+            {
+                image.Mutate(c => c.BokehBlur());
+            }
+        }
+    }
+}
--- a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
@ -6,7 +6,6 @@ using System.Collections.Generic;
 using System.Globalization;
 using System.Linq;
 using System.Text.RegularExpressions;
-using Microsoft.DotNet.RemoteExecutor;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Processing;
@ -44,9 +43,8 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution
        [InlineData(20, 4, -10f)]
        [InlineData(20, 4, 0f)]
        public void VerifyBokehBlurProcessorArguments_Fail(int radius, int components, float gamma)
-        {
-            Assert.Throws<ArgumentOutOfRangeException>(() => new BokehBlurProcessor(radius, components, gamma));
-        }
+            => Assert.Throws<ArgumentOutOfRangeException>(
+                () => new BokehBlurProcessor(radius, components, gamma));

        [Fact]
        public void VerifyComplexComponents()
@ -137,12 +135,10 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution
        [WithTestPatternImages(nameof(BokehBlurValues), 30, 20, PixelTypes.Rgba32)]
        public void BokehBlurFilterProcessor<TPixel>(TestImageProvider<TPixel> provider, BokehBlurInfo value)
            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            provider.RunValidatingProcessorTest(
+            => provider.RunValidatingProcessorTest(
                x => x.BokehBlur(value.Radius, value.Components, value.Gamma),
                testOutputDetails: value.ToString(),
                appendPixelTypeToFileName: false);
-        }

        [Theory]
        /*
@ -152,18 +148,23 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution
        [WithTestPatternImages(200, 200, PixelTypes.Bgr24 | PixelTypes.Bgra32)]
        public void BokehBlurFilterProcessor_WorksWithAllPixelTypes<TPixel>(TestImageProvider<TPixel> provider)
            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            provider.RunValidatingProcessorTest(
-                    x => x.BokehBlur(8, 2, 3),
-                    appendSourceFileOrDescription: false);
-        }
+            => provider.RunValidatingProcessorTest(
+                x => x.BokehBlur(8, 2, 3),
+                appendSourceFileOrDescription: false);

        [Theory]
        [WithFileCollection(nameof(TestFiles), nameof(BokehBlurValues), PixelTypes.Rgba32)]
-        public void BokehBlurFilterProcessor_Bounded<TPixel>(TestImageProvider<TPixel> provider, BokehBlurInfo value)
-            where TPixel : unmanaged, IPixel<TPixel>
+        public void BokehBlurFilterProcessor_Bounded(TestImageProvider<Rgba32> provider, BokehBlurInfo value)
        {
-            provider.RunValidatingProcessorTest(
+            static void RunTest(string arg1, string arg2)
+            {
+                TestImageProvider<Rgba32> provider =
+                    FeatureTestRunner.DeserializeForXunit<TestImageProvider<Rgba32>>(arg1);
+
+                BokehBlurInfo value =
+                    FeatureTestRunner.DeserializeForXunit<BokehBlurInfo>(arg2);
+
+                provider.RunValidatingProcessorTest(
                x =>
                {
                    Size size = x.GetCurrentSize();
@ -172,14 +173,19 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution
                },
                testOutputDetails: value.ToString(),
                appendPixelTypeToFileName: false);
+            }
+
+            FeatureTestRunner.RunWithHwIntrinsicsFeature(
+                RunTest,
+                HwIntrinsics.DisableSSE41,
+                provider,
+                value);
        }

        [Theory]
        [WithTestPatternImages(100, 300, PixelTypes.Bgr24)]
        public void WorksWithDiscoBuffers<TPixel>(TestImageProvider<TPixel> provider)
            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            provider.RunBufferCapacityLimitProcessorTest(41, c => c.BokehBlur());
-        }
+            => provider.RunBufferCapacityLimitProcessorTest(260, c => c.BokehBlur());
    }
 }
--- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
@ -211,6 +211,53 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities
            }
        }

+        /// <summary>
+        /// Runs the given test <paramref name="action"/> within an environment
+        /// where the given <paramref name="intrinsics"/> features.
+        /// </summary>
+        /// <param name="action">The test action to run.</param>
+        /// <param name="intrinsics">The intrinsics features.</param>
+        /// <param name="arg1">The value to pass as a parameter to the test action.</param>
+        /// <param name="arg2">The second value to pass as a parameter to the test action.</param>
+        public static void RunWithHwIntrinsicsFeature<T, T2>(
+            Action<string, string> action,
+            HwIntrinsics intrinsics,
+            T arg1,
+            T2 arg2)
+            where T : IXunitSerializable
+            where T2 : IXunitSerializable
+        {
+            if (!RemoteExecutor.IsSupported)
+            {
+                return;
+            }
+
+            foreach (KeyValuePair<HwIntrinsics, string> intrinsic in intrinsics.ToFeatureKeyValueCollection())
+            {
+                var processStartInfo = new ProcessStartInfo();
+                if (intrinsic.Key != HwIntrinsics.AllowAll)
+                {
+                    processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0";
+
+                    RemoteExecutor.Invoke(
+                        action,
+                        BasicSerializer.Serialize(arg1),
+                        BasicSerializer.Serialize(arg2),
+                        new RemoteInvokeOptions
+                        {
+                            StartInfo = processStartInfo
+                        })
+                        .Dispose();
+                }
+                else
+                {
+                    // Since we are running using the default architecture there is no
+                    // point creating the overhead of running the action in a separate process.
+                    action(BasicSerializer.Serialize(arg1), BasicSerializer.Serialize(arg2));
+                }
+            }
+        }
+
        /// <summary>
        /// Runs the given test <paramref name="action"/> within an environment
        /// where the given <paramref name="intrinsics"/> features.