Merge pull request #1477 from SixLabors/sp/2pass-convolution-speedup

1D convolution optimization and general codegen tweaks
5 years ago · b4e7d80ec4
11 changed files with 579 additions and 394 deletions
--- a/src/ImageSharp/ColorSpaces/Companding/SRgbCompanding.cs
+++ b/src/ImageSharp/ColorSpaces/Companding/SRgbCompanding.cs
@ -1,4 +1,4 @@
-// Copyright (c) Six Labors.
+// Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

 using System;
@ -25,12 +25,14 @@ namespace SixLabors.ImageSharp.ColorSpaces.Companding
        [MethodImpl(InliningOptions.ShortMethod)]
        public static void Expand(Span<Vector4> vectors)
        {
-            ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+            ref Vector4 vectorsStart = ref MemoryMarshal.GetReference(vectors);
+            ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsStart, vectors.Length);

-            for (int i = 0; i < vectors.Length; i++)
+            while (Unsafe.IsAddressLessThan(ref vectorsStart, ref vectorsEnd))
            {
-                ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
-                Expand(ref v);
+                Expand(ref vectorsStart);
+
+                vectorsStart = ref Unsafe.Add(ref vectorsStart, 1);
            }
        }

@ -41,12 +43,14 @@ namespace SixLabors.ImageSharp.ColorSpaces.Companding
        [MethodImpl(InliningOptions.ShortMethod)]
        public static void Compress(Span<Vector4> vectors)
        {
-            ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+            ref Vector4 vectorsStart = ref MemoryMarshal.GetReference(vectors);
+            ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsStart, vectors.Length);

-            for (int i = 0; i < vectors.Length; i++)
+            while (Unsafe.IsAddressLessThan(ref vectorsStart, ref vectorsEnd))
            {
-                ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
-                Compress(ref v);
+                Compress(ref vectorsStart);
+
+                vectorsStart = ref Unsafe.Add(ref vectorsStart, 1);
            }
        }

@ -90,4 +94,4 @@ namespace SixLabors.ImageSharp.ColorSpaces.Companding
        [MethodImpl(InliningOptions.ShortMethod)]
        public static float Compress(float channel) => channel <= 0.0031308F ? 12.92F * channel : (1.055F * MathF.Pow(channel, 0.416666666666667F)) - 0.055F;
    }
-}
+}
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@ -41,13 +41,11 @@ namespace SixLabors.ImageSharp

        /// <summary>
        /// Determine the Least Common Multiple (LCM) of two numbers.
+        /// See https://en.wikipedia.org/wiki/Least_common_multiple#Reduction_by_the_greatest_common_divisor.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static int LeastCommonMultiple(int a, int b)
-        {
-            // https://en.wikipedia.org/wiki/Least_common_multiple#Reduction_by_the_greatest_common_divisor
-            return (a / GreatestCommonDivisor(a, b)) * b;
-        }
+            => a / GreatestCommonDivisor(a, b) * b;

        /// <summary>
        /// Calculates <paramref name="x"/> % 2
@ -290,10 +288,14 @@ namespace SixLabors.ImageSharp

            if (remainder.Length > 0)
            {
-                for (int i = 0; i < remainder.Length; i++)
+                ref byte remainderStart = ref MemoryMarshal.GetReference(remainder);
+                ref byte remainderEnd = ref Unsafe.Add(ref remainderStart, remainder.Length);
+
+                while (Unsafe.IsAddressLessThan(ref remainderStart, ref remainderEnd))
                {
-                    ref byte v = ref remainder[i];
-                    v = Clamp(v, min, max);
+                    remainderStart = Clamp(remainderStart, min, max);
+
+                    remainderStart = ref Unsafe.Add(ref remainderStart, 1);
                }
            }
        }
@ -311,10 +313,14 @@ namespace SixLabors.ImageSharp

            if (remainder.Length > 0)
            {
-                for (int i = 0; i < remainder.Length; i++)
+                ref uint remainderStart = ref MemoryMarshal.GetReference(remainder);
+                ref uint remainderEnd = ref Unsafe.Add(ref remainderStart, remainder.Length);
+
+                while (Unsafe.IsAddressLessThan(ref remainderStart, ref remainderEnd))
                {
-                    ref uint v = ref remainder[i];
-                    v = Clamp(v, min, max);
+                    remainderStart = Clamp(remainderStart, min, max);
+
+                    remainderStart = ref Unsafe.Add(ref remainderStart, 1);
                }
            }
        }
@ -332,10 +338,14 @@ namespace SixLabors.ImageSharp

            if (remainder.Length > 0)
            {
-                for (int i = 0; i < remainder.Length; i++)
+                ref int remainderStart = ref MemoryMarshal.GetReference(remainder);
+                ref int remainderEnd = ref Unsafe.Add(ref remainderStart, remainder.Length);
+
+                while (Unsafe.IsAddressLessThan(ref remainderStart, ref remainderEnd))
                {
-                    ref int v = ref remainder[i];
-                    v = Clamp(v, min, max);
+                    remainderStart = Clamp(remainderStart, min, max);
+
+                    remainderStart = ref Unsafe.Add(ref remainderStart, 1);
                }
            }
        }
@ -353,10 +363,14 @@ namespace SixLabors.ImageSharp

            if (remainder.Length > 0)
            {
-                for (int i = 0; i < remainder.Length; i++)
+                ref float remainderStart = ref MemoryMarshal.GetReference(remainder);
+                ref float remainderEnd = ref Unsafe.Add(ref remainderStart, remainder.Length);
+
+                while (Unsafe.IsAddressLessThan(ref remainderStart, ref remainderEnd))
                {
-                    ref float v = ref remainder[i];
-                    v = Clamp(v, min, max);
+                    remainderStart = Clamp(remainderStart, min, max);
+
+                    remainderStart = ref Unsafe.Add(ref remainderStart, 1);
                }
            }
        }
@ -374,10 +388,14 @@ namespace SixLabors.ImageSharp

            if (remainder.Length > 0)
            {
-                for (int i = 0; i < remainder.Length; i++)
+                ref double remainderStart = ref MemoryMarshal.GetReference(remainder);
+                ref double remainderEnd = ref Unsafe.Add(ref remainderStart, remainder.Length);
+
+                while (Unsafe.IsAddressLessThan(ref remainderStart, ref remainderEnd))
                {
-                    ref double v = ref remainder[i];
-                    v = Clamp(v, min, max);
+                    remainderStart = Clamp(remainderStart, min, max);
+
+                    remainderStart = ref Unsafe.Add(ref remainderStart, 1);
                }
            }
        }
@ -407,7 +425,6 @@ namespace SixLabors.ImageSharp
            where T : unmanaged
        {
            ref T sRef = ref MemoryMarshal.GetReference(span);
-            ref Vector<T> vsBase = ref Unsafe.As<T, Vector<T>>(ref MemoryMarshal.GetReference(span));
            var vmin = new Vector<T>(min);
            var vmax = new Vector<T>(max);

@ -415,25 +432,35 @@ namespace SixLabors.ImageSharp
            int m = Modulo4(n);
            int u = n - m;

-            for (int i = 0; i < u; i += 4)
-            {
-                ref Vector<T> vs0 = ref Unsafe.Add(ref vsBase, i);
-                ref Vector<T> vs1 = ref Unsafe.Add(ref vs0, 1);
-                ref Vector<T> vs2 = ref Unsafe.Add(ref vs0, 2);
-                ref Vector<T> vs3 = ref Unsafe.Add(ref vs0, 3);
+            ref Vector<T> vs0 = ref Unsafe.As<T, Vector<T>>(ref MemoryMarshal.GetReference(span));
+            ref Vector<T> vs1 = ref Unsafe.Add(ref vs0, 1);
+            ref Vector<T> vs2 = ref Unsafe.Add(ref vs0, 2);
+            ref Vector<T> vs3 = ref Unsafe.Add(ref vs0, 3);
+            ref Vector<T> vsEnd = ref Unsafe.Add(ref vs0, u);

+            while (Unsafe.IsAddressLessThan(ref vs0, ref vsEnd))
+            {
                vs0 = Vector.Min(Vector.Max(vmin, vs0), vmax);
                vs1 = Vector.Min(Vector.Max(vmin, vs1), vmax);
                vs2 = Vector.Min(Vector.Max(vmin, vs2), vmax);
                vs3 = Vector.Min(Vector.Max(vmin, vs3), vmax);
+
+                vs0 = ref Unsafe.Add(ref vs0, 4);
+                vs1 = ref Unsafe.Add(ref vs1, 4);
+                vs2 = ref Unsafe.Add(ref vs2, 4);
+                vs3 = ref Unsafe.Add(ref vs3, 4);
            }

            if (m > 0)
            {
-                for (int i = u; i < n; i++)
+                vs0 = ref vsEnd;
+                vsEnd = ref Unsafe.Add(ref vsEnd, m);
+
+                while (Unsafe.IsAddressLessThan(ref vs0, ref vsEnd))
                {
-                    ref Vector<T> vs0 = ref Unsafe.Add(ref vsBase, i);
                    vs0 = Vector.Min(Vector.Max(vmin, vs0), vmax);
+
+                    vs0 = ref Unsafe.Add(ref vs0, 1);
                }
            }
        }
@ -472,10 +499,8 @@ namespace SixLabors.ImageSharp
 #if SUPPORTS_RUNTIME_INTRINSICS
            if (Avx2.IsSupported && vectors.Length >= 2)
            {
-                ref Vector256<float> vectorsBase =
-                    ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
-
                // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
+                ref Vector256<float> vectorsBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
                ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));

                while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
@ -495,12 +520,14 @@ namespace SixLabors.ImageSharp
            else
 #endif
            {
-                ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+                ref Vector4 vectorsStart = ref MemoryMarshal.GetReference(vectors);
+                ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsStart, vectors.Length);

-                for (int i = 0; i < vectors.Length; i++)
+                while (Unsafe.IsAddressLessThan(ref vectorsStart, ref vectorsEnd))
                {
-                    ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
-                    Premultiply(ref v);
+                    Premultiply(ref vectorsStart);
+
+                    vectorsStart = ref Unsafe.Add(ref vectorsStart, 1);
                }
            }
        }
@ -515,10 +542,8 @@ namespace SixLabors.ImageSharp
 #if SUPPORTS_RUNTIME_INTRINSICS
            if (Avx2.IsSupported && vectors.Length >= 2)
            {
-                ref Vector256<float> vectorsBase =
-                    ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
-
                // Divide by 2 as 4 elements per Vector4 and 8 per Vector256<float>
+                ref Vector256<float> vectorsBase = ref Unsafe.As<Vector4, Vector256<float>>(ref MemoryMarshal.GetReference(vectors));
                ref Vector256<float> vectorsLast = ref Unsafe.Add(ref vectorsBase, (IntPtr)((uint)vectors.Length / 2u));

                while (Unsafe.IsAddressLessThan(ref vectorsBase, ref vectorsLast))
@ -538,12 +563,14 @@ namespace SixLabors.ImageSharp
            else
 #endif
            {
-                ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+                ref Vector4 vectorsStart = ref MemoryMarshal.GetReference(vectors);
+                ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsStart, vectors.Length);

-                for (int i = 0; i < vectors.Length; i++)
+                while (Unsafe.IsAddressLessThan(ref vectorsStart, ref vectorsEnd))
                {
-                    ref Vector4 v = ref Unsafe.Add(ref baseRef, i);
-                    UnPremultiply(ref v);
+                    UnPremultiply(ref vectorsStart);
+
+                    vectorsStart = ref Unsafe.Add(ref vectorsStart, 1);
                }
            }
        }
@ -633,53 +660,54 @@ namespace SixLabors.ImageSharp
                    vectors128Ref = y4;
                    vectors128Ref = ref Unsafe.Add(ref vectors128Ref, 1);
                }
-
-                return;
            }
+            else
 #endif
-            ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
-            ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length);
-
-            // Fallback with scalar preprocessing and vectorized approximation steps
-            while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd))
            {
-                Vector4 v = vectorsRef;
+                ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
+                ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length);

-                double
-                    x64 = v.X,
-                    y64 = v.Y,
-                    z64 = v.Z;
-                float a = v.W;
-
-                ulong
-                    xl = *(ulong*)&x64,
-                    yl = *(ulong*)&y64,
-                    zl = *(ulong*)&z64;
-
-                // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
-                // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
-                // this means what we actually want is to find the cube root of our clamped values.
-                // For more info on the  constant below, see:
-                // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
-                // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
-                // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
-                // register, and use it to accelerate two steps of the Newton approximation using SIMD.
-                xl = 0x2a9f8a7be393b600 + (xl / 3);
-                yl = 0x2a9f8a7be393b600 + (yl / 3);
-                zl = 0x2a9f8a7be393b600 + (zl / 3);
-
-                Vector4 y4;
-                y4.X = (float)*(double*)&xl;
-                y4.Y = (float)*(double*)&yl;
-                y4.Z = (float)*(double*)&zl;
-                y4.W = 0;
-
-                y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
-                y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
-                y4.W = a;
-
-                vectorsRef = y4;
-                vectorsRef = ref Unsafe.Add(ref vectorsRef, 1);
+                // Fallback with scalar preprocessing and vectorized approximation steps
+                while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd))
+                {
+                    Vector4 v = vectorsRef;
+
+                    double
+                        x64 = v.X,
+                        y64 = v.Y,
+                        z64 = v.Z;
+                    float a = v.W;
+
+                    ulong
+                        xl = *(ulong*)&x64,
+                        yl = *(ulong*)&y64,
+                        zl = *(ulong*)&z64;
+
+                    // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
+                    // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
+                    // this means what we actually want is to find the cube root of our clamped values.
+                    // For more info on the  constant below, see:
+                    // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
+                    // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
+                    // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
+                    // register, and use it to accelerate two steps of the Newton approximation using SIMD.
+                    xl = 0x2a9f8a7be393b600 + (xl / 3);
+                    yl = 0x2a9f8a7be393b600 + (yl / 3);
+                    zl = 0x2a9f8a7be393b600 + (zl / 3);
+
+                    Vector4 y4;
+                    y4.X = (float)*(double*)&xl;
+                    y4.Y = (float)*(double*)&yl;
+                    y4.Z = (float)*(double*)&zl;
+                    y4.W = 0;
+
+                    y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+                    y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+                    y4.W = a;
+
+                    vectorsRef = y4;
+                    vectorsRef = ref Unsafe.Add(ref vectorsRef, 1);
+                }
            }
        }
    }
--- a/src/ImageSharp/PixelFormats/Utils/Vector4Converters.Default.cs
+++ b/src/ImageSharp/PixelFormats/Utils/Vector4Converters.Default.cs
@ -88,14 +88,16 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils
                Span<TPixel> destPixels)
                where TPixel : unmanaged, IPixel<TPixel>
            {
-                ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors);
+                ref Vector4 sourceStart = ref MemoryMarshal.GetReference(sourceVectors);
+                ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceStart, sourceVectors.Length);
                ref TPixel destRef = ref MemoryMarshal.GetReference(destPixels);

-                for (int i = 0; i < sourceVectors.Length; i++)
+                while (Unsafe.IsAddressLessThan(ref sourceStart, ref sourceEnd))
                {
-                    ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i);
-                    ref TPixel dp = ref Unsafe.Add(ref destRef, i);
-                    dp.FromVector4(sp);
+                    destRef.FromVector4(sourceStart);
+
+                    sourceStart = ref Unsafe.Add(ref sourceStart, 1);
+                    destRef = ref Unsafe.Add(ref destRef, 1);
                }
            }

@ -105,14 +107,16 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils
                Span<Vector4> destVectors)
                where TPixel : unmanaged, IPixel<TPixel>
            {
-                ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourcePixels);
+                ref TPixel sourceStart = ref MemoryMarshal.GetReference(sourcePixels);
+                ref TPixel sourceEnd = ref Unsafe.Add(ref sourceStart, sourcePixels.Length);
                ref Vector4 destRef = ref MemoryMarshal.GetReference(destVectors);

-                for (int i = 0; i < sourcePixels.Length; i++)
+                while (Unsafe.IsAddressLessThan(ref sourceStart, ref sourceEnd))
                {
-                    ref TPixel sp = ref Unsafe.Add(ref sourceRef, i);
-                    ref Vector4 dp = ref Unsafe.Add(ref destRef, i);
-                    dp = sp.ToVector4();
+                    destRef = sourceStart.ToVector4();
+
+                    sourceStart = ref Unsafe.Add(ref sourceStart, 1);
+                    destRef = ref Unsafe.Add(ref destRef, 1);
                }
            }

@ -122,14 +126,16 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils
                Span<TPixel> destinationColors)
                where TPixel : unmanaged, IPixel<TPixel>
            {
-                ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors);
+                ref Vector4 sourceStart = ref MemoryMarshal.GetReference(sourceVectors);
+                ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceStart, sourceVectors.Length);
                ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors);

-                for (int i = 0; i < sourceVectors.Length; i++)
+                while (Unsafe.IsAddressLessThan(ref sourceStart, ref sourceEnd))
                {
-                    ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i);
-                    ref TPixel dp = ref Unsafe.Add(ref destRef, i);
-                    dp.FromScaledVector4(sp);
+                    destRef.FromScaledVector4(sourceStart);
+
+                    sourceStart = ref Unsafe.Add(ref sourceStart, 1);
+                    destRef = ref Unsafe.Add(ref destRef, 1);
                }
            }

@ -139,16 +145,18 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils
                Span<Vector4> destinationVectors)
                where TPixel : unmanaged, IPixel<TPixel>
            {
-                ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors);
+                ref TPixel sourceStart = ref MemoryMarshal.GetReference(sourceColors);
+                ref TPixel sourceEnd = ref Unsafe.Add(ref sourceStart, sourceColors.Length);
                ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors);

-                for (int i = 0; i < sourceColors.Length; i++)
+                while (Unsafe.IsAddressLessThan(ref sourceStart, ref sourceEnd))
                {
-                    ref TPixel sp = ref Unsafe.Add(ref sourceRef, i);
-                    ref Vector4 dp = ref Unsafe.Add(ref destRef, i);
-                    dp = sp.ToScaledVector4();
+                    destRef = sourceStart.ToScaledVector4();
+
+                    sourceStart = ref Unsafe.Add(ref sourceStart, 1);
+                    destRef = ref Unsafe.Add(ref destRef, 1);
                }
            }
        }
    }
-}
+}
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
@ -129,29 +129,34 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                int boundsWidth = this.bounds.Width;
                int kernelSize = this.kernel.Length;

-                Span<int> rowOffsets = this.map.GetRowOffsetSpan();
-                ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize);
+                ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(this.map.GetRowOffsetSpan()), (y - this.bounds.Y) * kernelSize);

                // The target buffer is zeroed initially and then it accumulates the results
                // of each partial convolution, so we don't have to clear it here as well
                ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(boundsX, y);
-                ref Complex64 kernelBase = ref this.kernel[0];
+                ref Complex64 kernelStart = ref this.kernel[0];
+                ref Complex64 kernelEnd = ref Unsafe.Add(ref kernelStart, kernelSize);

-                for (int kY = 0; kY < kernelSize; kY++)
+                while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd))
                {
                    // Get the precalculated source sample row for this kernel row and copy to our buffer
-                    int sampleY = Unsafe.Add(ref sampleRowBase, kY);
-                    ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleY);
-                    Complex64 factor = Unsafe.Add(ref kernelBase, kY);
+                    ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleRowBase);
+                    ref ComplexVector4 sourceEnd = ref Unsafe.Add(ref sourceBase, boundsWidth);
+                    ref Vector4 targetStart = ref targetBase;
+                    Complex64 factor = kernelStart;

-                    for (int x = 0; x < boundsWidth; x++)
+                    while (Unsafe.IsAddressLessThan(ref sourceBase, ref sourceEnd))
                    {
-                        ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
-                        ComplexVector4 sample = Unsafe.Add(ref sourceBase, x);
-                        ComplexVector4 partial = factor * sample;
+                        ComplexVector4 partial = factor * sourceBase;

-                        target += partial.WeightedSum(this.z, this.w);
+                        targetStart += partial.WeightedSum(this.z, this.w);
+
+                        sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+                        targetStart = ref Unsafe.Add(ref targetStart, 1);
                    }
+
+                    kernelStart = ref Unsafe.Add(ref kernelStart, 1);
+                    sampleRowBase = ref Unsafe.Add(ref sampleRowBase, 1);
                }
            }
        }
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@ -233,32 +233,37 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                // Clear the target buffer for each row run
                Span<ComplexVector4> targetBuffer = this.targetValues.GetRowSpan(y);
                targetBuffer.Clear();
-                ref ComplexVector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);

                // Execute the bulk pixel format conversion for the current row
                Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, span);

                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span);
+                ref ComplexVector4 targetStart = ref MemoryMarshal.GetReference(targetBuffer);
+                ref ComplexVector4 targetEnd = ref Unsafe.Add(ref targetStart, span.Length);
                ref Complex64 kernelBase = ref this.kernel[0];
+                ref Complex64 kernelEnd = ref Unsafe.Add(ref kernelBase, kernelSize);
                ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan());

-                for (int x = 0; x < span.Length; x++)
+                while (Unsafe.IsAddressLessThan(ref targetStart, ref targetEnd))
                {
-                    ref ComplexVector4 target = ref Unsafe.Add(ref targetBase, x);
+                    ref Complex64 kernelStart = ref kernelBase;
+                    ref int sampleColumnStart = ref sampleColumnBase;

-                    for (int kX = 0; kX < kernelSize; kX++)
+                    while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd))
                    {
-                        int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
-                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
-                        Complex64 factor = Unsafe.Add(ref kernelBase, kX);
+                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleColumnStart - boundsX);

-                        target.Sum(factor * sample);
+                        targetStart.Sum(kernelStart * sample);
+
+                        kernelStart = ref Unsafe.Add(ref kernelStart, 1);
+                        sampleColumnStart = ref Unsafe.Add(ref sampleColumnStart, 1);
                    }

                    // Shift the base column sampling reference by one row at the end of each outer
                    // iteration so that the inner tight loop indexing can skip the multiplication
                    sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize);
+                    targetStart = ref Unsafe.Add(ref targetStart, 1);
                }
            }
        }
--- a/src/ImageSharp/Processing/Processors/Convolution/BoxBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BoxBlurProcessor{TPixel}.cs
@ -1,6 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

+using System;
 using SixLabors.ImageSharp.PixelFormats;

 namespace SixLabors.ImageSharp.Processing.Processors.Convolution
@ -23,24 +24,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
            : base(configuration, source, sourceRectangle)
        {
            int kernelSize = (definition.Radius * 2) + 1;
-            this.KernelX = CreateBoxKernel(kernelSize);
-            this.KernelY = this.KernelX.Transpose();
+            this.Kernel = CreateBoxKernel(kernelSize);
        }

        /// <summary>
-        /// Gets the horizontal gradient operator.
+        /// Gets the 1D convolution kernel.
        /// </summary>
-        public DenseMatrix<float> KernelX { get; }
-
-        /// <summary>
-        /// Gets the vertical gradient operator.
-        /// </summary>
-        public DenseMatrix<float> KernelY { get; }
+        public float[] Kernel { get; }

        /// <inheritdoc/>
        protected override void OnFrameApply(ImageFrame<TPixel> source)
        {
-            using var processor = new Convolution2PassProcessor<TPixel>(this.Configuration, this.KernelX, this.KernelY, false, this.Source, this.SourceRectangle);
+            using var processor = new Convolution2PassProcessor<TPixel>(this.Configuration, this.Kernel, false, this.Source, this.SourceRectangle);

            processor.Apply(source);
        }
@ -50,10 +45,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        /// </summary>
        /// <param name="kernelSize">The maximum size of the kernel in either direction.</param>
        /// <returns>The <see cref="DenseMatrix{T}"/>.</returns>
-        private static DenseMatrix<float> CreateBoxKernel(int kernelSize)
+        private static float[] CreateBoxKernel(int kernelSize)
        {
-            var kernel = new DenseMatrix<float>(kernelSize, 1);
-            kernel.Fill(1F / kernelSize);
+            var kernel = new float[kernelSize];
+
+            kernel.AsSpan().Fill(1F / kernelSize);
+
            return kernel;
        }
    }
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
@ -1,7 +1,10 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

+using System;
 using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
@ -19,34 +22,26 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        /// Initializes a new instance of the <see cref="Convolution2PassProcessor{TPixel}"/> class.
        /// </summary>
        /// <param name="configuration">The configuration which allows altering default behaviour or extending the library.</param>
-        /// <param name="kernelX">The horizontal gradient operator.</param>
-        /// <param name="kernelY">The vertical gradient operator.</param>
+        /// <param name="kernel">The 1D convolution kernel.</param>
        /// <param name="preserveAlpha">Whether the convolution filter is applied to alpha as well as the color channels.</param>
        /// <param name="source">The source <see cref="Image{TPixel}"/> for the current processor instance.</param>
        /// <param name="sourceRectangle">The source area to process for the current processor instance.</param>
        public Convolution2PassProcessor(
            Configuration configuration,
-            in DenseMatrix<float> kernelX,
-            in DenseMatrix<float> kernelY,
+            float[] kernel,
            bool preserveAlpha,
            Image<TPixel> source,
            Rectangle sourceRectangle)
            : base(configuration, source, sourceRectangle)
        {
-            this.KernelX = kernelX;
-            this.KernelY = kernelY;
+            this.Kernel = kernel;
            this.PreserveAlpha = preserveAlpha;
        }

        /// <summary>
-        /// Gets the horizontal convolution kernel.
+        /// Gets the convolution kernel.
        /// </summary>
-        public DenseMatrix<float> KernelX { get; }
-
-        /// <summary>
-        /// Gets the vertical convolution kernel.
-        /// </summary>
-        public DenseMatrix<float> KernelY { get; }
+        public float[] Kernel { get; }

        /// <summary>
        /// Gets a value indicating whether the convolution filter is applied to alpha as well as the color channels.
@ -64,44 +59,364 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
            // for source and target bulk pixel conversion.
            var operationBounds = new Rectangle(interest.X, interest.Y, interest.Width * 2, interest.Height);

-            using (var mapX = new KernelSamplingMap(this.Configuration.MemoryAllocator))
+            // We can create a single sampling map with the size as if we were using the non separated 2D kernel
+            // the two 1D kernels represent, and reuse it across both convolution steps, like in the bokeh blur.
+            using var mapXY = new KernelSamplingMap(this.Configuration.MemoryAllocator);
+
+            mapXY.BuildSamplingOffsetMap(this.Kernel.Length, this.Kernel.Length, interest);
+
+            // Horizontal convolution
+            var horizontalOperation = new HorizontalConvolutionRowOperation(
+                interest,
+                firstPassPixels,
+                source.PixelBuffer,
+                mapXY,
+                this.Kernel,
+                this.Configuration,
+                this.PreserveAlpha);
+
+            ParallelRowIterator.IterateRows<HorizontalConvolutionRowOperation, Vector4>(
+                this.Configuration,
+                operationBounds,
+                in horizontalOperation);
+
+            // Vertical convolution
+            var verticalOperation = new VerticalConvolutionRowOperation(
+                interest,
+                source.PixelBuffer,
+                firstPassPixels,
+                mapXY,
+                this.Kernel,
+                this.Configuration,
+                this.PreserveAlpha);
+
+            ParallelRowIterator.IterateRows<VerticalConvolutionRowOperation, Vector4>(
+                this.Configuration,
+                operationBounds,
+                in verticalOperation);
+        }
+
+        /// <summary>
+        /// A <see langword="struct"/> implementing the logic for the horizontal 1D convolution.
+        /// </summary>
+        internal readonly struct HorizontalConvolutionRowOperation : IRowOperation<Vector4>
+        {
+            private readonly Rectangle bounds;
+            private readonly Buffer2D<TPixel> targetPixels;
+            private readonly Buffer2D<TPixel> sourcePixels;
+            private readonly KernelSamplingMap map;
+            private readonly float[] kernel;
+            private readonly Configuration configuration;
+            private readonly bool preserveAlpha;
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public HorizontalConvolutionRowOperation(
+                Rectangle bounds,
+                Buffer2D<TPixel> targetPixels,
+                Buffer2D<TPixel> sourcePixels,
+                KernelSamplingMap map,
+                float[] kernel,
+                Configuration configuration,
+                bool preserveAlpha)
+            {
+                this.bounds = bounds;
+                this.targetPixels = targetPixels;
+                this.sourcePixels = sourcePixels;
+                this.map = map;
+                this.kernel = kernel;
+                this.configuration = configuration;
+                this.preserveAlpha = preserveAlpha;
+            }
+
+            /// <inheritdoc/>
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public void Invoke(int y, Span<Vector4> span)
            {
-                mapX.BuildSamplingOffsetMap(this.KernelX, interest);
-
-                // Horizontal convolution
-                var horizontalOperation = new ConvolutionRowOperation<TPixel>(
-                    interest,
-                    firstPassPixels,
-                    source.PixelBuffer,
-                    mapX,
-                    this.KernelX,
-                    this.Configuration,
-                    this.PreserveAlpha);
-
-                ParallelRowIterator.IterateRows<ConvolutionRowOperation<TPixel>, Vector4>(
-                    this.Configuration,
-                    operationBounds,
-                    in horizontalOperation);
+                if (this.preserveAlpha)
+                {
+                    this.Convolve3(y, span);
+                }
+                else
+                {
+                    this.Convolve4(y, span);
+                }
            }

-            using (var mapY = new KernelSamplingMap(this.Configuration.MemoryAllocator))
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private void Convolve3(int y, Span<Vector4> span)
            {
-                mapY.BuildSamplingOffsetMap(this.KernelY, interest);
-
-                // Vertical convolution
-                var verticalOperation = new ConvolutionRowOperation<TPixel>(
-                    interest,
-                    source.PixelBuffer,
-                    firstPassPixels,
-                    mapY,
-                    this.KernelY,
-                    this.Configuration,
-                    this.PreserveAlpha);
-
-                ParallelRowIterator.IterateRows<ConvolutionRowOperation<TPixel>, Vector4>(
-                    this.Configuration,
-                    operationBounds,
-                    in verticalOperation);
+                // Span is 2x bounds.
+                int boundsX = this.bounds.X;
+                int boundsWidth = this.bounds.Width;
+                int kernelSize = this.kernel.Length;
+
+                Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
+                Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
+
+                // Clear the target buffer for each row run.
+                targetBuffer.Clear();
+
+                // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+                ref Vector4 targetStart = ref MemoryMarshal.GetReference(targetBuffer);
+                ref Vector4 targetEnd = ref Unsafe.Add(ref targetStart, sourceBuffer.Length);
+                ref float kernelBase = ref this.kernel[0];
+                ref float kernelEnd = ref Unsafe.Add(ref kernelBase, kernelSize);
+                ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan());
+
+                while (Unsafe.IsAddressLessThan(ref targetStart, ref targetEnd))
+                {
+                    ref float kernelStart = ref kernelBase;
+                    ref int sampleColumnStart = ref sampleColumnBase;
+
+                    while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd))
+                    {
+                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleColumnStart - boundsX);
+
+                        targetStart += kernelStart * sample;
+
+                        kernelStart = ref Unsafe.Add(ref kernelStart, 1);
+                        sampleColumnStart = ref Unsafe.Add(ref sampleColumnStart, 1);
+                    }
+
+                    targetStart = ref Unsafe.Add(ref targetStart, 1);
+                    sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize);
+                }
+
+                // Now we need to copy the original alpha values from the source row.
+                sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                targetStart = ref MemoryMarshal.GetReference(targetBuffer);
+
+                while (Unsafe.IsAddressLessThan(ref targetStart, ref targetEnd))
+                {
+                    targetStart.W = sourceBase.W;
+
+                    targetStart = ref Unsafe.Add(ref targetStart, 1);
+                    sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+                }
+
+                Span<TPixel> targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow);
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private void Convolve4(int y, Span<Vector4> span)
+            {
+                // Span is 2x bounds.
+                int boundsX = this.bounds.X;
+                int boundsWidth = this.bounds.Width;
+                int kernelSize = this.kernel.Length;
+
+                Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
+                Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
+
+                // Clear the target buffer for each row run.
+                targetBuffer.Clear();
+
+                // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                Numerics.Premultiply(sourceBuffer);
+
+                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+                ref Vector4 targetStart = ref MemoryMarshal.GetReference(targetBuffer);
+                ref Vector4 targetEnd = ref Unsafe.Add(ref targetStart, sourceBuffer.Length);
+                ref float kernelBase = ref this.kernel[0];
+                ref float kernelEnd = ref Unsafe.Add(ref kernelBase, kernelSize);
+                ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan());
+
+                while (Unsafe.IsAddressLessThan(ref targetStart, ref targetEnd))
+                {
+                    ref float kernelStart = ref kernelBase;
+                    ref int sampleColumnStart = ref sampleColumnBase;
+
+                    while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd))
+                    {
+                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleColumnStart - boundsX);
+
+                        targetStart += kernelStart * sample;
+
+                        kernelStart = ref Unsafe.Add(ref kernelStart, 1);
+                        sampleColumnStart = ref Unsafe.Add(ref sampleColumnStart, 1);
+                    }
+
+                    targetStart = ref Unsafe.Add(ref targetStart, 1);
+                    sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize);
+                }
+
+                Numerics.UnPremultiply(targetBuffer);
+
+                Span<TPixel> targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow);
+            }
+        }
+
+        /// <summary>
+        /// A <see langword="struct"/> implementing the logic for the vertical 1D convolution.
+        /// </summary>
+        internal readonly struct VerticalConvolutionRowOperation : IRowOperation<Vector4>
+        {
+            private readonly Rectangle bounds;
+            private readonly Buffer2D<TPixel> targetPixels;
+            private readonly Buffer2D<TPixel> sourcePixels;
+            private readonly KernelSamplingMap map;
+            private readonly float[] kernel;
+            private readonly Configuration configuration;
+            private readonly bool preserveAlpha;
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public VerticalConvolutionRowOperation(
+                Rectangle bounds,
+                Buffer2D<TPixel> targetPixels,
+                Buffer2D<TPixel> sourcePixels,
+                KernelSamplingMap map,
+                float[] kernel,
+                Configuration configuration,
+                bool preserveAlpha)
+            {
+                this.bounds = bounds;
+                this.targetPixels = targetPixels;
+                this.sourcePixels = sourcePixels;
+                this.map = map;
+                this.kernel = kernel;
+                this.configuration = configuration;
+                this.preserveAlpha = preserveAlpha;
+            }
+
+            /// <inheritdoc/>
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            public void Invoke(int y, Span<Vector4> span)
+            {
+                if (this.preserveAlpha)
+                {
+                    this.Convolve3(y, span);
+                }
+                else
+                {
+                    this.Convolve4(y, span);
+                }
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private void Convolve3(int y, Span<Vector4> span)
+            {
+                // Span is 2x bounds.
+                int boundsX = this.bounds.X;
+                int boundsWidth = this.bounds.Width;
+                int kernelSize = this.kernel.Length;
+
+                Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
+                Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
+
+                ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(this.map.GetRowOffsetSpan()), (y - this.bounds.Y) * kernelSize);
+
+                // Clear the target buffer for each row run.
+                targetBuffer.Clear();
+
+                ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+                ref float kernelStart = ref this.kernel[0];
+                ref float kernelEnd = ref Unsafe.Add(ref kernelStart, kernelSize);
+
+                Span<TPixel> sourceRow;
+                while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd))
+                {
+                    // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                    sourceRow = this.sourcePixels.GetRowSpan(sampleRowBase).Slice(boundsX, boundsWidth);
+
+                    PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                    ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+                    ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceBase, sourceBuffer.Length);
+                    ref Vector4 targetStart = ref targetBase;
+                    float factor = kernelStart;
+
+                    while (Unsafe.IsAddressLessThan(ref sourceBase, ref sourceEnd))
+                    {
+                        targetStart += factor * sourceBase;
+
+                        sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+                        targetStart = ref Unsafe.Add(ref targetStart, 1);
+                    }
+
+                    kernelStart = ref Unsafe.Add(ref kernelStart, 1);
+                    sampleRowBase = ref Unsafe.Add(ref sampleRowBase, 1);
+                }
+
+                // Now we need to copy the original alpha values from the source row.
+                sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+                {
+                    ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+                    ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceBase, sourceBuffer.Length);
+
+                    while (Unsafe.IsAddressLessThan(ref sourceBase, ref sourceEnd))
+                    {
+                        targetBase.W = sourceBase.W;
+
+                        targetBase = ref Unsafe.Add(ref targetBase, 1);
+                        sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+                    }
+                }
+
+                Span<TPixel> targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow);
+            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private void Convolve4(int y, Span<Vector4> span)
+            {
+                // Span is 2x bounds.
+                int boundsX = this.bounds.X;
+                int boundsWidth = this.bounds.Width;
+                int kernelSize = this.kernel.Length;
+
+                Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
+                Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
+
+                ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(this.map.GetRowOffsetSpan()), (y - this.bounds.Y) * kernelSize);
+
+                // Clear the target buffer for each row run.
+                targetBuffer.Clear();
+
+                ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+                ref float kernelStart = ref this.kernel[0];
+                ref float kernelEnd = ref Unsafe.Add(ref kernelStart, kernelSize);
+
+                Span<TPixel> sourceRow;
+                while (Unsafe.IsAddressLessThan(ref kernelStart, ref kernelEnd))
+                {
+                    // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                    sourceRow = this.sourcePixels.GetRowSpan(sampleRowBase).Slice(boundsX, boundsWidth);
+
+                    PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                    ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+                    ref Vector4 sourceEnd = ref Unsafe.Add(ref sourceBase, sourceBuffer.Length);
+                    ref Vector4 targetStart = ref targetBase;
+                    float factor = kernelStart;
+
+                    while (Unsafe.IsAddressLessThan(ref sourceBase, ref sourceEnd))
+                    {
+                        targetStart += factor * sourceBase;
+
+                        sourceBase = ref Unsafe.Add(ref sourceBase, 1);
+                        targetStart = ref Unsafe.Add(ref targetStart, 1);
+                    }
+
+                    kernelStart = ref Unsafe.Add(ref kernelStart, 1);
+                    sampleRowBase = ref Unsafe.Add(ref sampleRowBase, 1);
+                }
+
+                Numerics.UnPremultiply(targetBuffer);
+
+                Span<TPixel> targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow);
            }
        }
    }
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessorHelpers.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessorHelpers.cs
@ -12,17 +12,15 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        /// See <see href="http://chemaguerra.com/gaussian-filter-radius/"/>.
        /// </summary>
        internal static int GetDefaultGaussianRadius(float sigma)
-        {
-            return (int)MathF.Ceiling(sigma * 3);
-        }
+            => (int)MathF.Ceiling(sigma * 3);

        /// <summary>
        /// Create a 1 dimensional Gaussian kernel using the Gaussian G(x) function.
        /// </summary>
-        /// <returns>The <see cref="DenseMatrix{T}"/>.</returns>
-        internal static DenseMatrix<float> CreateGaussianBlurKernel(int size, float weight)
+        /// <returns>The convolution kernel.</returns>
+        internal static float[] CreateGaussianBlurKernel(int size, float weight)
        {
-            var kernel = new DenseMatrix<float>(size, 1);
+            var kernel = new float[size];

            float sum = 0F;
            float midpoint = (size - 1) / 2F;
@ -32,13 +30,13 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                float x = i - midpoint;
                float gx = Numerics.Gaussian(x, weight);
                sum += gx;
-                kernel[0, i] = gx;
+                kernel[i] = gx;
            }

            // Normalize kernel so that the sum of all weights equals 1
            for (int i = 0; i < size; i++)
            {
-                kernel[0, i] /= sum;
+                kernel[i] /= sum;
            }

            return kernel;
@ -47,10 +45,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        /// <summary>
        /// Create a 1 dimensional Gaussian kernel using the Gaussian G(x) function
        /// </summary>
-        /// <returns>The <see cref="DenseMatrix{T}"/>.</returns>
-        internal static DenseMatrix<float> CreateGaussianSharpenKernel(int size, float weight)
+        /// <returns>The convolution kernel.</returns>
+        internal static float[] CreateGaussianSharpenKernel(int size, float weight)
        {
-            var kernel = new DenseMatrix<float>(size, 1);
+            var kernel = new float[size];

            float sum = 0;

@ -60,7 +58,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                float x = i - midpoint;
                float gx = Numerics.Gaussian(x, weight);
                sum += gx;
-                kernel[0, i] = gx;
+                kernel[i] = gx;
            }

            // Invert the kernel for sharpening.
@ -70,19 +68,19 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                if (i == midpointRounded)
                {
                    // Calculate central value
-                    kernel[0, i] = (2F * sum) - kernel[0, i];
+                    kernel[i] = (2F * sum) - kernel[i];
                }
                else
                {
                    // invert value
-                    kernel[0, i] = -kernel[0, i];
+                    kernel[i] = -kernel[i];
                }
            }

            // Normalize kernel so that the sum of all weights equals 1
            for (int i = 0; i < size; i++)
            {
-                kernel[0, i] /= sum;
+                kernel[i] /= sum;
            }

            return kernel;
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs
@ -1,163 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-using SixLabors.ImageSharp.Advanced;
-using SixLabors.ImageSharp.Memory;
-using SixLabors.ImageSharp.PixelFormats;
-
-namespace SixLabors.ImageSharp.Processing.Processors.Convolution
-{
-    /// <summary>
-    /// A <see langword="struct"/> implementing the logic for 1D convolution.
-    /// </summary>
-    internal readonly struct ConvolutionRowOperation<TPixel> : IRowOperation<Vector4>
-        where TPixel : unmanaged, IPixel<TPixel>
-    {
-        private readonly Rectangle bounds;
-        private readonly Buffer2D<TPixel> targetPixels;
-        private readonly Buffer2D<TPixel> sourcePixels;
-        private readonly KernelSamplingMap map;
-        private readonly DenseMatrix<float> kernelMatrix;
-        private readonly Configuration configuration;
-        private readonly bool preserveAlpha;
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public ConvolutionRowOperation(
-            Rectangle bounds,
-            Buffer2D<TPixel> targetPixels,
-            Buffer2D<TPixel> sourcePixels,
-            KernelSamplingMap map,
-            DenseMatrix<float> kernelMatrix,
-            Configuration configuration,
-            bool preserveAlpha)
-        {
-            this.bounds = bounds;
-            this.targetPixels = targetPixels;
-            this.sourcePixels = sourcePixels;
-            this.map = map;
-            this.kernelMatrix = kernelMatrix;
-            this.configuration = configuration;
-            this.preserveAlpha = preserveAlpha;
-        }
-
-        /// <inheritdoc/>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        public void Invoke(int y, Span<Vector4> span)
-        {
-            if (this.preserveAlpha)
-            {
-                this.Convolve3(y, span);
-            }
-            else
-            {
-                this.Convolve4(y, span);
-            }
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private void Convolve3(int y, Span<Vector4> span)
-        {
-            // Span is 2x bounds.
-            int boundsX = this.bounds.X;
-            int boundsWidth = this.bounds.Width;
-            Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
-            Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
-
-            var state = new ConvolutionState(in this.kernelMatrix, this.map);
-            ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
-
-            // Clear the target buffer for each row run.
-            targetBuffer.Clear();
-            ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
-
-            ReadOnlyKernel kernel = state.Kernel;
-            Span<TPixel> sourceRow;
-            for (int kY = 0; kY < kernel.Rows; kY++)
-            {
-                // Get the precalculated source sample row for this kernel row and copy to our buffer.
-                int sampleY = Unsafe.Add(ref sampleRowBase, kY);
-                sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
-                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
-
-                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
-
-                for (int x = 0; x < sourceBuffer.Length; x++)
-                {
-                    ref int sampleColumnBase = ref state.GetSampleColumn(x);
-                    ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
-
-                    for (int kX = 0; kX < kernel.Columns; kX++)
-                    {
-                        int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
-                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
-                        target += kernel[kY, kX] * sample;
-                    }
-                }
-            }
-
-            // Now we need to copy the original alpha values from the source row.
-            sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
-            PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
-
-            for (int x = 0; x < sourceRow.Length; x++)
-            {
-                ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
-                target.W = Unsafe.Add(ref MemoryMarshal.GetReference(sourceBuffer), x).W;
-            }
-
-            Span<TPixel> targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
-            PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow);
-        }
-
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        private void Convolve4(int y, Span<Vector4> span)
-        {
-            // Span is 2x bounds.
-            int boundsX = this.bounds.X;
-            int boundsWidth = this.bounds.Width;
-            Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
-            Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
-
-            var state = new ConvolutionState(in this.kernelMatrix, this.map);
-            ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
-
-            // Clear the target buffer for each row run.
-            targetBuffer.Clear();
-            ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
-
-            ReadOnlyKernel kernel = state.Kernel;
-            for (int kY = 0; kY < kernel.Rows; kY++)
-            {
-                // Get the precalculated source sample row for this kernel row and copy to our buffer.
-                int sampleY = Unsafe.Add(ref sampleRowBase, kY);
-                Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
-                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
-
-                Numerics.Premultiply(sourceBuffer);
-                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
-
-                for (int x = 0; x < sourceBuffer.Length; x++)
-                {
-                    ref int sampleColumnBase = ref state.GetSampleColumn(x);
-                    ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
-
-                    for (int kX = 0; kX < kernel.Columns; kX++)
-                    {
-                        int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
-                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
-                        target += kernel[kY, kX] * sample;
-                    }
-                }
-            }
-
-            Numerics.UnPremultiply(targetBuffer);
-
-            Span<TPixel> targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
-            PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow);
-        }
-    }
-}
--- a/src/ImageSharp/Processing/Processors/Convolution/GaussianBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/GaussianBlurProcessor{TPixel}.cs
@ -27,24 +27,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
            : base(configuration, source, sourceRectangle)
        {
            int kernelSize = (definition.Radius * 2) + 1;
-            this.KernelX = ConvolutionProcessorHelpers.CreateGaussianBlurKernel(kernelSize, definition.Sigma);
-            this.KernelY = this.KernelX.Transpose();
+            this.Kernel = ConvolutionProcessorHelpers.CreateGaussianBlurKernel(kernelSize, definition.Sigma);
        }

        /// <summary>
-        /// Gets the horizontal gradient operator.
+        /// Gets the 1D convolution kernel.
        /// </summary>
-        public DenseMatrix<float> KernelX { get; }
-
-        /// <summary>
-        /// Gets the vertical gradient operator.
-        /// </summary>
-        public DenseMatrix<float> KernelY { get; }
+        public float[] Kernel { get; }

        /// <inheritdoc/>
        protected override void OnFrameApply(ImageFrame<TPixel> source)
        {
-            using var processor = new Convolution2PassProcessor<TPixel>(this.Configuration, this.KernelX, this.KernelY, false, this.Source, this.SourceRectangle);
+            using var processor = new Convolution2PassProcessor<TPixel>(this.Configuration, this.Kernel, false, this.Source, this.SourceRectangle);

            processor.Apply(source);
        }
--- a/src/ImageSharp/Processing/Processors/Convolution/GaussianSharpenProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/GaussianSharpenProcessor{TPixel}.cs
@ -27,24 +27,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
            : base(configuration, source, sourceRectangle)
        {
            int kernelSize = (definition.Radius * 2) + 1;
-            this.KernelX = ConvolutionProcessorHelpers.CreateGaussianSharpenKernel(kernelSize, definition.Sigma);
-            this.KernelY = this.KernelX.Transpose();
+            this.Kernel = ConvolutionProcessorHelpers.CreateGaussianSharpenKernel(kernelSize, definition.Sigma);
        }

        /// <summary>
-        /// Gets the horizontal gradient operator.
+        /// Gets the 1D convolution kernel.
        /// </summary>
-        public DenseMatrix<float> KernelX { get; }
-
-        /// <summary>
-        /// Gets the vertical gradient operator.
-        /// </summary>
-        public DenseMatrix<float> KernelY { get; }
+        public float[] Kernel { get; }

        /// <inheritdoc/>
        protected override void OnFrameApply(ImageFrame<TPixel> source)
        {
-            using var processor = new Convolution2PassProcessor<TPixel>(this.Configuration, this.KernelX, this.KernelY, false, this.Source, this.SourceRectangle);
+            using var processor = new Convolution2PassProcessor<TPixel>(this.Configuration, this.Kernel, false, this.Source, this.SourceRectangle);

            processor.Apply(source);
        }