Merge branch 'master' into tiff-format

5 years ago · 827a36548a
14 changed files with 801 additions and 504 deletions
--- a/src/ImageSharp/Common/Helpers/DenseMatrixUtils.cs
+++ b/src/ImageSharp/Common/Helpers/DenseMatrixUtils.cs
@ -1,279 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using SixLabors.ImageSharp.Memory;
-using SixLabors.ImageSharp.PixelFormats;
-
-namespace SixLabors.ImageSharp
-{
-    /// <summary>
-    /// Extension methods for <see cref="DenseMatrix{T}"/>.
-    /// TODO: One day rewrite all this to use SIMD intrinsics. There's a lot of scope for improvement.
-    /// </summary>
-    internal static class DenseMatrixUtils
-    {
-        /// <summary>
-        /// Computes the sum of vectors in the span referenced by <paramref name="targetRowRef"/> weighted by the two kernel weight values.
-        /// Using this method the convolution filter is not applied to alpha in addition to the color channels.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="matrixY">The vertical dense matrix.</param>
-        /// <param name="matrixX">The horizontal dense matrix.</param>
-        /// <param name="sourcePixels">The source frame.</param>
-        /// <param name="targetRowRef">The target row base reference.</param>
-        /// <param name="row">The current row.</param>
-        /// <param name="column">The current column.</param>
-        /// <param name="minRow">The minimum working area row.</param>
-        /// <param name="maxRow">The maximum working area row.</param>
-        /// <param name="minColumn">The minimum working area column.</param>
-        /// <param name="maxColumn">The maximum working area column.</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static void Convolve2D3<TPixel>(
-            in DenseMatrix<float> matrixY,
-            in DenseMatrix<float> matrixX,
-            Buffer2D<TPixel> sourcePixels,
-            ref Vector4 targetRowRef,
-            int row,
-            int column,
-            int minRow,
-            int maxRow,
-            int minColumn,
-            int maxColumn)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            Convolve2DImpl(
-                in matrixY,
-                in matrixX,
-                sourcePixels,
-                row,
-                column,
-                minRow,
-                maxRow,
-                minColumn,
-                maxColumn,
-                out Vector4 vector);
-
-            ref Vector4 target = ref Unsafe.Add(ref targetRowRef, column);
-            vector.W = target.W;
-
-            Numerics.UnPremultiply(ref vector);
-            target = vector;
-        }
-
-        /// <summary>
-        /// Computes the sum of vectors in the span referenced by <paramref name="targetRowRef"/> weighted by the two kernel weight values.
-        /// Using this method the convolution filter is applied to alpha in addition to the color channels.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="matrixY">The vertical dense matrix.</param>
-        /// <param name="matrixX">The horizontal dense matrix.</param>
-        /// <param name="sourcePixels">The source frame.</param>
-        /// <param name="targetRowRef">The target row base reference.</param>
-        /// <param name="row">The current row.</param>
-        /// <param name="column">The current column.</param>
-        /// <param name="minRow">The minimum working area row.</param>
-        /// <param name="maxRow">The maximum working area row.</param>
-        /// <param name="minColumn">The minimum working area column.</param>
-        /// <param name="maxColumn">The maximum working area column.</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static void Convolve2D4<TPixel>(
-            in DenseMatrix<float> matrixY,
-            in DenseMatrix<float> matrixX,
-            Buffer2D<TPixel> sourcePixels,
-            ref Vector4 targetRowRef,
-            int row,
-            int column,
-            int minRow,
-            int maxRow,
-            int minColumn,
-            int maxColumn)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            Convolve2DImpl(
-                in matrixY,
-                in matrixX,
-                sourcePixels,
-                row,
-                column,
-                minRow,
-                maxRow,
-                minColumn,
-                maxColumn,
-                out Vector4 vector);
-
-            ref Vector4 target = ref Unsafe.Add(ref targetRowRef, column);
-            Numerics.UnPremultiply(ref vector);
-            target = vector;
-        }
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static void Convolve2DImpl<TPixel>(
-            in DenseMatrix<float> matrixY,
-            in DenseMatrix<float> matrixX,
-            Buffer2D<TPixel> sourcePixels,
-            int row,
-            int column,
-            int minRow,
-            int maxRow,
-            int minColumn,
-            int maxColumn,
-            out Vector4 vector)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            Vector4 vectorY = default;
-            Vector4 vectorX = default;
-            int matrixHeight = matrixY.Rows;
-            int matrixWidth = matrixY.Columns;
-            int radiusY = matrixHeight >> 1;
-            int radiusX = matrixWidth >> 1;
-            int sourceOffsetColumnBase = column + minColumn;
-
-            for (int y = 0; y < matrixHeight; y++)
-            {
-                int offsetY = Numerics.Clamp(row + y - radiusY, minRow, maxRow);
-                Span<TPixel> sourceRowSpan = sourcePixels.GetRowSpan(offsetY);
-
-                for (int x = 0; x < matrixWidth; x++)
-                {
-                    int offsetX = Numerics.Clamp(sourceOffsetColumnBase + x - radiusX, minColumn, maxColumn);
-                    var currentColor = sourceRowSpan[offsetX].ToVector4();
-                    Numerics.Premultiply(ref currentColor);
-
-                    vectorX += matrixX[y, x] * currentColor;
-                    vectorY += matrixY[y, x] * currentColor;
-                }
-            }
-
-            vector = Vector4.SquareRoot((vectorX * vectorX) + (vectorY * vectorY));
-        }
-
-        /// <summary>
-        /// Computes the sum of vectors in the span referenced by <paramref name="targetRowRef"/> weighted by the kernel weight values.
-        /// Using this method the convolution filter is not applied to alpha in addition to the color channels.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="matrix">The dense matrix.</param>
-        /// <param name="sourcePixels">The source frame.</param>
-        /// <param name="targetRowRef">The target row base reference.</param>
-        /// <param name="row">The current row.</param>
-        /// <param name="column">The current column.</param>
-        /// <param name="minRow">The minimum working area row.</param>
-        /// <param name="maxRow">The maximum working area row.</param>
-        /// <param name="minColumn">The minimum working area column.</param>
-        /// <param name="maxColumn">The maximum working area column.</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static void Convolve3<TPixel>(
-            in DenseMatrix<float> matrix,
-            Buffer2D<TPixel> sourcePixels,
-            ref Vector4 targetRowRef,
-            int row,
-            int column,
-            int minRow,
-            int maxRow,
-            int minColumn,
-            int maxColumn)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            Vector4 vector = default;
-
-            ConvolveImpl(
-                in matrix,
-                sourcePixels,
-                row,
-                column,
-                minRow,
-                maxRow,
-                minColumn,
-                maxColumn,
-                ref vector);
-
-            ref Vector4 target = ref Unsafe.Add(ref targetRowRef, column);
-            vector.W = target.W;
-
-            Numerics.UnPremultiply(ref vector);
-            target = vector;
-        }
-
-        /// <summary>
-        /// Computes the sum of vectors in the span referenced by <paramref name="targetRowRef"/> weighted by the kernel weight values.
-        /// Using this method the convolution filter is applied to alpha in addition to the color channels.
-        /// </summary>
-        /// <typeparam name="TPixel">The pixel format.</typeparam>
-        /// <param name="matrix">The dense matrix.</param>
-        /// <param name="sourcePixels">The source frame.</param>
-        /// <param name="targetRowRef">The target row base reference.</param>
-        /// <param name="row">The current row.</param>
-        /// <param name="column">The current column.</param>
-        /// <param name="minRow">The minimum working area row.</param>
-        /// <param name="maxRow">The maximum working area row.</param>
-        /// <param name="minColumn">The minimum working area column.</param>
-        /// <param name="maxColumn">The maximum working area column.</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        public static void Convolve4<TPixel>(
-            in DenseMatrix<float> matrix,
-            Buffer2D<TPixel> sourcePixels,
-            ref Vector4 targetRowRef,
-            int row,
-            int column,
-            int minRow,
-            int maxRow,
-            int minColumn,
-            int maxColumn)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            Vector4 vector = default;
-
-            ConvolveImpl(
-                in matrix,
-                sourcePixels,
-                row,
-                column,
-                minRow,
-                maxRow,
-                minColumn,
-                maxColumn,
-                ref vector);
-
-            ref Vector4 target = ref Unsafe.Add(ref targetRowRef, column);
-            Numerics.UnPremultiply(ref vector);
-            target = vector;
-        }
-
-        [MethodImpl(InliningOptions.ShortMethod)]
-        private static void ConvolveImpl<TPixel>(
-            in DenseMatrix<float> matrix,
-            Buffer2D<TPixel> sourcePixels,
-            int row,
-            int column,
-            int minRow,
-            int maxRow,
-            int minColumn,
-            int maxColumn,
-            ref Vector4 vector)
-            where TPixel : unmanaged, IPixel<TPixel>
-        {
-            int matrixHeight = matrix.Rows;
-            int matrixWidth = matrix.Columns;
-            int radiusY = matrixHeight >> 1;
-            int radiusX = matrixWidth >> 1;
-            int sourceOffsetColumnBase = column + minColumn;
-
-            for (int y = 0; y < matrixHeight; y++)
-            {
-                int offsetY = Numerics.Clamp(row + y - radiusY, minRow, maxRow);
-                Span<TPixel> sourceRowSpan = sourcePixels.GetRowSpan(offsetY);
-
-                for (int x = 0; x < matrixWidth; x++)
-                {
-                    int offsetX = Numerics.Clamp(sourceOffsetColumnBase + x - radiusX, minColumn, maxColumn);
-                    var currentColor = sourceRowSpan[offsetX].ToVector4();
-                    Numerics.Premultiply(ref currentColor);
-                    vector += matrix[y, x] * currentColor;
-                }
-            }
-        }
-    }
-}
--- a/src/ImageSharp/Memory/Allocators/ArrayPoolMemoryAllocator.Buffer{T}.cs
+++ b/src/ImageSharp/Memory/Allocators/ArrayPoolMemoryAllocator.Buffer{T}.cs
@ -53,8 +53,13 @@ namespace SixLabors.ImageSharp.Memory
                {
                    ThrowObjectDisposedException();
                }
-
+#if SUPPORTS_CREATESPAN
+                ref byte r0 = ref MemoryMarshal.GetReference<byte>(this.Data);
+                return MemoryMarshal.CreateSpan(ref Unsafe.As<byte, T>(ref r0), this.length);
+#else
                return MemoryMarshal.Cast<byte, T>(this.Data.AsSpan()).Slice(0, this.length);
+#endif
+
            }

            /// <inheritdoc />
--- a/src/ImageSharp/Primitives/DenseMatrix{T}.cs
+++ b/src/ImageSharp/Primitives/DenseMatrix{T}.cs
@ -109,7 +109,7 @@ namespace SixLabors.ImageSharp
        /// <returns>The <see typeparam="T"/> at the specified position.</returns>
        public ref T this[int row, int column]
        {
-            [MethodImpl(InliningOptions.ShortMethod)]
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            get
            {
                this.CheckCoordinates(row, column);
@ -124,7 +124,7 @@ namespace SixLabors.ImageSharp
        /// <returns>
        /// The <see cref="DenseMatrix{T}"/> representation on the source data.
        /// </returns>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static implicit operator DenseMatrix<T>(T[,] data) => new DenseMatrix<T>(data);

        /// <summary>
@ -134,7 +134,7 @@ namespace SixLabors.ImageSharp
        /// <returns>
        /// The <see cref="T:T[,]"/> representation on the source data.
        /// </returns>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
 #pragma warning disable SA1008 // Opening parenthesis should be spaced correctly
        public static implicit operator T[,](in DenseMatrix<T> data)
 #pragma warning restore SA1008 // Opening parenthesis should be spaced correctly
@ -175,7 +175,7 @@ namespace SixLabors.ImageSharp
        /// Transposes the rows and columns of the dense matrix.
        /// </summary>
        /// <returns>The <see cref="DenseMatrix{T}"/>.</returns>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public DenseMatrix<T> Transpose()
        {
            var result = new DenseMatrix<T>(this.Rows, this.Columns);
@ -196,13 +196,13 @@ namespace SixLabors.ImageSharp
        /// Fills the matrix with the given value
        /// </summary>
        /// <param name="value">The value to fill each item with</param>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void Fill(T value) => this.Span.Fill(value);

        /// <summary>
        /// Clears the matrix setting each value to the default value for the element type
        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public void Clear() => this.Span.Clear();

        /// <summary>
@ -232,14 +232,14 @@ namespace SixLabors.ImageSharp
            => obj is DenseMatrix<T> other && this.Equals(other);

        /// <inheritdoc/>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public bool Equals(DenseMatrix<T> other) =>
            this.Columns == other.Columns
            && this.Rows == other.Rows
            && this.Span.SequenceEqual(other.Span);

        /// <inheritdoc/>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public override int GetHashCode()
        {
            HashCode code = default;
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DProcessor{TPixel}.cs
@ -1,10 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Apache License, Version 2.0.

-using System;
 using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;
@ -43,12 +40,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        }

        /// <summary>
-        /// Gets the horizontal gradient operator.
+        /// Gets the horizontal convolution kernel.
        /// </summary>
        public DenseMatrix<float> KernelX { get; }

        /// <summary>
-        /// Gets the vertical gradient operator.
+        /// Gets the vertical convolution kernel.
        /// </summary>
        public DenseMatrix<float> KernelY { get; }

@ -60,102 +57,39 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        /// <inheritdoc/>
        protected override void OnFrameApply(ImageFrame<TPixel> source)
        {
-            using Buffer2D<TPixel> targetPixels = this.Configuration.MemoryAllocator.Allocate2D<TPixel>(source.Width, source.Height);
+            MemoryAllocator allocator = this.Configuration.MemoryAllocator;
+            using Buffer2D<TPixel> targetPixels = allocator.Allocate2D<TPixel>(source.Width, source.Height);

            source.CopyTo(targetPixels);

            var interest = Rectangle.Intersect(this.SourceRectangle, source.Bounds());
-            var operation = new RowOperation(interest, targetPixels, source.PixelBuffer, this.KernelY, this.KernelX, this.Configuration, this.PreserveAlpha);

-            ParallelRowIterator.IterateRows<RowOperation, Vector4>(
-                this.Configuration,
-                interest,
-                in operation);
+            // We use a rectangle 3x the interest width to allocate a buffer big enough
+            // for source and target bulk pixel conversion.
+            var operationBounds = new Rectangle(interest.X, interest.Y, interest.Width * 3, interest.Height);

-            Buffer2D<TPixel>.SwapOrCopyContent(source.PixelBuffer, targetPixels);
-        }
-
-        /// <summary>
-        /// A <see langword="struct"/> implementing the convolution logic for <see cref="Convolution2DProcessor{T}"/>.
-        /// </summary>
-        private readonly struct RowOperation : IRowOperation<Vector4>
-        {
-            private readonly Rectangle bounds;
-            private readonly int maxY;
-            private readonly int maxX;
-            private readonly Buffer2D<TPixel> targetPixels;
-            private readonly Buffer2D<TPixel> sourcePixels;
-            private readonly DenseMatrix<float> kernelY;
-            private readonly DenseMatrix<float> kernelX;
-            private readonly Configuration configuration;
-            private readonly bool preserveAlpha;
-
-            [MethodImpl(InliningOptions.ShortMethod)]
-            public RowOperation(
-                Rectangle bounds,
-                Buffer2D<TPixel> targetPixels,
-                Buffer2D<TPixel> sourcePixels,
-                DenseMatrix<float> kernelY,
-                DenseMatrix<float> kernelX,
-                Configuration configuration,
-                bool preserveAlpha)
-            {
-                this.bounds = bounds;
-                this.maxY = this.bounds.Bottom - 1;
-                this.maxX = this.bounds.Right - 1;
-                this.targetPixels = targetPixels;
-                this.sourcePixels = sourcePixels;
-                this.kernelY = kernelY;
-                this.kernelX = kernelX;
-                this.configuration = configuration;
-                this.preserveAlpha = preserveAlpha;
-            }
-
-            /// <inheritdoc/>
-            [MethodImpl(InliningOptions.ShortMethod)]
-            public void Invoke(int y, Span<Vector4> span)
+            using (var map = new KernelSamplingMap(allocator))
            {
-                ref Vector4 spanRef = ref MemoryMarshal.GetReference(span);
-                Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
-                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span);
+                // Since the kernel sizes are identical we can use a single map.
+                map.BuildSamplingOffsetMap(this.KernelY, interest);

-                if (this.preserveAlpha)
-                {
-                    for (int x = 0; x < this.bounds.Width; x++)
-                    {
-                        DenseMatrixUtils.Convolve2D3(
-                            in this.kernelY,
-                            in this.kernelX,
-                            this.sourcePixels,
-                            ref spanRef,
-                            y,
-                            x,
-                            this.bounds.Y,
-                            this.maxY,
-                            this.bounds.X,
-                            this.maxX);
-                    }
-                }
-                else
-                {
-                    for (int x = 0; x < this.bounds.Width; x++)
-                    {
-                        DenseMatrixUtils.Convolve2D4(
-                            in this.kernelY,
-                            in this.kernelX,
-                            this.sourcePixels,
-                            ref spanRef,
-                            y,
-                            x,
-                            this.bounds.Y,
-                            this.maxY,
-                            this.bounds.X,
-                            this.maxX);
-                    }
-                }
+                var operation = new Convolution2DRowOperation<TPixel>(
+                    interest,
+                    targetPixels,
+                    source.PixelBuffer,
+                    map,
+                    this.KernelY,
+                    this.KernelX,
+                    this.Configuration,
+                    this.PreserveAlpha);

-                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
+                ParallelRowIterator.IterateRows<Convolution2DRowOperation<TPixel>, Vector4>(
+                    this.Configuration,
+                    operationBounds,
+                    in operation);
            }
+
+            Buffer2D<TPixel>.SwapOrCopyContent(source.PixelBuffer, targetPixels);
        }
    }
 }
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs
@ -0,0 +1,193 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Processing.Processors.Convolution
+{
+    /// <summary>
+    /// A <see langword="struct"/> implementing the logic for 2D convolution.
+    /// </summary>
+    internal readonly struct Convolution2DRowOperation<TPixel> : IRowOperation<Vector4>
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        private readonly Rectangle bounds;
+        private readonly Buffer2D<TPixel> targetPixels;
+        private readonly Buffer2D<TPixel> sourcePixels;
+        private readonly KernelSamplingMap map;
+        private readonly DenseMatrix<float> kernelMatrixY;
+        private readonly DenseMatrix<float> kernelMatrixX;
+        private readonly Configuration configuration;
+        private readonly bool preserveAlpha;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public Convolution2DRowOperation(
+            Rectangle bounds,
+            Buffer2D<TPixel> targetPixels,
+            Buffer2D<TPixel> sourcePixels,
+            KernelSamplingMap map,
+            DenseMatrix<float> kernelMatrixY,
+            DenseMatrix<float> kernelMatrixX,
+            Configuration configuration,
+            bool preserveAlpha)
+        {
+            this.bounds = bounds;
+            this.targetPixels = targetPixels;
+            this.sourcePixels = sourcePixels;
+            this.map = map;
+            this.kernelMatrixY = kernelMatrixY;
+            this.kernelMatrixX = kernelMatrixX;
+            this.configuration = configuration;
+            this.preserveAlpha = preserveAlpha;
+        }
+
+        /// <inheritdoc/>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void Invoke(int y, Span<Vector4> span)
+        {
+            if (this.preserveAlpha)
+            {
+                this.Convolve3(y, span);
+            }
+            else
+            {
+                this.Convolve4(y, span);
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void Convolve3(int y, Span<Vector4> span)
+        {
+            // Span is 3x bounds.
+            int boundsX = this.bounds.X;
+            int boundsWidth = this.bounds.Width;
+            Span<Vector4> sourceBuffer = span.Slice(0, boundsWidth);
+            Span<Vector4> targetYBuffer = span.Slice(boundsWidth, boundsWidth);
+            Span<Vector4> targetXBuffer = span.Slice(boundsWidth * 2, boundsWidth);
+
+            var state = new Convolution2DState(in this.kernelMatrixY, in this.kernelMatrixX, this.map);
+            ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
+
+            // Clear the target buffers for each row run.
+            targetYBuffer.Clear();
+            targetXBuffer.Clear();
+            ref Vector4 targetBaseY = ref MemoryMarshal.GetReference(targetYBuffer);
+            ref Vector4 targetBaseX = ref MemoryMarshal.GetReference(targetXBuffer);
+
+            ReadOnlyKernel kernelY = state.KernelY;
+            ReadOnlyKernel kernelX = state.KernelX;
+            Span<TPixel> sourceRow;
+            for (int kY = 0; kY < kernelY.Rows; kY++)
+            {
+                // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                int sampleY = Unsafe.Add(ref sampleRowBase, kY);
+                sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+
+                for (int x = 0; x < sourceBuffer.Length; x++)
+                {
+                    ref int sampleColumnBase = ref state.GetSampleColumn(x);
+                    ref Vector4 targetY = ref Unsafe.Add(ref targetBaseY, x);
+                    ref Vector4 targetX = ref Unsafe.Add(ref targetBaseX, x);
+
+                    for (int kX = 0; kX < kernelY.Columns; kX++)
+                    {
+                        int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+                        targetY += kernelX[kY, kX] * sample;
+                        targetX += kernelY[kY, kX] * sample;
+                    }
+                }
+            }
+
+            // Now we need to combine the values and copy the original alpha values
+            // from the source row.
+            sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+            PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+            for (int x = 0; x < sourceRow.Length; x++)
+            {
+                ref Vector4 target = ref Unsafe.Add(ref targetBaseY, x);
+                Vector4 vectorY = target;
+                Vector4 vectorX = Unsafe.Add(ref targetBaseX, x);
+
+                target = Vector4.SquareRoot((vectorX * vectorX) + (vectorY * vectorY));
+                target.W = Unsafe.Add(ref MemoryMarshal.GetReference(sourceBuffer), x).W;
+            }
+
+            Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+            PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetYBuffer, targetRowSpan);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void Convolve4(int y, Span<Vector4> span)
+        {
+            // Span is 3x bounds.
+            int boundsX = this.bounds.X;
+            int boundsWidth = this.bounds.Width;
+            Span<Vector4> sourceBuffer = span.Slice(0, boundsWidth);
+            Span<Vector4> targetYBuffer = span.Slice(boundsWidth, boundsWidth);
+            Span<Vector4> targetXBuffer = span.Slice(boundsWidth * 2, boundsWidth);
+
+            var state = new Convolution2DState(in this.kernelMatrixY, in this.kernelMatrixX, this.map);
+            ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
+
+            // Clear the target buffers for each row run.
+            targetYBuffer.Clear();
+            targetXBuffer.Clear();
+            ref Vector4 targetBaseY = ref MemoryMarshal.GetReference(targetYBuffer);
+            ref Vector4 targetBaseX = ref MemoryMarshal.GetReference(targetXBuffer);
+
+            ReadOnlyKernel kernelY = state.KernelY;
+            ReadOnlyKernel kernelX = state.KernelX;
+            for (int kY = 0; kY < kernelY.Rows; kY++)
+            {
+                // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                int sampleY = Unsafe.Add(ref sampleRowBase, kY);
+                Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                Numerics.Premultiply(sourceBuffer);
+                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+
+                for (int x = 0; x < sourceBuffer.Length; x++)
+                {
+                    ref int sampleColumnBase = ref state.GetSampleColumn(x);
+                    ref Vector4 targetY = ref Unsafe.Add(ref targetBaseY, x);
+                    ref Vector4 targetX = ref Unsafe.Add(ref targetBaseX, x);
+
+                    for (int kX = 0; kX < kernelY.Columns; kX++)
+                    {
+                        int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+                        targetY += kernelX[kY, kX] * sample;
+                        targetX += kernelY[kY, kX] * sample;
+                    }
+                }
+            }
+
+            // Now we need to combine the values
+            for (int x = 0; x < targetYBuffer.Length; x++)
+            {
+                ref Vector4 target = ref Unsafe.Add(ref targetBaseY, x);
+                Vector4 vectorY = target;
+                Vector4 vectorX = Unsafe.Add(ref targetBaseX, x);
+
+                target = Vector4.SquareRoot((vectorX * vectorX) + (vectorY * vectorY));
+            }
+
+            Numerics.UnPremultiply(targetYBuffer);
+
+            Span<TPixel> targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+            PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetYBuffer, targetRow);
+        }
+    }
+}
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs
@ -0,0 +1,54 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp.Processing.Processors.Convolution
+{
+    /// <summary>
+    /// A stack only struct used for reducing reference indirection during 2D convolution operations.
+    /// </summary>
+    internal readonly ref struct Convolution2DState
+    {
+        private readonly Span<int> rowOffsetMap;
+        private readonly Span<int> columnOffsetMap;
+        private readonly int kernelHeight;
+        private readonly int kernelWidth;
+
+        public Convolution2DState(
+            in DenseMatrix<float> kernelY,
+            in DenseMatrix<float> kernelX,
+            KernelSamplingMap map)
+        {
+            // We check the kernels are the same size upstream.
+            this.KernelY = new ReadOnlyKernel(kernelY);
+            this.KernelX = new ReadOnlyKernel(kernelX);
+            this.kernelHeight = kernelY.Rows;
+            this.kernelWidth = kernelY.Columns;
+            this.rowOffsetMap = map.GetRowOffsetSpan();
+            this.columnOffsetMap = map.GetColumnOffsetSpan();
+        }
+
+        public readonly ReadOnlyKernel KernelY
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get;
+        }
+
+        public readonly ReadOnlyKernel KernelX
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public readonly ref int GetSampleRow(int row)
+            => ref Unsafe.Add(ref MemoryMarshal.GetReference(this.rowOffsetMap), row * this.kernelHeight);
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public readonly ref int GetSampleColumn(int column)
+            => ref Unsafe.Add(ref MemoryMarshal.GetReference(this.columnOffsetMap), column * this.kernelWidth);
+    }
+}
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
@ -42,12 +42,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        }

        /// <summary>
-        /// Gets the horizontal gradient operator.
+        /// Gets the horizontal convolution kernel.
        /// </summary>
        public DenseMatrix<float> KernelX { get; }

        /// <summary>
-        /// Gets the vertical gradient operator.
+        /// Gets the vertical convolution kernel.
        /// </summary>
        public DenseMatrix<float> KernelY { get; }

@ -63,96 +63,48 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution

            var interest = Rectangle.Intersect(this.SourceRectangle, source.Bounds());

-            // Horizontal convolution
-            var horizontalOperation = new RowOperation(interest, firstPassPixels, source.PixelBuffer, this.KernelX, this.Configuration, this.PreserveAlpha);
-            ParallelRowIterator.IterateRows<RowOperation, Vector4>(
-                this.Configuration,
-                interest,
-                in horizontalOperation);
+            // We use a rectangle 2x the interest width to allocate a buffer big enough
+            // for source and target bulk pixel conversion.
+            var operationBounds = new Rectangle(interest.X, interest.Y, interest.Width * 2, interest.Height);

-            // Vertical convolution
-            var verticalOperation = new RowOperation(interest, source.PixelBuffer, firstPassPixels, this.KernelY, this.Configuration, this.PreserveAlpha);
-            ParallelRowIterator.IterateRows<RowOperation, Vector4>(
-                this.Configuration,
-                interest,
-                in verticalOperation);
-        }
-
-        /// <summary>
-        /// A <see langword="struct"/> implementing the convolution logic for <see cref="Convolution2PassProcessor{T}"/>.
-        /// </summary>
-        private readonly struct RowOperation : IRowOperation<Vector4>
-        {
-            private readonly Rectangle bounds;
-            private readonly Buffer2D<TPixel> targetPixels;
-            private readonly Buffer2D<TPixel> sourcePixels;
-            private readonly DenseMatrix<float> kernel;
-            private readonly Configuration configuration;
-            private readonly bool preserveAlpha;
-
-            [MethodImpl(InliningOptions.ShortMethod)]
-            public RowOperation(
-                Rectangle bounds,
-                Buffer2D<TPixel> targetPixels,
-                Buffer2D<TPixel> sourcePixels,
-                DenseMatrix<float> kernel,
-                Configuration configuration,
-                bool preserveAlpha)
+            using (var mapX = new KernelSamplingMap(this.Configuration.MemoryAllocator))
            {
-                this.bounds = bounds;
-                this.targetPixels = targetPixels;
-                this.sourcePixels = sourcePixels;
-                this.kernel = kernel;
-                this.configuration = configuration;
-                this.preserveAlpha = preserveAlpha;
+                mapX.BuildSamplingOffsetMap(this.KernelX, interest);
+
+                // Horizontal convolution
+                var horizontalOperation = new ConvolutionRowOperation<TPixel>(
+                    interest,
+                    firstPassPixels,
+                    source.PixelBuffer,
+                    mapX,
+                    this.KernelX,
+                    this.Configuration,
+                    this.PreserveAlpha);
+
+                ParallelRowIterator.IterateRows<ConvolutionRowOperation<TPixel>, Vector4>(
+                    this.Configuration,
+                    operationBounds,
+                    in horizontalOperation);
            }

-            /// <inheritdoc/>
-            [MethodImpl(InliningOptions.ShortMethod)]
-            public void Invoke(int y, Span<Vector4> span)
+            using (var mapY = new KernelSamplingMap(this.Configuration.MemoryAllocator))
            {
-                ref Vector4 spanRef = ref MemoryMarshal.GetReference(span);
-
-                int maxY = this.bounds.Bottom - 1;
-                int maxX = this.bounds.Right - 1;
-
-                Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
-                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span);
-
-                if (this.preserveAlpha)
-                {
-                    for (int x = 0; x < this.bounds.Width; x++)
-                    {
-                        DenseMatrixUtils.Convolve3(
-                            in this.kernel,
-                            this.sourcePixels,
-                            ref spanRef,
-                            y,
-                            x,
-                            this.bounds.Y,
-                            maxY,
-                            this.bounds.X,
-                            maxX);
-                    }
-                }
-                else
-                {
-                    for (int x = 0; x < this.bounds.Width; x++)
-                    {
-                        DenseMatrixUtils.Convolve4(
-                            in this.kernel,
-                            this.sourcePixels,
-                            ref spanRef,
-                            y,
-                            x,
-                            this.bounds.Y,
-                            maxY,
-                            this.bounds.X,
-                            maxX);
-                    }
-                }
-
-                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
+                mapY.BuildSamplingOffsetMap(this.KernelY, interest);
+
+                // Vertical convolution
+                var verticalOperation = new ConvolutionRowOperation<TPixel>(
+                    interest,
+                    source.PixelBuffer,
+                    firstPassPixels,
+                    mapY,
+                    this.KernelY,
+                    this.Configuration,
+                    this.PreserveAlpha);
+
+                ParallelRowIterator.IterateRows<ConvolutionRowOperation<TPixel>, Vector4>(
+                    this.Configuration,
+                    operationBounds,
+                    in verticalOperation);
            }
        }
    }
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs
@ -39,7 +39,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        }

        /// <summary>
-        /// Gets the 2d gradient operator.
+        /// Gets the 2d convolution kernel.
        /// </summary>
        public DenseMatrix<float> KernelXY { get; }

@ -51,16 +51,26 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        /// <inheritdoc/>
        protected override void OnFrameApply(ImageFrame<TPixel> source)
        {
-            using Buffer2D<TPixel> targetPixels = this.Configuration.MemoryAllocator.Allocate2D<TPixel>(source.Size());
+            MemoryAllocator allocator = this.Configuration.MemoryAllocator;
+            using Buffer2D<TPixel> targetPixels = allocator.Allocate2D<TPixel>(source.Size());

            source.CopyTo(targetPixels);

            var interest = Rectangle.Intersect(this.SourceRectangle, source.Bounds());
-            var operation = new RowOperation(interest, targetPixels, source.PixelBuffer, this.KernelXY, this.Configuration, this.PreserveAlpha);
-            ParallelRowIterator.IterateRows<RowOperation, Vector4>(
-               this.Configuration,
-               interest,
-               in operation);
+
+            // We use a rectangle 2x the interest width to allocate a buffer big enough
+            // for source and target bulk pixel conversion.
+            var operationBounds = new Rectangle(interest.X, interest.Y, interest.Width * 2, interest.Height);
+            using (var map = new KernelSamplingMap(allocator))
+            {
+                map.BuildSamplingOffsetMap(this.KernelXY, interest);
+
+                var operation = new RowOperation(interest, targetPixels, source.PixelBuffer, map, this.KernelXY, this.Configuration, this.PreserveAlpha);
+                ParallelRowIterator.IterateRows<RowOperation, Vector4>(
+                   this.Configuration,
+                   operationBounds,
+                   in operation);
+            }

            Buffer2D<TPixel>.SwapOrCopyContent(source.PixelBuffer, targetPixels);
        }
@ -71,10 +81,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
        private readonly struct RowOperation : IRowOperation<Vector4>
        {
            private readonly Rectangle bounds;
-            private readonly int maxY;
-            private readonly int maxX;
            private readonly Buffer2D<TPixel> targetPixels;
            private readonly Buffer2D<TPixel> sourcePixels;
+            private readonly KernelSamplingMap map;
            private readonly DenseMatrix<float> kernel;
            private readonly Configuration configuration;
            private readonly bool preserveAlpha;
@ -84,15 +93,15 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
                Rectangle bounds,
                Buffer2D<TPixel> targetPixels,
                Buffer2D<TPixel> sourcePixels,
+                KernelSamplingMap map,
                DenseMatrix<float> kernel,
                Configuration configuration,
                bool preserveAlpha)
            {
                this.bounds = bounds;
-                this.maxY = this.bounds.Bottom - 1;
-                this.maxX = this.bounds.Right - 1;
                this.targetPixels = targetPixels;
                this.sourcePixels = sourcePixels;
+                this.map = map;
                this.kernel = kernel;
                this.configuration = configuration;
                this.preserveAlpha = preserveAlpha;
@ -102,45 +111,93 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
            [MethodImpl(InliningOptions.ShortMethod)]
            public void Invoke(int y, Span<Vector4> span)
            {
-                ref Vector4 spanRef = ref MemoryMarshal.GetReference(span);
+                // Span is 2x bounds.
+                int boundsX = this.bounds.X;
+                int boundsWidth = this.bounds.Width;
+                Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
+                Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
+
+                ref Vector4 targetRowRef = ref MemoryMarshal.GetReference(span);
+                Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);

-                Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
-                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span);
+                var state = new ConvolutionState(in this.kernel, this.map);
+                int row = y - this.bounds.Y;
+                ref int sampleRowBase = ref state.GetSampleRow(row);

                if (this.preserveAlpha)
                {
-                    for (int x = 0; x < this.bounds.Width; x++)
+                    // Clear the target buffer for each row run.
+                    targetBuffer.Clear();
+                    ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+
+                    Span<TPixel> sourceRow;
+                    for (int kY = 0; kY < state.Kernel.Rows; kY++)
+                    {
+                        // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                        int offsetY = Unsafe.Add(ref sampleRowBase, kY);
+                        sourceRow = this.sourcePixels.GetRowSpan(offsetY).Slice(boundsX, boundsWidth);
+                        PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                        ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+
+                        for (int x = 0; x < sourceBuffer.Length; x++)
+                        {
+                            ref int sampleColumnBase = ref state.GetSampleColumn(x);
+                            ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
+
+                            for (int kX = 0; kX < state.Kernel.Columns; kX++)
+                            {
+                                int offsetX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+                                Vector4 sample = Unsafe.Add(ref sourceBase, offsetX);
+                                target += state.Kernel[kY, kX] * sample;
+                            }
+                        }
+                    }
+
+                    // Now we need to copy the original alpha values from the source row.
+                    sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+                    PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                    for (int x = 0; x < sourceRow.Length; x++)
                    {
-                        DenseMatrixUtils.Convolve3(
-                            in this.kernel,
-                            this.sourcePixels,
-                            ref spanRef,
-                            y,
-                            x,
-                            this.bounds.Y,
-                            this.maxY,
-                            this.bounds.X,
-                            this.maxX);
+                        ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
+                        target.W = Unsafe.Add(ref MemoryMarshal.GetReference(sourceBuffer), x).W;
                    }
                }
                else
                {
-                    for (int x = 0; x < this.bounds.Width; x++)
+                    // Clear the target buffer for each row run.
+                    targetBuffer.Clear();
+                    ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+
+                    for (int kY = 0; kY < state.Kernel.Rows; kY++)
                    {
-                        DenseMatrixUtils.Convolve4(
-                            in this.kernel,
-                            this.sourcePixels,
-                            ref spanRef,
-                            y,
-                            x,
-                            this.bounds.Y,
-                            this.maxY,
-                            this.bounds.X,
-                            this.maxX);
+                        // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                        int offsetY = Unsafe.Add(ref sampleRowBase, kY);
+                        Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(offsetY).Slice(boundsX, boundsWidth);
+                        PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                        Numerics.Premultiply(sourceBuffer);
+                        ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+
+                        for (int x = 0; x < sourceBuffer.Length; x++)
+                        {
+                            ref int sampleColumnBase = ref state.GetSampleColumn(x);
+                            ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
+
+                            for (int kX = 0; kX < state.Kernel.Columns; kX++)
+                            {
+                                int offsetX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+                                Vector4 sample = Unsafe.Add(ref sourceBase, offsetX);
+                                target += state.Kernel[kY, kX] * sample;
+                            }
+                        }
                    }
+
+                    Numerics.UnPremultiply(targetBuffer);
                }

-                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
+                PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRowSpan);
            }
        }
    }
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs
@ -0,0 +1,163 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Advanced;
+using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.PixelFormats;
+
+namespace SixLabors.ImageSharp.Processing.Processors.Convolution
+{
+    /// <summary>
+    /// A <see langword="struct"/> implementing the logic for 1D convolution.
+    /// </summary>
+    internal readonly struct ConvolutionRowOperation<TPixel> : IRowOperation<Vector4>
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        private readonly Rectangle bounds;
+        private readonly Buffer2D<TPixel> targetPixels;
+        private readonly Buffer2D<TPixel> sourcePixels;
+        private readonly KernelSamplingMap map;
+        private readonly DenseMatrix<float> kernelMatrix;
+        private readonly Configuration configuration;
+        private readonly bool preserveAlpha;
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public ConvolutionRowOperation(
+            Rectangle bounds,
+            Buffer2D<TPixel> targetPixels,
+            Buffer2D<TPixel> sourcePixels,
+            KernelSamplingMap map,
+            DenseMatrix<float> kernelMatrix,
+            Configuration configuration,
+            bool preserveAlpha)
+        {
+            this.bounds = bounds;
+            this.targetPixels = targetPixels;
+            this.sourcePixels = sourcePixels;
+            this.map = map;
+            this.kernelMatrix = kernelMatrix;
+            this.configuration = configuration;
+            this.preserveAlpha = preserveAlpha;
+        }
+
+        /// <inheritdoc/>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public void Invoke(int y, Span<Vector4> span)
+        {
+            if (this.preserveAlpha)
+            {
+                this.Convolve3(y, span);
+            }
+            else
+            {
+                this.Convolve4(y, span);
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void Convolve3(int y, Span<Vector4> span)
+        {
+            // Span is 2x bounds.
+            int boundsX = this.bounds.X;
+            int boundsWidth = this.bounds.Width;
+            Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
+            Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
+
+            var state = new ConvolutionState(in this.kernelMatrix, this.map);
+            ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
+
+            // Clear the target buffer for each row run.
+            targetBuffer.Clear();
+            ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+
+            ReadOnlyKernel kernel = state.Kernel;
+            Span<TPixel> sourceRow;
+            for (int kY = 0; kY < kernel.Rows; kY++)
+            {
+                // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                int sampleY = Unsafe.Add(ref sampleRowBase, kY);
+                sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+
+                for (int x = 0; x < sourceBuffer.Length; x++)
+                {
+                    ref int sampleColumnBase = ref state.GetSampleColumn(x);
+                    ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
+
+                    for (int kX = 0; kX < kernel.Columns; kX++)
+                    {
+                        int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+                        target += kernel[kY, kX] * sample;
+                    }
+                }
+            }
+
+            // Now we need to copy the original alpha values from the source row.
+            sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+            PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+            for (int x = 0; x < sourceRow.Length; x++)
+            {
+                ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
+                target.W = Unsafe.Add(ref MemoryMarshal.GetReference(sourceBuffer), x).W;
+            }
+
+            Span<TPixel> targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+            PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow);
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        private void Convolve4(int y, Span<Vector4> span)
+        {
+            // Span is 2x bounds.
+            int boundsX = this.bounds.X;
+            int boundsWidth = this.bounds.Width;
+            Span<Vector4> sourceBuffer = span.Slice(0, this.bounds.Width);
+            Span<Vector4> targetBuffer = span.Slice(this.bounds.Width);
+
+            var state = new ConvolutionState(in this.kernelMatrix, this.map);
+            ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y);
+
+            // Clear the target buffer for each row run.
+            targetBuffer.Clear();
+            ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+
+            ReadOnlyKernel kernel = state.Kernel;
+            for (int kY = 0; kY < kernel.Rows; kY++)
+            {
+                // Get the precalculated source sample row for this kernel row and copy to our buffer.
+                int sampleY = Unsafe.Add(ref sampleRowBase, kY);
+                Span<TPixel> sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth);
+                PixelOperations<TPixel>.Instance.ToVector4(this.configuration, sourceRow, sourceBuffer);
+
+                Numerics.Premultiply(sourceBuffer);
+                ref Vector4 sourceBase = ref MemoryMarshal.GetReference(sourceBuffer);
+
+                for (int x = 0; x < sourceBuffer.Length; x++)
+                {
+                    ref int sampleColumnBase = ref state.GetSampleColumn(x);
+                    ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
+
+                    for (int kX = 0; kX < kernel.Columns; kX++)
+                    {
+                        int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+                        Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+                        target += kernel[kY, kX] * sample;
+                    }
+                }
+            }
+
+            Numerics.UnPremultiply(targetBuffer);
+
+            Span<TPixel> targetRow = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+            PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, targetBuffer, targetRow);
+        }
+    }
+}
--- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs
@ -0,0 +1,45 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp.Processing.Processors.Convolution
+{
+    /// <summary>
+    /// A stack only struct used for reducing reference indirection during convolution operations.
+    /// </summary>
+    internal readonly ref struct ConvolutionState
+    {
+        private readonly Span<int> rowOffsetMap;
+        private readonly Span<int> columnOffsetMap;
+        private readonly int kernelHeight;
+        private readonly int kernelWidth;
+
+        public ConvolutionState(
+            in DenseMatrix<float> kernel,
+            KernelSamplingMap map)
+        {
+            this.Kernel = new ReadOnlyKernel(kernel);
+            this.kernelHeight = kernel.Rows;
+            this.kernelWidth = kernel.Columns;
+            this.rowOffsetMap = map.GetRowOffsetSpan();
+            this.columnOffsetMap = map.GetColumnOffsetSpan();
+        }
+
+        public readonly ReadOnlyKernel Kernel
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get;
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public readonly ref int GetSampleRow(int row)
+            => ref Unsafe.Add(ref MemoryMarshal.GetReference(this.rowOffsetMap), row * this.kernelHeight);
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public readonly ref int GetSampleColumn(int column)
+            => ref Unsafe.Add(ref MemoryMarshal.GetReference(this.columnOffsetMap), column * this.kernelWidth);
+    }
+}
--- a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
@ -0,0 +1,102 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Buffers;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Memory;
+
+namespace SixLabors.ImageSharp.Processing.Processors.Convolution
+{
+    /// <summary>
+    /// Provides a map of the convolution kernel sampling offsets.
+    /// </summary>
+    internal sealed class KernelSamplingMap : IDisposable
+    {
+        private readonly MemoryAllocator allocator;
+        private bool isDisposed;
+        private IMemoryOwner<int> yOffsets;
+        private IMemoryOwner<int> xOffsets;
+
+        /// <summary>
+        /// Initializes a new instance of the <see cref="KernelSamplingMap"/> class.
+        /// </summary>
+        /// <param name="allocator">The memory allocator.</param>
+        public KernelSamplingMap(MemoryAllocator allocator) => this.allocator = allocator;
+
+        /// <summary>
+        /// Builds a map of the sampling offsets for the kernel clamped by the given bounds.
+        /// </summary>
+        /// <param name="kernel">The convolution kernel.</param>
+        /// <param name="bounds">The source bounds.</param>
+        public void BuildSamplingOffsetMap(DenseMatrix<float> kernel, Rectangle bounds)
+        {
+            int kernelHeight = kernel.Rows;
+            int kernelWidth = kernel.Columns;
+            this.yOffsets = this.allocator.Allocate<int>(bounds.Height * kernelHeight);
+            this.xOffsets = this.allocator.Allocate<int>(bounds.Width * kernelWidth);
+
+            int minY = bounds.Y;
+            int maxY = bounds.Bottom - 1;
+            int minX = bounds.X;
+            int maxX = bounds.Right - 1;
+
+            int radiusY = kernelHeight >> 1;
+            int radiusX = kernelWidth >> 1;
+
+            // Calculate the y and x sampling offsets clamped to the given rectangle.
+            // While this isn't a hotpath we still dip into unsafe to avoid the span bounds
+            // checks as the can potentially be looping over large arrays.
+            Span<int> ySpan = this.yOffsets.GetSpan();
+            ref int ySpanBase = ref MemoryMarshal.GetReference(ySpan);
+            for (int row = 0; row < bounds.Height; row++)
+            {
+                int rowBase = row * kernelHeight;
+                for (int y = 0; y < kernelHeight; y++)
+                {
+                    Unsafe.Add(ref ySpanBase, rowBase + y) = row + y + minY - radiusY;
+                }
+            }
+
+            if (kernelHeight > 1)
+            {
+                Numerics.Clamp(ySpan, minY, maxY);
+            }
+
+            Span<int> xSpan = this.xOffsets.GetSpan();
+            ref int xSpanBase = ref MemoryMarshal.GetReference(xSpan);
+            for (int column = 0; column < bounds.Width; column++)
+            {
+                int columnBase = column * kernelWidth;
+                for (int x = 0; x < kernelWidth; x++)
+                {
+                    Unsafe.Add(ref xSpanBase, columnBase + x) = column + x + minX - radiusX;
+                }
+            }
+
+            if (kernelWidth > 1)
+            {
+                Numerics.Clamp(xSpan, minX, maxX);
+            }
+        }
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public Span<int> GetRowOffsetSpan() => this.yOffsets.GetSpan();
+
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        public Span<int> GetColumnOffsetSpan() => this.xOffsets.GetSpan();
+
+        /// <inheritdoc/>
+        public void Dispose()
+        {
+            if (!this.isDisposed)
+            {
+                this.yOffsets.Dispose();
+                this.xOffsets.Dispose();
+
+                this.isDisposed = true;
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs
@ -0,0 +1,63 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp.Processing.Processors.Convolution
+{
+    /// <summary>
+    /// A stack only, readonly, kernel matrix that can be indexed without
+    /// bounds checks when compiled in release mode.
+    /// </summary>
+    internal readonly ref struct ReadOnlyKernel
+    {
+        private readonly ReadOnlySpan<float> values;
+
+        public ReadOnlyKernel(DenseMatrix<float> matrix)
+        {
+            this.Columns = matrix.Columns;
+            this.Rows = matrix.Rows;
+            this.values = matrix.Span;
+        }
+
+        public int Columns
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get;
+        }
+
+        public int Rows
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get;
+        }
+
+        public float this[int row, int column]
+        {
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            get
+            {
+                this.CheckCoordinates(row, column);
+                ref float vBase = ref MemoryMarshal.GetReference(this.values);
+                return Unsafe.Add(ref vBase, (row * this.Columns) + column);
+            }
+        }
+
+        [Conditional("DEBUG")]
+        private void CheckCoordinates(int row, int column)
+        {
+            if (row < 0 || row >= this.Rows)
+            {
+                throw new ArgumentOutOfRangeException(nameof(row), row, $"{row} is outwith the matrix bounds.");
+            }
+
+            if (column < 0 || column >= this.Columns)
+            {
+                throw new ArgumentOutOfRangeException(nameof(column), column, $"{column} is outwith the matrix bounds.");
+            }
+        }
+    }
+}
--- a/tests/ImageSharp.Benchmarks/Config.cs
+++ b/tests/ImageSharp.Benchmarks/Config.cs
@ -27,6 +27,14 @@ namespace SixLabors.ImageSharp.Benchmarks

        }

+        public class MultiFramework : Config
+        {
+            public MultiFramework() => this.AddJob(
+                    Job.Default.WithRuntime(ClrRuntime.Net472),
+                    Job.Default.WithRuntime(CoreRuntime.Core21),
+                    Job.Default.WithRuntime(CoreRuntime.Core31));
+        }
+
        public class ShortClr : Config
        {
            public ShortClr() => this.AddJob(
--- a/tests/ImageSharp.Benchmarks/Samplers/GaussianBlur.cs
+++ b/tests/ImageSharp.Benchmarks/Samplers/GaussianBlur.cs
@ -7,7 +7,7 @@ using SixLabors.ImageSharp.Processing;

 namespace SixLabors.ImageSharp.Benchmarks.Samplers
 {
-    [Config(typeof(Config.ShortClr))]
+    [Config(typeof(Config.MultiFramework))]
    public class GaussianBlur
    {
        [Benchmark]