From 01c3fab408d1ab354947889c21066e1559021d32 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sat, 12 Dec 2020 15:40:14 +0100 Subject: [PATCH 01/22] Add BokehBlur benchmark --- .../Samplers/BokehBlur.cs | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs diff --git a/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs b/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs new file mode 100644 index 000000000..1c3b1a7b2 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs @@ -0,0 +1,22 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.PixelFormats; +using SixLabors.ImageSharp.Processing; + +namespace SixLabors.ImageSharp.Benchmarks.Samplers +{ + [Config(typeof(Config.MultiFramework))] + public class BokehBlur + { + [Benchmark] + public void Blur() + { + using (var image = new Image(Configuration.Default, 400, 400, Color.White)) + { + image.Mutate(c => c.BokehBlur()); + } + } + } +} From bd6e555312bae10bd211a6bcdabc1ee1bcb5b87a Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sat, 12 Dec 2020 18:02:31 +0100 Subject: [PATCH 02/22] Minor code refactoring to improve flexibility --- .../Convolution2DRowOperation{TPixel}.cs | 8 +++---- .../Convolution/Convolution2DState.cs | 8 +++---- .../ConvolutionProcessor{TPixel}.cs | 2 +- .../ConvolutionRowOperation{TPixel}.cs | 12 +++++----- .../Convolution/ConvolutionState.cs | 23 +++++++++++++++---- .../Convolution/KernelSamplingMap.cs | 11 +++++++-- ...ReadOnlyKernel.cs => ReadOnlyKernel{T}.cs} | 19 +++++++++++---- 7 files changed, 57 insertions(+), 26 deletions(-) rename src/ImageSharp/Processing/Processors/Convolution/{ReadOnlyKernel.cs => ReadOnlyKernel{T}.cs} (73%) diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs index 802d1809f..dd3e98609 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs @@ -80,8 +80,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution ref Vector4 targetBaseY = ref MemoryMarshal.GetReference(targetYBuffer); ref Vector4 targetBaseX = ref MemoryMarshal.GetReference(targetXBuffer); - ReadOnlyKernel kernelY = state.KernelY; - ReadOnlyKernel kernelX = state.KernelX; + ReadOnlyKernel kernelY = state.KernelY; + ReadOnlyKernel kernelX = state.KernelX; Span sourceRow; for (int kY = 0; kY < kernelY.Rows; kY++) { @@ -146,8 +146,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution ref Vector4 targetBaseY = ref MemoryMarshal.GetReference(targetYBuffer); ref Vector4 targetBaseX = ref MemoryMarshal.GetReference(targetXBuffer); - ReadOnlyKernel kernelY = state.KernelY; - ReadOnlyKernel kernelX = state.KernelX; + ReadOnlyKernel kernelY = state.KernelY; + ReadOnlyKernel kernelX = state.KernelX; for (int kY = 0; kY < kernelY.Rows; kY++) { // Get the precalculated source sample row for this kernel row and copy to our buffer. diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs index 218093ac4..6f9b11857 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs @@ -23,21 +23,21 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution KernelSamplingMap map) { // We check the kernels are the same size upstream. - this.KernelY = new ReadOnlyKernel(kernelY); - this.KernelX = new ReadOnlyKernel(kernelX); + this.KernelY = new ReadOnlyKernel(kernelY); + this.KernelX = new ReadOnlyKernel(kernelX); this.kernelHeight = kernelY.Rows; this.kernelWidth = kernelY.Columns; this.rowOffsetMap = map.GetRowOffsetSpan(); this.columnOffsetMap = map.GetColumnOffsetSpan(); } - public readonly ReadOnlyKernel KernelY + public readonly ReadOnlyKernel KernelY { [MethodImpl(MethodImplOptions.AggressiveInlining)] get; } - public readonly ReadOnlyKernel KernelX + public readonly ReadOnlyKernel KernelX { [MethodImpl(MethodImplOptions.AggressiveInlining)] get; diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs index 924a1125b..b0254bc91 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs @@ -120,7 +120,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution ref Vector4 targetRowRef = ref MemoryMarshal.GetReference(span); Span targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth); - var state = new ConvolutionState(in this.kernel, this.map); + var state = new ConvolutionState(in this.kernel, this.map); int row = y - this.bounds.Y; ref int sampleRowBase = ref state.GetSampleRow(row); diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs index 9876b2885..beccfff01 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs @@ -67,14 +67,14 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution Span sourceBuffer = span.Slice(0, this.bounds.Width); Span targetBuffer = span.Slice(this.bounds.Width); - var state = new ConvolutionState(in this.kernelMatrix, this.map); + var state = new ConvolutionState(in this.kernelMatrix, this.map); ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); // Clear the target buffer for each row run. targetBuffer.Clear(); ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - ReadOnlyKernel kernel = state.Kernel; + ReadOnlyKernel kernel = state.Kernel; Span sourceRow; for (int kY = 0; kY < kernel.Rows; kY++) { @@ -119,17 +119,17 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution // Span is 2x bounds. int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; - Span sourceBuffer = span.Slice(0, this.bounds.Width); - Span targetBuffer = span.Slice(this.bounds.Width); + Span sourceBuffer = span.Slice(0, boundsWidth); + Span targetBuffer = span.Slice(boundsWidth); - var state = new ConvolutionState(in this.kernelMatrix, this.map); + var state = new ConvolutionState(in this.kernelMatrix, this.map); ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); // Clear the target buffer for each row run. targetBuffer.Clear(); ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - ReadOnlyKernel kernel = state.Kernel; + ReadOnlyKernel kernel = state.Kernel; for (int kY = 0; kY < kernel.Rows; kY++) { // Get the precalculated source sample row for this kernel row and copy to our buffer. diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs index 3f296c67d..0b3dbc2d1 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs @@ -10,7 +10,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution /// /// A stack only struct used for reducing reference indirection during convolution operations. /// - internal readonly ref struct ConvolutionState + /// The type of values for the kernel in use. + internal readonly ref struct ConvolutionState + where T : unmanaged, IEquatable { private readonly Span rowOffsetMap; private readonly Span columnOffsetMap; @@ -18,17 +20,30 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution private readonly int kernelWidth; public ConvolutionState( - in DenseMatrix kernel, + in DenseMatrix kernel, KernelSamplingMap map) { - this.Kernel = new ReadOnlyKernel(kernel); + this.Kernel = new ReadOnlyKernel(kernel); this.kernelHeight = kernel.Rows; this.kernelWidth = kernel.Columns; this.rowOffsetMap = map.GetRowOffsetSpan(); this.columnOffsetMap = map.GetColumnOffsetSpan(); } - public readonly ReadOnlyKernel Kernel + public ConvolutionState( + T[] kernel, + int height, + int width, + KernelSamplingMap map) + { + this.Kernel = new ReadOnlyKernel(kernel, height, width); + this.kernelHeight = height; + this.kernelWidth = width; + this.rowOffsetMap = map.GetRowOffsetSpan(); + this.columnOffsetMap = map.GetColumnOffsetSpan(); + } + + public readonly ReadOnlyKernel Kernel { [MethodImpl(MethodImplOptions.AggressiveInlining)] get; diff --git a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs index e4b7dbea0..f912b9562 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs @@ -31,9 +31,16 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution /// The convolution kernel. /// The source bounds. public void BuildSamplingOffsetMap(DenseMatrix kernel, Rectangle bounds) + => this.BuildSamplingOffsetMap(kernel.Rows, kernel.Columns, bounds); + + /// + /// Builds a map of the sampling offsets for the kernel clamped by the given bounds. + /// + /// The height (number of rows) of the convolution kernel to use. + /// The width (number of columns) of the convolution kernel to use. + /// The source bounds. + public void BuildSamplingOffsetMap(int kernelHeight, int kernelWidth, Rectangle bounds) { - int kernelHeight = kernel.Rows; - int kernelWidth = kernel.Columns; this.yOffsets = this.allocator.Allocate(bounds.Height * kernelHeight); this.xOffsets = this.allocator.Allocate(bounds.Width * kernelWidth); diff --git a/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs b/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs similarity index 73% rename from src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs rename to src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs index 37e006005..f95c3dc0a 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs @@ -12,17 +12,26 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution /// A stack only, readonly, kernel matrix that can be indexed without /// bounds checks when compiled in release mode. /// - internal readonly ref struct ReadOnlyKernel + /// The type of items in the kernel. + internal readonly ref struct ReadOnlyKernel + where T : unmanaged, IEquatable { - private readonly ReadOnlySpan values; + private readonly ReadOnlySpan values; - public ReadOnlyKernel(DenseMatrix matrix) + public ReadOnlyKernel(DenseMatrix matrix) { this.Columns = matrix.Columns; this.Rows = matrix.Rows; this.values = matrix.Span; } + public ReadOnlyKernel(T[] kernel, int height, int width) + { + this.Columns = width; + this.Rows = height; + this.values = kernel; + } + public int Columns { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -35,13 +44,13 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution get; } - public float this[int row, int column] + public T this[int row, int column] { [MethodImpl(MethodImplOptions.AggressiveInlining)] get { this.CheckCoordinates(row, column); - ref float vBase = ref MemoryMarshal.GetReference(this.values); + ref T vBase = ref MemoryMarshal.GetReference(this.values); return Unsafe.Add(ref vBase, (row * this.Columns) + column); } } From b3f4befe5ecd75aad288bb1e315e5597ba5341b9 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sat, 12 Dec 2020 20:02:10 +0100 Subject: [PATCH 03/22] Switched bokeh blur to optimized pipeline --- .../Convolution/BokehBlurProcessor.cs | 49 ++++++-- .../Convolution/BokehBlurProcessor{TPixel}.cs | 115 ++++++++++++++---- .../Convolution2PassProcessor{TPixel}.cs | 3 - 3 files changed, 132 insertions(+), 35 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs index 352960f41..e8f7351fa 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs @@ -4,6 +4,7 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; using SixLabors.ImageSharp.Advanced; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -91,31 +92,30 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution /// it is actually used, because it does not use any generic parameters internally. Defining in a non-generic class means that there will only /// ever be a single instantiation of this type for the JIT/AOT compilers to process, instead of having duplicate versions for each pixel type. /// - internal readonly struct ApplyHorizontalConvolutionRowOperation : IRowOperation + internal readonly struct SecondPassConvolutionRowOperation : IRowOperation { private readonly Rectangle bounds; private readonly Buffer2D targetValues; private readonly Buffer2D sourceValues; + private readonly KernelSamplingMap map; private readonly Complex64[] kernel; private readonly float z; private readonly float w; - private readonly int maxY; - private readonly int maxX; [MethodImpl(InliningOptions.ShortMethod)] - public ApplyHorizontalConvolutionRowOperation( + public SecondPassConvolutionRowOperation( Rectangle bounds, Buffer2D targetValues, Buffer2D sourceValues, + KernelSamplingMap map, Complex64[] kernel, float z, float w) { this.bounds = bounds; - this.maxY = this.bounds.Bottom - 1; - this.maxX = this.bounds.Right - 1; this.targetValues = targetValues; this.sourceValues = sourceValues; + this.map = map; this.kernel = kernel; this.z = z; this.w = w; @@ -125,11 +125,42 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution [MethodImpl(InliningOptions.ShortMethod)] public void Invoke(int y) { - Span targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X); + int boundsX = this.bounds.X; + int boundsWidth = this.bounds.Width; + Span targetBuffer = this.targetValues.GetRowSpan(y); - for (int x = 0; x < this.bounds.Width; x++) + var state = new ConvolutionState(this.kernel, this.kernel.Length, 1, this.map); + ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); + + // The target buffer is zeroed initially and then it accumulates the results + // of each partial convolution, so we don't have to clear it here as well. + ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + + ReadOnlyKernel kernel = state.Kernel; + + for (int kY = 0; kY < kernel.Rows; kY++) { - Buffer2DUtils.Convolve4AndAccumulatePartials(this.kernel, this.sourceValues, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX, this.z, this.w); + // Get the precalculated source sample row for this kernel row and copy to our buffer. + int sampleY = Unsafe.Add(ref sampleRowBase, kY); + Span sourceRow = this.sourceValues.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + ref ComplexVector4 sourceBase = ref MemoryMarshal.GetReference(sourceRow); + + for (int x = 0; x < boundsWidth; x++) + { + ref int sampleColumnBase = ref state.GetSampleColumn(x); + ref Vector4 target = ref Unsafe.Add(ref targetBase, x); + ComplexVector4 pixel4 = default; + + for (int kX = 0; kX < kernel.Columns; kX++) + { + int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; + ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX); + + pixel4.Sum(kernel[kY, kX] * sample); + } + + target += pixel4.WeightedSum(this.z, this.w); + } } } } diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs index dfe54bf2e..aa6160799 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs @@ -26,6 +26,11 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution /// private readonly float gamma; + /// + /// The size of each complex convolution kernel. + /// + private readonly int kernelSize; + /// /// The kernel parameters to use for the current instance (a: X, b: Y, A: Z, B: W) /// @@ -47,11 +52,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution : base(configuration, source, sourceRectangle) { this.gamma = definition.Gamma; + this.kernelSize = (definition.Radius * 2) + 1; // Get the bokeh blur data BokehBlurKernelData data = BokehBlurKernelDataProvider.GetBokehBlurKernelData( definition.Radius, - (definition.Radius * 2) + 1, + this.kernelSize, definition.Components); this.kernelParameters = data.Parameters; @@ -108,69 +114,132 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution Buffer2D processingBuffer) { // Allocate the buffer with the intermediate convolution results - using Buffer2D firstPassBuffer = this.Configuration.MemoryAllocator.Allocate2D(source.Size()); + using Buffer2D firstPassBuffer = configuration.MemoryAllocator.Allocate2D(source.Size()); + + var interest = Rectangle.Intersect(sourceRectangle, source.Bounds()); + + // Unlike in the standard 2 pass convolution processor, we use a rectangle of 1x the interest width + // to speedup the actual convolution, by applying bulk pixel conversion and clamping calculation. + // The second half of the buffer will just target the temporary buffer of complex pixel values. + // This is needed because the bokeh blur operates as TPixel -> complex -> TPixel, so we cannot + // convert back to standard pixels after each separate 1D convolution pass. Like in the gaussian + // blur though, we preallocate and compute the kernel sampling maps before processing each complex + // component, to avoid recomputing the same sampling map once per convolution pass. + using var mapX = new KernelSamplingMap(configuration.MemoryAllocator); + using var mapY = new KernelSamplingMap(configuration.MemoryAllocator); + + mapX.BuildSamplingOffsetMap(1, this.kernelSize, interest); + mapY.BuildSamplingOffsetMap(this.kernelSize, 1, interest); - // Perform two 1D convolutions for each component in the current instance ref Complex64[] baseRef = ref MemoryMarshal.GetReference(this.kernels.AsSpan()); ref Vector4 paramsRef = ref MemoryMarshal.GetReference(this.kernelParameters.AsSpan()); + + // Perform two 1D convolutions for each component in the current instance for (int i = 0; i < this.kernels.Length; i++) { // Compute the resulting complex buffer for the current component Complex64[] kernel = Unsafe.Add(ref baseRef, i); Vector4 parameters = Unsafe.Add(ref paramsRef, i); - // Compute the vertical 1D convolution - var verticalOperation = new ApplyVerticalConvolutionRowOperation(sourceRectangle, firstPassBuffer, source.PixelBuffer, kernel); - ParallelRowIterator.IterateRows( + // Horizontal convolution + var horizontalOperation = new FirstPassConvolutionRowOperation( + interest, + firstPassBuffer, + source.PixelBuffer, + mapX, + kernel, + configuration); + + ParallelRowIterator.IterateRows( configuration, - sourceRectangle, - in verticalOperation); + interest, + in horizontalOperation); + + // Vertical 1D convolutions to accumulate the partial results on the target buffer + var verticalOperation = new BokehBlurProcessor.SecondPassConvolutionRowOperation( + interest, + processingBuffer, + firstPassBuffer, + mapY, + kernel, + parameters.Z, + parameters.W); - // Compute the horizontal 1D convolutions and accumulate the partial results on the target buffer - var horizontalOperation = new BokehBlurProcessor.ApplyHorizontalConvolutionRowOperation(sourceRectangle, processingBuffer, firstPassBuffer, kernel, parameters.Z, parameters.W); ParallelRowIterator.IterateRows( configuration, - sourceRectangle, - in horizontalOperation); + interest, + in verticalOperation); } } /// /// A implementing the vertical convolution logic for . /// - private readonly struct ApplyVerticalConvolutionRowOperation : IRowOperation + private readonly struct FirstPassConvolutionRowOperation : IRowOperation { private readonly Rectangle bounds; private readonly Buffer2D targetValues; private readonly Buffer2D sourcePixels; + private readonly KernelSamplingMap map; private readonly Complex64[] kernel; - private readonly int maxY; - private readonly int maxX; + private readonly Configuration configuration; [MethodImpl(InliningOptions.ShortMethod)] - public ApplyVerticalConvolutionRowOperation( + public FirstPassConvolutionRowOperation( Rectangle bounds, Buffer2D targetValues, Buffer2D sourcePixels, - Complex64[] kernel) + KernelSamplingMap map, + Complex64[] kernel, + Configuration configuration) { this.bounds = bounds; - this.maxY = this.bounds.Bottom - 1; - this.maxX = this.bounds.Right - 1; this.targetValues = targetValues; this.sourcePixels = sourcePixels; + this.map = map; this.kernel = kernel; + this.configuration = configuration; } /// [MethodImpl(InliningOptions.ShortMethod)] - public void Invoke(int y) + public void Invoke(int y, Span span) { - Span targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X); + int boundsX = this.bounds.X; + int boundsWidth = this.bounds.Width; - for (int x = 0; x < this.bounds.Width; x++) + var state = new ConvolutionState(this.kernel, 1, this.kernel.Length, this.map); + ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); + + Span targetBuffer = this.targetValues.GetRowSpan(y); + + // Clear the target buffer + targetBuffer.Clear(); + ref ComplexVector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + + ReadOnlyKernel kernel = state.Kernel; + + for (int kY = 0; kY < kernel.Rows; kY++) { - Buffer2DUtils.Convolve4(this.kernel, this.sourcePixels, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX); + // Get the precalculated source sample row for this kernel row and copy to our buffer. + int sampleY = Unsafe.Add(ref sampleRowBase, kY); + Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + PixelOperations.Instance.ToVector4(this.configuration, sourceRow, span); + + ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span); + + for (int x = 0; x < span.Length; x++) + { + ref int sampleColumnBase = ref state.GetSampleColumn(x); + ref ComplexVector4 target = ref Unsafe.Add(ref targetBase, x); + + for (int kX = 0; kX < kernel.Columns; kX++) + { + int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; + Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); + target.Sum(kernel[kY, kX] * sample); + } + } } } } diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs index 151b0ffcc..16ce0fdd7 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs @@ -1,10 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. -using System; using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; using SixLabors.ImageSharp.Advanced; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; From ca1a67a36cacd3c95c8a1fd91fdc994d57460b08 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sat, 12 Dec 2020 22:10:36 +0100 Subject: [PATCH 04/22] Specialize bokeh blur operations for 1D kernels --- .../Convolution/BokehBlurProcessor.cs | 34 +++++++------- .../Convolution/BokehBlurProcessor{TPixel}.cs | 46 ++++++++++--------- 2 files changed, 40 insertions(+), 40 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs index e8f7351fa..edaac45b6 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs @@ -127,39 +127,37 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution { int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; - Span targetBuffer = this.targetValues.GetRowSpan(y); + int kernelSize = this.kernel.Length; - var state = new ConvolutionState(this.kernel, this.kernel.Length, 1, this.map); - ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); + Span rowOffsets = this.map.GetRowOffsetSpan(); + Span columnOffsets = this.map.GetColumnOffsetSpan(); + ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize); + ref int sampleColumnBase = ref MemoryMarshal.GetReference(columnOffsets); // The target buffer is zeroed initially and then it accumulates the results - // of each partial convolution, so we don't have to clear it here as well. - ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + // of each partial convolution, so we don't have to clear it here as well + Span targetBuffer = this.targetValues.GetRowSpan(y); - ReadOnlyKernel kernel = state.Kernel; + ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + ref Complex64 kernelBase = ref this.kernel[0]; - for (int kY = 0; kY < kernel.Rows; kY++) + for (int kY = 0; kY < kernelSize; kY++) { - // Get the precalculated source sample row for this kernel row and copy to our buffer. + // Get the precalculated source sample row for this kernel row and copy to our buffer int sampleY = Unsafe.Add(ref sampleRowBase, kY); Span sourceRow = this.sourceValues.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); ref ComplexVector4 sourceBase = ref MemoryMarshal.GetReference(sourceRow); + Complex64 factor = Unsafe.Add(ref kernelBase, kY); for (int x = 0; x < boundsWidth; x++) { - ref int sampleColumnBase = ref state.GetSampleColumn(x); + int sampleX = Unsafe.Add(ref sampleColumnBase, x) - boundsX; ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - ComplexVector4 pixel4 = default; - - for (int kX = 0; kX < kernel.Columns; kX++) - { - int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; - ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX); - pixel4.Sum(kernel[kY, kX] * sample); - } + ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX); + ComplexVector4 partial = factor * sample; - target += pixel4.WeightedSum(this.z, this.w); + target += partial.WeightedSum(this.z, this.w); } } } diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs index aa6160799..cdadd4dee 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs @@ -207,39 +207,41 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution { int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; + int kernelSize = this.kernel.Length; - var state = new ConvolutionState(this.kernel, 1, this.kernel.Length, this.map); - ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); + Span rowOffsets = this.map.GetRowOffsetSpan(); + Span columnOffsets = this.map.GetColumnOffsetSpan(); + int sampleY = Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), y - this.bounds.Y); + ref int sampleColumnBase = ref MemoryMarshal.GetReference(columnOffsets); + // Clear the target buffer for each row run Span targetBuffer = this.targetValues.GetRowSpan(y); - - // Clear the target buffer targetBuffer.Clear(); ref ComplexVector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - ReadOnlyKernel kernel = state.Kernel; + // Execute the bulk pixel format conversion for the current row + Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + PixelOperations.Instance.ToVector4(this.configuration, sourceRow, span); - for (int kY = 0; kY < kernel.Rows; kY++) - { - // Get the precalculated source sample row for this kernel row and copy to our buffer. - int sampleY = Unsafe.Add(ref sampleRowBase, kY); - Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); - PixelOperations.Instance.ToVector4(this.configuration, sourceRow, span); + ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span); + ref Complex64 kernelBase = ref this.kernel[0]; - ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span); + for (int x = 0; x < span.Length; x++) + { + ref ComplexVector4 target = ref Unsafe.Add(ref targetBase, x); - for (int x = 0; x < span.Length; x++) + for (int kX = 0; kX < kernelSize; kX++) { - ref int sampleColumnBase = ref state.GetSampleColumn(x); - ref ComplexVector4 target = ref Unsafe.Add(ref targetBase, x); - - for (int kX = 0; kX < kernel.Columns; kX++) - { - int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; - Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); - target.Sum(kernel[kY, kX] * sample); - } + int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX; + Vector4 sample = Unsafe.Add(ref sourceBase, sampleX); + Complex64 factor = Unsafe.Add(ref kernelBase, kX); + + target.Sum(factor * sample); } + + // Shift the base column sampling reference by one row at the end of each outer + // iteration so that the inner tight loop indexing can skip the multiplication + sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize); } } } From 22f151286928f582dc522dcb3e3609cb737428d6 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sat, 12 Dec 2020 22:15:00 +0100 Subject: [PATCH 05/22] Minor code tweaks --- .../Processors/Convolution/BokehBlurProcessor.cs | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs index edaac45b6..243bc46cb 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs @@ -136,24 +136,20 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution // The target buffer is zeroed initially and then it accumulates the results // of each partial convolution, so we don't have to clear it here as well - Span targetBuffer = this.targetValues.GetRowSpan(y); - - ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); + ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(0, y); ref Complex64 kernelBase = ref this.kernel[0]; for (int kY = 0; kY < kernelSize; kY++) { // Get the precalculated source sample row for this kernel row and copy to our buffer int sampleY = Unsafe.Add(ref sampleRowBase, kY); - Span sourceRow = this.sourceValues.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); - ref ComplexVector4 sourceBase = ref MemoryMarshal.GetReference(sourceRow); + ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(boundsX, sampleY); Complex64 factor = Unsafe.Add(ref kernelBase, kY); for (int x = 0; x < boundsWidth; x++) { int sampleX = Unsafe.Add(ref sampleColumnBase, x) - boundsX; ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX); ComplexVector4 partial = factor * sample; From 68eeca928295ef1ace6a2e034e11073c54ab278b Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sat, 12 Dec 2020 22:19:24 +0100 Subject: [PATCH 06/22] Restore temporary changes --- .../Convolution2DRowOperation{TPixel}.cs | 8 +++---- .../Convolution/Convolution2DState.cs | 8 +++---- .../ConvolutionProcessor{TPixel}.cs | 2 +- .../ConvolutionRowOperation{TPixel}.cs | 12 +++++----- .../Convolution/ConvolutionState.cs | 23 ++++--------------- ...ReadOnlyKernel{T}.cs => ReadOnlyKernel.cs} | 19 ++++----------- 6 files changed, 24 insertions(+), 48 deletions(-) rename src/ImageSharp/Processing/Processors/Convolution/{ReadOnlyKernel{T}.cs => ReadOnlyKernel.cs} (73%) diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs index dd3e98609..802d1809f 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DRowOperation{TPixel}.cs @@ -80,8 +80,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution ref Vector4 targetBaseY = ref MemoryMarshal.GetReference(targetYBuffer); ref Vector4 targetBaseX = ref MemoryMarshal.GetReference(targetXBuffer); - ReadOnlyKernel kernelY = state.KernelY; - ReadOnlyKernel kernelX = state.KernelX; + ReadOnlyKernel kernelY = state.KernelY; + ReadOnlyKernel kernelX = state.KernelX; Span sourceRow; for (int kY = 0; kY < kernelY.Rows; kY++) { @@ -146,8 +146,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution ref Vector4 targetBaseY = ref MemoryMarshal.GetReference(targetYBuffer); ref Vector4 targetBaseX = ref MemoryMarshal.GetReference(targetXBuffer); - ReadOnlyKernel kernelY = state.KernelY; - ReadOnlyKernel kernelX = state.KernelX; + ReadOnlyKernel kernelY = state.KernelY; + ReadOnlyKernel kernelX = state.KernelX; for (int kY = 0; kY < kernelY.Rows; kY++) { // Get the precalculated source sample row for this kernel row and copy to our buffer. diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs index 6f9b11857..218093ac4 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2DState.cs @@ -23,21 +23,21 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution KernelSamplingMap map) { // We check the kernels are the same size upstream. - this.KernelY = new ReadOnlyKernel(kernelY); - this.KernelX = new ReadOnlyKernel(kernelX); + this.KernelY = new ReadOnlyKernel(kernelY); + this.KernelX = new ReadOnlyKernel(kernelX); this.kernelHeight = kernelY.Rows; this.kernelWidth = kernelY.Columns; this.rowOffsetMap = map.GetRowOffsetSpan(); this.columnOffsetMap = map.GetColumnOffsetSpan(); } - public readonly ReadOnlyKernel KernelY + public readonly ReadOnlyKernel KernelY { [MethodImpl(MethodImplOptions.AggressiveInlining)] get; } - public readonly ReadOnlyKernel KernelX + public readonly ReadOnlyKernel KernelX { [MethodImpl(MethodImplOptions.AggressiveInlining)] get; diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs index b0254bc91..924a1125b 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionProcessor{TPixel}.cs @@ -120,7 +120,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution ref Vector4 targetRowRef = ref MemoryMarshal.GetReference(span); Span targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(boundsX, boundsWidth); - var state = new ConvolutionState(in this.kernel, this.map); + var state = new ConvolutionState(in this.kernel, this.map); int row = y - this.bounds.Y; ref int sampleRowBase = ref state.GetSampleRow(row); diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs index beccfff01..9876b2885 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionRowOperation{TPixel}.cs @@ -67,14 +67,14 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution Span sourceBuffer = span.Slice(0, this.bounds.Width); Span targetBuffer = span.Slice(this.bounds.Width); - var state = new ConvolutionState(in this.kernelMatrix, this.map); + var state = new ConvolutionState(in this.kernelMatrix, this.map); ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); // Clear the target buffer for each row run. targetBuffer.Clear(); ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - ReadOnlyKernel kernel = state.Kernel; + ReadOnlyKernel kernel = state.Kernel; Span sourceRow; for (int kY = 0; kY < kernel.Rows; kY++) { @@ -119,17 +119,17 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution // Span is 2x bounds. int boundsX = this.bounds.X; int boundsWidth = this.bounds.Width; - Span sourceBuffer = span.Slice(0, boundsWidth); - Span targetBuffer = span.Slice(boundsWidth); + Span sourceBuffer = span.Slice(0, this.bounds.Width); + Span targetBuffer = span.Slice(this.bounds.Width); - var state = new ConvolutionState(in this.kernelMatrix, this.map); + var state = new ConvolutionState(in this.kernelMatrix, this.map); ref int sampleRowBase = ref state.GetSampleRow(y - this.bounds.Y); // Clear the target buffer for each row run. targetBuffer.Clear(); ref Vector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); - ReadOnlyKernel kernel = state.Kernel; + ReadOnlyKernel kernel = state.Kernel; for (int kY = 0; kY < kernel.Rows; kY++) { // Get the precalculated source sample row for this kernel row and copy to our buffer. diff --git a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs index 0b3dbc2d1..3f296c67d 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/ConvolutionState.cs @@ -10,9 +10,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution /// /// A stack only struct used for reducing reference indirection during convolution operations. /// - /// The type of values for the kernel in use. - internal readonly ref struct ConvolutionState - where T : unmanaged, IEquatable + internal readonly ref struct ConvolutionState { private readonly Span rowOffsetMap; private readonly Span columnOffsetMap; @@ -20,30 +18,17 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution private readonly int kernelWidth; public ConvolutionState( - in DenseMatrix kernel, + in DenseMatrix kernel, KernelSamplingMap map) { - this.Kernel = new ReadOnlyKernel(kernel); + this.Kernel = new ReadOnlyKernel(kernel); this.kernelHeight = kernel.Rows; this.kernelWidth = kernel.Columns; this.rowOffsetMap = map.GetRowOffsetSpan(); this.columnOffsetMap = map.GetColumnOffsetSpan(); } - public ConvolutionState( - T[] kernel, - int height, - int width, - KernelSamplingMap map) - { - this.Kernel = new ReadOnlyKernel(kernel, height, width); - this.kernelHeight = height; - this.kernelWidth = width; - this.rowOffsetMap = map.GetRowOffsetSpan(); - this.columnOffsetMap = map.GetColumnOffsetSpan(); - } - - public readonly ReadOnlyKernel Kernel + public readonly ReadOnlyKernel Kernel { [MethodImpl(MethodImplOptions.AggressiveInlining)] get; diff --git a/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs b/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs similarity index 73% rename from src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs rename to src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs index f95c3dc0a..37e006005 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel{T}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/ReadOnlyKernel.cs @@ -12,26 +12,17 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution /// A stack only, readonly, kernel matrix that can be indexed without /// bounds checks when compiled in release mode. /// - /// The type of items in the kernel. - internal readonly ref struct ReadOnlyKernel - where T : unmanaged, IEquatable + internal readonly ref struct ReadOnlyKernel { - private readonly ReadOnlySpan values; + private readonly ReadOnlySpan values; - public ReadOnlyKernel(DenseMatrix matrix) + public ReadOnlyKernel(DenseMatrix matrix) { this.Columns = matrix.Columns; this.Rows = matrix.Rows; this.values = matrix.Span; } - public ReadOnlyKernel(T[] kernel, int height, int width) - { - this.Columns = width; - this.Rows = height; - this.values = kernel; - } - public int Columns { [MethodImpl(MethodImplOptions.AggressiveInlining)] @@ -44,13 +35,13 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution get; } - public T this[int row, int column] + public float this[int row, int column] { [MethodImpl(MethodImplOptions.AggressiveInlining)] get { this.CheckCoordinates(row, column); - ref T vBase = ref MemoryMarshal.GetReference(this.values); + ref float vBase = ref MemoryMarshal.GetReference(this.values); return Unsafe.Add(ref vBase, (row * this.Columns) + column); } } From f8f3eaa321faede2d9e66d2c4042636eef735963 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sat, 12 Dec 2020 22:19:57 +0100 Subject: [PATCH 07/22] Remove unnecessary code --- .../Common/Helpers/Buffer2DUtils.cs | 109 ------------------ 1 file changed, 109 deletions(-) delete mode 100644 src/ImageSharp/Common/Helpers/Buffer2DUtils.cs diff --git a/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs b/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs deleted file mode 100644 index 02a5afff7..000000000 --- a/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Apache License, Version 2.0. - -using System; -using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; - -using SixLabors.ImageSharp.Memory; -using SixLabors.ImageSharp.PixelFormats; - -namespace SixLabors.ImageSharp -{ - /// - /// Extension methods for . - /// TODO: One day rewrite all this to use SIMD intrinsics. There's a lot of scope for improvement. - /// - internal static class Buffer2DUtils - { - /// - /// Computes the sum of vectors in weighted by the kernel weight values. - /// - /// The pixel format. - /// The 1D convolution kernel. - /// The source frame. - /// The target row. - /// The current row. - /// The current column. - /// The minimum working area row. - /// The maximum working area row. - /// The minimum working area column. - /// The maximum working area column. - public static void Convolve4( - Span kernel, - Buffer2D sourcePixels, - Span targetRow, - int row, - int column, - int minRow, - int maxRow, - int minColumn, - int maxColumn) - where TPixel : unmanaged, IPixel - { - ComplexVector4 vector = default; - int kernelLength = kernel.Length; - int radiusY = kernelLength >> 1; - int sourceOffsetColumnBase = column + minColumn; - ref Complex64 baseRef = ref MemoryMarshal.GetReference(kernel); - - for (int i = 0; i < kernelLength; i++) - { - int offsetY = Numerics.Clamp(row + i - radiusY, minRow, maxRow); - int offsetX = Numerics.Clamp(sourceOffsetColumnBase, minColumn, maxColumn); - Span sourceRowSpan = sourcePixels.GetRowSpan(offsetY); - var currentColor = sourceRowSpan[offsetX].ToVector4(); - - vector.Sum(Unsafe.Add(ref baseRef, i) * currentColor); - } - - targetRow[column] = vector; - } - - /// - /// Computes the sum of vectors in weighted by the kernel weight values and accumulates the partial results. - /// - /// The 1D convolution kernel. - /// The source frame. - /// The target row. - /// The current row. - /// The current column. - /// The minimum working area row. - /// The maximum working area row. - /// The minimum working area column. - /// The maximum working area column. - /// The weight factor for the real component of the complex pixel values. - /// The weight factor for the imaginary component of the complex pixel values. - public static void Convolve4AndAccumulatePartials( - Span kernel, - Buffer2D sourceValues, - Span targetRow, - int row, - int column, - int minRow, - int maxRow, - int minColumn, - int maxColumn, - float z, - float w) - { - ComplexVector4 vector = default; - int kernelLength = kernel.Length; - int radiusX = kernelLength >> 1; - int sourceOffsetColumnBase = column + minColumn; - - int offsetY = Numerics.Clamp(row, minRow, maxRow); - ref ComplexVector4 sourceRef = ref MemoryMarshal.GetReference(sourceValues.GetRowSpan(offsetY)); - ref Complex64 baseRef = ref MemoryMarshal.GetReference(kernel); - - for (int x = 0; x < kernelLength; x++) - { - int offsetX = Numerics.Clamp(sourceOffsetColumnBase + x - radiusX, minColumn, maxColumn); - vector.Sum(Unsafe.Add(ref baseRef, x) * Unsafe.Add(ref sourceRef, offsetX)); - } - - targetRow[column] += vector.WeightedSum(z, w); - } - } -} From 16f4842f64bdb77c251eed0a3a4636a9b7ed604a Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sat, 12 Dec 2020 23:07:11 +0100 Subject: [PATCH 08/22] Fix gamma processing out of image bounds --- .../Convolution/BokehBlurProcessor{TPixel}.cs | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs index cdadd4dee..4b1d7f8f1 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs @@ -77,26 +77,28 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution /// protected override void OnFrameApply(ImageFrame source) { + var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds()); + // Preliminary gamma highlight pass - var gammaOperation = new ApplyGammaExposureRowOperation(this.SourceRectangle, source.PixelBuffer, this.Configuration, this.gamma); + var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma); ParallelRowIterator.IterateRows( this.Configuration, - this.SourceRectangle, + sourceRectangle, in gammaOperation); // Create a 0-filled buffer to use to store the result of the component convolutions using Buffer2D processingBuffer = this.Configuration.MemoryAllocator.Allocate2D(source.Size(), AllocationOptions.Clean); // Perform the 1D convolutions on all the kernel components and accumulate the results - this.OnFrameApplyCore(source, this.SourceRectangle, this.Configuration, processingBuffer); + this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer); float inverseGamma = 1 / this.gamma; // Apply the inverse gamma exposure pass, and write the final pixel data - var operation = new ApplyInverseGammaExposureRowOperation(this.SourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma); + var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma); ParallelRowIterator.IterateRows( this.Configuration, - this.SourceRectangle, + sourceRectangle, in operation); } @@ -116,8 +118,6 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution // Allocate the buffer with the intermediate convolution results using Buffer2D firstPassBuffer = configuration.MemoryAllocator.Allocate2D(source.Size()); - var interest = Rectangle.Intersect(sourceRectangle, source.Bounds()); - // Unlike in the standard 2 pass convolution processor, we use a rectangle of 1x the interest width // to speedup the actual convolution, by applying bulk pixel conversion and clamping calculation. // The second half of the buffer will just target the temporary buffer of complex pixel values. @@ -128,8 +128,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution using var mapX = new KernelSamplingMap(configuration.MemoryAllocator); using var mapY = new KernelSamplingMap(configuration.MemoryAllocator); - mapX.BuildSamplingOffsetMap(1, this.kernelSize, interest); - mapY.BuildSamplingOffsetMap(this.kernelSize, 1, interest); + mapX.BuildSamplingOffsetMap(1, this.kernelSize, sourceRectangle); + mapY.BuildSamplingOffsetMap(this.kernelSize, 1, sourceRectangle); ref Complex64[] baseRef = ref MemoryMarshal.GetReference(this.kernels.AsSpan()); ref Vector4 paramsRef = ref MemoryMarshal.GetReference(this.kernelParameters.AsSpan()); @@ -143,7 +143,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution // Horizontal convolution var horizontalOperation = new FirstPassConvolutionRowOperation( - interest, + sourceRectangle, firstPassBuffer, source.PixelBuffer, mapX, @@ -152,12 +152,12 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution ParallelRowIterator.IterateRows( configuration, - interest, + sourceRectangle, in horizontalOperation); // Vertical 1D convolutions to accumulate the partial results on the target buffer var verticalOperation = new BokehBlurProcessor.SecondPassConvolutionRowOperation( - interest, + sourceRectangle, processingBuffer, firstPassBuffer, mapY, @@ -167,7 +167,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution ParallelRowIterator.IterateRows( configuration, - interest, + sourceRectangle, in verticalOperation); } } From 6187fb55e0aeb90c18d21f5f685baacac0172364 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sun, 13 Dec 2020 00:18:09 +0100 Subject: [PATCH 09/22] Fix blur processing when constrained to region --- .../Processing/Processors/Convolution/BokehBlurProcessor.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs index 243bc46cb..b3844ded8 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs @@ -136,14 +136,14 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution // The target buffer is zeroed initially and then it accumulates the results // of each partial convolution, so we don't have to clear it here as well - ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(0, y); + ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(boundsX, y); ref Complex64 kernelBase = ref this.kernel[0]; for (int kY = 0; kY < kernelSize; kY++) { // Get the precalculated source sample row for this kernel row and copy to our buffer int sampleY = Unsafe.Add(ref sampleRowBase, kY); - ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(boundsX, sampleY); + ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleY); Complex64 factor = Unsafe.Add(ref kernelBase, kY); for (int x = 0; x < boundsWidth; x++) From 0a6f7baa719fa69ca4c56d0fcefc405b1a2051ff Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sun, 13 Dec 2020 00:58:26 +0100 Subject: [PATCH 10/22] Fix NullReferenceException in KernelSamplingMap.Dispose --- .../Processing/Processors/Convolution/KernelSamplingMap.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs index f912b9562..904b599f7 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs @@ -99,8 +99,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution { if (!this.isDisposed) { - this.yOffsets.Dispose(); - this.xOffsets.Dispose(); + this.yOffsets?.Dispose(); + this.xOffsets?.Dispose(); this.isDisposed = true; } From f62e2f9748e149394ccedd8595f212be80ca87f7 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sun, 13 Dec 2020 01:08:01 +0100 Subject: [PATCH 11/22] Remove allocation constrained test for bokeh blur --- .../Processing/Processors/Convolution/BokehBlurTest.cs | 8 -------- 1 file changed, 8 deletions(-) diff --git a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs index 6c48cf843..666fbdd93 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs @@ -173,13 +173,5 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution testOutputDetails: value.ToString(), appendPixelTypeToFileName: false); } - - [Theory] - [WithTestPatternImages(100, 300, PixelTypes.Bgr24)] - public void WorksWithDiscoBuffers(TestImageProvider provider) - where TPixel : unmanaged, IPixel - { - provider.RunBufferCapacityLimitProcessorTest(41, c => c.BokehBlur()); - } } } From 3356225bb163ea28ce6f013ada157b4beba323c1 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Sun, 13 Dec 2020 23:08:55 +0100 Subject: [PATCH 12/22] Remove unnecessary offset indirections --- .../Convolution/BokehBlurProcessor.cs | 5 +--- .../Convolution/BokehBlurProcessor{TPixel}.cs | 23 ++++++++----------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs index b3844ded8..d4fb27a57 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs @@ -130,9 +130,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution int kernelSize = this.kernel.Length; Span rowOffsets = this.map.GetRowOffsetSpan(); - Span columnOffsets = this.map.GetColumnOffsetSpan(); ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize); - ref int sampleColumnBase = ref MemoryMarshal.GetReference(columnOffsets); // The target buffer is zeroed initially and then it accumulates the results // of each partial convolution, so we don't have to clear it here as well @@ -148,9 +146,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution for (int x = 0; x < boundsWidth; x++) { - int sampleX = Unsafe.Add(ref sampleColumnBase, x) - boundsX; ref Vector4 target = ref Unsafe.Add(ref targetBase, x); - ComplexVector4 sample = Unsafe.Add(ref sourceBase, sampleX); + ComplexVector4 sample = Unsafe.Add(ref sourceBase, x); ComplexVector4 partial = factor * sample; target += partial.WeightedSum(this.z, this.w); diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs index 4b1d7f8f1..dda384390 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs @@ -124,12 +124,13 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution // This is needed because the bokeh blur operates as TPixel -> complex -> TPixel, so we cannot // convert back to standard pixels after each separate 1D convolution pass. Like in the gaussian // blur though, we preallocate and compute the kernel sampling maps before processing each complex - // component, to avoid recomputing the same sampling map once per convolution pass. - using var mapX = new KernelSamplingMap(configuration.MemoryAllocator); - using var mapY = new KernelSamplingMap(configuration.MemoryAllocator); + // component, to avoid recomputing the same sampling map once per convolution pass. Since we are + // doing two 1D convolutions with the same kernel, we can use a single kernel sampling map as if + // we were using a 2D kernel with each dimension being the same as the length of our kernel, and + // use the two sampling offset spans resulting from this same map. This saves some extra work. + using var mapXY = new KernelSamplingMap(configuration.MemoryAllocator); - mapX.BuildSamplingOffsetMap(1, this.kernelSize, sourceRectangle); - mapY.BuildSamplingOffsetMap(this.kernelSize, 1, sourceRectangle); + mapXY.BuildSamplingOffsetMap(this.kernelSize, this.kernelSize, sourceRectangle); ref Complex64[] baseRef = ref MemoryMarshal.GetReference(this.kernels.AsSpan()); ref Vector4 paramsRef = ref MemoryMarshal.GetReference(this.kernelParameters.AsSpan()); @@ -146,7 +147,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution sourceRectangle, firstPassBuffer, source.PixelBuffer, - mapX, + mapXY, kernel, configuration); @@ -160,7 +161,7 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution sourceRectangle, processingBuffer, firstPassBuffer, - mapY, + mapXY, kernel, parameters.Z, parameters.W); @@ -209,22 +210,18 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution int boundsWidth = this.bounds.Width; int kernelSize = this.kernel.Length; - Span rowOffsets = this.map.GetRowOffsetSpan(); - Span columnOffsets = this.map.GetColumnOffsetSpan(); - int sampleY = Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), y - this.bounds.Y); - ref int sampleColumnBase = ref MemoryMarshal.GetReference(columnOffsets); - // Clear the target buffer for each row run Span targetBuffer = this.targetValues.GetRowSpan(y); targetBuffer.Clear(); ref ComplexVector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer); // Execute the bulk pixel format conversion for the current row - Span sourceRow = this.sourcePixels.GetRowSpan(sampleY).Slice(boundsX, boundsWidth); + Span sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth); PixelOperations.Instance.ToVector4(this.configuration, sourceRow, span); ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span); ref Complex64 kernelBase = ref this.kernel[0]; + ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan()); for (int x = 0; x < span.Length; x++) { From 8292407ae2258f413e13754d443e69ba21a92b8b Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Mon, 14 Dec 2020 01:51:24 +0100 Subject: [PATCH 13/22] Add optimized paths for default gamma exposure --- .../Convolution/BokehBlurProcessor{TPixel}.cs | 171 ++++++++++++++++-- 1 file changed, 159 insertions(+), 12 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs index dda384390..c01fc3ba1 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs @@ -80,11 +80,22 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds()); // Preliminary gamma highlight pass - var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma); - ParallelRowIterator.IterateRows( - this.Configuration, - sourceRectangle, - in gammaOperation); + if (this.gamma == 3F) + { + var gammaOperation = new ApplyGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration); + ParallelRowIterator.IterateRows( + this.Configuration, + sourceRectangle, + in gammaOperation); + } + else + { + var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma); + ParallelRowIterator.IterateRows( + this.Configuration, + sourceRectangle, + in gammaOperation); + } // Create a 0-filled buffer to use to store the result of the component convolutions using Buffer2D processingBuffer = this.Configuration.MemoryAllocator.Allocate2D(source.Size(), AllocationOptions.Clean); @@ -92,14 +103,23 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution // Perform the 1D convolutions on all the kernel components and accumulate the results this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer); - float inverseGamma = 1 / this.gamma; - // Apply the inverse gamma exposure pass, and write the final pixel data - var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma); - ParallelRowIterator.IterateRows( - this.Configuration, - sourceRectangle, - in operation); + if (this.gamma == 3F) + { + var operation = new ApplyInverseGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration); + ParallelRowIterator.IterateRows( + this.Configuration, + sourceRectangle, + in operation); + } + else + { + var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma); + ParallelRowIterator.IterateRows( + this.Configuration, + sourceRectangle, + in operation); + } } /// @@ -286,6 +306,56 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution } } + /// + /// A implementing the 3F gamma exposure logic for . + /// + private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation + { + private readonly Rectangle bounds; + private readonly Buffer2D targetPixels; + private readonly Configuration configuration; + + [MethodImpl(InliningOptions.ShortMethod)] + public ApplyGamma3ExposureRowOperation( + Rectangle bounds, + Buffer2D targetPixels, + Configuration configuration) + { + this.bounds = bounds; + this.targetPixels = targetPixels; + this.configuration = configuration; + } + + /// + [MethodImpl(InliningOptions.ShortMethod)] + public void Invoke(int y, Span span) + { + Span targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X); + PixelOperations.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply); + ref Vector4 baseRef = ref MemoryMarshal.GetReference(span); + + for (int x = 0; x < this.bounds.Width; x++) + { + ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x); + Vector4 v = pixel4; + float a = v.W; + + // Fast path for the default gamma exposure, which is 3. In this case we can skip + // calling Math.Pow 3 times (one per component), as the method is an internal call and + // introduces quite a bit of overhead. Instead, we can just manually multiply the whole + // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it + // back to the target index in the temporary span. The whole iteration will get completely + // inlined and traslated into vectorized instructions, with much better performance. + v = v * v * v; + v.W = a; + + pixel4 = v; + } + + PixelOperations.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan); + } + } + /// /// A implementing the inverse gamma exposure logic for . /// @@ -335,5 +405,82 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution PixelOperations.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply); } } + + /// + /// A implementing the inverse 3F gamma exposure logic for . + /// + private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation + { + private readonly Rectangle bounds; + private readonly Buffer2D targetPixels; + private readonly Buffer2D sourceValues; + private readonly Configuration configuration; + + [MethodImpl(InliningOptions.ShortMethod)] + public ApplyInverseGamma3ExposureRowOperation( + Rectangle bounds, + Buffer2D targetPixels, + Buffer2D sourceValues, + Configuration configuration) + { + this.bounds = bounds; + this.targetPixels = targetPixels; + this.sourceValues = sourceValues; + this.configuration = configuration; + } + + /// + [MethodImpl(InliningOptions.ShortMethod)] + public unsafe void Invoke(int y) + { + Vector4 low = Vector4.Zero; + var high = new Vector4(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity); + + Span targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X); + Span sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X); + ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan); + + for (int x = 0; x < this.bounds.Width; x++) + { + ref Vector4 v = ref Unsafe.Add(ref sourceRef, x); + Vector4 clamp = Numerics.Clamp(v, low, high); + + double + x64 = clamp.X, + y64 = clamp.Y, + z64 = clamp.Z; + float a = clamp.W; + + ulong + xl = *(ulong*)&x64, + yl = *(ulong*)&y64, + zl = *(ulong*)&z64; + + // Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root + // of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the + // constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform + // the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set + // these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD. + // As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values. + xl = 0x2a9f8a7be393b600 + (xl / 3); + yl = 0x2a9f8a7be393b600 + (yl / 3); + zl = 0x2a9f8a7be393b600 + (zl / 3); + + Vector4 y4; + y4.X = (float)*(double*)&xl; + y4.Y = (float)*(double*)&yl; + y4.Z = (float)*(double*)&zl; + y4.W = 0; + + y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4))); + y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4))); + y4.W = a; + + v = y4; + } + + PixelOperations.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply); + } + } } } From 0903a58e588c6a8c32cfeba88349d26bb8e28558 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Mon, 14 Dec 2020 02:54:18 +0100 Subject: [PATCH 14/22] Switch to vectorized clamping --- .../Convolution/BokehBlurProcessor{TPixel}.cs | 27 ++++++++++--------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs index c01fc3ba1..02308d3fb 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs @@ -333,8 +333,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution Span targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X); PixelOperations.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply); ref Vector4 baseRef = ref MemoryMarshal.GetReference(span); + int length = this.bounds.Width; - for (int x = 0; x < this.bounds.Width; x++) + for (int x = 0; x < length; x++) { ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x); Vector4 v = pixel4; @@ -433,23 +434,23 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution [MethodImpl(InliningOptions.ShortMethod)] public unsafe void Invoke(int y) { - Vector4 low = Vector4.Zero; - var high = new Vector4(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity); + Span sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X, this.bounds.Width); + ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan); + + Numerics.Clamp(MemoryMarshal.Cast(sourceRowSpan), 0, float.PositiveInfinity); Span targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X); - Span sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X); - ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan); + int length = this.bounds.Width; - for (int x = 0; x < this.bounds.Width; x++) + for (int x = 0; x < length; x++) { ref Vector4 v = ref Unsafe.Add(ref sourceRef, x); - Vector4 clamp = Numerics.Clamp(v, low, high); double - x64 = clamp.X, - y64 = clamp.Y, - z64 = clamp.Z; - float a = clamp.W; + x64 = v.X, + y64 = v.Y, + z64 = v.Z; + float a = v.W; ulong xl = *(ulong*)&x64, @@ -472,8 +473,8 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution y4.Z = (float)*(double*)&zl; y4.W = 0; - y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4))); - y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4))); + y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); + y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); y4.W = a; v = y4; From 3bba7deda18fcdabdc7f87c1c762e60398850579 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Mon, 14 Dec 2020 16:14:41 +0100 Subject: [PATCH 15/22] Initial vectorized cube root implementation --- src/ImageSharp/Common/Helpers/Numerics.cs | 126 ++++++++++++++++++ .../Convolution/BokehBlurProcessor{TPixel}.cs | 61 +-------- 2 files changed, 129 insertions(+), 58 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index b2bedb87b..f09530d6b 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -547,5 +547,131 @@ namespace SixLabors.ImageSharp } } } + + /// + /// Calculates the cube pow of all the XYZ channels of the input vectors. + /// + /// The span of vectors + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void CubePowOnXYZ(Span vectors) + { + ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); + int length = vectors.Length; + + for (int x = 0; x < length; x++) + { + ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x); + Vector4 v = pixel4; + float a = v.W; + + // Fast path for the default gamma exposure, which is 3. In this case we can skip + // calling Math.Pow 3 times (one per component), as the method is an internal call and + // introduces quite a bit of overhead. Instead, we can just manually multiply the whole + // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it + // back to the target index in the temporary span. The whole iteration will get completely + // inlined and traslated into vectorized instructions, with much better performance. + v = v * v * v; + v.W = a; + + pixel4 = v; + } + } + + /// + /// Calculates the cube root of all the XYZ channels of the input vectors. + /// + /// The span of vectors + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static unsafe void CubeRootOnXYZ(Span vectors) + { + ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors); + int length = vectors.Length; + +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse41.IsSupported) + { + var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF); + var v128_0x3F8000000 = Vector128.Create(0x3F800000); + var v128_341 = Vector128.Create(341); + var v128_0x80000000 = Vector128.Create(unchecked((int)0x80000000)); + var v4_23rds = new Vector4(2 / 3f); + var v4_13rds = new Vector4(1 / 3f); + + for (int x = 0; x < length; x++) + { + ref Vector4 v4 = ref Unsafe.Add(ref vectorsRef, x); + + Vector4 vx = v4; + float a = vx.W; + Vector128 veax = Unsafe.As>(ref vx); + Vector128 vecx = veax; + + // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it + // directly on 32 bit floating point values. What follows is a vectorized implementation of this method: + // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html. + // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here + // using a different succession (the same used below), which should be less unstable due to not having cube pow. + veax = Sse2.And(veax, v128_0x7FFFFFFF); + veax = Sse2.Subtract(veax, v128_0x3F8000000); + veax = Sse2.ShiftRightArithmetic(veax, 10); + veax = Sse41.MultiplyLow(veax, v128_341); + veax = Sse2.Add(veax, v128_0x3F8000000); + veax = Sse2.And(veax, v128_0x7FFFFFFF); + vecx = Sse2.And(vecx, v128_0x80000000); + veax = Sse2.Or(veax, vecx); + + Vector4 y4 = *(Vector4*)&veax; + + y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4))); + y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4))); + y4.W = a; + + v4 = y4; + } + + return; + } +#else + for (int x = 0; x < length; x++) + { + ref Vector4 v = ref Unsafe.Add(ref vectorsRef, x); + + double + x64 = v.X, + y64 = v.Y, + z64 = v.Z; + float a = v.W; + + ulong + xl = *(ulong*)&x64, + yl = *(ulong*)&y64, + zl = *(ulong*)&z64; + + // Here we use a trick to compute the starting value x0 for the cube root. This is because doing + // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case, + // this means what we actually want is to find the cube root of our clamped values. + // For more info on the constant below, see: + // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. + // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and + // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit + // register, and use it to accelerate two steps of the Newton approximation using SIMD. + xl = 0x2a9f8a7be393b600 + (xl / 3); + yl = 0x2a9f8a7be393b600 + (yl / 3); + zl = 0x2a9f8a7be393b600 + (zl / 3); + + Vector4 y4; + y4.X = (float)*(double*)&xl; + y4.Y = (float)*(double*)&yl; + y4.Z = (float)*(double*)&zl; + y4.W = 0; + + y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); + y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); + y4.W = a; + + v = y4; + } +#endif + } } } diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs index 02308d3fb..a21155e10 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs @@ -331,27 +331,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution public void Invoke(int y, Span span) { Span targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X); + PixelOperations.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply); - ref Vector4 baseRef = ref MemoryMarshal.GetReference(span); - int length = this.bounds.Width; - for (int x = 0; x < length; x++) - { - ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x); - Vector4 v = pixel4; - float a = v.W; - - // Fast path for the default gamma exposure, which is 3. In this case we can skip - // calling Math.Pow 3 times (one per component), as the method is an internal call and - // introduces quite a bit of overhead. Instead, we can just manually multiply the whole - // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it - // back to the target index in the temporary span. The whole iteration will get completely - // inlined and traslated into vectorized instructions, with much better performance. - v = v * v * v; - v.W = a; - - pixel4 = v; - } + Numerics.CubePowOnXYZ(span); PixelOperations.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan); } @@ -438,47 +421,9 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan); Numerics.Clamp(MemoryMarshal.Cast(sourceRowSpan), 0, float.PositiveInfinity); + Numerics.CubeRootOnXYZ(sourceRowSpan); Span targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X); - int length = this.bounds.Width; - - for (int x = 0; x < length; x++) - { - ref Vector4 v = ref Unsafe.Add(ref sourceRef, x); - - double - x64 = v.X, - y64 = v.Y, - z64 = v.Z; - float a = v.W; - - ulong - xl = *(ulong*)&x64, - yl = *(ulong*)&y64, - zl = *(ulong*)&z64; - - // Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root - // of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the - // constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform - // the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set - // these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD. - // As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values. - xl = 0x2a9f8a7be393b600 + (xl / 3); - yl = 0x2a9f8a7be393b600 + (yl / 3); - zl = 0x2a9f8a7be393b600 + (zl / 3); - - Vector4 y4; - y4.X = (float)*(double*)&xl; - y4.Y = (float)*(double*)&yl; - y4.Z = (float)*(double*)&zl; - y4.W = 0; - - y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); - y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); - y4.W = a; - - v = y4; - } PixelOperations.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply); } From 392afeadeff989c9c2ae40a4a380f8797413e030 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Mon, 14 Dec 2020 16:30:14 +0100 Subject: [PATCH 16/22] Fix vectorized cube root on x86-64 with no SSE41 --- src/ImageSharp/Common/Helpers/Numerics.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index f09530d6b..115cebef2 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -631,7 +631,9 @@ namespace SixLabors.ImageSharp return; } -#else +#endif + + // Fallback with scalar preprocessing and vectorized approximation steps for (int x = 0; x < length; x++) { ref Vector4 v = ref Unsafe.Add(ref vectorsRef, x); @@ -671,7 +673,6 @@ namespace SixLabors.ImageSharp v = y4; } -#endif } } } From 76e704d4616fe51e47e3eefd19f38a97159c50d4 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Mon, 14 Dec 2020 16:49:18 +0100 Subject: [PATCH 17/22] Minor codegen tweaks --- src/ImageSharp/Common/Helpers/Numerics.cs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 115cebef2..55718e724 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -584,12 +584,12 @@ namespace SixLabors.ImageSharp [MethodImpl(MethodImplOptions.AggressiveInlining)] public static unsafe void CubeRootOnXYZ(Span vectors) { - ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors); - int length = vectors.Length; - #if SUPPORTS_RUNTIME_INTRINSICS if (Sse41.IsSupported) { + ref Vector4 vectors4Ref = ref MemoryMarshal.GetReference(vectors); + ref Vector4 vectors4End = ref Unsafe.Add(ref vectors4Ref, vectors.Length); + var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF); var v128_0x3F8000000 = Vector128.Create(0x3F800000); var v128_341 = Vector128.Create(341); @@ -597,11 +597,9 @@ namespace SixLabors.ImageSharp var v4_23rds = new Vector4(2 / 3f); var v4_13rds = new Vector4(1 / 3f); - for (int x = 0; x < length; x++) + while (Unsafe.IsAddressLessThan(ref vectors4Ref, ref vectors4End)) { - ref Vector4 v4 = ref Unsafe.Add(ref vectorsRef, x); - - Vector4 vx = v4; + Vector4 vx = vectors4Ref; float a = vx.W; Vector128 veax = Unsafe.As>(ref vx); Vector128 vecx = veax; @@ -626,12 +624,15 @@ namespace SixLabors.ImageSharp y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4))); y4.W = a; - v4 = y4; + vectors4Ref = y4; + vectors4Ref = ref Unsafe.Add(ref vectors4Ref, 1); } return; } #endif + ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors); + int length = vectors.Length; // Fallback with scalar preprocessing and vectorized approximation steps for (int x = 0; x < length; x++) From b383dd4496ade8d1b0d6c90b00590d228c8642f4 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 14 Dec 2020 18:21:55 +0000 Subject: [PATCH 18/22] Add discontigous buffers and intrinsics tests --- .../Processors/Convolution/BokehBlurTest.cs | 44 +++++++++++------ .../FeatureTesting/FeatureTestRunner.cs | 47 +++++++++++++++++++ 2 files changed, 76 insertions(+), 15 deletions(-) diff --git a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs index 666fbdd93..dbf59a29b 100644 --- a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs +++ b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs @@ -6,7 +6,6 @@ using System.Collections.Generic; using System.Globalization; using System.Linq; using System.Text.RegularExpressions; -using Microsoft.DotNet.RemoteExecutor; using SixLabors.ImageSharp.Advanced; using SixLabors.ImageSharp.PixelFormats; using SixLabors.ImageSharp.Processing; @@ -44,9 +43,8 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution [InlineData(20, 4, -10f)] [InlineData(20, 4, 0f)] public void VerifyBokehBlurProcessorArguments_Fail(int radius, int components, float gamma) - { - Assert.Throws(() => new BokehBlurProcessor(radius, components, gamma)); - } + => Assert.Throws( + () => new BokehBlurProcessor(radius, components, gamma)); [Fact] public void VerifyComplexComponents() @@ -137,12 +135,10 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution [WithTestPatternImages(nameof(BokehBlurValues), 30, 20, PixelTypes.Rgba32)] public void BokehBlurFilterProcessor(TestImageProvider provider, BokehBlurInfo value) where TPixel : unmanaged, IPixel - { - provider.RunValidatingProcessorTest( + => provider.RunValidatingProcessorTest( x => x.BokehBlur(value.Radius, value.Components, value.Gamma), testOutputDetails: value.ToString(), appendPixelTypeToFileName: false); - } [Theory] /* @@ -152,18 +148,23 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution [WithTestPatternImages(200, 200, PixelTypes.Bgr24 | PixelTypes.Bgra32)] public void BokehBlurFilterProcessor_WorksWithAllPixelTypes(TestImageProvider provider) where TPixel : unmanaged, IPixel - { - provider.RunValidatingProcessorTest( - x => x.BokehBlur(8, 2, 3), - appendSourceFileOrDescription: false); - } + => provider.RunValidatingProcessorTest( + x => x.BokehBlur(8, 2, 3), + appendSourceFileOrDescription: false); [Theory] [WithFileCollection(nameof(TestFiles), nameof(BokehBlurValues), PixelTypes.Rgba32)] - public void BokehBlurFilterProcessor_Bounded(TestImageProvider provider, BokehBlurInfo value) - where TPixel : unmanaged, IPixel + public void BokehBlurFilterProcessor_Bounded(TestImageProvider provider, BokehBlurInfo value) { - provider.RunValidatingProcessorTest( + static void RunTest(string arg1, string arg2) + { + TestImageProvider provider = + FeatureTestRunner.DeserializeForXunit>(arg1); + + BokehBlurInfo value = + FeatureTestRunner.DeserializeForXunit(arg2); + + provider.RunValidatingProcessorTest( x => { Size size = x.GetCurrentSize(); @@ -172,6 +173,19 @@ namespace SixLabors.ImageSharp.Tests.Processing.Processors.Convolution }, testOutputDetails: value.ToString(), appendPixelTypeToFileName: false); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + HwIntrinsics.DisableSSE41, + provider, + value); } + + [Theory] + [WithTestPatternImages(100, 300, PixelTypes.Bgr24)] + public void WorksWithDiscoBuffers(TestImageProvider provider) + where TPixel : unmanaged, IPixel + => provider.RunBufferCapacityLimitProcessorTest(260, c => c.BokehBlur()); } } diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs index 4720ea78a..8c78cc2b3 100644 --- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs +++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs @@ -211,6 +211,53 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities } } + /// + /// Runs the given test within an environment + /// where the given features. + /// + /// The test action to run. + /// The intrinsics features. + /// The value to pass as a parameter to the test action. + /// The second value to pass as a parameter to the test action. + public static void RunWithHwIntrinsicsFeature( + Action action, + HwIntrinsics intrinsics, + T arg1, + T2 arg2) + where T : IXunitSerializable + where T2 : IXunitSerializable + { + if (!RemoteExecutor.IsSupported) + { + return; + } + + foreach (KeyValuePair intrinsic in intrinsics.ToFeatureKeyValueCollection()) + { + var processStartInfo = new ProcessStartInfo(); + if (intrinsic.Key != HwIntrinsics.AllowAll) + { + processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0"; + + RemoteExecutor.Invoke( + action, + BasicSerializer.Serialize(arg1), + BasicSerializer.Serialize(arg1), + new RemoteInvokeOptions + { + StartInfo = processStartInfo + }) + .Dispose(); + } + else + { + // Since we are running using the default architecture there is no + // point creating the overhead of running the action in a separate process. + action(BasicSerializer.Serialize(arg1), BasicSerializer.Serialize(arg2)); + } + } + } + /// /// Runs the given test within an environment /// where the given features. From c9b07964cc17ebd271b9a9b3f9400cb80acba83b Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 14 Dec 2020 18:38:02 +0000 Subject: [PATCH 19/22] Fix feature test runner --- .../TestUtilities/FeatureTesting/FeatureTestRunner.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs index 8c78cc2b3..fa0f02ca1 100644 --- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs +++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs @@ -242,7 +242,7 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities RemoteExecutor.Invoke( action, BasicSerializer.Serialize(arg1), - BasicSerializer.Serialize(arg1), + BasicSerializer.Serialize(arg2), new RemoteInvokeOptions { StartInfo = processStartInfo From df18c4e4c87ef20e2c75c2242d3076d4287f0ed4 Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Mon, 14 Dec 2020 20:11:47 +0100 Subject: [PATCH 20/22] Switch to explicit SSE Newton approximations --- src/ImageSharp/Common/Helpers/Numerics.cs | 31 ++++++++++------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 55718e724..e7ae71210 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -587,22 +587,20 @@ namespace SixLabors.ImageSharp #if SUPPORTS_RUNTIME_INTRINSICS if (Sse41.IsSupported) { - ref Vector4 vectors4Ref = ref MemoryMarshal.GetReference(vectors); - ref Vector4 vectors4End = ref Unsafe.Add(ref vectors4Ref, vectors.Length); + ref Vector128 vectors128Ref = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); + ref Vector128 vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length); var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF); var v128_0x3F8000000 = Vector128.Create(0x3F800000); var v128_341 = Vector128.Create(341); var v128_0x80000000 = Vector128.Create(unchecked((int)0x80000000)); - var v4_23rds = new Vector4(2 / 3f); - var v4_13rds = new Vector4(1 / 3f); + var v4_23rds = Vector128.Create(2 / 3f); + var v4_13rds = Vector128.Create(1 / 3f); - while (Unsafe.IsAddressLessThan(ref vectors4Ref, ref vectors4End)) + while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End)) { - Vector4 vx = vectors4Ref; - float a = vx.W; - Vector128 veax = Unsafe.As>(ref vx); - Vector128 vecx = veax; + Vector128 vecx = vectors128Ref; + Vector128 veax = vecx.AsInt32(); // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it // directly on 32 bit floating point values. What follows is a vectorized implementation of this method: @@ -615,17 +613,16 @@ namespace SixLabors.ImageSharp veax = Sse41.MultiplyLow(veax, v128_341); veax = Sse2.Add(veax, v128_0x3F8000000); veax = Sse2.And(veax, v128_0x7FFFFFFF); - vecx = Sse2.And(vecx, v128_0x80000000); - veax = Sse2.Or(veax, vecx); + veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_0x80000000)); - Vector4 y4 = *(Vector4*)&veax; + Vector128 y4 = veax.AsSingle(); - y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4))); - y4 = (v4_23rds * y4) + (v4_13rds * (vx / (y4 * y4))); - y4.W = a; + y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + y4 = Sse41.Insert(y4, vecx, 0xF0); - vectors4Ref = y4; - vectors4Ref = ref Unsafe.Add(ref vectors4Ref, 1); + vectors128Ref = y4; + vectors128Ref = ref Unsafe.Add(ref vectors128Ref, 1); } return; From e7cdb0aaab2d1c0c122f8e0cb7618bc5b624befa Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Mon, 14 Dec 2020 20:19:47 +0100 Subject: [PATCH 21/22] Add FMA support, more SSE optimizations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Special thanks to @tannergooding for the help 🚀 --- src/ImageSharp/Common/Helpers/Numerics.cs | 33 ++++++++++++++--------- 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index e7ae71210..88b6d83ee 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -590,12 +590,12 @@ namespace SixLabors.ImageSharp ref Vector128 vectors128Ref = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors)); ref Vector128 vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length); - var v128_0x7FFFFFFF = Vector128.Create(0x7FFFFFFF); - var v128_0x3F8000000 = Vector128.Create(0x3F800000); var v128_341 = Vector128.Create(341); - var v128_0x80000000 = Vector128.Create(unchecked((int)0x80000000)); - var v4_23rds = Vector128.Create(2 / 3f); - var v4_13rds = Vector128.Create(1 / 3f); + Vector128 v128_negativeZero = Vector128.Create(-0.0f).AsInt32(); + Vector128 v128_one = Vector128.Create(1.0f).AsInt32(); + + var v128_13rd = Vector128.Create(1 / 3f); + var v128_23rds = Vector128.Create(2 / 3f); while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End)) { @@ -607,18 +607,27 @@ namespace SixLabors.ImageSharp // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html. // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here // using a different succession (the same used below), which should be less unstable due to not having cube pow. - veax = Sse2.And(veax, v128_0x7FFFFFFF); - veax = Sse2.Subtract(veax, v128_0x3F8000000); + veax = Sse2.AndNot(v128_negativeZero, veax); + veax = Sse2.Subtract(veax, v128_one); veax = Sse2.ShiftRightArithmetic(veax, 10); veax = Sse41.MultiplyLow(veax, v128_341); - veax = Sse2.Add(veax, v128_0x3F8000000); - veax = Sse2.And(veax, v128_0x7FFFFFFF); - veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_0x80000000)); + veax = Sse2.Add(veax, v128_one); + veax = Sse2.AndNot(v128_negativeZero, veax); + veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_negativeZero)); Vector128 y4 = veax.AsSingle(); - y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); - y4 = Sse.Add(Sse.Multiply(v4_23rds, y4), Sse.Multiply(v4_13rds, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + if (Fma.IsSupported) + { + y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + } + else + { + y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4)))); + } + y4 = Sse41.Insert(y4, vecx, 0xF0); vectors128Ref = y4; From 80617a060c5647048ff3fe8bd3ad352c067fb4ba Mon Sep 17 00:00:00 2001 From: Sergio Pedri Date: Mon, 14 Dec 2020 20:44:30 +0100 Subject: [PATCH 22/22] Add more codegen improvements --- src/ImageSharp/Common/Helpers/Numerics.cs | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 88b6d83ee..56ab46c68 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -556,12 +556,11 @@ namespace SixLabors.ImageSharp public static unsafe void CubePowOnXYZ(Span vectors) { ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors); - int length = vectors.Length; + ref Vector4 endRef = ref Unsafe.Add(ref baseRef, vectors.Length); - for (int x = 0; x < length; x++) + while (Unsafe.IsAddressLessThan(ref baseRef, ref endRef)) { - ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x); - Vector4 v = pixel4; + Vector4 v = baseRef; float a = v.W; // Fast path for the default gamma exposure, which is 3. In this case we can skip @@ -573,7 +572,8 @@ namespace SixLabors.ImageSharp v = v * v * v; v.W = a; - pixel4 = v; + baseRef = v; + baseRef = ref Unsafe.Add(ref baseRef, 1); } } @@ -638,12 +638,12 @@ namespace SixLabors.ImageSharp } #endif ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors); - int length = vectors.Length; + ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length); // Fallback with scalar preprocessing and vectorized approximation steps - for (int x = 0; x < length; x++) + while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd)) { - ref Vector4 v = ref Unsafe.Add(ref vectorsRef, x); + Vector4 v = vectorsRef; double x64 = v.X, @@ -678,7 +678,8 @@ namespace SixLabors.ImageSharp y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4))); y4.W = a; - v = y4; + vectorsRef = y4; + vectorsRef = ref Unsafe.Add(ref vectorsRef, 1); } } }