diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs index dda384390f..c01fc3ba18 100644 --- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs +++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs @@ -80,11 +80,22 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds()); // Preliminary gamma highlight pass - var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma); - ParallelRowIterator.IterateRows( - this.Configuration, - sourceRectangle, - in gammaOperation); + if (this.gamma == 3F) + { + var gammaOperation = new ApplyGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration); + ParallelRowIterator.IterateRows( + this.Configuration, + sourceRectangle, + in gammaOperation); + } + else + { + var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma); + ParallelRowIterator.IterateRows( + this.Configuration, + sourceRectangle, + in gammaOperation); + } // Create a 0-filled buffer to use to store the result of the component convolutions using Buffer2D processingBuffer = this.Configuration.MemoryAllocator.Allocate2D(source.Size(), AllocationOptions.Clean); @@ -92,14 +103,23 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution // Perform the 1D convolutions on all the kernel components and accumulate the results this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer); - float inverseGamma = 1 / this.gamma; - // Apply the inverse gamma exposure pass, and write the final pixel data - var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma); - ParallelRowIterator.IterateRows( - this.Configuration, - sourceRectangle, - in operation); + if (this.gamma == 3F) + { + var operation = new ApplyInverseGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration); + ParallelRowIterator.IterateRows( + this.Configuration, + sourceRectangle, + in operation); + } + else + { + var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma); + ParallelRowIterator.IterateRows( + this.Configuration, + sourceRectangle, + in operation); + } } /// @@ -286,6 +306,56 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution } } + /// + /// A implementing the 3F gamma exposure logic for . + /// + private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation + { + private readonly Rectangle bounds; + private readonly Buffer2D targetPixels; + private readonly Configuration configuration; + + [MethodImpl(InliningOptions.ShortMethod)] + public ApplyGamma3ExposureRowOperation( + Rectangle bounds, + Buffer2D targetPixels, + Configuration configuration) + { + this.bounds = bounds; + this.targetPixels = targetPixels; + this.configuration = configuration; + } + + /// + [MethodImpl(InliningOptions.ShortMethod)] + public void Invoke(int y, Span span) + { + Span targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X); + PixelOperations.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply); + ref Vector4 baseRef = ref MemoryMarshal.GetReference(span); + + for (int x = 0; x < this.bounds.Width; x++) + { + ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x); + Vector4 v = pixel4; + float a = v.W; + + // Fast path for the default gamma exposure, which is 3. In this case we can skip + // calling Math.Pow 3 times (one per component), as the method is an internal call and + // introduces quite a bit of overhead. Instead, we can just manually multiply the whole + // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it + // back to the target index in the temporary span. The whole iteration will get completely + // inlined and traslated into vectorized instructions, with much better performance. + v = v * v * v; + v.W = a; + + pixel4 = v; + } + + PixelOperations.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan); + } + } + /// /// A implementing the inverse gamma exposure logic for . /// @@ -335,5 +405,82 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution PixelOperations.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply); } } + + /// + /// A implementing the inverse 3F gamma exposure logic for . + /// + private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation + { + private readonly Rectangle bounds; + private readonly Buffer2D targetPixels; + private readonly Buffer2D sourceValues; + private readonly Configuration configuration; + + [MethodImpl(InliningOptions.ShortMethod)] + public ApplyInverseGamma3ExposureRowOperation( + Rectangle bounds, + Buffer2D targetPixels, + Buffer2D sourceValues, + Configuration configuration) + { + this.bounds = bounds; + this.targetPixels = targetPixels; + this.sourceValues = sourceValues; + this.configuration = configuration; + } + + /// + [MethodImpl(InliningOptions.ShortMethod)] + public unsafe void Invoke(int y) + { + Vector4 low = Vector4.Zero; + var high = new Vector4(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity); + + Span targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X); + Span sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X); + ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan); + + for (int x = 0; x < this.bounds.Width; x++) + { + ref Vector4 v = ref Unsafe.Add(ref sourceRef, x); + Vector4 clamp = Numerics.Clamp(v, low, high); + + double + x64 = clamp.X, + y64 = clamp.Y, + z64 = clamp.Z; + float a = clamp.W; + + ulong + xl = *(ulong*)&x64, + yl = *(ulong*)&y64, + zl = *(ulong*)&z64; + + // Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root + // of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the + // constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform + // the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set + // these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD. + // As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values. + xl = 0x2a9f8a7be393b600 + (xl / 3); + yl = 0x2a9f8a7be393b600 + (yl / 3); + zl = 0x2a9f8a7be393b600 + (zl / 3); + + Vector4 y4; + y4.X = (float)*(double*)&xl; + y4.Y = (float)*(double*)&yl; + y4.Z = (float)*(double*)&zl; + y4.W = 0; + + y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4))); + y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4))); + y4.W = a; + + v = y4; + } + + PixelOperations.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply); + } + } } }