|
|
|
@ -80,11 +80,22 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution |
|
|
|
var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds()); |
|
|
|
|
|
|
|
// Preliminary gamma highlight pass
|
|
|
|
var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma); |
|
|
|
ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>( |
|
|
|
this.Configuration, |
|
|
|
sourceRectangle, |
|
|
|
in gammaOperation); |
|
|
|
if (this.gamma == 3F) |
|
|
|
{ |
|
|
|
var gammaOperation = new ApplyGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration); |
|
|
|
ParallelRowIterator.IterateRows<ApplyGamma3ExposureRowOperation, Vector4>( |
|
|
|
this.Configuration, |
|
|
|
sourceRectangle, |
|
|
|
in gammaOperation); |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma); |
|
|
|
ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>( |
|
|
|
this.Configuration, |
|
|
|
sourceRectangle, |
|
|
|
in gammaOperation); |
|
|
|
} |
|
|
|
|
|
|
|
// Create a 0-filled buffer to use to store the result of the component convolutions
|
|
|
|
using Buffer2D<Vector4> processingBuffer = this.Configuration.MemoryAllocator.Allocate2D<Vector4>(source.Size(), AllocationOptions.Clean); |
|
|
|
@ -92,14 +103,23 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution |
|
|
|
// Perform the 1D convolutions on all the kernel components and accumulate the results
|
|
|
|
this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer); |
|
|
|
|
|
|
|
float inverseGamma = 1 / this.gamma; |
|
|
|
|
|
|
|
// Apply the inverse gamma exposure pass, and write the final pixel data
|
|
|
|
var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma); |
|
|
|
ParallelRowIterator.IterateRows( |
|
|
|
this.Configuration, |
|
|
|
sourceRectangle, |
|
|
|
in operation); |
|
|
|
if (this.gamma == 3F) |
|
|
|
{ |
|
|
|
var operation = new ApplyInverseGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration); |
|
|
|
ParallelRowIterator.IterateRows( |
|
|
|
this.Configuration, |
|
|
|
sourceRectangle, |
|
|
|
in operation); |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma); |
|
|
|
ParallelRowIterator.IterateRows( |
|
|
|
this.Configuration, |
|
|
|
sourceRectangle, |
|
|
|
in operation); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
@ -286,6 +306,56 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// A <see langword="struct"/> implementing the 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
|
|
|
|
/// </summary>
|
|
|
|
private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation<Vector4> |
|
|
|
{ |
|
|
|
private readonly Rectangle bounds; |
|
|
|
private readonly Buffer2D<TPixel> targetPixels; |
|
|
|
private readonly Configuration configuration; |
|
|
|
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
public ApplyGamma3ExposureRowOperation( |
|
|
|
Rectangle bounds, |
|
|
|
Buffer2D<TPixel> targetPixels, |
|
|
|
Configuration configuration) |
|
|
|
{ |
|
|
|
this.bounds = bounds; |
|
|
|
this.targetPixels = targetPixels; |
|
|
|
this.configuration = configuration; |
|
|
|
} |
|
|
|
|
|
|
|
/// <inheritdoc/>
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
public void Invoke(int y, Span<Vector4> span) |
|
|
|
{ |
|
|
|
Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X); |
|
|
|
PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply); |
|
|
|
ref Vector4 baseRef = ref MemoryMarshal.GetReference(span); |
|
|
|
|
|
|
|
for (int x = 0; x < this.bounds.Width; x++) |
|
|
|
{ |
|
|
|
ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x); |
|
|
|
Vector4 v = pixel4; |
|
|
|
float a = v.W; |
|
|
|
|
|
|
|
// Fast path for the default gamma exposure, which is 3. In this case we can skip
|
|
|
|
// calling Math.Pow 3 times (one per component), as the method is an internal call and
|
|
|
|
// introduces quite a bit of overhead. Instead, we can just manually multiply the whole
|
|
|
|
// pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
|
|
|
|
// back to the target index in the temporary span. The whole iteration will get completely
|
|
|
|
// inlined and traslated into vectorized instructions, with much better performance.
|
|
|
|
v = v * v * v; |
|
|
|
v.W = a; |
|
|
|
|
|
|
|
pixel4 = v; |
|
|
|
} |
|
|
|
|
|
|
|
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// A <see langword="struct"/> implementing the inverse gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
|
|
|
|
/// </summary>
|
|
|
|
@ -335,5 +405,82 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution |
|
|
|
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// A <see langword="struct"/> implementing the inverse 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
|
|
|
|
/// </summary>
|
|
|
|
private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation |
|
|
|
{ |
|
|
|
private readonly Rectangle bounds; |
|
|
|
private readonly Buffer2D<TPixel> targetPixels; |
|
|
|
private readonly Buffer2D<Vector4> sourceValues; |
|
|
|
private readonly Configuration configuration; |
|
|
|
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
public ApplyInverseGamma3ExposureRowOperation( |
|
|
|
Rectangle bounds, |
|
|
|
Buffer2D<TPixel> targetPixels, |
|
|
|
Buffer2D<Vector4> sourceValues, |
|
|
|
Configuration configuration) |
|
|
|
{ |
|
|
|
this.bounds = bounds; |
|
|
|
this.targetPixels = targetPixels; |
|
|
|
this.sourceValues = sourceValues; |
|
|
|
this.configuration = configuration; |
|
|
|
} |
|
|
|
|
|
|
|
/// <inheritdoc/>
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
public unsafe void Invoke(int y) |
|
|
|
{ |
|
|
|
Vector4 low = Vector4.Zero; |
|
|
|
var high = new Vector4(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity); |
|
|
|
|
|
|
|
Span<TPixel> targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X); |
|
|
|
Span<Vector4> sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X); |
|
|
|
ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan); |
|
|
|
|
|
|
|
for (int x = 0; x < this.bounds.Width; x++) |
|
|
|
{ |
|
|
|
ref Vector4 v = ref Unsafe.Add(ref sourceRef, x); |
|
|
|
Vector4 clamp = Numerics.Clamp(v, low, high); |
|
|
|
|
|
|
|
double |
|
|
|
x64 = clamp.X, |
|
|
|
y64 = clamp.Y, |
|
|
|
z64 = clamp.Z; |
|
|
|
float a = clamp.W; |
|
|
|
|
|
|
|
ulong |
|
|
|
xl = *(ulong*)&x64, |
|
|
|
yl = *(ulong*)&y64, |
|
|
|
zl = *(ulong*)&z64; |
|
|
|
|
|
|
|
// Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root
|
|
|
|
// of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the
|
|
|
|
// constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform
|
|
|
|
// the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set
|
|
|
|
// these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD.
|
|
|
|
// As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values.
|
|
|
|
xl = 0x2a9f8a7be393b600 + (xl / 3); |
|
|
|
yl = 0x2a9f8a7be393b600 + (yl / 3); |
|
|
|
zl = 0x2a9f8a7be393b600 + (zl / 3); |
|
|
|
|
|
|
|
Vector4 y4; |
|
|
|
y4.X = (float)*(double*)&xl; |
|
|
|
y4.Y = (float)*(double*)&yl; |
|
|
|
y4.Z = (float)*(double*)&zl; |
|
|
|
y4.W = 0; |
|
|
|
|
|
|
|
y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4))); |
|
|
|
y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4))); |
|
|
|
y4.W = a; |
|
|
|
|
|
|
|
v = y4; |
|
|
|
} |
|
|
|
|
|
|
|
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|