Browse Source

Add optimized paths for default gamma exposure

js/color-alpha-handling
Sergio Pedri 5 years ago
parent
commit
8292407ae2
  1. 171
      src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs

171
src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs

@ -80,11 +80,22 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds());
// Preliminary gamma highlight pass
var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
this.Configuration,
sourceRectangle,
in gammaOperation);
if (this.gamma == 3F)
{
var gammaOperation = new ApplyGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration);
ParallelRowIterator.IterateRows<ApplyGamma3ExposureRowOperation, Vector4>(
this.Configuration,
sourceRectangle,
in gammaOperation);
}
else
{
var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
ParallelRowIterator.IterateRows<ApplyGammaExposureRowOperation, Vector4>(
this.Configuration,
sourceRectangle,
in gammaOperation);
}
// Create a 0-filled buffer to use to store the result of the component convolutions
using Buffer2D<Vector4> processingBuffer = this.Configuration.MemoryAllocator.Allocate2D<Vector4>(source.Size(), AllocationOptions.Clean);
@ -92,14 +103,23 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
// Perform the 1D convolutions on all the kernel components and accumulate the results
this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer);
float inverseGamma = 1 / this.gamma;
// Apply the inverse gamma exposure pass, and write the final pixel data
var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma);
ParallelRowIterator.IterateRows(
this.Configuration,
sourceRectangle,
in operation);
if (this.gamma == 3F)
{
var operation = new ApplyInverseGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration);
ParallelRowIterator.IterateRows(
this.Configuration,
sourceRectangle,
in operation);
}
else
{
var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma);
ParallelRowIterator.IterateRows(
this.Configuration,
sourceRectangle,
in operation);
}
}
/// <summary>
@ -286,6 +306,56 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
}
}
/// <summary>
/// A <see langword="struct"/> implementing the 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
/// </summary>
private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation<Vector4>
{
private readonly Rectangle bounds;
private readonly Buffer2D<TPixel> targetPixels;
private readonly Configuration configuration;
[MethodImpl(InliningOptions.ShortMethod)]
public ApplyGamma3ExposureRowOperation(
Rectangle bounds,
Buffer2D<TPixel> targetPixels,
Configuration configuration)
{
this.bounds = bounds;
this.targetPixels = targetPixels;
this.configuration = configuration;
}
/// <inheritdoc/>
[MethodImpl(InliningOptions.ShortMethod)]
public void Invoke(int y, Span<Vector4> span)
{
Span<TPixel> targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
PixelOperations<TPixel>.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
ref Vector4 baseRef = ref MemoryMarshal.GetReference(span);
for (int x = 0; x < this.bounds.Width; x++)
{
ref Vector4 pixel4 = ref Unsafe.Add(ref baseRef, x);
Vector4 v = pixel4;
float a = v.W;
// Fast path for the default gamma exposure, which is 3. In this case we can skip
// calling Math.Pow 3 times (one per component), as the method is an internal call and
// introduces quite a bit of overhead. Instead, we can just manually multiply the whole
// pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
// back to the target index in the temporary span. The whole iteration will get completely
// inlined and traslated into vectorized instructions, with much better performance.
v = v * v * v;
v.W = a;
pixel4 = v;
}
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
}
}
/// <summary>
/// A <see langword="struct"/> implementing the inverse gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
/// </summary>
@ -335,5 +405,82 @@ namespace SixLabors.ImageSharp.Processing.Processors.Convolution
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
}
}
/// <summary>
/// A <see langword="struct"/> implementing the inverse 3F gamma exposure logic for <see cref="BokehBlurProcessor{T}"/>.
/// </summary>
private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation
{
private readonly Rectangle bounds;
private readonly Buffer2D<TPixel> targetPixels;
private readonly Buffer2D<Vector4> sourceValues;
private readonly Configuration configuration;
[MethodImpl(InliningOptions.ShortMethod)]
public ApplyInverseGamma3ExposureRowOperation(
Rectangle bounds,
Buffer2D<TPixel> targetPixels,
Buffer2D<Vector4> sourceValues,
Configuration configuration)
{
this.bounds = bounds;
this.targetPixels = targetPixels;
this.sourceValues = sourceValues;
this.configuration = configuration;
}
/// <inheritdoc/>
[MethodImpl(InliningOptions.ShortMethod)]
public unsafe void Invoke(int y)
{
Vector4 low = Vector4.Zero;
var high = new Vector4(float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity, float.PositiveInfinity);
Span<TPixel> targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
Span<Vector4> sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X);
ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
for (int x = 0; x < this.bounds.Width; x++)
{
ref Vector4 v = ref Unsafe.Add(ref sourceRef, x);
Vector4 clamp = Numerics.Clamp(v, low, high);
double
x64 = clamp.X,
y64 = clamp.Y,
z64 = clamp.Z;
float a = clamp.W;
ulong
xl = *(ulong*)&x64,
yl = *(ulong*)&y64,
zl = *(ulong*)&z64;
// Here we use a trick to compute the starting value x0 for the cube root. This is because doing pow(x, 1 / gamma) is the same as the gamma-th root
// of x, and since gamme is 3 in this case, this means what we actually want is to find the cube root of our clamped values. For more info on the
// constant below, see https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543. Here we perform
// the same trick on all RGB channels separately to help the CPU execute them in paralle, and store the alpha channel to preserve it. Then we set
// these values to the fields of a temporary 128-bit register, and use it to accelerate two steps of the Newton approximation using SIMD.
// As a note for possible future improvements, we should come up with a good bitmask to perform the x0 approximation directly on float values.
xl = 0x2a9f8a7be393b600 + (xl / 3);
yl = 0x2a9f8a7be393b600 + (yl / 3);
zl = 0x2a9f8a7be393b600 + (zl / 3);
Vector4 y4;
y4.X = (float)*(double*)&xl;
y4.Y = (float)*(double*)&yl;
y4.Z = (float)*(double*)&zl;
y4.W = 0;
y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
y4 = (2 / 3f * y4) + (1 / 3f * (clamp / (y4 * y4)));
y4.W = a;
v = y4;
}
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
}
}
}
}

Loading…
Cancel
Save