Browse Source

Reimplement @Sergio0694 work.

pull/2793/head
James Jackson-South 2 years ago
parent
commit
cd1b77a88f
  1. 47
      src/ImageSharp/Common/Helpers/Numerics.cs
  2. 38
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  3. 38
      src/ImageSharp/Common/Helpers/Vector256Utilities.cs
  4. 107
      src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs
  5. 2
      src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs
  6. 74
      src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs
  7. 36
      tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs
  8. 26
      tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs

47
src/ImageSharp/Common/Helpers/Numerics.cs

@ -1097,4 +1097,51 @@ internal static class Numerics
public static nuint Vector512Count<TVector>(int length)
where TVector : struct
=> (uint)length / (uint)Vector512<TVector>.Count;
/// <summary>
/// Normalizes the values in a given <see cref="Span{T}"/>.
/// </summary>
/// <param name="span">The sequence of <see cref="float"/> values to normalize.</param>
/// <param name="sum">The sum of the values in <paramref name="span"/>.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void Normalize(Span<float> span, float sum)
{
if (Vector256.IsHardwareAccelerated)
{
ref float startRef = ref MemoryMarshal.GetReference(span);
ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7);
Vector256<float> sum256 = Vector256.Create(sum);
while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
{
Unsafe.As<float, Vector256<float>>(ref startRef) /= sum256;
startRef = ref Unsafe.Add(ref startRef, (nuint)8);
}
if ((span.Length & 7) >= 4)
{
Unsafe.As<float, Vector128<float>>(ref startRef) /= sum256.GetLower();
startRef = ref Unsafe.Add(ref startRef, (nuint)4);
}
endRef = ref Unsafe.Add(ref startRef, span.Length & 3);
while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
{
startRef /= sum;
startRef = ref Unsafe.Add(ref startRef, (nuint)1);
}
}
else
{
ref float startRef = ref MemoryMarshal.GetReference(span);
ref float endRef = ref Unsafe.Add(ref startRef, span.Length);
while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
{
startRef /= sum;
startRef = ref Unsafe.Add(ref startRef, (nuint)1);
}
}
}
}

38
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

@ -245,6 +245,44 @@ internal static class Vector128Utilities
return default;
}
/// <summary>
/// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
/// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
/// corresponding element in <paramref name="c"/>.
/// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
/// fused operation for better performance and precision.
/// </summary>
/// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
/// <paramref name="a"/> and <paramref name="b"/>.</param>
/// <returns>
/// A <see cref="Vector128{Single}"/> where each element is the result of multiplying the corresponding elements
/// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
/// </returns>
/// <remarks>
/// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
/// <see cref="Fma.MultiplyAdd(Vector128{float}, Vector128{float}, Vector128{float})"/>. This approach can result
/// in slightly different results compared to performing the multiplication and addition separately due to
/// differences in how floating-point
/// rounding is handled.
/// <para>
/// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
/// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
/// is critical.
/// </para>
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<float> MultiplyAdd(Vector128<float> a, Vector128<float> b, Vector128<float> c)
{
if (Fma.IsSupported)
{
return Fma.MultiplyAdd(a, b, c);
}
return (a * b) + c;
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

38
src/ImageSharp/Common/Helpers/Vector256Utilities.cs

@ -110,6 +110,44 @@ internal static class Vector256Utilities
return Vector256.ConvertToInt32(val_2p23_f32 | sign);
}
/// <summary>
/// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
/// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
/// corresponding element in <paramref name="c"/>.
/// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
/// fused operation for better performance and precision.
/// </summary>
/// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
/// <paramref name="a"/> and <paramref name="b"/>.</param>
/// <returns>
/// A <see cref="Vector256{Single}"/> where each element is the result of multiplying the corresponding elements
/// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
/// </returns>
/// <remarks>
/// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
/// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/>. This approach can result
/// in slightly different results compared to performing the multiplication and addition separately due to
/// differences in how floating-point
/// rounding is handled.
/// <para>
/// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
/// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
/// is critical.
/// </para>
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> MultiplyAdd(Vector256<float> a, Vector256<float> b, Vector256<float> c)
{
if (Fma.IsSupported)
{
return Fma.MultiplyAdd(a, b, c);
}
return (a * b) + c;
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

107
src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs

@ -5,7 +5,7 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
@ -14,6 +14,10 @@ namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
/// </summary>
internal readonly unsafe struct ResizeKernel
{
/// <summary>
/// The buffer with the convolution factors.
/// Note that when FMA is supported, this is of size 4x that reported in <see cref="Length"/>.
/// </summary>
private readonly float* bufferPtr;
/// <summary>
@ -53,7 +57,15 @@ internal readonly unsafe struct ResizeKernel
public Span<float> Values
{
[MethodImpl(InliningOptions.ShortMethod)]
get => new(this.bufferPtr, this.Length);
get
{
if (Vector256.IsHardwareAccelerated)
{
return new(this.bufferPtr, this.Length * 4);
}
return new(this.bufferPtr, this.Length);
}
}
/// <summary>
@ -68,70 +80,42 @@ internal readonly unsafe struct ResizeKernel
[MethodImpl(InliningOptions.ShortMethod)]
public Vector4 ConvolveCore(ref Vector4 rowStartRef)
{
if (Avx2.IsSupported && Fma.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
float* bufferStart = this.bufferPtr;
float* bufferEnd = bufferStart + (this.Length & ~3);
ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3);
Vector256<float> result256_0 = Vector256<float>.Zero;
Vector256<float> result256_1 = Vector256<float>.Zero;
ReadOnlySpan<byte> maskBytes = new byte[]
{
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0,
1, 0, 0, 0, 1, 0, 0, 0,
};
Vector256<int> mask = Unsafe.ReadUnaligned<Vector256<int>>(ref MemoryMarshal.GetReference(maskBytes));
while (bufferStart < bufferEnd)
while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
{
// It is important to use a single expression here so that the JIT will correctly use vfmadd231ps
// for the FMA operation, and execute it directly on the target register and reading directly from
// memory for the first parameter. This skips initializing a SIMD register, and an extra copy.
// The code below should compile in the following assembly on .NET 5 x64:
//
// vmovsd xmm2, [rax] ; load *(double*)bufferStart into xmm2 as [ab, _]
// vpermps ymm2, ymm1, ymm2 ; permute as a float YMM register to [a, a, a, a, b, b, b, b]
// vfmadd231ps ymm0, ymm2, [r8] ; result256_0 = FMA(pixels, factors) + result256_0
//
// For tracking the codegen issue with FMA, see: https://github.com/dotnet/runtime/issues/12212.
// Additionally, we're also unrolling two computations per each loop iterations to leverage the
// fact that most CPUs have two ports to schedule multiply operations for FMA instructions.
result256_0 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
result256_0);
result256_1 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, 2)),
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)(bufferStart + 2)).AsSingle(), mask),
result256_1);
bufferStart += 4;
rowStartRef = ref Unsafe.Add(ref rowStartRef, 4);
Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
Vector256<float> pixels256_1 = Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)2));
result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0);
result256_1 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);
bufferStart += 16;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
}
result256_0 = Avx.Add(result256_0, result256_1);
result256_0 += result256_1;
if ((this.Length & 3) >= 2)
{
result256_0 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef),
Avx2.PermuteVar8x32(Vector256.CreateScalarUnsafe(*(double*)bufferStart).AsSingle(), mask),
result256_0);
Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0);
bufferStart += 2;
rowStartRef = ref Unsafe.Add(ref rowStartRef, 2);
bufferStart += 8;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
}
Vector128<float> result128 = Sse.Add(result256_0.GetLower(), result256_0.GetUpper());
Vector128<float> result128 = result256_0.GetLower() + result256_0.GetUpper();
if ((this.Length & 1) != 0)
{
result128 = Fma.MultiplyAdd(
Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef),
Vector128.Create(*bufferStart),
result128);
Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
result128 = Vector128Utilities.MultiplyAdd(Vector128.Load(bufferStart), pixels128, result128);
}
return *(Vector4*)&result128;
@ -149,7 +133,7 @@ internal readonly unsafe struct ResizeKernel
result += rowStartRef * *bufferStart;
bufferStart++;
rowStartRef = ref Unsafe.Add(ref rowStartRef, 1);
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)1);
}
return result;
@ -164,13 +148,30 @@ internal readonly unsafe struct ResizeKernel
internal ResizeKernel AlterLeftValue(int left)
=> new(left, this.bufferPtr, this.Length);
internal void Fill(Span<double> values)
internal void FillOrCopyAndExpand(Span<float> values)
{
DebugGuard.IsTrue(values.Length == this.Length, nameof(values), "ResizeKernel.Fill: values.Length != this.Length!");
for (int i = 0; i < this.Length; i++)
if (Vector256.IsHardwareAccelerated)
{
this.Values[i] = (float)values[i];
Vector4* bufferStart = (Vector4*)this.bufferPtr;
ref float valuesStart = ref MemoryMarshal.GetReference(values);
ref float valuesEnd = ref Unsafe.Add(ref valuesStart, values.Length);
while (Unsafe.IsAddressLessThan(ref valuesStart, ref valuesEnd))
{
*bufferStart = new Vector4(valuesStart);
bufferStart++;
valuesStart = ref Unsafe.Add(ref valuesStart, (nuint)1);
}
}
else
{
for (int i = 0; i < this.Length; i++)
{
this.Values[i] = (float)values[i];
}
}
}
}

2
src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.PeriodicKernelMap.cs

@ -54,7 +54,7 @@ internal partial class ResizeKernelMap
int bottomStartDest = this.DestinationLength - this.cornerInterval;
for (int i = startOfFirstRepeatedMosaic; i < bottomStartDest; i++)
{
double center = ((i + .5) * this.ratio) - .5;
float center = (float)(((i + .5) * this.ratio) - .5);
int left = (int)TolerantMath.Ceiling(center - this.radius);
ResizeKernel kernel = this.kernels[i - this.period];
this.kernels[i] = kernel.AlterLeftValue(left);

74
src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernelMap.cs

@ -5,6 +5,7 @@ using System.Buffers;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Memory;
namespace SixLabors.ImageSharp.Processing.Processors.Transforms;
@ -33,7 +34,7 @@ internal partial class ResizeKernelMap : IDisposable
private bool isDisposed;
// To avoid both GC allocations, and MemoryAllocator ceremony:
private readonly double[] tempValues;
private readonly float[] tempValues;
private ResizeKernelMap(
MemoryAllocator memoryAllocator,
@ -50,10 +51,19 @@ internal partial class ResizeKernelMap : IDisposable
this.sourceLength = sourceLength;
this.DestinationLength = destinationLength;
this.MaxDiameter = (radius * 2) + 1;
this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true, AllocationOptions.Clean);
if (Vector256.IsHardwareAccelerated)
{
this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter * 4, bufferHeight, preferContiguosImageBuffers: true);
}
else
{
this.data = memoryAllocator.Allocate2D<float>(this.MaxDiameter, bufferHeight, preferContiguosImageBuffers: true);
}
this.pinHandle = this.data.DangerousGetSingleMemory().Pin();
this.kernels = new ResizeKernel[destinationLength];
this.tempValues = new double[this.MaxDiameter];
this.tempValues = new float[this.MaxDiameter];
}
/// <summary>
@ -155,23 +165,23 @@ internal partial class ResizeKernelMap : IDisposable
bool hasAtLeast2Periods = 2 * (cornerInterval + period) < destinationSize;
ResizeKernelMap result = hasAtLeast2Periods
? new PeriodicKernelMap(
memoryAllocator,
sourceSize,
destinationSize,
ratio,
scale,
radius,
period,
cornerInterval)
: new ResizeKernelMap(
memoryAllocator,
sourceSize,
destinationSize,
destinationSize,
ratio,
scale,
radius);
? new PeriodicKernelMap(
memoryAllocator,
sourceSize,
destinationSize,
ratio,
scale,
radius,
period,
cornerInterval)
: new ResizeKernelMap(
memoryAllocator,
sourceSize,
destinationSize,
destinationSize,
ratio,
scale,
radius);
result.Initialize(in sampler);
@ -198,7 +208,8 @@ internal partial class ResizeKernelMap : IDisposable
private ResizeKernel BuildKernel<TResampler>(in TResampler sampler, int destRowIndex, int dataRowIndex)
where TResampler : struct, IResampler
{
double center = ((destRowIndex + .5) * this.ratio) - .5;
float center = (float)(((destRowIndex + .5) * this.ratio) - .5);
float scale = (float)this.scale;
// Keep inside bounds.
int left = (int)TolerantMath.Ceiling(center - this.radius);
@ -214,30 +225,25 @@ internal partial class ResizeKernelMap : IDisposable
}
ResizeKernel kernel = this.CreateKernel(dataRowIndex, left, right);
Span<double> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
double sum = 0;
Span<float> kernelValues = this.tempValues.AsSpan(0, kernel.Length);
ref float kernelStart = ref MemoryMarshal.GetReference(kernelValues);
float sum = 0;
for (int j = left; j <= right; j++)
{
double value = sampler.GetValue((float)((j - center) / this.scale));
float value = sampler.GetValue((j - center) / scale);
sum += value;
kernelValues[j - left] = value;
kernelStart = value;
kernelStart = ref Unsafe.Add(ref kernelStart, 1);
}
// Normalize, best to do it here rather than in the pixel loop later on.
if (sum > 0)
{
for (int j = 0; j < kernel.Length; j++)
{
// weights[w] = weights[w] / sum:
ref double kRef = ref kernelValues[j];
kRef /= sum;
}
Numerics.Normalize(kernelValues, sum);
}
kernel.Fill(kernelValues);
kernel.FillOrCopyAndExpand(kernelValues);
return kernel;
}

36
tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.ReferenceKernelMap.cs

@ -16,9 +16,7 @@ public partial class ResizeKernelMapTests
private readonly ReferenceKernel[] kernels;
public ReferenceKernelMap(ReferenceKernel[] kernels)
{
this.kernels = kernels;
}
=> this.kernels = kernels;
public int DestinationSize => this.kernels.Length;
@ -28,22 +26,23 @@ public partial class ResizeKernelMapTests
where TResampler : struct, IResampler
{
double ratio = (double)sourceSize / destinationSize;
double scale = ratio;
double scaleD = ratio;
if (scale < 1F)
if (scaleD < 1)
{
scale = 1F;
scaleD = 1;
}
TolerantMath tolerantMath = TolerantMath.Default;
double radius = tolerantMath.Ceiling(scale * sampler.Radius);
double radius = tolerantMath.Ceiling(scaleD * sampler.Radius);
var result = new List<ReferenceKernel>();
List<ReferenceKernel> result = [];
float scale = (float)scaleD;
for (int i = 0; i < destinationSize; i++)
{
double center = ((i + .5) * ratio) - .5;
float center = (float)(((i + .5) * ratio) - .5);
// Keep inside bounds.
int left = (int)tolerantMath.Ceiling(center - radius);
@ -58,15 +57,14 @@ public partial class ResizeKernelMapTests
right = sourceSize - 1;
}
double sum = 0;
float sum = 0;
double[] values = new double[right - left + 1];
float[] values = new float[right - left + 1];
for (int j = left; j <= right; j++)
{
double weight = sampler.GetValue((float)((j - center) / scale));
float weight = sampler.GetValue((j - center) / scale);
sum += weight;
values[j - left] = weight;
}
@ -78,16 +76,14 @@ public partial class ResizeKernelMapTests
}
}
float[] floatVals = values.Select(v => (float)v).ToArray();
result.Add(new ReferenceKernel(left, floatVals));
result.Add(new ReferenceKernel(left, values));
}
return new ReferenceKernelMap(result.ToArray());
return new ReferenceKernelMap([.. result]);
}
}
internal struct ReferenceKernel
internal readonly struct ReferenceKernel
{
public ReferenceKernel(int left, float[] values)
{
@ -102,8 +98,6 @@ public partial class ResizeKernelMapTests
public int Length => this.Values.Length;
public static implicit operator ReferenceKernel(ResizeKernel orig)
{
return new ReferenceKernel(orig.StartIndex, orig.Values.ToArray());
}
=> new(orig.StartIndex, orig.Values.ToArray());
}
}

26
tests/ImageSharp.Tests/Processing/Processors/Transforms/ResizeKernelMapTests.cs

@ -1,6 +1,7 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.Intrinsics;
using System.Text;
using SixLabors.ImageSharp.Processing;
using SixLabors.ImageSharp.Processing.Processors.Transforms;
@ -124,7 +125,6 @@ public partial class ResizeKernelMapTests
this.Output.WriteLine($"Expected KernelMap:\n{PrintKernelMap(referenceMap)}\n");
this.Output.WriteLine($"Actual KernelMap:\n{PrintKernelMap(kernelMap)}\n");
#endif
var comparer = new ApproximateFloatComparer(1e-6f);
for (int i = 0; i < kernelMap.DestinationLength; i++)
{
@ -139,7 +139,29 @@ public partial class ResizeKernelMapTests
referenceKernel.Left == kernel.StartIndex,
$"referenceKernel.Left != kernel.Left: {referenceKernel.Left} != {kernel.StartIndex}");
float[] expectedValues = referenceKernel.Values;
Span<float> actualValues = kernel.Values;
Span<float> actualValues;
ApproximateFloatComparer comparer;
if (Vector256.IsHardwareAccelerated)
{
comparer = new ApproximateFloatComparer(1e-4f);
Assert.Equal(expectedValues.Length, kernel.Values.Length / 4);
int actualLength = referenceKernel.Length / 4;
actualValues = new float[expectedValues.Length];
for (int j = 0; j < expectedValues.Length; j++)
{
actualValues[j] = kernel.Values[j * 4];
}
}
else
{
comparer = new ApproximateFloatComparer(1e-6f);
actualValues = kernel.Values;
}
Assert.Equal(expectedValues.Length, actualValues.Length);

Loading…
Cancel
Save