Browse Source

Add Vector512 support

pull/2793/head
James Jackson-South 2 years ago
parent
commit
36fefc6059
  1. 34
      src/ImageSharp/Common/Helpers/Numerics.cs
  2. 2
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  3. 2
      src/ImageSharp/Common/Helpers/Vector256Utilities.cs
  4. 32
      src/ImageSharp/Common/Helpers/Vector512Utilities.cs
  5. 63
      src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs

34
src/ImageSharp/Common/Helpers/Numerics.cs

@ -1106,7 +1106,39 @@ internal static class Numerics
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static void Normalize(Span<float> span, float sum)
{
if (Vector256.IsHardwareAccelerated)
if (Vector512.IsHardwareAccelerated)
{
ref float startRef = ref MemoryMarshal.GetReference(span);
ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~15);
Vector512<float> sum512 = Vector512.Create(sum);
while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
{
Unsafe.As<float, Vector512<float>>(ref startRef) /= sum512;
startRef = ref Unsafe.Add(ref startRef, (nuint)16);
}
if ((span.Length & 15) >= 8)
{
Unsafe.As<float, Vector256<float>>(ref startRef) /= sum512.GetLower();
startRef = ref Unsafe.Add(ref startRef, (nuint)8);
}
if ((span.Length & 7) >= 4)
{
Unsafe.As<float, Vector128<float>>(ref startRef) /= sum512.GetLower().GetLower();
startRef = ref Unsafe.Add(ref startRef, (nuint)4);
}
endRef = ref Unsafe.Add(ref startRef, span.Length & 3);
while (Unsafe.IsAddressLessThan(ref startRef, ref endRef))
{
startRef /= sum;
startRef = ref Unsafe.Add(ref startRef, (nuint)1);
}
}
else if (Vector256.IsHardwareAccelerated)
{
ref float startRef = ref MemoryMarshal.GetReference(span);
ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7);

2
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

@ -273,7 +273,7 @@ internal static class Vector128Utilities
/// </para>
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<float> MultiplyAdd(Vector128<float> a, Vector128<float> b, Vector128<float> c)
public static Vector128<float> MultiplyAddEstimate(Vector128<float> a, Vector128<float> b, Vector128<float> c)
{
if (Fma.IsSupported)
{

2
src/ImageSharp/Common/Helpers/Vector256Utilities.cs

@ -138,7 +138,7 @@ internal static class Vector256Utilities
/// </para>
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> MultiplyAdd(Vector256<float> a, Vector256<float> b, Vector256<float> c)
public static Vector256<float> MultiplyAddEstimate(Vector256<float> a, Vector256<float> b, Vector256<float> c)
{
if (Fma.IsSupported)
{

32
src/ImageSharp/Common/Helpers/Vector512Utilities.cs

@ -110,6 +110,38 @@ internal static class Vector512Utilities
return Vector512.ConvertToInt32(val_2p23_f32 | sign);
}
/// <summary>
/// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the
/// product of corresponding elements in <paramref name="a"/> and <paramref name="b"/> added to the
/// corresponding element in <paramref name="c"/>.
/// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single
/// fused operation for better performance and precision.
/// </summary>
/// <param name="a">The first vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="b">The second vector of single-precision floating-point numbers to be multiplied.</param>
/// <param name="c">The vector of single-precision floating-point numbers to be added to the product of
/// <paramref name="a"/> and <paramref name="b"/>.</param>
/// <returns>
/// A <see cref="Vector512{Single}"/> where each element is the result of multiplying the corresponding elements
/// of <paramref name="a"/> and <paramref name="b"/>, and then adding the corresponding element from <paramref name="c"/>.
/// </returns>
/// <remarks>
/// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using
/// <see cref="Fma.MultiplyAdd(Vector256{float}, Vector256{float}, Vector256{float})"/> against the upper and lower
/// buts. This approach can result in slightly different results compared to performing the multiplication and
/// addition separately due to differences in how floating-point rounding is handled.
/// <para>
/// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead
/// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy
/// is critical.
/// </para>
/// </remarks>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<float> MultiplyAddEstimate(Vector512<float> a, Vector512<float> b, Vector512<float> c)
=> Vector512.Create(
Vector256Utilities.MultiplyAddEstimate(a.GetLower(), b.GetLower(), c.GetLower()),
Vector256Utilities.MultiplyAddEstimate(a.GetUpper(), b.GetUpper(), c.GetUpper()));
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

63
src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs

@ -80,7 +80,58 @@ internal readonly unsafe struct ResizeKernel
[MethodImpl(InliningOptions.ShortMethod)]
public Vector4 ConvolveCore(ref Vector4 rowStartRef)
{
if (Vector256.IsHardwareAccelerated)
if (Vector512.IsHardwareAccelerated)
{
float* bufferStart = this.bufferPtr;
ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7);
Vector512<float> result512_0 = Vector512<float>.Zero;
Vector512<float> result512_1 = Vector512<float>.Zero;
while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef))
{
Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
Vector512<float> pixels512_1 = Unsafe.As<Vector4, Vector512<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)4));
result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1);
bufferStart += 32;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8);
}
result512_0 += result512_1;
if ((this.Length & 7) >= 4)
{
Vector512<float> pixels512_0 = Unsafe.As<Vector4, Vector512<float>>(ref rowStartRef);
result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0);
bufferStart += 16;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
}
Vector256<float> result256 = result512_0.GetLower() + result512_0.GetUpper();
if ((this.Length & 3) >= 2)
{
Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256);
bufferStart += 8;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
}
Vector128<float> result128 = result256.GetLower() + result256.GetUpper();
if ((this.Length & 1) != 0)
{
Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
}
return *(Vector4*)&result128;
}
else if (Vector256.IsHardwareAccelerated)
{
float* bufferStart = this.bufferPtr;
ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3);
@ -92,8 +143,8 @@ internal readonly unsafe struct ResizeKernel
Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
Vector256<float> pixels256_1 = Unsafe.As<Vector4, Vector256<float>>(ref Unsafe.Add(ref rowStartRef, (nuint)2));
result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0);
result256_1 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);
result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1);
bufferStart += 16;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4);
@ -104,7 +155,7 @@ internal readonly unsafe struct ResizeKernel
if ((this.Length & 3) >= 2)
{
Vector256<float> pixels256_0 = Unsafe.As<Vector4, Vector256<float>>(ref rowStartRef);
result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0);
result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0);
bufferStart += 8;
rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2);
@ -115,7 +166,7 @@ internal readonly unsafe struct ResizeKernel
if ((this.Length & 1) != 0)
{
Vector128<float> pixels128 = Unsafe.As<Vector4, Vector128<float>>(ref rowStartRef);
result128 = Vector128Utilities.MultiplyAdd(Vector128.Load(bufferStart), pixels128, result128);
result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128);
}
return *(Vector4*)&result128;
@ -170,7 +221,7 @@ internal readonly unsafe struct ResizeKernel
{
for (int i = 0; i < this.Length; i++)
{
this.Values[i] = (float)values[i];
this.Values[i] = values[i];
}
}
}

Loading…
Cancel
Save