diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index ced2be2e0c..e8f50b3eeb 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -1106,7 +1106,39 @@ internal static class Numerics [MethodImpl(MethodImplOptions.AggressiveInlining)] public static void Normalize(Span span, float sum) { - if (Vector256.IsHardwareAccelerated) + if (Vector512.IsHardwareAccelerated) + { + ref float startRef = ref MemoryMarshal.GetReference(span); + ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~15); + Vector512 sum512 = Vector512.Create(sum); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + Unsafe.As>(ref startRef) /= sum512; + startRef = ref Unsafe.Add(ref startRef, (nuint)16); + } + + if ((span.Length & 15) >= 8) + { + Unsafe.As>(ref startRef) /= sum512.GetLower(); + startRef = ref Unsafe.Add(ref startRef, (nuint)8); + } + + if ((span.Length & 7) >= 4) + { + Unsafe.As>(ref startRef) /= sum512.GetLower().GetLower(); + startRef = ref Unsafe.Add(ref startRef, (nuint)4); + } + + endRef = ref Unsafe.Add(ref startRef, span.Length & 3); + + while (Unsafe.IsAddressLessThan(ref startRef, ref endRef)) + { + startRef /= sum; + startRef = ref Unsafe.Add(ref startRef, (nuint)1); + } + } + else if (Vector256.IsHardwareAccelerated) { ref float startRef = ref MemoryMarshal.GetReference(span); ref float endRef = ref Unsafe.Add(ref startRef, span.Length & ~7); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 009c6e9581..07cfe02850 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -273,7 +273,7 @@ internal static class Vector128Utilities /// /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 MultiplyAdd(Vector128 a, Vector128 b, Vector128 c) + public static Vector128 MultiplyAddEstimate(Vector128 a, Vector128 b, Vector128 c) { if (Fma.IsSupported) { diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 754d6dcb8b..082e4683b0 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -138,7 +138,7 @@ internal static class Vector256Utilities /// /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 MultiplyAdd(Vector256 a, Vector256 b, Vector256 c) + public static Vector256 MultiplyAddEstimate(Vector256 a, Vector256 b, Vector256 c) { if (Fma.IsSupported) { diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 0165af90ef..3325ad1aeb 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -110,6 +110,38 @@ internal static class Vector512Utilities return Vector512.ConvertToInt32(val_2p23_f32 | sign); } + /// + /// Performs a multiply-add operation on three vectors, where each element of the resulting vector is the + /// product of corresponding elements in and added to the + /// corresponding element in . + /// If the CPU supports FMA (Fused Multiply-Add) instructions, the operation is performed as a single + /// fused operation for better performance and precision. + /// + /// The first vector of single-precision floating-point numbers to be multiplied. + /// The second vector of single-precision floating-point numbers to be multiplied. + /// The vector of single-precision floating-point numbers to be added to the product of + /// and . + /// + /// A where each element is the result of multiplying the corresponding elements + /// of and , and then adding the corresponding element from . + /// + /// + /// If the FMA (Fused Multiply-Add) instruction set is supported by the CPU, the operation is performed using + /// against the upper and lower + /// buts. This approach can result in slightly different results compared to performing the multiplication and + /// addition separately due to differences in how floating-point rounding is handled. + /// + /// If FMA is not supported, the operation is performed as a separate multiplication and addition. This might lead + /// to a minor difference in precision compared to the fused operation, particularly in cases where numerical accuracy + /// is critical. + /// + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector512 MultiplyAddEstimate(Vector512 a, Vector512 b, Vector512 c) + => Vector512.Create( + Vector256Utilities.MultiplyAddEstimate(a.GetLower(), b.GetLower(), c.GetLower()), + Vector256Utilities.MultiplyAddEstimate(a.GetUpper(), b.GetUpper(), c.GetUpper())); + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs index 3545bae3f7..41afec892c 100644 --- a/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs +++ b/src/ImageSharp/Processing/Processors/Transforms/Resize/ResizeKernel.cs @@ -80,7 +80,58 @@ internal readonly unsafe struct ResizeKernel [MethodImpl(InliningOptions.ShortMethod)] public Vector4 ConvolveCore(ref Vector4 rowStartRef) { - if (Vector256.IsHardwareAccelerated) + if (Vector512.IsHardwareAccelerated) + { + float* bufferStart = this.bufferPtr; + ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~7); + Vector512 result512_0 = Vector512.Zero; + Vector512 result512_1 = Vector512.Zero; + + while (Unsafe.IsAddressLessThan(ref rowStartRef, ref rowEndRef)) + { + Vector512 pixels512_0 = Unsafe.As>(ref rowStartRef); + Vector512 pixels512_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)4)); + + result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0); + result512_1 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart + 16), pixels512_1, result512_1); + + bufferStart += 32; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)8); + } + + result512_0 += result512_1; + + if ((this.Length & 7) >= 4) + { + Vector512 pixels512_0 = Unsafe.As>(ref rowStartRef); + result512_0 = Vector512Utilities.MultiplyAddEstimate(Vector512.Load(bufferStart), pixels512_0, result512_0); + + bufferStart += 16; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); + } + + Vector256 result256 = result512_0.GetLower() + result512_0.GetUpper(); + + if ((this.Length & 3) >= 2) + { + Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); + result256 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256); + + bufferStart += 8; + rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); + } + + Vector128 result128 = result256.GetLower() + result256.GetUpper(); + + if ((this.Length & 1) != 0) + { + Vector128 pixels128 = Unsafe.As>(ref rowStartRef); + result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128); + } + + return *(Vector4*)&result128; + } + else if (Vector256.IsHardwareAccelerated) { float* bufferStart = this.bufferPtr; ref Vector4 rowEndRef = ref Unsafe.Add(ref rowStartRef, this.Length & ~3); @@ -92,8 +143,8 @@ internal readonly unsafe struct ResizeKernel Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); Vector256 pixels256_1 = Unsafe.As>(ref Unsafe.Add(ref rowStartRef, (nuint)2)); - result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0); - result256_1 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart + 8), pixels256_1, result256_1); + result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0); + result256_1 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart + 8), pixels256_1, result256_1); bufferStart += 16; rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)4); @@ -104,7 +155,7 @@ internal readonly unsafe struct ResizeKernel if ((this.Length & 3) >= 2) { Vector256 pixels256_0 = Unsafe.As>(ref rowStartRef); - result256_0 = Vector256Utilities.MultiplyAdd(Vector256.Load(bufferStart), pixels256_0, result256_0); + result256_0 = Vector256Utilities.MultiplyAddEstimate(Vector256.Load(bufferStart), pixels256_0, result256_0); bufferStart += 8; rowStartRef = ref Unsafe.Add(ref rowStartRef, (nuint)2); @@ -115,7 +166,7 @@ internal readonly unsafe struct ResizeKernel if ((this.Length & 1) != 0) { Vector128 pixels128 = Unsafe.As>(ref rowStartRef); - result128 = Vector128Utilities.MultiplyAdd(Vector128.Load(bufferStart), pixels128, result128); + result128 = Vector128Utilities.MultiplyAddEstimate(Vector128.Load(bufferStart), pixels128, result128); } return *(Vector4*)&result128; @@ -170,7 +221,7 @@ internal readonly unsafe struct ResizeKernel { for (int i = 0; i < this.Length; i++) { - this.Values[i] = (float)values[i]; + this.Values[i] = values[i]; } } }