From 980347e96fae9642cbd89bccb5a732d61865558c Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 2 Feb 2024 15:51:22 +1000 Subject: [PATCH] ENhance NormalizedFloatToByteSaturate --- src/ImageSharp/Common/Helpers/Numerics.cs | 20 +++ .../Helpers/SimdUtils.ExtendedIntrinsics.cs | 4 +- .../SimdUtils.FallbackIntrinsics128.cs | 4 +- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 150 +++++++++++------- src/ImageSharp/Common/Helpers/SimdUtils.cs | 49 +++--- .../Common/Helpers/Vector128Utilities.cs | 87 +++++++++- .../Common/Helpers/Vector256Utilities.cs | 39 ++++- .../Common/Helpers/Vector512Utilities.cs | 37 ++++- .../PixelOperations/Rgba32.PixelOperations.cs | 16 +- .../ImageSharp.Benchmarks/Bulk/FromVector4.cs | 92 ++++++----- .../Bulk/FromVector4_Rgb24.cs | 66 +++----- .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 5 +- .../TestUtilities/BasicSerializer.cs | 22 +-- .../FeatureTesting/FeatureTestRunner.cs | 81 ++++++---- 14 files changed, 443 insertions(+), 229 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index ca28a7aab..5f85734e8 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -1010,6 +1010,26 @@ internal static class Numerics where TVector : struct => (uint)span.Length / (uint)Vector256.Count; + /// + /// Gets the count of vectors that safely fit into the given span. + /// + /// The type of the vector. + /// The given span. + /// Count of vectors that safely fit into the span. + public static nuint Vector512Count(this Span span) + where TVector : struct + => (uint)span.Length / (uint)Vector512.Count; + + /// + /// Gets the count of vectors that safely fit into the given span. + /// + /// The type of the vector. + /// The given span. + /// Count of vectors that safely fit into the span. + public static nuint Vector512Count(this ReadOnlySpan span) + where TVector : struct + => (uint)span.Length / (uint)Vector512.Count; + /// /// Gets the count of vectors that safely fit into the given span. /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index ac122fc7d..7d07dcaae 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -95,7 +95,7 @@ internal static partial class SimdUtils /// internal static void ByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - VerifySpanInput(source, dest, Vector.Count); + DebugVerifySpanInput(source, dest, Vector.Count); nuint n = dest.VectorCount(); @@ -130,7 +130,7 @@ internal static partial class SimdUtils ReadOnlySpan source, Span dest) { - VerifySpanInput(source, dest, Vector.Count); + DebugVerifySpanInput(source, dest, Vector.Count); nuint n = dest.VectorCount(); diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs index a551cebd0..90b313fb9 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs @@ -69,7 +69,7 @@ internal static partial class SimdUtils [MethodImpl(InliningOptions.ColdPath)] internal static void ByteToNormalizedFloat(ReadOnlySpan source, Span dest) { - VerifySpanInput(source, dest, 4); + DebugVerifySpanInput(source, dest, 4); uint count = (uint)dest.Length / 4; if (count == 0) @@ -103,7 +103,7 @@ internal static partial class SimdUtils ReadOnlySpan source, Span dest) { - VerifySpanInput(source, dest, 4); + DebugVerifySpanInput(source, dest, 4); uint count = (uint)source.Length / 4; if (count == 0) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index f27852a82..feb55ebe5 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -17,8 +17,13 @@ internal static partial class SimdUtils { public static class HwIntrinsics { +#pragma warning disable SA1117 // Parameters should be on same line or separate lines +#pragma warning disable SA1137 // Elements should have the same indentation [MethodImpl(MethodImplOptions.AggressiveInlining)] // too much IL for JIT to inline, so give a hint - public static Vector256 PermuteMaskDeinterleave8x32() => Vector256.Create(0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsInt32(); + public static Vector256 PermuteMaskDeinterleave8x32() => Vector256.Create(0, 4, 1, 5, 2, 6, 3, 7); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector512 PermuteMaskDeinterleave16x32() => Vector512.Create(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 PermuteMaskEvenOdd8x32() => Vector256.Create(0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0).AsUInt32(); @@ -38,17 +43,18 @@ internal static partial class SimdUtils [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector128 ShuffleMaskSlice4Nx16() => Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80); -#pragma warning disable SA1003, SA1116, SA1117 // Parameters should be on same line or separate lines [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 ShuffleMaskShiftAlpha() => Vector256.Create((byte) - 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15, - 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15); + private static Vector256 ShuffleMaskShiftAlpha() => Vector256.Create( + (byte)0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15, + 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15); [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 PermuteMaskShiftAlpha8x32() => Vector256.Create( - 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, - 5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsUInt32(); -#pragma warning restore SA1003, SA1116, SA1117 // Parameters should be on same line or separate lines + public static Vector256 PermuteMaskShiftAlpha8x32() + => Vector256.Create( + 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, + 5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsUInt32(); +#pragma warning restore SA1137 // Elements should have the same indentation +#pragma warning restore SA1117 // Parameters should be on same line or separate lines /// /// Shuffle single-precision (32-bit) floating-point elements in @@ -795,7 +801,7 @@ internal static partial class SimdUtils { if (Avx2.IsSupported) { - VerifySpanInput(source, dest, Vector256.Count); + DebugVerifySpanInput(source, dest, Vector256.Count); nuint n = dest.Vector256Count(); @@ -828,7 +834,7 @@ internal static partial class SimdUtils else { // Sse - VerifySpanInput(source, dest, Vector128.Count); + DebugVerifySpanInput(source, dest, Vector128.Count); nuint n = dest.Vector128Count(); @@ -881,17 +887,24 @@ internal static partial class SimdUtils /// /// as many elements as possible, slicing them down (keeping the remainder). /// + /// The source buffer. + /// The destination buffer. [MethodImpl(InliningOptions.ShortMethod)] internal static void NormalizedFloatToByteSaturateReduce( ref ReadOnlySpan source, - ref Span dest) + ref Span destination) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); + DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!"); - if (Avx2.IsSupported || Sse2.IsSupported) + if (Avx512BW.IsSupported || Avx2.IsSupported || Sse2.IsSupported || AdvSimd.IsSupported) { int remainder; - if (Avx2.IsSupported) + + if (Avx512BW.IsSupported) + { + remainder = Numerics.ModuloP2(source.Length, Vector512.Count); + } + else if (Avx2.IsSupported) { remainder = Numerics.ModuloP2(source.Length, Vector256.Count); } @@ -906,10 +919,10 @@ internal static partial class SimdUtils { NormalizedFloatToByteSaturate( source[..adjustedCount], - dest[..adjustedCount]); + destination[..adjustedCount]); source = source[adjustedCount..]; - dest = dest[adjustedCount..]; + destination = destination[adjustedCount..]; } } } @@ -917,25 +930,59 @@ internal static partial class SimdUtils /// /// Implementation of , which is faster on new .NET runtime. /// + /// The source buffer. + /// The destination buffer. /// /// Implementation is based on MagicScaler code: /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622 /// internal static void NormalizedFloatToByteSaturate( ReadOnlySpan source, - Span dest) + Span destination) { - if (Avx2.IsSupported) + if (Avx512BW.IsSupported) { - VerifySpanInput(source, dest, Vector256.Count); + DebugVerifySpanInput(source, destination, Vector512.Count); + + nuint n = destination.Vector512Count(); - nuint n = dest.Vector256Count(); + ref Vector512 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector512 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); - ref Vector256 sourceBase = - ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + Vector512 scale = Vector512.Create((float)byte.MaxValue); + Vector512 mask = PermuteMaskDeinterleave16x32(); - ref Vector256 destBase = - ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + for (nuint i = 0; i < n; i++) + { + ref Vector512 s = ref Unsafe.Add(ref sourceBase, i * 4); + + Vector512 f0 = scale * s; + Vector512 f1 = scale * Unsafe.Add(ref s, 1); + Vector512 f2 = scale * Unsafe.Add(ref s, 2); + Vector512 f3 = scale * Unsafe.Add(ref s, 3); + + Vector512 w0 = Vector512Utilities.ConvertToInt32RoundToEven(f0); + Vector512 w1 = Vector512Utilities.ConvertToInt32RoundToEven(f1); + Vector512 w2 = Vector512Utilities.ConvertToInt32RoundToEven(f2); + Vector512 w3 = Vector512Utilities.ConvertToInt32RoundToEven(f3); + + Vector512 u0 = Avx512BW.PackSignedSaturate(w0, w1); + Vector512 u1 = Avx512BW.PackSignedSaturate(w2, w3); + Vector512 b = Avx512BW.PackUnsignedSaturate(u0, u1); + b = Avx512F.PermuteVar16x32(b.AsInt32(), mask).AsByte(); + + Unsafe.Add(ref destinationBase, i) = b; + } + } + else + if (Avx2.IsSupported) + { + DebugVerifySpanInput(source, destination, Vector256.Count); + + nuint n = destination.Vector256Count(); + + ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); Vector256 scale = Vector256.Create((float)byte.MaxValue); Vector256 mask = PermuteMaskDeinterleave8x32(); @@ -944,36 +991,33 @@ internal static partial class SimdUtils { ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4); - Vector256 f0 = Avx.Multiply(scale, s); - Vector256 f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1)); - Vector256 f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2)); - Vector256 f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3)); + Vector256 f0 = scale * s; + Vector256 f1 = scale * Unsafe.Add(ref s, 1); + Vector256 f2 = scale * Unsafe.Add(ref s, 2); + Vector256 f3 = scale * Unsafe.Add(ref s, 3); - Vector256 w0 = Avx.ConvertToVector256Int32(f0); - Vector256 w1 = Avx.ConvertToVector256Int32(f1); - Vector256 w2 = Avx.ConvertToVector256Int32(f2); - Vector256 w3 = Avx.ConvertToVector256Int32(f3); + Vector256 w0 = Vector256Utilities.ConvertToInt32RoundToEven(f0); + Vector256 w1 = Vector256Utilities.ConvertToInt32RoundToEven(f1); + Vector256 w2 = Vector256Utilities.ConvertToInt32RoundToEven(f2); + Vector256 w3 = Vector256Utilities.ConvertToInt32RoundToEven(f3); Vector256 u0 = Avx2.PackSignedSaturate(w0, w1); Vector256 u1 = Avx2.PackSignedSaturate(w2, w3); Vector256 b = Avx2.PackUnsignedSaturate(u0, u1); b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); - Unsafe.Add(ref destBase, i) = b; + Unsafe.Add(ref destinationBase, i) = b; } } else { - // Sse - VerifySpanInput(source, dest, Vector128.Count); - - nuint n = dest.Vector128Count(); + // Sse, AdvSimd + DebugVerifySpanInput(source, destination, Vector128.Count); - ref Vector128 sourceBase = - ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + nuint n = destination.Vector128Count(); - ref Vector128 destBase = - ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector128 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); Vector128 scale = Vector128.Create((float)byte.MaxValue); @@ -981,20 +1025,20 @@ internal static partial class SimdUtils { ref Vector128 s = ref Unsafe.Add(ref sourceBase, i * 4); - Vector128 f0 = Sse.Multiply(scale, s); - Vector128 f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1)); - Vector128 f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2)); - Vector128 f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3)); + Vector128 f0 = scale * s; + Vector128 f1 = scale * Unsafe.Add(ref s, 1); + Vector128 f2 = scale * Unsafe.Add(ref s, 2); + Vector128 f3 = scale * Unsafe.Add(ref s, 3); - Vector128 w0 = Sse2.ConvertToVector128Int32(f0); - Vector128 w1 = Sse2.ConvertToVector128Int32(f1); - Vector128 w2 = Sse2.ConvertToVector128Int32(f2); - Vector128 w3 = Sse2.ConvertToVector128Int32(f3); + Vector128 w0 = Vector128Utilities.ConvertToInt32RoundToEven(f0); + Vector128 w1 = Vector128Utilities.ConvertToInt32RoundToEven(f1); + Vector128 w2 = Vector128Utilities.ConvertToInt32RoundToEven(f2); + Vector128 w3 = Vector128Utilities.ConvertToInt32RoundToEven(f3); - Vector128 u0 = Sse2.PackSignedSaturate(w0, w1); - Vector128 u1 = Sse2.PackSignedSaturate(w2, w3); + Vector128 u0 = Vector128Utilities.PackSignedSaturate(w0, w1); + Vector128 u1 = Vector128Utilities.PackSignedSaturate(w2, w3); - Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1); + Unsafe.Add(ref destinationBase, i) = Vector128Utilities.PackUnsignedSaturate(u0, u1); } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 497e3cc6a..002c1f8da 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -94,35 +94,31 @@ internal static partial class SimdUtils } /// - /// Convert all values normalized into [0..1] from 'source' into 'dest' buffer of . + /// Convert all values normalized into [0..1] from 'source' into 'destination' buffer of . /// The values are scaled up into [0-255] and rounded, overflows are clamped. - /// should be the of the same size as , + /// should be the of the same size as , /// but there are no restrictions on the span's length. /// /// The source span of floats - /// The destination span of bytes + /// The destination span of bytes [MethodImpl(InliningOptions.ShortMethod)] - internal static void NormalizedFloatToByteSaturate(ReadOnlySpan source, Span dest) + internal static void NormalizedFloatToByteSaturate(ReadOnlySpan source, Span destination) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); - - HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref dest); - - // Also deals with the remainder from previous conversions: - FallbackIntrinsics128.NormalizedFloatToByteSaturateReduce(ref source, ref dest); + DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!"); + HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref destination); // Deal with the remainder: if (source.Length > 0) { - ConvertNormalizedFloatToByteRemainder(source, dest); + ConvertNormalizedFloatToByteRemainder(source, destination); } } [MethodImpl(InliningOptions.ColdPath)] - private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan source, Span dest) + private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan source, Span destination) { ref byte sBase = ref MemoryMarshal.GetReference(source); - ref float dBase = ref MemoryMarshal.GetReference(dest); + ref float dBase = ref MemoryMarshal.GetReference(destination); // There are at most 3 elements at this point, having a for loop is overkill. // Let's minimize the no. of instructions! @@ -140,23 +136,14 @@ internal static partial class SimdUtils } } - [MethodImpl(InliningOptions.ColdPath)] - private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan source, Span dest) + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan source, Span destination) { ref float sBase = ref MemoryMarshal.GetReference(source); - ref byte dBase = ref MemoryMarshal.GetReference(dest); - - switch (source.Length) + ref byte dBase = ref MemoryMarshal.GetReference(destination); + for (int i = 0; i < source.Length; i++) { - case 3: - Unsafe.Add(ref dBase, 2) = ConvertToByte(Unsafe.Add(ref sBase, 2)); - goto case 2; - case 2: - Unsafe.Add(ref dBase, 1) = ConvertToByte(Unsafe.Add(ref sBase, 1)); - goto case 1; - case 1: - dBase = ConvertToByte(sBase); - break; + Unsafe.Add(ref dBase, i) = ConvertToByte(Unsafe.Add(ref sBase, i)); } } @@ -173,7 +160,7 @@ internal static partial class SimdUtils } [Conditional("DEBUG")] - private static void VerifySpanInput(ReadOnlySpan source, Span dest, int shouldBeDivisibleBy) + private static void DebugVerifySpanInput(ReadOnlySpan source, Span dest, int shouldBeDivisibleBy) { DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); DebugGuard.IsTrue( @@ -183,11 +170,11 @@ internal static partial class SimdUtils } [Conditional("DEBUG")] - private static void VerifySpanInput(ReadOnlySpan source, Span dest, int shouldBeDivisibleBy) + private static void DebugVerifySpanInput(ReadOnlySpan source, Span destination, int shouldBeDivisibleBy) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); + DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!"); DebugGuard.IsTrue( - Numerics.ModuloP2(dest.Length, shouldBeDivisibleBy) == 0, + Numerics.ModuloP2(destination.Length, shouldBeDivisibleBy) == 0, nameof(source), $"length should be divisible by {shouldBeDivisibleBy}!"); } diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 981f9a47f..a07fa8ca6 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -26,7 +26,7 @@ internal static class Vector128Utilities public static bool SupportsShuffleFloat { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Sse.IsSupported; + get => Sse.IsSupported || AdvSimd.IsSupported; } /// @@ -62,6 +62,7 @@ internal static class Vector128Utilities /// The input vector from which values are selected. /// The shuffle control byte. /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 Shuffle(Vector128 vector, [ConstantExpected] byte control) { if (Sse.IsSupported) @@ -69,6 +70,17 @@ internal static class Vector128Utilities return Sse.Shuffle(vector, vector, control); } + if (AdvSimd.IsSupported) + { +#pragma warning disable CA1857 // A constant is expected for the parameter + Vector128 result = Vector128.Create(AdvSimd.Extract(vector, (byte)(control & 0x3))); + result = AdvSimd.Insert(result, 1, AdvSimd.Extract(vector, (byte)((control >> 2) & 0x3))); + result = AdvSimd.Insert(result, 2, AdvSimd.Extract(vector, (byte)((control >> 4) & 0x3))); + result = AdvSimd.Insert(result, 3, AdvSimd.Extract(vector, (byte)((control >> 6) & 0x3))); +#pragma warning restore CA1857 // A constant is expected for the parameter + return result; + } + ThrowUnreachableException(); return default; } @@ -84,6 +96,7 @@ internal static class Vector128Utilities /// /// A new vector containing the values from selected by the given . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 Shuffle(Vector128 vector, Vector128 indices) { if (Ssse3.IsSupported) @@ -155,6 +168,7 @@ internal static class Vector128Utilities /// The right hand source vector. /// An 8-bit mask used for the operation. /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 AlignRight(Vector128 left, Vector128 right, [ConstantExpected(Max = (byte)15)] byte mask) { if (Ssse3.IsSupported) @@ -171,6 +185,77 @@ internal static class Vector128Utilities return default; } + /// + /// Performs a conversion from a 128-bit vector of 4 single-precision floating-point values to a 128-bit vector of 4 signed 32-bit integer values. + /// Rounding is equivalent to . + /// + /// The value to convert. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ConvertToInt32RoundToEven(Vector128 vector) + { + if (Sse2.IsSupported) + { + return Sse2.ConvertToVector128Int32(vector); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ConvertToInt32RoundToEven(vector); + } + + Vector128 sign = vector & Vector128.Create(-0.0f); + Vector128 val_2p23_f32 = sign | Vector128.Create(8388608.0f); + + val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32; + return Vector128.ConvertToInt32(val_2p23_f32 | sign); + } + + /// + /// Packs signed 16-bit integers to unsigned 8-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + public static Vector128 PackUnsignedSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.PackUnsignedSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right); + } + + ThrowUnreachableException(); + return default; + } + + /// + /// Packs signed 32-bit integers to signed 16-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 PackSignedSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.PackSignedSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right); + } + + ThrowUnreachableException(); + return default; + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 14fa24b31..6e8c0d1de 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -25,7 +25,7 @@ internal static class Vector256Utilities public static bool SupportsShuffleFloat { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx.IsSupported; + get => Avx.IsSupported || Sse.IsSupported; } /// @@ -43,6 +43,7 @@ internal static class Vector256Utilities /// The input vector from which values are selected. /// The shuffle control byte. /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 Shuffle(Vector256 vector, [ConstantExpected] byte control) { if (Avx.IsSupported) @@ -50,6 +51,13 @@ internal static class Vector256Utilities return Avx.Shuffle(vector, vector, control); } + if (Sse.IsSupported) + { + Vector128 lower = vector.GetLower(); + Vector128 upper = vector.GetUpper(); + return Vector256.Create(Sse.Shuffle(lower, lower, control), Sse.Shuffle(upper, upper, control)); + } + ThrowUnreachableException(); return default; } @@ -62,6 +70,7 @@ internal static class Vector256Utilities /// The per-element indices used to select a value from . /// /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector256 Shuffle(Vector256 vector, Vector256 indices) { if (Avx2.IsSupported) @@ -73,6 +82,34 @@ internal static class Vector256Utilities return default; } + /// + /// Performs a conversion from a 256-bit vector of 8 single-precision floating-point values to a 256-bit vector of 8 signed 32-bit integer values. + /// Rounding is equivalent to . + /// + /// The value to convert. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 ConvertToInt32RoundToEven(Vector256 vector) + { + if (Avx.IsSupported) + { + return Avx.ConvertToVector256Int32(vector); + } + + if (Sse2.IsSupported) + { + Vector128 lower = Sse2.ConvertToVector128Int32(vector.GetLower()); + Vector128 upper = Sse2.ConvertToVector128Int32(vector.GetUpper()); + return Vector256.Create(lower, upper); + } + + Vector256 sign = vector & Vector256.Create(-0.0f); + Vector256 val_2p23_f32 = sign | Vector256.Create(8388608.0f); + + val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32; + return Vector256.ConvertToInt32(val_2p23_f32 | sign); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 5488b4064..0165af90e 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -25,7 +25,7 @@ internal static class Vector512Utilities public static bool SupportsShuffleFloat { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx512F.IsSupported; + get => Avx512F.IsSupported || Avx.IsSupported; } /// @@ -51,6 +51,13 @@ internal static class Vector512Utilities return Avx512F.Shuffle(vector, vector, control); } + if (Avx.IsSupported) + { + Vector256 lower = vector.GetLower(); + Vector256 upper = vector.GetUpper(); + return Vector512.Create(Avx.Shuffle(lower, lower, control), Avx.Shuffle(upper, upper, control)); + } + ThrowUnreachableException(); return default; } @@ -75,6 +82,34 @@ internal static class Vector512Utilities return default; } + /// + /// Performs a conversion from a 512-bit vector of 16 single-precision floating-point values to a 512-bit vector of 16 signed 32-bit integer values. + /// Rounding is equivalent to . + /// + /// The value to convert. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector512 ConvertToInt32RoundToEven(Vector512 vector) + { + if (Avx512F.IsSupported) + { + return Avx512F.ConvertToVector512Int32(vector); + } + + if (Avx.IsSupported) + { + Vector256 lower = Avx.ConvertToVector256Int32(vector.GetLower()); + Vector256 upper = Avx.ConvertToVector256Int32(vector.GetUpper()); + return Vector512.Create(lower, upper); + } + + Vector512 sign = vector & Vector512.Create(-0.0f); + Vector512 val_2p23_f32 = sign | Vector512.Create(8388608.0f); + + val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32; + return Vector512.ConvertToInt32(val_2p23_f32 | sign); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs index ed89585dc..065e34c33 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/PixelOperations/Rgba32.PixelOperations.cs @@ -20,15 +20,15 @@ public partial struct Rgba32 /// public override void ToVector4( Configuration configuration, - ReadOnlySpan sourcePixels, + ReadOnlySpan source, Span destinationVectors, PixelConversionModifiers modifiers) { - Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationVectors, nameof(destinationVectors)); + Guard.DestinationShouldNotBeTooShort(source, destinationVectors, nameof(destinationVectors)); - destinationVectors = destinationVectors[..sourcePixels.Length]; + destinationVectors = destinationVectors[..source.Length]; SimdUtils.ByteToNormalizedFloat( - MemoryMarshal.Cast(sourcePixels), + MemoryMarshal.Cast(source), MemoryMarshal.Cast(destinationVectors)); Vector4Converters.ApplyForwardConversionModifiers(destinationVectors, modifiers); } @@ -37,16 +37,16 @@ public partial struct Rgba32 public override void FromVector4Destructive( Configuration configuration, Span sourceVectors, - Span destinationPixels, + Span destination, PixelConversionModifiers modifiers) { - Guard.DestinationShouldNotBeTooShort(sourceVectors, destinationPixels, nameof(destinationPixels)); + Guard.DestinationShouldNotBeTooShort(sourceVectors, destination, nameof(destination)); - destinationPixels = destinationPixels[..sourceVectors.Length]; + destination = destination[..sourceVectors.Length]; Vector4Converters.ApplyBackwardConversionModifiers(sourceVectors, modifiers); SimdUtils.NormalizedFloatToByteSaturate( MemoryMarshal.Cast(sourceVectors), - MemoryMarshal.Cast(destinationPixels)); + MemoryMarshal.Cast(destination)); } /// diff --git a/tests/ImageSharp.Benchmarks/Bulk/FromVector4.cs b/tests/ImageSharp.Benchmarks/Bulk/FromVector4.cs index ecd16b957..dff687fa1 100644 --- a/tests/ImageSharp.Benchmarks/Bulk/FromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Bulk/FromVector4.cs @@ -18,9 +18,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk; public abstract class FromVector4 where TPixel : unmanaged, IPixel { - protected IMemoryOwner source; + protected IMemoryOwner Source { get; set; } - protected IMemoryOwner destination; + protected IMemoryOwner Destination { get; set; } protected Configuration Configuration => Configuration.Default; @@ -31,22 +31,22 @@ public abstract class FromVector4 [GlobalSetup] public void Setup() { - this.destination = this.Configuration.MemoryAllocator.Allocate(this.Count); - this.source = this.Configuration.MemoryAllocator.Allocate(this.Count); + this.Destination = this.Configuration.MemoryAllocator.Allocate(this.Count); + this.Source = this.Configuration.MemoryAllocator.Allocate(this.Count); } [GlobalCleanup] public void Cleanup() { - this.destination.Dispose(); - this.source.Dispose(); + this.Destination.Dispose(); + this.Source.Dispose(); } // [Benchmark] public void PerElement() { - ref Vector4 s = ref MemoryMarshal.GetReference(this.source.GetSpan()); - ref TPixel d = ref MemoryMarshal.GetReference(this.destination.GetSpan()); + ref Vector4 s = ref MemoryMarshal.GetReference(this.Source.GetSpan()); + ref TPixel d = ref MemoryMarshal.GetReference(this.Destination.GetSpan()); for (nuint i = 0; i < (uint)this.Count; i++) { Unsafe.Add(ref d, i) = TPixel.FromVector4(Unsafe.Add(ref s, i)); @@ -55,11 +55,11 @@ public abstract class FromVector4 [Benchmark(Baseline = true)] public void PixelOperations_Base() - => new PixelOperations().FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan()); + => new PixelOperations().FromVector4Destructive(this.Configuration, this.Source.GetSpan(), this.Destination.GetSpan()); [Benchmark] public void PixelOperations_Specialized() - => PixelOperations.Instance.FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan()); + => PixelOperations.Instance.FromVector4Destructive(this.Configuration, this.Source.GetSpan(), this.Destination.GetSpan()); } public class FromVector4Rgba32 : FromVector4 @@ -67,8 +67,8 @@ public class FromVector4Rgba32 : FromVector4 [Benchmark] public void FallbackIntrinsics128() { - Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); - Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + Span sBytes = MemoryMarshal.Cast(this.Source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.Destination.GetSpan()); SimdUtils.FallbackIntrinsics128.NormalizedFloatToByteSaturate(sBytes, dFloats); } @@ -76,8 +76,8 @@ public class FromVector4Rgba32 : FromVector4 [Benchmark] public void ExtendedIntrinsic() { - Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); - Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + Span sBytes = MemoryMarshal.Cast(this.Source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.Destination.GetSpan()); SimdUtils.ExtendedIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats); } @@ -85,8 +85,8 @@ public class FromVector4Rgba32 : FromVector4 [Benchmark] public void UseHwIntrinsics() { - Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); - Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + Span sBytes = MemoryMarshal.Cast(this.Source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.Destination.GetSpan()); SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats); } @@ -96,8 +96,8 @@ public class FromVector4Rgba32 : FromVector4 [Benchmark] public void UseAvx2_Grouped() { - Span src = MemoryMarshal.Cast(this.source.GetSpan()); - Span dest = MemoryMarshal.Cast(this.destination.GetSpan()); + Span src = MemoryMarshal.Cast(this.Source.GetSpan()); + Span dest = MemoryMarshal.Cast(this.Destination.GetSpan()); nuint n = (uint)dest.Length / (uint)Vector.Count; @@ -107,7 +107,7 @@ public class FromVector4Rgba32 : FromVector4 ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); Vector256 mask = Unsafe.As>(ref maskBase); - var maxBytes = Vector256.Create(255f); + Vector256 maxBytes = Vector256.Create(255f); for (nuint i = 0; i < n; i++) { @@ -137,25 +137,37 @@ public class FromVector4Rgba32 : FromVector4 } } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector256 ConvertToInt32(Vector256 vf, Vector256 scale) - { - vf = Avx.Multiply(scale, vf); - return Avx.ConvertToVector256Int32(vf); - } - - // *** RESULTS 2020 March: *** - // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores - // .NET Core SDK=3.1.200-preview-014971 - // Job-IUZXZT : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT - // - // | Method | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |---------------------------- |------ |-----------:|------------:|----------:|------:|--------:|------:|------:|------:|----------:| - // | FallbackIntrinsics128 | 1024 | 2,952.6 ns | 1,680.77 ns | 92.13 ns | 3.32 | 0.16 | - | - | - | - | - // | BasicIntrinsics256 | 1024 | 1,664.5 ns | 928.11 ns | 50.87 ns | 1.87 | 0.09 | - | - | - | - | - // | ExtendedIntrinsic | 1024 | 890.6 ns | 375.48 ns | 20.58 ns | 1.00 | 0.00 | - | - | - | - | - // | UseAvx2 | 1024 | 299.0 ns | 30.47 ns | 1.67 ns | 0.34 | 0.01 | - | - | - | - | - // | UseAvx2_Grouped | 1024 | 318.1 ns | 48.19 ns | 2.64 ns | 0.36 | 0.01 | - | - | - | - | - // | PixelOperations_Base | 1024 | 8,136.9 ns | 1,834.82 ns | 100.57 ns | 9.14 | 0.26 | - | - | - | 24 B | - // | PixelOperations_Specialized | 1024 | 951.1 ns | 123.93 ns | 6.79 ns | 1.07 | 0.03 | - | - | - | - | + /* + BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3) + 11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores + .NET SDK 8.0.200-preview.23624.5 + [Host] : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2 + Job-YJYLLR : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2 + + Runtime=.NET 8.0 Arguments=/p:DebugType=portable IterationCount=3 + LaunchCount=1 WarmupCount=3 + + | Method | Count | Mean | Error | StdDev | Ratio | RatioSD | Allocated | Alloc Ratio | + |---------------------------- |------ |------------:|-------------:|-----------:|------:|--------:|----------:|------------:| + | PixelOperations_Base | 64 | 114.80 ns | 16.459 ns | 0.902 ns | 1.00 | 0.00 | - | NA | + | PixelOperations_Specialized | 64 | 28.91 ns | 80.482 ns | 4.411 ns | 0.25 | 0.04 | - | NA | + | FallbackIntrinsics128 | 64 | 133.60 ns | 23.750 ns | 1.302 ns | 1.16 | 0.02 | - | NA | + | ExtendedIntrinsic | 64 | 40.11 ns | 10.183 ns | 0.558 ns | 0.35 | 0.01 | - | NA | + | UseHwIntrinsics | 64 | 14.71 ns | 4.860 ns | 0.266 ns | 0.13 | 0.00 | - | NA | + | UseAvx2_Grouped | 64 | 20.23 ns | 11.619 ns | 0.637 ns | 0.18 | 0.00 | - | NA | + | | | | | | | | | | + | PixelOperations_Base | 256 | 387.94 ns | 31.591 ns | 1.732 ns | 1.00 | 0.00 | - | NA | + | PixelOperations_Specialized | 256 | 50.93 ns | 22.388 ns | 1.227 ns | 0.13 | 0.00 | - | NA | + | FallbackIntrinsics128 | 256 | 509.72 ns | 249.926 ns | 13.699 ns | 1.31 | 0.04 | - | NA | + | ExtendedIntrinsic | 256 | 140.32 ns | 9.353 ns | 0.513 ns | 0.36 | 0.00 | - | NA | + | UseHwIntrinsics | 256 | 41.99 ns | 16.000 ns | 0.877 ns | 0.11 | 0.00 | - | NA | + | UseAvx2_Grouped | 256 | 63.81 ns | 2.360 ns | 0.129 ns | 0.16 | 0.00 | - | NA | + | | | | | | | | | | + | PixelOperations_Base | 2048 | 2,979.49 ns | 2,023.706 ns | 110.926 ns | 1.00 | 0.00 | - | NA | + | PixelOperations_Specialized | 2048 | 326.19 ns | 19.077 ns | 1.046 ns | 0.11 | 0.00 | - | NA | + | FallbackIntrinsics128 | 2048 | 3,885.95 ns | 411.078 ns | 22.533 ns | 1.31 | 0.05 | - | NA | + | ExtendedIntrinsic | 2048 | 1,078.58 ns | 136.960 ns | 7.507 ns | 0.36 | 0.01 | - | NA | + | UseHwIntrinsics | 2048 | 312.07 ns | 68.662 ns | 3.764 ns | 0.10 | 0.00 | - | NA | + | UseAvx2_Grouped | 2048 | 451.83 ns | 41.742 ns | 2.288 ns | 0.15 | 0.01 | - | NA | + */ } diff --git a/tests/ImageSharp.Benchmarks/Bulk/FromVector4_Rgb24.cs b/tests/ImageSharp.Benchmarks/Bulk/FromVector4_Rgb24.cs index 27fab64dd..c6125ef8f 100644 --- a/tests/ImageSharp.Benchmarks/Bulk/FromVector4_Rgb24.cs +++ b/tests/ImageSharp.Benchmarks/Bulk/FromVector4_Rgb24.cs @@ -7,48 +7,26 @@ using SixLabors.ImageSharp.PixelFormats; namespace SixLabors.ImageSharp.Benchmarks.Bulk; [Config(typeof(Config.Short))] -public class FromVector4_Rgb24 : FromVector4 -{ -} +public class FromVector4_Rgb24 : FromVector4; -// 2020-11-02 -// ########## -// -// BenchmarkDotNet = v0.12.1, OS = Windows 10.0.19041.572(2004 /?/ 20H1) -// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores -// .NET Core SDK=3.1.403 -// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT -// Job-XYEQXL : .NET Framework 4.8 (4.8.4250.0), X64 RyuJIT -// Job-HSXNJV : .NET Core 2.1.23 (CoreCLR 4.6.29321.03, CoreFX 4.6.29321.01), X64 RyuJIT -// Job-YUREJO : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT -// -// IterationCount=3 LaunchCount=1 WarmupCount=3 -// -// | Method | Job | Runtime | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | -// |---------------------------- |----------- |-------------- |------ |-----------:|------------:|----------:|------:|--------:|-------:|------:|------:|----------:| -// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 64 | 343.2 ns | 305.91 ns | 16.77 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | -// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 64 | 320.8 ns | 19.93 ns | 1.09 ns | 0.94 | 0.05 | - | - | - | - | -// | | | | | | | | | | | | | | -// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 64 | 234.3 ns | 17.98 ns | 0.99 ns | 1.00 | 0.00 | 0.0052 | - | - | 24 B | -// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 64 | 246.0 ns | 82.34 ns | 4.51 ns | 1.05 | 0.02 | - | - | - | - | -// | | | | | | | | | | | | | | -// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 64 | 222.3 ns | 39.46 ns | 2.16 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | -// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 64 | 243.4 ns | 33.58 ns | 1.84 ns | 1.09 | 0.01 | - | - | - | - | -// | | | | | | | | | | | | | | -// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 256 | 824.9 ns | 32.77 ns | 1.80 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | -// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 256 | 967.0 ns | 39.09 ns | 2.14 ns | 1.17 | 0.01 | 0.0172 | - | - | 72 B | -// | | | | | | | | | | | | | | -// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 256 | 756.9 ns | 94.43 ns | 5.18 ns | 1.00 | 0.00 | 0.0048 | - | - | 24 B | -// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 256 | 1,003.3 ns | 3,192.09 ns | 174.97 ns | 1.32 | 0.22 | 0.0172 | - | - | 72 B | -// | | | | | | | | | | | | | | -// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 256 | 748.6 ns | 248.03 ns | 13.60 ns | 1.00 | 0.00 | 0.0057 | - | - | 24 B | -// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 256 | 437.0 ns | 36.48 ns | 2.00 ns | 0.58 | 0.01 | 0.0172 | - | - | 72 B | -// | | | | | | | | | | | | | | -// | PixelOperations_Base | Job-XYEQXL | .NET 4.7.2 | 2048 | 5,751.6 ns | 704.24 ns | 38.60 ns | 1.00 | 0.00 | - | - | - | 24 B | -// | PixelOperations_Specialized | Job-XYEQXL | .NET 4.7.2 | 2048 | 4,391.6 ns | 718.17 ns | 39.37 ns | 0.76 | 0.00 | 0.0153 | - | - | 72 B | -// | | | | | | | | | | | | | | -// | PixelOperations_Base | Job-HSXNJV | .NET Core 2.1 | 2048 | 6,202.0 ns | 1,815.18 ns | 99.50 ns | 1.00 | 0.00 | - | - | - | 24 B | -// | PixelOperations_Specialized | Job-HSXNJV | .NET Core 2.1 | 2048 | 4,225.6 ns | 1,004.03 ns | 55.03 ns | 0.68 | 0.01 | 0.0153 | - | - | 72 B | -// | | | | | | | | | | | | | | -// | PixelOperations_Base | Job-YUREJO | .NET Core 3.1 | 2048 | 6,157.1 ns | 2,516.98 ns | 137.96 ns | 1.00 | 0.00 | - | - | - | 24 B | -// | PixelOperations_Specialized | Job-YUREJO | .NET Core 3.1 | 2048 | 1,822.7 ns | 1,764.43 ns | 96.71 ns | 0.30 | 0.02 | 0.0172 | - | - | 72 B | +/* + BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3) +11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores +.NET SDK 8.0.200-preview.23624.5 + [Host] : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2 + Job-NEHCEM : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2 + +Runtime=.NET 8.0 Arguments=/p:DebugType=portable IterationCount=3 +LaunchCount=1 WarmupCount=3 + +| Method | Count | Mean | Error | StdDev | Ratio | Gen0 | Allocated | Alloc Ratio | +|---------------------------- |------ |------------:|----------:|---------:|------:|-------:|----------:|------------:| +| PixelOperations_Base | 64 | 95.87 ns | 13.60 ns | 0.745 ns | 1.00 | - | - | NA | +| PixelOperations_Specialized | 64 | 97.34 ns | 30.34 ns | 1.663 ns | 1.02 | - | - | NA | +| | | | | | | | | | +| PixelOperations_Base | 256 | 337.80 ns | 88.10 ns | 4.829 ns | 1.00 | - | - | NA | +| PixelOperations_Specialized | 256 | 195.07 ns | 30.54 ns | 1.674 ns | 0.58 | 0.0153 | 96 B | NA | +| | | | | | | | | | +| PixelOperations_Base | 2048 | 2,561.79 ns | 162.45 ns | 8.905 ns | 1.00 | - | - | NA | +| PixelOperations_Specialized | 2048 | 741.85 ns | 18.05 ns | 0.989 ns | 0.29 | 0.0153 | 96 B | NA | + */ diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index c81eaea63..200371679 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -112,6 +112,7 @@ public partial class SimdUtilsTests public static readonly TheoryData ArraySizesDivisibleBy4 = new() { 0, 4, 8, 28, 1020 }; public static readonly TheoryData ArraySizesDivisibleBy3 = new() { 0, 3, 9, 36, 957 }; public static readonly TheoryData ArraySizesDivisibleBy32 = new() { 0, 32, 512 }; + public static readonly TheoryData ArraySizesDivisibleBy64 = new() { 0, 64, 512 }; public static readonly TheoryData ArbitraryArraySizes = new() { 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520 }; @@ -199,7 +200,7 @@ public partial class SimdUtilsTests } [Theory] - [MemberData(nameof(ArraySizesDivisibleBy32))] + [MemberData(nameof(ArraySizesDivisibleBy64))] public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) { if (!Sse2.IsSupported) @@ -214,7 +215,7 @@ public partial class SimdUtilsTests FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, count, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512BW | HwIntrinsics.DisableAVX2); } [Theory] diff --git a/tests/ImageSharp.Tests/TestUtilities/BasicSerializer.cs b/tests/ImageSharp.Tests/TestUtilities/BasicSerializer.cs index d161b8019..216ed95b8 100644 --- a/tests/ImageSharp.Tests/TestUtilities/BasicSerializer.cs +++ b/tests/ImageSharp.Tests/TestUtilities/BasicSerializer.cs @@ -13,14 +13,14 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities; /// internal class BasicSerializer : IXunitSerializationInfo { - private readonly Dictionary map = new Dictionary(); + private readonly Dictionary map = []; public const char Separator = ':'; private string DumpToString(Type type) { - using var ms = new MemoryStream(); - using var writer = new StreamWriter(ms); + using MemoryStream ms = new(); + using StreamWriter writer = new(ms); writer.WriteLine(type.FullName); foreach (KeyValuePair kv in this.map) { @@ -29,16 +29,16 @@ internal class BasicSerializer : IXunitSerializationInfo writer.Flush(); byte[] data = ms.ToArray(); - return System.Convert.ToBase64String(data); + return Convert.ToBase64String(data); } private Type LoadDump(string dump) { - byte[] data = System.Convert.FromBase64String(dump); + byte[] data = Convert.FromBase64String(dump); - using var ms = new MemoryStream(data); - using var reader = new StreamReader(ms); - var type = Type.GetType(reader.ReadLine()); + using MemoryStream ms = new(data); + using StreamReader reader = new(ms); + Type type = Type.GetType(reader.ReadLine()); for (string s = reader.ReadLine(); s != null; s = reader.ReadLine()) { string[] kv = s.Split(Separator); @@ -50,7 +50,7 @@ internal class BasicSerializer : IXunitSerializationInfo public static string Serialize(IXunitSerializable serializable) { - var serializer = new BasicSerializer(); + BasicSerializer serializer = new(); serializable.Serialize(serializer); return serializer.DumpToString(serializable.GetType()); } @@ -58,10 +58,10 @@ internal class BasicSerializer : IXunitSerializationInfo public static T Deserialize(string dump) where T : IXunitSerializable { - var serializer = new BasicSerializer(); + BasicSerializer serializer = new(); Type type = serializer.LoadDump(dump); - var result = (T)Activator.CreateInstance(type); + T result = (T)Activator.CreateInstance(type); result.Deserialize(serializer); return result; } diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs index 5a9a72f96..07ad5e8f0 100644 --- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs +++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs @@ -2,6 +2,7 @@ // Licensed under the Six Labors Split License. using System.Diagnostics; +using System.Globalization; using Microsoft.DotNet.RemoteExecutor; using Xunit.Abstractions; @@ -12,7 +13,7 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities; /// public static class FeatureTestRunner { - private static readonly char[] SplitChars = { ',', ' ' }; + private static readonly char[] SplitChars = [',', ' ']; /// /// Allows the deserialization of parameters passed to the feature test. @@ -40,7 +41,7 @@ public static class FeatureTestRunner /// The value. public static T Deserialize(string value) where T : IConvertible - => (T)Convert.ChangeType(value, typeof(T)); + => (T)Convert.ChangeType(value, typeof(T), CultureInfo.InvariantCulture); /// /// Runs the given test within an environment @@ -127,6 +128,7 @@ public static class FeatureTestRunner /// Runs the given test within an environment /// where the given features. /// + /// The type of argument. /// The test action to run. /// The intrinsics features. /// The value to pass as a parameter to the test action. @@ -170,6 +172,7 @@ public static class FeatureTestRunner /// Runs the given test within an environment /// where the given features. /// + /// The type of argument. /// The test action to run. /// The intrinsics features. /// The value to pass as a parameter to the test action. @@ -214,6 +217,8 @@ public static class FeatureTestRunner /// Runs the given test within an environment /// where the given features. /// + /// The type of argument. + /// The addition type of argument. /// The test action to run. /// The intrinsics features. /// The value to pass as a parameter to the test action. @@ -261,6 +266,7 @@ public static class FeatureTestRunner /// Runs the given test within an environment /// where the given features. /// + /// The type of argument. /// The test action to run. /// The intrinsics features. /// The value to pass as a parameter to the test action. @@ -307,6 +313,7 @@ public static class FeatureTestRunner /// Runs the given test within an environment /// where the given features. /// + /// The type of argument. /// The test action to run. /// The value to pass as a parameter to the test action. /// The intrinsics features. @@ -350,6 +357,7 @@ public static class FeatureTestRunner /// Runs the given test within an environment /// where the given features. /// + /// The type of argument. /// The test action to run. /// The value to pass as a parameter #0 to the test action. /// The value to pass as a parameter #1 to the test action. @@ -395,10 +403,10 @@ public static class FeatureTestRunner internal static Dictionary ToFeatureKeyValueCollection(this HwIntrinsics intrinsics) { // Loop through and translate the given values into COMPlus equivalents - Dictionary features = new(); + Dictionary features = []; foreach (string intrinsic in intrinsics.ToString("G").Split(SplitChars, StringSplitOptions.RemoveEmptyEntries)) { - HwIntrinsics key = (HwIntrinsics)Enum.Parse(typeof(HwIntrinsics), intrinsic); + HwIntrinsics key = Enum.Parse(intrinsic); switch (intrinsic) { case nameof(HwIntrinsics.AllowAll): @@ -418,40 +426,47 @@ public static class FeatureTestRunner } /// -/// See -/// -/// ends up impacting all SIMD support(including System.Numerics) -/// but not things like , , and . -/// +/// See /// [Flags] #pragma warning disable RCS1135 // Declare enum member with zero value (when enum has FlagsAttribute). -public enum HwIntrinsics +public enum HwIntrinsics : long #pragma warning restore RCS1135 // Declare enum member with zero value (when enum has FlagsAttribute). { // Use flags so we can pass multiple values without using params. // Don't base on 0 or use inverse for All as that doesn't translate to string values. - DisableHWIntrinsic = 1 << 0, - DisableSSE = 1 << 1, - DisableSSE2 = 1 << 2, - DisableAES = 1 << 3, - DisablePCLMULQDQ = 1 << 4, - DisableSSE3 = 1 << 5, - DisableSSSE3 = 1 << 6, - DisableSSE41 = 1 << 7, - DisableSSE42 = 1 << 8, - DisablePOPCNT = 1 << 9, - DisableAVX = 1 << 10, - DisableFMA = 1 << 11, - DisableAVX2 = 1 << 12, - DisableBMI1 = 1 << 13, - DisableBMI2 = 1 << 14, - DisableLZCNT = 1 << 15, - DisableArm64AdvSimd = 1 << 16, - DisableArm64Crc32 = 1 << 17, - DisableArm64Dp = 1 << 18, - DisableArm64Aes = 1 << 19, - DisableArm64Sha1 = 1 << 20, - DisableArm64Sha256 = 1 << 21, - AllowAll = 1 << 22 + DisableHWIntrinsic = 1L << 0, + DisableSSE = 1L << 1, + DisableSSE2 = 1L << 2, + DisableAES = 1L << 3, + DisablePCLMULQDQ = 1L << 4, + DisableSSE3 = 1L << 5, + DisableSSSE3 = 1L << 6, + DisableSSE41 = 1L << 7, + DisableSSE42 = 1L << 8, + DisablePOPCNT = 1L << 9, + DisableAVX = 1L << 10, + DisableFMA = 1L << 11, + DisableAVX2 = 1L << 12, + DisableAVXVNNI = 1L << 13, + DisableAVX512BW = 1L << 14, + DisableAVX512BW_VL = 1L << 15, + DisableAVX512CD = 1L << 16, + DisableAVX512CD_VL = 1L << 17, + DisableAVX512DQ = 1L << 18, + DisableAVX512DQ_VL = 1L << 19, + DisableAVX512F = 1L << 20, + DisableAVX512F_VL = 1L << 21, + DisableAVX512VBMI = 1L << 22, + DisableAVX512VBMI_VL = 1L << 23, + DisableBMI1 = 1L << 24, + DisableBMI2 = 1L << 25, + DisableLZCNT = 1L << 26, + DisableArm64AdvSimd = 1L << 27, + DisableArm64Crc32 = 1L << 28, + DisableArm64Dp = 1L << 29, + DisableArm64Aes = 1L << 30, + DisableArm64Sha1 = 1L << 31, + DisableArm64Sha256 = 1L << 32, + AllowAll = 1L << 33 }