diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 449dc37d0..8533b2151 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -66,9 +66,9 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat) || - (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat) || - (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat)) + if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) || + (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) || + (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat)) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -112,9 +112,9 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte) || + if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) || (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) || - (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte)) + (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -158,7 +158,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign) { int remainder = source.Length % (Vector128.Count * 3); @@ -190,7 +190,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) { int remainder = source.Length % (Vector128.Count * 3); @@ -223,7 +223,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) { int remainder = source.Length & ((Vector128.Count * 4) - 1); // bit-hack for modulo @@ -249,7 +249,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat) + if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) { ref Vector512 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector512 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -263,21 +263,21 @@ internal static partial class SimdUtils ref Vector512 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector512 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector512_.Shuffle(vs0, control); - Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); - Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); - Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); + vd0 = Vector512_.ShuffleNative(vs0, control); + Unsafe.Add(ref vd0, (nuint)1) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control); + Unsafe.Add(ref vd0, (nuint)2) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control); + Unsafe.Add(ref vd0, (nuint)3) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), control); + Unsafe.Add(ref destinationBase, i) = Vector512_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control); } } } - else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat) + else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) { ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -291,21 +291,21 @@ internal static partial class SimdUtils ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector256_.Shuffle(vs0, control); - Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); - Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); - Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); + vd0 = Vector256_.ShuffleNative(vs0, control); + Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control); + Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control); + Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), control); + Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control); } } } - else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat) + else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat) { ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector128 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); @@ -319,17 +319,17 @@ internal static partial class SimdUtils ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector128 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector128_.Shuffle(vs0, control); - Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); - Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); - Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); + vd0 = Vector128_.ShuffleNative(vs0, control); + Unsafe.Add(ref vd0, (nuint)1) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control); + Unsafe.Add(ref vd0, (nuint)2) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control); + Unsafe.Add(ref vd0, (nuint)3) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), control); + Unsafe.Add(ref destinationBase, i) = Vector128_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control); } } } @@ -341,7 +341,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte) + if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) { Span temp = stackalloc byte[Vector512.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -359,17 +359,17 @@ internal static partial class SimdUtils ref Vector512 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector512 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector512_.Shuffle(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector512_.ShuffleNative(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector512_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask); } } } @@ -391,21 +391,21 @@ internal static partial class SimdUtils ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector256_.Shuffle(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector256_.ShuffleNative(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask); } } } - else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte) + else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte) { Span temp = stackalloc byte[Vector128.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -423,17 +423,17 @@ internal static partial class SimdUtils ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector128 vd0 = ref Unsafe.Add(ref destinationBase, i); - vd0 = Vector128_.Shuffle(vs0, mask); - Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); - Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); - Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); + vd0 = Vector128_.ShuffleNative(vs0, mask); + Unsafe.Add(ref vd0, (nuint)1) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask); + Unsafe.Add(ref vd0, (nuint)2) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask); + Unsafe.Add(ref vd0, (nuint)3) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask); } if (m > 0) { for (nuint i = u; i < n; i++) { - Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), mask); + Unsafe.Add(ref destinationBase, i) = Vector128_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask); } } } @@ -445,7 +445,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); @@ -472,15 +472,15 @@ internal static partial class SimdUtils v2 = Vector128_.AlignRight(v2, v1, 8); v1 = Vector128_.AlignRight(v1, v0, 12); - v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16), mask); - v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16), mask); - v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16), mask); - v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16), mask); + v0 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, maskPad4Nx16), mask); + v1 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, maskPad4Nx16), mask); + v2 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, maskPad4Nx16), mask); + v3 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, maskPad4Nx16), mask); - v0 = Vector128_.Shuffle(v0, maskE); - v1 = Vector128_.Shuffle(v1, maskSlice4Nx16); - v2 = Vector128_.Shuffle(v2, maskE); - v3 = Vector128_.Shuffle(v3, maskSlice4Nx16); + v0 = Vector128_.ShuffleNative(v0, maskE); + v1 = Vector128_.ShuffleNative(v1, maskSlice4Nx16); + v2 = Vector128_.ShuffleNative(v2, maskE); + v3 = Vector128_.ShuffleNative(v3, maskSlice4Nx16); v0 = Vector128_.AlignRight(v1, v0, 4); v3 = Vector128_.AlignRight(v3, v2, 12); @@ -505,7 +505,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 fill = Vector128.Create(0xff000000ff000000ul).AsByte(); @@ -534,10 +534,10 @@ internal static partial class SimdUtils ref Vector128 vd = ref Unsafe.Add(ref destinationBase, j); - vd = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16) | fill, mask); - Unsafe.Add(ref vd, 1) = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16) | fill, mask); - Unsafe.Add(ref vd, 2) = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16) | fill, mask); - Unsafe.Add(ref vd, 3) = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16) | fill, mask); + vd = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, maskPad4Nx16) | fill, mask); + Unsafe.Add(ref vd, 1) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, maskPad4Nx16) | fill, mask); + Unsafe.Add(ref vd, 2) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, maskPad4Nx16) | fill, mask); + Unsafe.Add(ref vd, 3) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, maskPad4Nx16) | fill, mask); } } } @@ -548,7 +548,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) { Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); Vector128 maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); @@ -574,10 +574,10 @@ internal static partial class SimdUtils Vector128 v2 = Unsafe.Add(ref vs, 2); Vector128 v3 = Unsafe.Add(ref vs, 3); - v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, mask), maskE); - v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, mask), maskSlice4Nx16); - v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, mask), maskE); - v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, mask), maskSlice4Nx16); + v0 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, mask), maskE); + v1 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, mask), maskSlice4Nx16); + v2 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, mask), maskE); + v3 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, mask), maskSlice4Nx16); v0 = Vector128_.AlignRight(v1, v0, 4); v3 = Vector128_.AlignRight(v3, v2, 12); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 85b09b351..3471acbd3 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -26,7 +26,7 @@ internal static class Vector128_ /// /// Gets a value indicating whether shuffle operations are supported. /// - public static bool SupportsShuffleFloat + public static bool SupportsShuffleNativeFloat { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Sse.IsSupported; @@ -35,10 +35,10 @@ internal static class Vector128_ /// /// Gets a value indicating whether shuffle operations are supported. /// - public static bool SupportsShuffleByte + public static bool SupportsShuffleNativeByte { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported; + get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported; } /// @@ -66,7 +66,7 @@ internal static class Vector128_ /// The shuffle control byte. /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 Shuffle(Vector128 vector, [ConstantExpected] byte control) + public static Vector128 ShuffleNative(Vector128 vector, [ConstantExpected] byte control) { if (Sse.IsSupported) { @@ -89,7 +89,7 @@ internal static class Vector128_ /// A new vector containing the values from selected by the given . /// [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 Shuffle(Vector128 vector, Vector128 indices) + public static Vector128 ShuffleNative(Vector128 vector, Vector128 indices) { if (Ssse3.IsSupported) { @@ -101,6 +101,11 @@ internal static class Vector128_ return AdvSimd.Arm64.VectorTableLookup(vector, indices); } + if (PackedSimd.IsSupported) + { + return PackedSimd.Swizzle(vector, indices); + } + ThrowUnreachableException(); return default; } diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index 893b6240d..8b22a5137 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -24,10 +24,10 @@ internal static class Vector256_ /// /// Gets a value indicating whether shuffle byte operations are supported. /// - public static bool SupportsShuffleFloat + public static bool SupportsShuffleNativeFloat { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx.IsSupported || Sse.IsSupported; + get => Avx.IsSupported; } /// @@ -46,20 +46,13 @@ internal static class Vector256_ /// The shuffle control byte. /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 Shuffle(Vector256 vector, [ConstantExpected] byte control) + public static Vector256 ShuffleNative(Vector256 vector, [ConstantExpected] byte control) { if (Avx.IsSupported) { return Avx.Shuffle(vector, vector, control); } - if (Sse.IsSupported) - { - Vector128 lower = vector.GetLower(); - Vector128 upper = vector.GetUpper(); - return Vector256.Create(Sse.Shuffle(lower, lower, control), Sse.Shuffle(upper, upper, control)); - } - ThrowUnreachableException(); return default; } @@ -73,7 +66,7 @@ internal static class Vector256_ /// /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector256 Shuffle(Vector256 vector, Vector256 indices) + public static Vector256 ShuffleNative(Vector256 vector, Vector256 indices) { if (Avx2.IsSupported) { @@ -98,13 +91,6 @@ internal static class Vector256_ return Avx.ConvertToVector256Int32(vector); } - if (Sse2.IsSupported) - { - Vector128 lower = Sse2.ConvertToVector128Int32(vector.GetLower()); - Vector128 upper = Sse2.ConvertToVector128Int32(vector.GetUpper()); - return Vector256.Create(lower, upper); - } - Vector256 sign = vector & Vector256.Create(-0F); Vector256 val_2p23_f32 = sign | Vector256.Create(8388608F); @@ -154,6 +140,27 @@ internal static class Vector256_ return va + (vm0 * vm1); } + /// + /// Packs signed 32-bit integers to signed 16-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 PackSignedSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.PackSignedSaturate(left, right); + } + + Vector256 min = Vector256.Create((int)short.MinValue); + Vector256 max = Vector256.Create((int)short.MaxValue); + Vector256 lefClamped = Clamp(left, min, max); + Vector256 rightClamped = Clamp(right, min, max); + return Vector256.Narrow(lefClamped, rightClamped); + } + /// /// Restricts a vector between a minimum and a maximum value. /// @@ -166,6 +173,21 @@ internal static class Vector256_ public static Vector256 Clamp(Vector256 value, Vector256 min, Vector256 max) => Vector256.Min(Vector256.Max(value, min), max); + /// + /// Widens a to a . + /// + /// The vector to widen. + /// The widened . + public static Vector256 Widen(Vector128 value) + { + if (Avx2.IsSupported) + { + return Avx2.ConvertToVector256Int32(value); + } + + return Vector256.WidenLower(value.ToVector256()); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index 3c773bc52..63de5dc10 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -24,16 +24,16 @@ internal static class Vector512_ /// /// Gets a value indicating whether shuffle float operations are supported. /// - public static bool SupportsShuffleFloat + public static bool SupportsShuffleNativeFloat { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Avx512F.IsSupported || Avx.IsSupported; + get => Avx512F.IsSupported; } /// /// Gets a value indicating whether shuffle byte operations are supported. /// - public static bool SupportsShuffleByte + public static bool SupportsShuffleNativeByte { [MethodImpl(MethodImplOptions.AggressiveInlining)] get => Avx512BW.IsSupported; @@ -46,20 +46,13 @@ internal static class Vector512_ /// The shuffle control byte. /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector512 Shuffle(Vector512 vector, [ConstantExpected] byte control) + public static Vector512 ShuffleNative(Vector512 vector, [ConstantExpected] byte control) { if (Avx512F.IsSupported) { return Avx512F.Shuffle(vector, vector, control); } - if (Avx.IsSupported) - { - Vector256 lower = vector.GetLower(); - Vector256 upper = vector.GetUpper(); - return Vector512.Create(Avx.Shuffle(lower, lower, control), Avx.Shuffle(upper, upper, control)); - } - ThrowUnreachableException(); return default; } @@ -73,7 +66,7 @@ internal static class Vector512_ /// /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector512 Shuffle(Vector512 vector, Vector512 indices) + public static Vector512 ShuffleNative(Vector512 vector, Vector512 indices) { if (Avx512BW.IsSupported) { diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs index 4e4133496..2aaf5c943 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; @@ -60,109 +59,76 @@ internal partial struct Block8x8F } /// - /// Loads values from using extended AVX2 intrinsics. + /// Loads values from using intrinsics. /// /// The source - public void LoadFromInt16ExtendedAvx2(ref Block8x8 source) + public void LoadFromInt16ExtendedVector256(ref Block8x8 source) { DebugGuard.IsTrue( - Avx2.IsSupported, - "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!"); + Vector256.IsHardwareAccelerated, + "LoadFromInt16ExtendedVector256 only works on Vector256 compatible architecture!"); ref short sRef = ref Unsafe.As(ref source); ref Vector256 dRef = ref Unsafe.As>(ref this); - // Vector256.Count == 16 on AVX2 + // Vector256.Count == 16 // We can process 2 block rows in a single step - Vector256 top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef)); - Vector256 bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256.Count)); - dRef = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom); - - top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 2))); - bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 3))); - Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom); - - top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 4))); - bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 5))); - Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom); - - top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 6))); - bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 7))); - Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top); - Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom); + Vector256 top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef)); + Vector256 bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256.Count)); + dRef = Vector256.ConvertToSingle(top); + Unsafe.Add(ref dRef, 1) = Vector256.ConvertToSingle(bottom); + + top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 2))); + bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 3))); + Unsafe.Add(ref dRef, 2) = Vector256.ConvertToSingle(top); + Unsafe.Add(ref dRef, 3) = Vector256.ConvertToSingle(bottom); + + top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 4))); + bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 5))); + Unsafe.Add(ref dRef, 4) = Vector256.ConvertToSingle(top); + Unsafe.Add(ref dRef, 5) = Vector256.ConvertToSingle(bottom); + + top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 6))); + bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 7))); + Unsafe.Add(ref dRef, 6) = Vector256.ConvertToSingle(top); + Unsafe.Add(ref dRef, 7) = Vector256.ConvertToSingle(bottom); } [MethodImpl(InliningOptions.ShortMethod)] private static Vector256 NormalizeAndRoundVector256(Vector256 value, Vector256 off, Vector256 max) => Vector256_.RoundToNearestInteger(Vector256_.Clamp(value + off, Vector256.Zero, max)); - private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) + private static unsafe void MultiplyIntoInt16Vector256(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest) { - DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!"); + DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to run this operation!"); ref Vector256 aBase = ref a.V256_0; ref Vector256 bBase = ref b.V256_0; - ref Vector256 destRef = ref dest.V01; - Vector256 multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7); for (nuint i = 0; i < 8; i += 2) { - Vector256 row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0))); - Vector256 row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1))); + Vector256 row0 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0)); + Vector256 row1 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1)); - Vector256 row = Avx2.PackSignedSaturate(row0, row1); - row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16(); + Vector256 row = Vector256_.PackSignedSaturate(row0, row1); + row = Vector256.Shuffle(row.AsInt32(), Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7)).AsInt16(); Unsafe.Add(ref destRef, i / 2) = row; } } - private void TransposeInPlace_Avx() + private void TransposeInPlaceVector256() { // https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536 - Vector256 r0 = Avx.InsertVector128( - this.V256_0, - Unsafe.As>(ref this.V4L), - 1); - - Vector256 r1 = Avx.InsertVector128( - this.V256_1, - Unsafe.As>(ref this.V5L), - 1); - - Vector256 r2 = Avx.InsertVector128( - this.V256_2, - Unsafe.As>(ref this.V6L), - 1); - - Vector256 r3 = Avx.InsertVector128( - this.V256_3, - Unsafe.As>(ref this.V7L), - 1); - - Vector256 r4 = Avx.InsertVector128( - Unsafe.As>(ref this.V0R).ToVector256(), - Unsafe.As>(ref this.V4R), - 1); - - Vector256 r5 = Avx.InsertVector128( - Unsafe.As>(ref this.V1R).ToVector256(), - Unsafe.As>(ref this.V5R), - 1); - - Vector256 r6 = Avx.InsertVector128( - Unsafe.As>(ref this.V2R).ToVector256(), - Unsafe.As>(ref this.V6R), - 1); - - Vector256 r7 = Avx.InsertVector128( - Unsafe.As>(ref this.V3R).ToVector256(), - Unsafe.As>(ref this.V7R), - 1); + Vector256 r0 = this.V256_0.WithUpper(this.V4L.AsVector128()); + Vector256 r1 = this.V256_1.WithUpper(this.V5L.AsVector128()); + Vector256 r2 = this.V256_2.WithUpper(this.V6L.AsVector128()); + Vector256 r3 = this.V256_3.WithUpper(this.V7L.AsVector128()); + Vector256 r4 = this.V0R.AsVector128().ToVector256().WithUpper(this.V4R.AsVector128()); + Vector256 r5 = this.V1R.AsVector128().ToVector256().WithUpper(this.V5R.AsVector128()); + Vector256 r6 = this.V2R.AsVector128().ToVector256().WithUpper(this.V6R.AsVector128()); + Vector256 r7 = this.V3R.AsVector128().ToVector256().WithUpper(this.V7R.AsVector128()); Vector256 t0 = Avx.UnpackLow(r0, r1); Vector256 t2 = Avx.UnpackLow(r2, r3); diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index 6f9b4fd16..a4a7d3ed0 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -277,9 +277,9 @@ internal partial struct Block8x8F : IEquatable /// The quantization table. public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { - MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest); + MultiplyIntoInt16Vector256(ref block, ref qt, ref dest); ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest); } else if (Vector128.IsHardwareAccelerated) @@ -387,9 +387,9 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void LoadFrom(ref Block8x8 source) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { - this.LoadFromInt16ExtendedAvx2(ref source); + this.LoadFromInt16ExtendedVector256(ref source); return; } else if (Vector128.IsHardwareAccelerated) @@ -601,9 +601,9 @@ internal partial struct Block8x8F : IEquatable [MethodImpl(InliningOptions.ShortMethod)] public void TransposeInPlace() { - if (Avx.IsSupported) + if (Vector256.IsHardwareAccelerated) { - this.TransposeInPlace_Avx(); + this.TransposeInPlaceVector256(); } else { diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs index 7a8502c2c..25b5e973e 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs @@ -32,7 +32,7 @@ public class Block8x8F_LoadFromInt16 public void Scalar() => this.destination.LoadFromInt16Scalar(ref this.source); [Benchmark] - public void ExtendedAvx2() => this.destination.LoadFromInt16ExtendedAvx2(ref this.source); + public void ExtendedAvx2() => this.destination.LoadFromInt16ExtendedVector256(ref this.source); // RESULT: // Method | Mean | Error | StdDev | Scaled | diff --git a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs index d1ade761c..ab205c8a3 100644 --- a/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs +++ b/tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs @@ -433,7 +433,7 @@ public partial class Block8x8FTests : JpegFixture Block8x8 source = Block8x8.Load(data); Block8x8F dest = default; - dest.LoadFromInt16ExtendedAvx2(ref source); + dest.LoadFromInt16ExtendedVector256(ref source); for (int i = 0; i < Block8x8F.Size; i++) {