diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index 5f85734e83..777de2dc9f 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -1069,4 +1069,24 @@ internal static class Numerics public static nuint Vector256Count(int length) where TVector : struct => (uint)length / (uint)Vector256.Count; + + /// + /// Gets the count of vectors that safely fit into the given span. + /// + /// The type of the vector. + /// The given span. + /// Count of vectors that safely fit into the span. + public static nuint Vector512Count(this Span span) + where TVector : struct + => (uint)span.Length / (uint)Vector512.Count; + + /// + /// Gets the count of vectors that safely fit into length. + /// + /// The type of the vector. + /// The given length. + /// Count of vectors that safely fit into the length. + public static nuint Vector512Count(int length) + where TVector : struct + => (uint)length / (uint)Vector512.Count; } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs new file mode 100644 index 0000000000..1b5a418dea --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs @@ -0,0 +1,77 @@ +// Copyright (c) Six Labors. +// Licensed under the Six Labors Split License. + +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp; + +internal static partial class SimdUtils +{ + /// + /// Converts all input -s to -s normalized into [0..1]. + /// should be the of the same size as , + /// but there are no restrictions on the span's length. + /// + /// The source span of bytes + /// The destination span of floats + [MethodImpl(InliningOptions.ShortMethod)] + internal static void ByteToNormalizedFloat(ReadOnlySpan source, Span destination) + { + DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!"); + + HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref destination); + + if (source.Length > 0) + { + ConvertByteToNormalizedFloatRemainder(source, destination); + } + } + + /// + /// Convert all values normalized into [0..1] from 'source' into 'destination' buffer of . + /// The values are scaled up into [0-255] and rounded, overflows are clamped. + /// should be the of the same size as , + /// but there are no restrictions on the span's length. + /// + /// The source span of floats + /// The destination span of bytes + [MethodImpl(InliningOptions.ShortMethod)] + internal static void NormalizedFloatToByteSaturate(ReadOnlySpan source, Span destination) + { + DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!"); + + HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref destination); + + if (source.Length > 0) + { + ConvertNormalizedFloatToByteRemainder(source, destination); + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan source, Span destination) + { + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref float dBase = ref MemoryMarshal.GetReference(destination); + + for (int i = 0; i < source.Length; i++) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f; + } + } + + [MethodImpl(MethodImplOptions.NoInlining)] + private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan source, Span destination) + { + ref float sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(destination); + for (int i = 0; i < source.Length; i++) + { + Unsafe.Add(ref dBase, i) = ConvertToByte(Unsafe.Add(ref sBase, i)); + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static byte ConvertToByte(float f) => (byte)Numerics.Clamp((f * 255F) + 0.5F, 0, 255F); +} diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs deleted file mode 100644 index 3c2f189cf6..0000000000 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Six Labors Split License. - -using System.Numerics; -using System.Runtime.CompilerServices; - -// ReSharper disable MemberHidesStaticFromOuterClass -namespace SixLabors.ImageSharp; - -internal static partial class SimdUtils -{ - /// - /// Implementation methods based on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*). - /// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+) - /// See: - /// https://github.com/dotnet/coreclr/pull/10662 - /// API Proposal: - /// https://github.com/dotnet/corefx/issues/15957 - /// - public static class ExtendedIntrinsics - { - public static bool IsAvailable { get; } = Vector.IsHardwareAccelerated; - - /// - /// Widen and convert a vector of values into 2 vectors of -s. - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static void ConvertToSingle( - Vector source, - out Vector dest1, - out Vector dest2) - { - Vector.Widen(source, out Vector i1, out Vector i2); - dest1 = Vector.ConvertToSingle(i1); - dest2 = Vector.ConvertToSingle(i2); - } - } -} diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs deleted file mode 100644 index fcf441c476..0000000000 --- a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) Six Labors. -// Licensed under the Six Labors Split License. - -using System.Numerics; -using System.Runtime.CompilerServices; -using System.Runtime.InteropServices; - -// ReSharper disable MemberHidesStaticFromOuterClass -namespace SixLabors.ImageSharp; - -internal static partial class SimdUtils -{ - /// - /// Fallback implementation based on (128bit). - /// For , efficient software fallback implementations are present, - /// and we hope that even mono's JIT is able to emit SIMD instructions for that type :P - /// - public static class FallbackIntrinsics128 - { - /// - /// as many elements as possible, slicing them down (keeping the remainder). - /// - [MethodImpl(InliningOptions.ShortMethod)] - internal static void ByteToNormalizedFloatReduce( - ref ReadOnlySpan source, - ref Span dest) - { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); - - int remainder = Numerics.Modulo4(source.Length); - int adjustedCount = source.Length - remainder; - - if (adjustedCount > 0) - { - ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]); - - source = source[adjustedCount..]; - dest = dest[adjustedCount..]; - } - } - - /// - /// Implementation of using . - /// - [MethodImpl(InliningOptions.ColdPath)] - internal static void ByteToNormalizedFloat(ReadOnlySpan source, Span dest) - { - DebugVerifySpanInput(source, dest, 4); - - uint count = (uint)dest.Length / 4; - if (count == 0) - { - return; - } - - ref ByteVector4 sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source)); - ref Vector4 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest)); - - const float scale = 1f / 255f; - Vector4 d = default; - - for (nuint i = 0; i < count; i++) - { - ref ByteVector4 s = ref Unsafe.Add(ref sBase, i); - d.X = s.X; - d.Y = s.Y; - d.Z = s.Z; - d.W = s.W; - d *= scale; - Unsafe.Add(ref dBase, i) = d; - } - } - - [StructLayout(LayoutKind.Sequential)] - private struct ByteVector4 - { - public byte X; - public byte Y; - public byte Z; - public byte W; - } - } -} diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 487331360c..6f0b4b4e3c 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -752,17 +752,23 @@ internal static partial class SimdUtils /// /// as many elements as possible, slicing them down (keeping the remainder). /// + /// The source buffer. + /// The destination buffer. [MethodImpl(InliningOptions.ShortMethod)] internal static void ByteToNormalizedFloatReduce( ref ReadOnlySpan source, - ref Span dest) + ref Span destination) { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); + DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!"); if (Avx2.IsSupported || Sse2.IsSupported) { int remainder; - if (Avx2.IsSupported) + if (Vector512.IsHardwareAccelerated && Avx512F.IsSupported) + { + remainder = Numerics.ModuloP2(source.Length, Vector512.Count); + } + else if (Avx2.IsSupported) { remainder = Numerics.ModuloP2(source.Length, Vector256.Count); } @@ -775,10 +781,10 @@ internal static partial class SimdUtils if (adjustedCount > 0) { - ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]); + ByteToNormalizedFloat(source[..adjustedCount], destination[..adjustedCount]); source = source[adjustedCount..]; - dest = dest[adjustedCount..]; + destination = destination[adjustedCount..]; } } } @@ -786,97 +792,127 @@ internal static partial class SimdUtils /// /// Implementation , which is faster on new RyuJIT runtime. /// + /// The source buffer. + /// The destination buffer. /// /// Implementation is based on MagicScaler code: /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182 /// internal static unsafe void ByteToNormalizedFloat( ReadOnlySpan source, - Span dest) + Span destination) { - fixed (byte* sourceBase = source) + if (Avx512F.IsSupported) { - if (Avx2.IsSupported) - { - DebugVerifySpanInput(source, dest, Vector256.Count); - - nuint n = dest.Vector256Count(); + DebugVerifySpanInput(source, destination, Vector512.Count); - ref Vector256 destBase = - ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + nuint n = destination.Vector512Count(); - Vector256 scale = Vector256.Create(1 / (float)byte.MaxValue); + ref byte sourceBase = ref MemoryMarshal.GetReference(source); + ref Vector512 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); - for (nuint i = 0; i < n; i++) - { - nuint si = (uint)Vector256.Count * i; - Vector256 i0 = Avx2.ConvertToVector256Int32(sourceBase + si); - Vector256 i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256.Count); - Vector256 i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 2)); - Vector256 i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256.Count * 3)); - - Vector256 f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0)); - Vector256 f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1)); - Vector256 f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2)); - Vector256 f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3)); - - ref Vector256 d = ref Unsafe.Add(ref destBase, i * 4); - - d = f0; - Unsafe.Add(ref d, 1) = f1; - Unsafe.Add(ref d, 2) = f2; - Unsafe.Add(ref d, 3) = f3; - } + for (nuint i = 0; i < n; i++) + { + nuint si = (uint)Vector512.Count * i; + Vector512 i0 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si)); + Vector512 i1 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)Vector512.Count)); + Vector512 i2 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector512.Count * 2))); + Vector512 i3 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector512.Count * 3))); + + // Declare multiplier on each line. Codegen is better. + Vector512 f0 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i0); + Vector512 f1 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i1); + Vector512 f2 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i2); + Vector512 f3 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i3); + + ref Vector512 d = ref Unsafe.Add(ref destinationBase, i * 4); + + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; } - else + } + else if (Avx2.IsSupported) + { + DebugVerifySpanInput(source, destination, Vector256.Count); + + nuint n = destination.Vector256Count(); + + ref byte sourceBase = ref MemoryMarshal.GetReference(source); + ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); + + for (nuint i = 0; i < n; i++) { - // Sse - DebugVerifySpanInput(source, dest, Vector128.Count); + nuint si = (uint)Vector256.Count * i; + Vector256 i0 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si)); + Vector256 i1 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)Vector256.Count)); + Vector256 i2 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector256.Count * 2))); + + // Ensure overreads past 16 byte boundary do not happen in debug due to lack of containment. + ref ulong refULong = ref Unsafe.As(ref Unsafe.Add(ref sourceBase, si)); + Vector256 i3 = Avx2.ConvertToVector256Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refULong, 3)).AsByte()); + + // Declare multiplier on each line. Codegen is better. + Vector256 f0 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i0); + Vector256 f1 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i1); + Vector256 f2 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i2); + Vector256 f3 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i3); + + ref Vector256 d = ref Unsafe.Add(ref destinationBase, i * 4); + + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; + } + } + else if (Sse2.IsSupported || AdvSimd.IsSupported) + { + DebugVerifySpanInput(source, destination, Vector128.Count); - nuint n = dest.Vector128Count(); + nuint n = destination.Vector128Count(); + + ref byte sourceBase = ref MemoryMarshal.GetReference(source); + ref Vector128 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination)); - ref Vector128 destBase = - ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + Vector128 scale = Vector128.Create(1 / (float)byte.MaxValue); + Vector128 zero = Vector128.Zero; - Vector128 scale = Vector128.Create(1 / (float)byte.MaxValue); - Vector128 zero = Vector128.Zero; + for (nuint i = 0; i < n; i++) + { + nuint si = (uint)Vector128.Count * i; - for (nuint i = 0; i < n; i++) + Vector128 i0, i1, i2, i3; + if (Sse41.IsSupported) { - nuint si = (uint)Vector128.Count * i; - - Vector128 i0, i1, i2, i3; - if (Sse41.IsSupported) - { - i0 = Sse41.ConvertToVector128Int32(sourceBase + si); - i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128.Count); - i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 2)); - i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128.Count * 3)); - } - else - { - Vector128 b = Sse2.LoadVector128(sourceBase + si); - Vector128 s0 = Sse2.UnpackLow(b, zero).AsInt16(); - Vector128 s1 = Sse2.UnpackHigh(b, zero).AsInt16(); - - i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32(); - i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32(); - i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32(); - i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32(); - } - - Vector128 f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0)); - Vector128 f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1)); - Vector128 f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2)); - Vector128 f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3)); - - ref Vector128 d = ref Unsafe.Add(ref destBase, i * 4); - - d = f0; - Unsafe.Add(ref d, 1) = f1; - Unsafe.Add(ref d, 2) = f2; - Unsafe.Add(ref d, 3) = f3; + ref int refInt = ref Unsafe.As(ref Unsafe.Add(ref sourceBase, si)); + + i0 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(refInt).AsByte()); + i1 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 1)).AsByte()); + i2 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 2)).AsByte()); + i3 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 3)).AsByte()); } + else + { + // Sse2, AdvSimd + Vector128 b = Vector128.LoadUnsafe(ref sourceBase, si); + (Vector128 s0, Vector128 s1) = Vector128.Widen(b); + (i0, i1) = Vector128.Widen(s0.AsInt16()); + (i2, i3) = Vector128.Widen(s1.AsInt16()); + } + + Vector128 f0 = scale * Vector128.ConvertToSingle(i0); + Vector128 f1 = scale * Vector128.ConvertToSingle(i1); + Vector128 f2 = scale * Vector128.ConvertToSingle(i2); + Vector128 f3 = scale * Vector128.ConvertToSingle(i3); + + ref Vector128 d = ref Unsafe.Add(ref destinationBase, i * 4); + + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs index 002c1f8da0..0279e57cc6 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -22,13 +22,6 @@ internal static partial class SimdUtils public static bool HasVector8 { get; } = Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8; - /// - /// Gets a value indicating whether code is being JIT-ed to SSE instructions - /// where float and integer registers are of size 128 byte. - /// - public static bool HasVector4 { get; } = - Vector.IsHardwareAccelerated && Vector.Count == 4; - /// /// Transform all scalars in 'v' in a way that converting them to would have rounding semantics. /// @@ -69,96 +62,6 @@ internal static partial class SimdUtils } } - /// - /// Converts all input -s to -s normalized into [0..1]. - /// should be the of the same size as , - /// but there are no restrictions on the span's length. - /// - /// The source span of bytes - /// The destination span of floats - [MethodImpl(InliningOptions.ShortMethod)] - internal static void ByteToNormalizedFloat(ReadOnlySpan source, Span dest) - { - DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); - - HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest); - - // Also deals with the remainder from previous conversions: - FallbackIntrinsics128.ByteToNormalizedFloatReduce(ref source, ref dest); - - // Deal with the remainder: - if (source.Length > 0) - { - ConvertByteToNormalizedFloatRemainder(source, dest); - } - } - - /// - /// Convert all values normalized into [0..1] from 'source' into 'destination' buffer of . - /// The values are scaled up into [0-255] and rounded, overflows are clamped. - /// should be the of the same size as , - /// but there are no restrictions on the span's length. - /// - /// The source span of floats - /// The destination span of bytes - [MethodImpl(InliningOptions.ShortMethod)] - internal static void NormalizedFloatToByteSaturate(ReadOnlySpan source, Span destination) - { - DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!"); - HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref destination); - - // Deal with the remainder: - if (source.Length > 0) - { - ConvertNormalizedFloatToByteRemainder(source, destination); - } - } - - [MethodImpl(InliningOptions.ColdPath)] - private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan source, Span destination) - { - ref byte sBase = ref MemoryMarshal.GetReference(source); - ref float dBase = ref MemoryMarshal.GetReference(destination); - - // There are at most 3 elements at this point, having a for loop is overkill. - // Let's minimize the no. of instructions! - switch (source.Length) - { - case 3: - Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2) / 255f; - goto case 2; - case 2: - Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1) / 255f; - goto case 1; - case 1: - dBase = sBase / 255f; - break; - } - } - - [MethodImpl(MethodImplOptions.NoInlining)] - private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan source, Span destination) - { - ref float sBase = ref MemoryMarshal.GetReference(source); - ref byte dBase = ref MemoryMarshal.GetReference(destination); - for (int i = 0; i < source.Length; i++) - { - Unsafe.Add(ref dBase, i) = ConvertToByte(Unsafe.Add(ref sBase, i)); - } - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static byte ConvertToByte(float f) => (byte)Numerics.Clamp((f * 255F) + 0.5F, 0, 255F); - - [Conditional("DEBUG")] - private static void VerifyHasVector8(string operation) - { - if (!HasVector8) - { - throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!"); - } - } - [Conditional("DEBUG")] private static void DebugVerifySpanInput(ReadOnlySpan source, Span dest, int shouldBeDivisibleBy) { diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index a07fa8ca6e..b6dd319f06 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -26,7 +26,7 @@ internal static class Vector128Utilities public static bool SupportsShuffleFloat { [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Sse.IsSupported || AdvSimd.IsSupported; + get => Sse.IsSupported; } /// @@ -70,17 +70,6 @@ internal static class Vector128Utilities return Sse.Shuffle(vector, vector, control); } - if (AdvSimd.IsSupported) - { -#pragma warning disable CA1857 // A constant is expected for the parameter - Vector128 result = Vector128.Create(AdvSimd.Extract(vector, (byte)(control & 0x3))); - result = AdvSimd.Insert(result, 1, AdvSimd.Extract(vector, (byte)((control >> 2) & 0x3))); - result = AdvSimd.Insert(result, 2, AdvSimd.Extract(vector, (byte)((control >> 4) & 0x3))); - result = AdvSimd.Insert(result, 3, AdvSimd.Extract(vector, (byte)((control >> 6) & 0x3))); -#pragma warning restore CA1857 // A constant is expected for the parameter - return result; - } - ThrowUnreachableException(); return default; } diff --git a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs index d432e82d24..018df5f9f4 100644 --- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs +++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs @@ -386,29 +386,33 @@ internal partial struct Block8x8F : IEquatable public void LoadFromInt16ExtendedAvx2(ref Block8x8 source) { DebugGuard.IsTrue( - SimdUtils.HasVector8, + Avx2.IsSupported, "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!"); - ref Vector sRef = ref Unsafe.As>(ref source); - ref Vector dRef = ref Unsafe.As>(ref this); + ref short sRef = ref Unsafe.As(ref source); + ref Vector256 dRef = ref Unsafe.As>(ref this); - // Vector.Count == 16 on AVX2 + // Vector256.Count == 16 on AVX2 // We can process 2 block rows in a single step - SimdUtils.ExtendedIntrinsics.ConvertToSingle(sRef, out Vector top, out Vector bottom); - dRef = top; - Unsafe.Add(ref dRef, 1) = bottom; - - SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 1), out top, out bottom); - Unsafe.Add(ref dRef, 2) = top; - Unsafe.Add(ref dRef, 3) = bottom; - - SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 2), out top, out bottom); - Unsafe.Add(ref dRef, 4) = top; - Unsafe.Add(ref dRef, 5) = bottom; - - SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 3), out top, out bottom); - Unsafe.Add(ref dRef, 6) = top; - Unsafe.Add(ref dRef, 7) = bottom; + Vector256 top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef)); + Vector256 bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256.Count)); + dRef = Avx.ConvertToVector256Single(top); + Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom); + + top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 2))); + bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 3))); + Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top); + Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom); + + top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 4))); + bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 5))); + Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top); + Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom); + + top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 6))); + bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256.Count * 7))); + Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top); + Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom); } /// diff --git a/src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs b/src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs index 8616ecb3b1..9e649f3c08 100644 --- a/src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs +++ b/src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs @@ -5,6 +5,7 @@ using System.Buffers; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; namespace SixLabors.ImageSharp.PixelFormats.Utils; @@ -31,74 +32,86 @@ internal static partial class Vector4Converters /// Provides an efficient default implementation for /// The method works by internally converting to a therefore it's not applicable for that type! /// - [MethodImpl(InliningOptions.ShortMethod)] + /// The type of pixel format. + /// The configuration. + /// The pixel operations instance. + /// The source buffer. + /// The destination buffer. + /// The conversion modifier flags. + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static void ToVector4( Configuration configuration, PixelOperations pixelOperations, - ReadOnlySpan sourcePixels, - Span destVectors, + ReadOnlySpan source, + Span destination, PixelConversionModifiers modifiers) where TPixel : unmanaged, IPixel { Guard.NotNull(configuration, nameof(configuration)); - Guard.DestinationShouldNotBeTooShort(sourcePixels, destVectors, nameof(destVectors)); + Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination)); - int count = sourcePixels.Length; + int count = source.Length; // Not worth for small buffers: if (count < Vector4ConversionThreshold) { - Default.UnsafeToVector4(sourcePixels, destVectors, modifiers); + Default.UnsafeToVector4(source, destination, modifiers); return; } - // Using the last quarter of 'destVectors' as a temporary buffer to avoid allocation: + // Using the last quarter of 'destination' as a temporary buffer to avoid allocation: int countWithoutLastItem = count - 1; - ReadOnlySpan reducedSource = sourcePixels[..countWithoutLastItem]; - Span lastQuarterOfDestBuffer = MemoryMarshal.Cast(destVectors).Slice((3 * count) + 1, countWithoutLastItem); - pixelOperations.ToRgba32(configuration, reducedSource, lastQuarterOfDestBuffer); + ReadOnlySpan reducedSource = source[..countWithoutLastItem]; + Span lastQuarterOfDestination = MemoryMarshal.Cast(destination).Slice((3 * count) + 1, countWithoutLastItem); + pixelOperations.ToRgba32(configuration, reducedSource, lastQuarterOfDestination); - // 'destVectors' and 'lastQuarterOfDestBuffer' are overlapping buffers, + // 'destination' and 'lastQuarterOfDestination' are overlapping buffers, // but we are always reading/writing at different positions: SimdUtils.ByteToNormalizedFloat( - MemoryMarshal.Cast(lastQuarterOfDestBuffer), - MemoryMarshal.Cast(destVectors[..countWithoutLastItem])); + MemoryMarshal.Cast(lastQuarterOfDestination), + MemoryMarshal.Cast(destination[..countWithoutLastItem])); - destVectors[countWithoutLastItem] = sourcePixels[countWithoutLastItem].ToVector4(); + destination[countWithoutLastItem] = source[countWithoutLastItem].ToVector4(); // TODO: Investigate optimized 1-pass approach! - ApplyForwardConversionModifiers(destVectors, modifiers); + ApplyForwardConversionModifiers(destination, modifiers); } /// /// Provides an efficient default implementation for /// The method is works by internally converting to a therefore it's not applicable for that type! /// - [MethodImpl(InliningOptions.ShortMethod)] + /// The type of pixel format. + /// The configuration. + /// The pixel operations instance. + /// The source buffer. + /// The destination buffer. + /// The conversion modifier flags. + [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static void FromVector4( Configuration configuration, PixelOperations pixelOperations, - Span sourceVectors, - Span destPixels, + Span source, + Span destination, PixelConversionModifiers modifiers) where TPixel : unmanaged, IPixel { Guard.NotNull(configuration, nameof(configuration)); - Guard.DestinationShouldNotBeTooShort(sourceVectors, destPixels, nameof(destPixels)); + Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination)); - int count = sourceVectors.Length; + int count = source.Length; // Not worth for small buffers: if (count < Vector4ConversionThreshold) { - Default.UnsafeFromVector4(sourceVectors, destPixels, modifiers); + Default.UnsafeFromVector4(source, destination, modifiers); return; } // TODO: Investigate optimized 1-pass approach! - ApplyBackwardConversionModifiers(sourceVectors, modifiers); + ApplyBackwardConversionModifiers(source, modifiers); // For the opposite direction it's not easy to implement the trick used in RunRgba32CompatibleToVector4Conversion, // so let's allocate a temporary buffer as usually: @@ -106,20 +119,30 @@ internal static partial class Vector4Converters Span tempSpan = tempBuffer.Memory.Span; SimdUtils.NormalizedFloatToByteSaturate( - MemoryMarshal.Cast(sourceVectors), + MemoryMarshal.Cast(source), MemoryMarshal.Cast(tempSpan)); - pixelOperations.FromRgba32(configuration, tempSpan, destPixels); + pixelOperations.FromRgba32(configuration, tempSpan, destination); } private static int CalculateVector4ConversionThreshold() { - if (!Vector.IsHardwareAccelerated) + if (!Vector128.IsHardwareAccelerated) { return int.MaxValue; } - return SimdUtils.ExtendedIntrinsics.IsAvailable && SimdUtils.HasVector8 ? 256 : 128; + if (Vector512.IsHardwareAccelerated) + { + return 512; + } + + if (Vector256.IsHardwareAccelerated) + { + return 256; + } + + return 128; } } } diff --git a/tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs index 3b4360b161..0df8d9818c 100644 --- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs @@ -14,9 +14,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk; public abstract class ToVector4 where TPixel : unmanaged, IPixel { - protected IMemoryOwner source; + protected IMemoryOwner Source { get; set; } - protected IMemoryOwner destination; + protected IMemoryOwner Destination { get; set; } protected Configuration Configuration => Configuration.Default; @@ -26,22 +26,22 @@ public abstract class ToVector4 [GlobalSetup] public void Setup() { - this.source = this.Configuration.MemoryAllocator.Allocate(this.Count); - this.destination = this.Configuration.MemoryAllocator.Allocate(this.Count); + this.Source = this.Configuration.MemoryAllocator.Allocate(this.Count); + this.Destination = this.Configuration.MemoryAllocator.Allocate(this.Count); } [GlobalCleanup] public void Cleanup() { - this.source.Dispose(); - this.destination.Dispose(); + this.Source.Dispose(); + this.Destination.Dispose(); } // [Benchmark] public void Naive() { - Span s = this.source.GetSpan(); - Span d = this.destination.GetSpan(); + Span s = this.Source.GetSpan(); + Span d = this.Destination.GetSpan(); for (int i = 0; i < this.Count; i++) { @@ -50,11 +50,8 @@ public abstract class ToVector4 } [Benchmark] - public void PixelOperations_Specialized() - { - PixelOperations.Instance.ToVector4( + public void PixelOperations_Specialized() => PixelOperations.Instance.ToVector4( this.Configuration, - this.source.GetSpan(), - this.destination.GetSpan()); - } + this.Source.GetSpan(), + this.Destination.GetSpan()); } diff --git a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs index 934a17dc94..6499632b69 100644 --- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs +++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs @@ -16,8 +16,8 @@ public class ToVector4_Bgra32 : ToVector4 { new PixelOperations().ToVector4( this.Configuration, - this.source.GetSpan(), - this.destination.GetSpan()); + this.Source.GetSpan(), + this.Destination.GetSpan()); } // RESULTS: diff --git a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs index d5d6e31b5d..adedabf8f5 100644 --- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs +++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs @@ -16,8 +16,8 @@ public class ToVector4_Rgb24 : ToVector4 { new PixelOperations().ToVector4( this.Configuration, - this.source.GetSpan(), - this.destination.GetSpan()); + this.Source.GetSpan(), + this.Destination.GetSpan()); } } diff --git a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs index 9c7ecbc491..113793a033 100644 --- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs +++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs @@ -14,27 +14,18 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk; [Config(typeof(Config.Short))] public class ToVector4_Rgba32 : ToVector4 { - [Benchmark] - public void FallbackIntrinsics128() - { - Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); - Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - - SimdUtils.FallbackIntrinsics128.ByteToNormalizedFloat(sBytes, dFloats); - } - [Benchmark] public void PixelOperations_Base() => new PixelOperations().ToVector4( this.Configuration, - this.source.GetSpan(), - this.destination.GetSpan()); + this.Source.GetSpan(), + this.Destination.GetSpan()); [Benchmark] public void HwIntrinsics() { - Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); - Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + Span sBytes = MemoryMarshal.Cast(this.Source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.Destination.GetSpan()); SimdUtils.HwIntrinsics.ByteToNormalizedFloat(sBytes, dFloats); } @@ -42,8 +33,8 @@ public class ToVector4_Rgba32 : ToVector4 // [Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops() { - Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); - Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + Span sBytes = MemoryMarshal.Cast(this.Source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.Destination.GetSpan()); nuint n = (uint)dFloats.Length / (uint)Vector.Count; @@ -67,14 +58,14 @@ public class ToVector4_Rgba32 : ToVector4 } n = (uint)(dFloats.Length / Vector.Count); - var scale = new Vector(1f / 255f); + Vector scale = new(1f / 255f); for (nuint i = 0; i < n; i++) { ref Vector dRef = ref Unsafe.Add(ref destBase, i); - var du = Vector.AsVectorInt32(dRef); - var v = Vector.ConvertToSingle(du); + Vector du = Vector.AsVectorInt32(dRef); + Vector v = Vector.ConvertToSingle(du); v *= scale; dRef = v; @@ -84,14 +75,14 @@ public class ToVector4_Rgba32 : ToVector4 // [Benchmark] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop() { - Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); - Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + Span sBytes = MemoryMarshal.Cast(this.Source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.Destination.GetSpan()); nuint n = (uint)dFloats.Length / (uint)Vector.Count; ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dFloats)); - var scale = new Vector(1f / 255f); + Vector scale = new(1f / 255f); for (nuint i = 0; i < n; i++) { @@ -117,8 +108,8 @@ public class ToVector4_Rgba32 : ToVector4 [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector ConvertToNormalizedSingle(Vector u, Vector scale) { - var vi = Vector.AsVectorInt32(u); - var v = Vector.ConvertToSingle(vi); + Vector vi = Vector.AsVectorInt32(u); + Vector v = Vector.ConvertToSingle(vi); v *= scale; return v; } @@ -151,4 +142,30 @@ public class ToVector4_Rgba32 : ToVector4 PixelOperations_Base | Core | 2048 | 6,752.68 ns | 272.820 ns | 15.4148 ns | 1.67 | 0.02 | - | 24 B | PixelOperations_Specialized | Core | 2048 | 1,126.13 ns | 79.192 ns | 4.4745 ns |!! 0.28 | 0.00 | - | 0 B | <--- ExtendedIntrinsics rock! */ + + /* + BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3) + 11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores + .NET SDK 8.0.200-preview.23624.5 + [Host] : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2 + Job-DFEQJT : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2 + + Runtime=.NET 8.0 Arguments=/p:DebugType=portable IterationCount=3 + LaunchCount=1 WarmupCount=3 + + | Method | Count | Mean | Error | StdDev | Allocated | + |---------------------------- |------ |------------:|-----------:|----------:|----------:| + | FallbackIntrinsics128 | 64 | 139.66 ns | 27.429 ns | 1.503 ns | - | + | PixelOperations_Base | 64 | 124.65 ns | 29.653 ns | 1.625 ns | - | + | HwIntrinsics | 64 | 18.16 ns | 4.731 ns | 0.259 ns | - | + | PixelOperations_Specialized | 64 | 27.94 ns | 15.220 ns | 0.834 ns | - | + | FallbackIntrinsics128 | 256 | 525.07 ns | 34.397 ns | 1.885 ns | - | + | PixelOperations_Base | 256 | 464.17 ns | 46.897 ns | 2.571 ns | - | + | HwIntrinsics | 256 | 43.88 ns | 4.525 ns | 0.248 ns | - | + | PixelOperations_Specialized | 256 | 55.57 ns | 14.587 ns | 0.800 ns | - | + | FallbackIntrinsics128 | 2048 | 4,148.44 ns | 476.583 ns | 26.123 ns | - | + | PixelOperations_Base | 2048 | 3,608.42 ns | 66.293 ns | 3.634 ns | - | + | HwIntrinsics | 2048 | 361.42 ns | 35.576 ns | 1.950 ns | - | + | PixelOperations_Specialized | 2048 | 374.82 ns | 33.371 ns | 1.829 ns | - | + */ } diff --git a/tests/ImageSharp.Benchmarks/LoadResizeSave/README.md b/tests/ImageSharp.Benchmarks/LoadResizeSave/README.md index 6cb48eb48c..98f472241f 100644 --- a/tests/ImageSharp.Benchmarks/LoadResizeSave/README.md +++ b/tests/ImageSharp.Benchmarks/LoadResizeSave/README.md @@ -1,4 +1,4 @@ -The benchmarks have been adapted from the +The benchmarks have been adapted from the [PhotoSauce's MemoryStress project](https://github.com/saucecontrol/core-imaging-playground/tree/beeees/MemoryStress). ### Setup diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index e9e4550b04..36b3012640 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -3,6 +3,7 @@ using System.Numerics; using System.Runtime.CompilerServices; +using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; using SixLabors.ImageSharp.PixelFormats; using SixLabors.ImageSharp.Tests.TestUtilities; @@ -117,16 +118,10 @@ public partial class SimdUtilsTests public static readonly TheoryData ArbitraryArraySizes = new() { 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520 }; [Theory] - [MemberData(nameof(ArraySizesDivisibleBy4))] - public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count) => TestImpl_BulkConvertByteToNormalizedFloat( - count, - (s, d) => SimdUtils.FallbackIntrinsics128.ByteToNormalizedFloat(s.Span, d.Span)); - - [Theory] - [MemberData(nameof(ArraySizesDivisibleBy32))] + [MemberData(nameof(ArraySizesDivisibleBy64))] public void HwIntrinsics_BulkConvertByteToNormalizedFloat(int count) { - if (!Sse2.IsSupported) + if (!Sse2.IsSupported && !AdvSimd.IsSupported) { return; } @@ -138,7 +133,7 @@ public partial class SimdUtilsTests FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, count, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41); + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512F | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41); } [Theory] @@ -160,32 +155,11 @@ public partial class SimdUtilsTests Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } - [Theory] - [InlineData(1234)] - public void ExtendedIntrinsics_ConvertToSingle(short scale) - { - int n = Vector.Count; - short[] sData = new Random(scale).GenerateRandomInt16Array(2 * n, (short)-scale, scale); - float[] fData = sData.Select(u => (float)u).ToArray(); - - Vector source = new(sData); - - Vector expected1 = new(fData, 0); - Vector expected2 = new(fData, n); - - // Act: - SimdUtils.ExtendedIntrinsics.ConvertToSingle(source, out Vector actual1, out Vector actual2); - - // Assert: - Assert.Equal(expected1, actual1); - Assert.Equal(expected2, actual2); - } - [Theory] [MemberData(nameof(ArraySizesDivisibleBy64))] public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count) { - if (!Sse2.IsSupported) + if (!Sse2.IsSupported && !AdvSimd.IsSupported) { return; }