|
|
|
@ -17,8 +17,13 @@ internal static partial class SimdUtils |
|
|
|
{ |
|
|
|
public static class HwIntrinsics |
|
|
|
{ |
|
|
|
#pragma warning disable SA1117 // Parameters should be on same line or separate lines
|
|
|
|
#pragma warning disable SA1137 // Elements should have the same indentation
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] // too much IL for JIT to inline, so give a hint
|
|
|
|
public static Vector256<int> PermuteMaskDeinterleave8x32() => Vector256.Create(0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsInt32(); |
|
|
|
public static Vector256<int> PermuteMaskDeinterleave8x32() => Vector256.Create(0, 4, 1, 5, 2, 6, 3, 7); |
|
|
|
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
public static Vector512<int> PermuteMaskDeinterleave16x32() => Vector512.Create(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); |
|
|
|
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
public static Vector256<uint> PermuteMaskEvenOdd8x32() => Vector256.Create(0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0).AsUInt32(); |
|
|
|
@ -38,17 +43,18 @@ internal static partial class SimdUtils |
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
private static Vector128<byte> ShuffleMaskSlice4Nx16() => Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 0x80, 0x80, 0x80, 0x80); |
|
|
|
|
|
|
|
#pragma warning disable SA1003, SA1116, SA1117 // Parameters should be on same line or separate lines
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
private static Vector256<byte> ShuffleMaskShiftAlpha() => Vector256.Create((byte) |
|
|
|
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15, |
|
|
|
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15); |
|
|
|
private static Vector256<byte> ShuffleMaskShiftAlpha() => Vector256.Create( |
|
|
|
(byte)0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15, |
|
|
|
0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15); |
|
|
|
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)] |
|
|
|
public static Vector256<uint> PermuteMaskShiftAlpha8x32() => Vector256.Create( |
|
|
|
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, |
|
|
|
5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsUInt32(); |
|
|
|
#pragma warning restore SA1003, SA1116, SA1117 // Parameters should be on same line or separate lines
|
|
|
|
public static Vector256<uint> PermuteMaskShiftAlpha8x32() |
|
|
|
=> Vector256.Create( |
|
|
|
0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, |
|
|
|
5, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0).AsUInt32(); |
|
|
|
#pragma warning restore SA1137 // Elements should have the same indentation
|
|
|
|
#pragma warning restore SA1117 // Parameters should be on same line or separate lines
|
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
|
|
|
|
@ -795,7 +801,7 @@ internal static partial class SimdUtils |
|
|
|
{ |
|
|
|
if (Avx2.IsSupported) |
|
|
|
{ |
|
|
|
VerifySpanInput(source, dest, Vector256<byte>.Count); |
|
|
|
DebugVerifySpanInput(source, dest, Vector256<byte>.Count); |
|
|
|
|
|
|
|
nuint n = dest.Vector256Count<byte>(); |
|
|
|
|
|
|
|
@ -828,7 +834,7 @@ internal static partial class SimdUtils |
|
|
|
else |
|
|
|
{ |
|
|
|
// Sse
|
|
|
|
VerifySpanInput(source, dest, Vector128<byte>.Count); |
|
|
|
DebugVerifySpanInput(source, dest, Vector128<byte>.Count); |
|
|
|
|
|
|
|
nuint n = dest.Vector128Count<byte>(); |
|
|
|
|
|
|
|
@ -881,17 +887,24 @@ internal static partial class SimdUtils |
|
|
|
/// <summary>
|
|
|
|
/// <see cref="NormalizedFloatToByteSaturate"/> as many elements as possible, slicing them down (keeping the remainder).
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="source">The source buffer.</param>
|
|
|
|
/// <param name="destination">The destination buffer.</param>
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
internal static void NormalizedFloatToByteSaturateReduce( |
|
|
|
ref ReadOnlySpan<float> source, |
|
|
|
ref Span<byte> dest) |
|
|
|
ref Span<byte> destination) |
|
|
|
{ |
|
|
|
DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!"); |
|
|
|
DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!"); |
|
|
|
|
|
|
|
if (Avx2.IsSupported || Sse2.IsSupported) |
|
|
|
if (Avx512BW.IsSupported || Avx2.IsSupported || Sse2.IsSupported || AdvSimd.IsSupported) |
|
|
|
{ |
|
|
|
int remainder; |
|
|
|
if (Avx2.IsSupported) |
|
|
|
|
|
|
|
if (Avx512BW.IsSupported) |
|
|
|
{ |
|
|
|
remainder = Numerics.ModuloP2(source.Length, Vector512<byte>.Count); |
|
|
|
} |
|
|
|
else if (Avx2.IsSupported) |
|
|
|
{ |
|
|
|
remainder = Numerics.ModuloP2(source.Length, Vector256<byte>.Count); |
|
|
|
} |
|
|
|
@ -906,10 +919,10 @@ internal static partial class SimdUtils |
|
|
|
{ |
|
|
|
NormalizedFloatToByteSaturate( |
|
|
|
source[..adjustedCount], |
|
|
|
dest[..adjustedCount]); |
|
|
|
destination[..adjustedCount]); |
|
|
|
|
|
|
|
source = source[adjustedCount..]; |
|
|
|
dest = dest[adjustedCount..]; |
|
|
|
destination = destination[adjustedCount..]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
@ -917,25 +930,59 @@ internal static partial class SimdUtils |
|
|
|
/// <summary>
|
|
|
|
/// Implementation of <see cref="SimdUtils.NormalizedFloatToByteSaturate"/>, which is faster on new .NET runtime.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="source">The source buffer.</param>
|
|
|
|
/// <param name="destination">The destination buffer.</param>
|
|
|
|
/// <remarks>
|
|
|
|
/// Implementation is based on MagicScaler code:
|
|
|
|
/// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L541-L622
|
|
|
|
/// </remarks>
|
|
|
|
internal static void NormalizedFloatToByteSaturate( |
|
|
|
ReadOnlySpan<float> source, |
|
|
|
Span<byte> dest) |
|
|
|
Span<byte> destination) |
|
|
|
{ |
|
|
|
if (Avx2.IsSupported) |
|
|
|
if (Avx512BW.IsSupported) |
|
|
|
{ |
|
|
|
VerifySpanInput(source, dest, Vector256<byte>.Count); |
|
|
|
DebugVerifySpanInput(source, destination, Vector512<byte>.Count); |
|
|
|
|
|
|
|
nuint n = destination.Vector512Count<byte>(); |
|
|
|
|
|
|
|
nuint n = dest.Vector256Count<byte>(); |
|
|
|
ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector512<byte> destinationBase = ref Unsafe.As<byte, Vector512<byte>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
ref Vector256<float> sourceBase = |
|
|
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
Vector512<float> scale = Vector512.Create((float)byte.MaxValue); |
|
|
|
Vector512<int> mask = PermuteMaskDeinterleave16x32(); |
|
|
|
|
|
|
|
ref Vector256<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
for (nuint i = 0; i < n; i++) |
|
|
|
{ |
|
|
|
ref Vector512<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|
|
|
|
|
|
|
Vector512<float> f0 = scale * s; |
|
|
|
Vector512<float> f1 = scale * Unsafe.Add(ref s, 1); |
|
|
|
Vector512<float> f2 = scale * Unsafe.Add(ref s, 2); |
|
|
|
Vector512<float> f3 = scale * Unsafe.Add(ref s, 3); |
|
|
|
|
|
|
|
Vector512<int> w0 = Vector512Utilities.ConvertToInt32RoundToEven(f0); |
|
|
|
Vector512<int> w1 = Vector512Utilities.ConvertToInt32RoundToEven(f1); |
|
|
|
Vector512<int> w2 = Vector512Utilities.ConvertToInt32RoundToEven(f2); |
|
|
|
Vector512<int> w3 = Vector512Utilities.ConvertToInt32RoundToEven(f3); |
|
|
|
|
|
|
|
Vector512<short> u0 = Avx512BW.PackSignedSaturate(w0, w1); |
|
|
|
Vector512<short> u1 = Avx512BW.PackSignedSaturate(w2, w3); |
|
|
|
Vector512<byte> b = Avx512BW.PackUnsignedSaturate(u0, u1); |
|
|
|
b = Avx512F.PermuteVar16x32(b.AsInt32(), mask).AsByte(); |
|
|
|
|
|
|
|
Unsafe.Add(ref destinationBase, i) = b; |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
if (Avx2.IsSupported) |
|
|
|
{ |
|
|
|
DebugVerifySpanInput(source, destination, Vector256<byte>.Count); |
|
|
|
|
|
|
|
nuint n = destination.Vector256Count<byte>(); |
|
|
|
|
|
|
|
ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector256<byte> destinationBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
Vector256<float> scale = Vector256.Create((float)byte.MaxValue); |
|
|
|
Vector256<int> mask = PermuteMaskDeinterleave8x32(); |
|
|
|
@ -944,36 +991,33 @@ internal static partial class SimdUtils |
|
|
|
{ |
|
|
|
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|
|
|
|
|
|
|
Vector256<float> f0 = Avx.Multiply(scale, s); |
|
|
|
Vector256<float> f1 = Avx.Multiply(scale, Unsafe.Add(ref s, 1)); |
|
|
|
Vector256<float> f2 = Avx.Multiply(scale, Unsafe.Add(ref s, 2)); |
|
|
|
Vector256<float> f3 = Avx.Multiply(scale, Unsafe.Add(ref s, 3)); |
|
|
|
Vector256<float> f0 = scale * s; |
|
|
|
Vector256<float> f1 = scale * Unsafe.Add(ref s, 1); |
|
|
|
Vector256<float> f2 = scale * Unsafe.Add(ref s, 2); |
|
|
|
Vector256<float> f3 = scale * Unsafe.Add(ref s, 3); |
|
|
|
|
|
|
|
Vector256<int> w0 = Avx.ConvertToVector256Int32(f0); |
|
|
|
Vector256<int> w1 = Avx.ConvertToVector256Int32(f1); |
|
|
|
Vector256<int> w2 = Avx.ConvertToVector256Int32(f2); |
|
|
|
Vector256<int> w3 = Avx.ConvertToVector256Int32(f3); |
|
|
|
Vector256<int> w0 = Vector256Utilities.ConvertToInt32RoundToEven(f0); |
|
|
|
Vector256<int> w1 = Vector256Utilities.ConvertToInt32RoundToEven(f1); |
|
|
|
Vector256<int> w2 = Vector256Utilities.ConvertToInt32RoundToEven(f2); |
|
|
|
Vector256<int> w3 = Vector256Utilities.ConvertToInt32RoundToEven(f3); |
|
|
|
|
|
|
|
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1); |
|
|
|
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3); |
|
|
|
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1); |
|
|
|
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); |
|
|
|
|
|
|
|
Unsafe.Add(ref destBase, i) = b; |
|
|
|
Unsafe.Add(ref destinationBase, i) = b; |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
{ |
|
|
|
// Sse
|
|
|
|
VerifySpanInput(source, dest, Vector128<byte>.Count); |
|
|
|
|
|
|
|
nuint n = dest.Vector128Count<byte>(); |
|
|
|
// Sse, AdvSimd
|
|
|
|
DebugVerifySpanInput(source, destination, Vector128<byte>.Count); |
|
|
|
|
|
|
|
ref Vector128<float> sourceBase = |
|
|
|
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
nuint n = destination.Vector128Count<byte>(); |
|
|
|
|
|
|
|
ref Vector128<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
ref Vector128<float> sourceBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector128<byte> destinationBase = ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
Vector128<float> scale = Vector128.Create((float)byte.MaxValue); |
|
|
|
|
|
|
|
@ -981,20 +1025,20 @@ internal static partial class SimdUtils |
|
|
|
{ |
|
|
|
ref Vector128<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|
|
|
|
|
|
|
Vector128<float> f0 = Sse.Multiply(scale, s); |
|
|
|
Vector128<float> f1 = Sse.Multiply(scale, Unsafe.Add(ref s, 1)); |
|
|
|
Vector128<float> f2 = Sse.Multiply(scale, Unsafe.Add(ref s, 2)); |
|
|
|
Vector128<float> f3 = Sse.Multiply(scale, Unsafe.Add(ref s, 3)); |
|
|
|
Vector128<float> f0 = scale * s; |
|
|
|
Vector128<float> f1 = scale * Unsafe.Add(ref s, 1); |
|
|
|
Vector128<float> f2 = scale * Unsafe.Add(ref s, 2); |
|
|
|
Vector128<float> f3 = scale * Unsafe.Add(ref s, 3); |
|
|
|
|
|
|
|
Vector128<int> w0 = Sse2.ConvertToVector128Int32(f0); |
|
|
|
Vector128<int> w1 = Sse2.ConvertToVector128Int32(f1); |
|
|
|
Vector128<int> w2 = Sse2.ConvertToVector128Int32(f2); |
|
|
|
Vector128<int> w3 = Sse2.ConvertToVector128Int32(f3); |
|
|
|
Vector128<int> w0 = Vector128Utilities.ConvertToInt32RoundToEven(f0); |
|
|
|
Vector128<int> w1 = Vector128Utilities.ConvertToInt32RoundToEven(f1); |
|
|
|
Vector128<int> w2 = Vector128Utilities.ConvertToInt32RoundToEven(f2); |
|
|
|
Vector128<int> w3 = Vector128Utilities.ConvertToInt32RoundToEven(f3); |
|
|
|
|
|
|
|
Vector128<short> u0 = Sse2.PackSignedSaturate(w0, w1); |
|
|
|
Vector128<short> u1 = Sse2.PackSignedSaturate(w2, w3); |
|
|
|
Vector128<short> u0 = Vector128Utilities.PackSignedSaturate(w0, w1); |
|
|
|
Vector128<short> u1 = Vector128Utilities.PackSignedSaturate(w2, w3); |
|
|
|
|
|
|
|
Unsafe.Add(ref destBase, i) = Sse2.PackUnsignedSaturate(u0, u1); |
|
|
|
Unsafe.Add(ref destinationBase, i) = Vector128Utilities.PackUnsignedSaturate(u0, u1); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|