Browse Source

Port more V256 code

pull/2918/head
James Jackson-South 9 months ago
parent
commit
8a23d42bfd
  1. 124
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  2. 15
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  3. 58
      src/ImageSharp/Common/Helpers/Vector256Utilities.cs
  4. 17
      src/ImageSharp/Common/Helpers/Vector512Utilities.cs
  5. 112
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
  6. 12
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
  7. 2
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs
  8. 2
      tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs

124
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -66,9 +66,9 @@ internal static partial class SimdUtils
ref Span<float> destination,
[ConstantExpected] byte control)
{
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat) ||
(Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat))
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) ||
(Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat))
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@ -112,9 +112,9 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte) ||
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) ||
(Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte))
(Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte))
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@ -158,7 +158,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign)
{
int remainder = source.Length % (Vector128<byte>.Count * 3);
@ -190,7 +190,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
{
int remainder = source.Length % (Vector128<byte>.Count * 3);
@ -223,7 +223,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
{
int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1); // bit-hack for modulo
@ -249,7 +249,7 @@ internal static partial class SimdUtils
Span<float> destination,
[ConstantExpected] byte control)
{
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat)
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat)
{
ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
@ -263,21 +263,21 @@ internal static partial class SimdUtils
ref Vector512<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector512<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector512_.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
vd0 = Vector512_.ShuffleNative(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
Unsafe.Add(ref destinationBase, i) = Vector512_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control);
}
}
}
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat)
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat)
{
ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@ -291,21 +291,21 @@ internal static partial class SimdUtils
ref Vector256<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector256<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector256_.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
vd0 = Vector256_.ShuffleNative(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control);
}
}
}
else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat)
else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeFloat)
{
ref Vector128<float> sourceBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination));
@ -319,17 +319,17 @@ internal static partial class SimdUtils
ref Vector128<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector128<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector128_.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
vd0 = Vector128_.ShuffleNative(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
Unsafe.Add(ref destinationBase, i) = Vector128_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control);
}
}
}
@ -341,7 +341,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte)
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte)
{
Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -359,17 +359,17 @@ internal static partial class SimdUtils
ref Vector512<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector512<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector512_.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector512_.ShuffleNative(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector512_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
@ -391,21 +391,21 @@ internal static partial class SimdUtils
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector256_.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector256_.ShuffleNative(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte)
else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)
{
Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -423,17 +423,17 @@ internal static partial class SimdUtils
ref Vector128<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector128<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector128_.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector128_.ShuffleNative(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector128_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
@ -445,7 +445,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsRightAlign)
{
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
@ -472,15 +472,15 @@ internal static partial class SimdUtils
v2 = Vector128_.AlignRight(v2, v1, 8);
v1 = Vector128_.AlignRight(v1, v0, 12);
v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16), mask);
v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16), mask);
v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16), mask);
v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16), mask);
v0 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, maskPad4Nx16), mask);
v1 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, maskPad4Nx16), mask);
v2 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, maskPad4Nx16), mask);
v3 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, maskPad4Nx16), mask);
v0 = Vector128_.Shuffle(v0, maskE);
v1 = Vector128_.Shuffle(v1, maskSlice4Nx16);
v2 = Vector128_.Shuffle(v2, maskE);
v3 = Vector128_.Shuffle(v3, maskSlice4Nx16);
v0 = Vector128_.ShuffleNative(v0, maskE);
v1 = Vector128_.ShuffleNative(v1, maskSlice4Nx16);
v2 = Vector128_.ShuffleNative(v2, maskE);
v3 = Vector128_.ShuffleNative(v3, maskSlice4Nx16);
v0 = Vector128_.AlignRight(v1, v0, 4);
v3 = Vector128_.AlignRight(v3, v2, 12);
@ -505,7 +505,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
{
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@ -534,10 +534,10 @@ internal static partial class SimdUtils
ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, j);
vd = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 1) = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 2) = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 3) = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16) | fill, mask);
vd = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 1) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 2) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 3) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, maskPad4Nx16) | fill, mask);
}
}
}
@ -548,7 +548,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
{
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
@ -574,10 +574,10 @@ internal static partial class SimdUtils
Vector128<byte> v2 = Unsafe.Add(ref vs, 2);
Vector128<byte> v3 = Unsafe.Add(ref vs, 3);
v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, mask), maskE);
v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, mask), maskSlice4Nx16);
v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, mask), maskE);
v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, mask), maskSlice4Nx16);
v0 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, mask), maskE);
v1 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, mask), maskSlice4Nx16);
v2 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, mask), maskE);
v3 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, mask), maskSlice4Nx16);
v0 = Vector128_.AlignRight(v1, v0, 4);
v3 = Vector128_.AlignRight(v3, v2, 12);

15
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

@ -26,7 +26,7 @@ internal static class Vector128_
/// <summary>
/// Gets a value indicating whether shuffle operations are supported.
/// </summary>
public static bool SupportsShuffleFloat
public static bool SupportsShuffleNativeFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Sse.IsSupported;
@ -35,10 +35,10 @@ internal static class Vector128_
/// <summary>
/// Gets a value indicating whether shuffle operations are supported.
/// </summary>
public static bool SupportsShuffleByte
public static bool SupportsShuffleNativeByte
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported;
get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported || PackedSimd.IsSupported;
}
/// <summary>
@ -66,7 +66,7 @@ internal static class Vector128_
/// <param name="control">The shuffle control byte.</param>
/// <returns>The <see cref="Vector128{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<float> Shuffle(Vector128<float> vector, [ConstantExpected] byte control)
public static Vector128<float> ShuffleNative(Vector128<float> vector, [ConstantExpected] byte control)
{
if (Sse.IsSupported)
{
@ -89,7 +89,7 @@ internal static class Vector128_
/// A new vector containing the values from <paramref name="vector" /> selected by the given <paramref name="indices" />.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> indices)
public static Vector128<byte> ShuffleNative(Vector128<byte> vector, Vector128<byte> indices)
{
if (Ssse3.IsSupported)
{
@ -101,6 +101,11 @@ internal static class Vector128_
return AdvSimd.Arm64.VectorTableLookup(vector, indices);
}
if (PackedSimd.IsSupported)
{
return PackedSimd.Swizzle(vector, indices);
}
ThrowUnreachableException();
return default;
}

58
src/ImageSharp/Common/Helpers/Vector256Utilities.cs

@ -24,10 +24,10 @@ internal static class Vector256_
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleFloat
public static bool SupportsShuffleNativeFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx.IsSupported || Sse.IsSupported;
get => Avx.IsSupported;
}
/// <summary>
@ -46,20 +46,13 @@ internal static class Vector256_
/// <param name="control">The shuffle control byte.</param>
/// <returns>The <see cref="Vector256{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> Shuffle(Vector256<float> vector, [ConstantExpected] byte control)
public static Vector256<float> ShuffleNative(Vector256<float> vector, [ConstantExpected] byte control)
{
if (Avx.IsSupported)
{
return Avx.Shuffle(vector, vector, control);
}
if (Sse.IsSupported)
{
Vector128<float> lower = vector.GetLower();
Vector128<float> upper = vector.GetUpper();
return Vector256.Create(Sse.Shuffle(lower, lower, control), Sse.Shuffle(upper, upper, control));
}
ThrowUnreachableException();
return default;
}
@ -73,7 +66,7 @@ internal static class Vector256_
/// </param>
/// <returns>The <see cref="Vector256{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> Shuffle(Vector256<byte> vector, Vector256<byte> indices)
public static Vector256<byte> ShuffleNative(Vector256<byte> vector, Vector256<byte> indices)
{
if (Avx2.IsSupported)
{
@ -98,13 +91,6 @@ internal static class Vector256_
return Avx.ConvertToVector256Int32(vector);
}
if (Sse2.IsSupported)
{
Vector128<int> lower = Sse2.ConvertToVector128Int32(vector.GetLower());
Vector128<int> upper = Sse2.ConvertToVector128Int32(vector.GetUpper());
return Vector256.Create(lower, upper);
}
Vector256<float> sign = vector & Vector256.Create(-0F);
Vector256<float> val_2p23_f32 = sign | Vector256.Create(8388608F);
@ -154,6 +140,27 @@ internal static class Vector256_
return va + (vm0 * vm1);
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>
/// <param name="left">The left hand source vector.</param>
/// <param name="right">The right hand source vector.</param>
/// <returns>The <see cref="Vector256{Int16}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> PackSignedSaturate(Vector256<int> left, Vector256<int> right)
{
if (Avx2.IsSupported)
{
return Avx2.PackSignedSaturate(left, right);
}
Vector256<int> min = Vector256.Create((int)short.MinValue);
Vector256<int> max = Vector256.Create((int)short.MaxValue);
Vector256<int> lefClamped = Clamp(left, min, max);
Vector256<int> rightClamped = Clamp(right, min, max);
return Vector256.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
/// </summary>
@ -166,6 +173,21 @@ internal static class Vector256_
public static Vector256<T> Clamp<T>(Vector256<T> value, Vector256<T> min, Vector256<T> max)
=> Vector256.Min(Vector256.Max(value, min), max);
/// <summary>
/// Widens a <see cref="Vector128{Int16}"/> to a <see cref="Vector256{Int32}"/>.
/// </summary>
/// <param name="value">The vector to widen.</param>
/// <returns>The widened <see cref="Vector256{Int32}"/>.</returns>
public static Vector256<int> Widen(Vector128<short> value)
{
if (Avx2.IsSupported)
{
return Avx2.ConvertToVector256Int32(value);
}
return Vector256.WidenLower(value.ToVector256());
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

17
src/ImageSharp/Common/Helpers/Vector512Utilities.cs

@ -24,16 +24,16 @@ internal static class Vector512_
/// <summary>
/// Gets a value indicating whether shuffle float operations are supported.
/// </summary>
public static bool SupportsShuffleFloat
public static bool SupportsShuffleNativeFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx512F.IsSupported || Avx.IsSupported;
get => Avx512F.IsSupported;
}
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleByte
public static bool SupportsShuffleNativeByte
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx512BW.IsSupported;
@ -46,20 +46,13 @@ internal static class Vector512_
/// <param name="control">The shuffle control byte.</param>
/// <returns>The <see cref="Vector512{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<float> Shuffle(Vector512<float> vector, [ConstantExpected] byte control)
public static Vector512<float> ShuffleNative(Vector512<float> vector, [ConstantExpected] byte control)
{
if (Avx512F.IsSupported)
{
return Avx512F.Shuffle(vector, vector, control);
}
if (Avx.IsSupported)
{
Vector256<float> lower = vector.GetLower();
Vector256<float> upper = vector.GetUpper();
return Vector512.Create(Avx.Shuffle(lower, lower, control), Avx.Shuffle(upper, upper, control));
}
ThrowUnreachableException();
return default;
}
@ -73,7 +66,7 @@ internal static class Vector512_
/// </param>
/// <returns>The <see cref="Vector512{Byte}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<byte> Shuffle(Vector512<byte> vector, Vector512<byte> indices)
public static Vector512<byte> ShuffleNative(Vector512<byte> vector, Vector512<byte> indices)
{
if (Avx512BW.IsSupported)
{

112
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs

@ -1,7 +1,6 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
@ -60,109 +59,76 @@ internal partial struct Block8x8F
}
/// <summary>
/// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
/// Loads values from <paramref name="source"/> using <see cref="Vector256{T}"/> intrinsics.
/// </summary>
/// <param name="source">The source <see cref="Block8x8"/></param>
public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
public void LoadFromInt16ExtendedVector256(ref Block8x8 source)
{
DebugGuard.IsTrue(
Avx2.IsSupported,
"LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");
Vector256.IsHardwareAccelerated,
"LoadFromInt16ExtendedVector256 only works on Vector256 compatible architecture!");
ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
// Vector256<ushort>.Count == 16 on AVX2
// Vector256<ushort>.Count == 16
// We can process 2 block rows in a single step
Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
dRef = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
Vector256<int> top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef));
Vector256<int> bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
dRef = Vector256.ConvertToSingle(top);
Unsafe.Add(ref dRef, 1) = Vector256.ConvertToSingle(bottom);
top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
Unsafe.Add(ref dRef, 2) = Vector256.ConvertToSingle(top);
Unsafe.Add(ref dRef, 3) = Vector256.ConvertToSingle(bottom);
top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
Unsafe.Add(ref dRef, 4) = Vector256.ConvertToSingle(top);
Unsafe.Add(ref dRef, 5) = Vector256.ConvertToSingle(bottom);
top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
Unsafe.Add(ref dRef, 6) = Vector256.ConvertToSingle(top);
Unsafe.Add(ref dRef, 7) = Vector256.ConvertToSingle(bottom);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector256<float> NormalizeAndRoundVector256(Vector256<float> value, Vector256<float> off, Vector256<float> max)
=> Vector256_.RoundToNearestInteger(Vector256_.Clamp(value + off, Vector256<float>.Zero, max));
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
private static unsafe void MultiplyIntoInt16Vector256(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to run this operation!");
ref Vector256<float> aBase = ref a.V256_0;
ref Vector256<float> bBase = ref b.V256_0;
ref Vector256<short> destRef = ref dest.V01;
Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
for (nuint i = 0; i < 8; i += 2)
{
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector256<int> row0 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0));
Vector256<int> row1 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1));
Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16();
Vector256<short> row = Vector256_.PackSignedSaturate(row0, row1);
row = Vector256.Shuffle(row.AsInt32(), Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7)).AsInt16();
Unsafe.Add(ref destRef, i / 2) = row;
}
}
private void TransposeInPlace_Avx()
private void TransposeInPlaceVector256()
{
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
Vector256<float> r0 = Avx.InsertVector128(
this.V256_0,
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
1);
Vector256<float> r1 = Avx.InsertVector128(
this.V256_1,
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
1);
Vector256<float> r2 = Avx.InsertVector128(
this.V256_2,
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
1);
Vector256<float> r3 = Avx.InsertVector128(
this.V256_3,
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
1);
Vector256<float> r4 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
1);
Vector256<float> r5 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
1);
Vector256<float> r6 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
1);
Vector256<float> r7 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
1);
Vector256<float> r0 = this.V256_0.WithUpper(this.V4L.AsVector128());
Vector256<float> r1 = this.V256_1.WithUpper(this.V5L.AsVector128());
Vector256<float> r2 = this.V256_2.WithUpper(this.V6L.AsVector128());
Vector256<float> r3 = this.V256_3.WithUpper(this.V7L.AsVector128());
Vector256<float> r4 = this.V0R.AsVector128().ToVector256().WithUpper(this.V4R.AsVector128());
Vector256<float> r5 = this.V1R.AsVector128().ToVector256().WithUpper(this.V5R.AsVector128());
Vector256<float> r6 = this.V2R.AsVector128().ToVector256().WithUpper(this.V6R.AsVector128());
Vector256<float> r7 = this.V3R.AsVector128().ToVector256().WithUpper(this.V7R.AsVector128());
Vector256<float> t0 = Avx.UnpackLow(r0, r1);
Vector256<float> t2 = Avx.UnpackLow(r2, r3);

12
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

@ -277,9 +277,9 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
/// <param name="qt">The quantization table.</param>
public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt)
{
if (Avx2.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
MultiplyIntoInt16Vector256(ref block, ref qt, ref dest);
ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest);
}
else if (Vector128.IsHardwareAccelerated)
@ -387,9 +387,9 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)]
public void LoadFrom(ref Block8x8 source)
{
if (Avx2.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
this.LoadFromInt16ExtendedAvx2(ref source);
this.LoadFromInt16ExtendedVector256(ref source);
return;
}
else if (Vector128.IsHardwareAccelerated)
@ -601,9 +601,9 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)]
public void TransposeInPlace()
{
if (Avx.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
this.TransposeInPlace_Avx();
this.TransposeInPlaceVector256();
}
else
{

2
tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs

@ -32,7 +32,7 @@ public class Block8x8F_LoadFromInt16
public void Scalar() => this.destination.LoadFromInt16Scalar(ref this.source);
[Benchmark]
public void ExtendedAvx2() => this.destination.LoadFromInt16ExtendedAvx2(ref this.source);
public void ExtendedAvx2() => this.destination.LoadFromInt16ExtendedVector256(ref this.source);
// RESULT:
// Method | Mean | Error | StdDev | Scaled |

2
tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs

@ -433,7 +433,7 @@ public partial class Block8x8FTests : JpegFixture
Block8x8 source = Block8x8.Load(data);
Block8x8F dest = default;
dest.LoadFromInt16ExtendedAvx2(ref source);
dest.LoadFromInt16ExtendedVector256(ref source);
for (int i = 0; i < Block8x8F.Size; i++)
{

Loading…
Cancel
Save