Browse Source

Merge pull request #2918 from SixLabors/js/block8x8-simd

Improve JPEG Block8x8F Intrinsics for Vector128 paths.
pull/2926/head
James Jackson-South 9 months ago
committed by GitHub
parent
commit
d8b464bacd
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 236
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  2. 104
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  3. 98
      src/ImageSharp/Common/Helpers/Vector256Utilities.cs
  4. 47
      src/ImageSharp/Common/Helpers/Vector512Utilities.cs
  5. 4
      src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs
  6. 153
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs
  7. 103
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt
  8. 144
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
  9. 93
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
  10. 157
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
  11. 275
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
  12. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs
  13. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs
  14. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs
  15. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs
  16. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs
  17. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs
  18. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs
  19. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs
  20. 2
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs
  21. 142
      src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
  22. 142
      src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs
  23. 22
      src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs
  24. 2
      src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs
  25. 135
      src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs
  26. 6
      src/ImageSharp/Formats/Webp/AlphaDecoder.cs
  27. 18
      src/ImageSharp/ImageSharp.csproj
  28. 2
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs
  29. 2
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs
  30. 11
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
  31. 33
      tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs
  32. 95
      tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs
  33. 2
      tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs
  34. 12
      tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs
  35. 2
      tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs

236
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -66,9 +66,9 @@ internal static partial class SimdUtils
ref Span<float> destination,
[ConstantExpected] byte control)
{
if ((Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleFloat) ||
(Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleFloat) ||
(Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleFloat))
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) ||
Vector128.IsHardwareAccelerated)
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@ -112,9 +112,9 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if ((Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleByte) ||
(Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleByte) ||
(Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte))
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) ||
(Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte))
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@ -158,7 +158,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsRightAlign)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight)
{
int remainder = source.Length % (Vector128<byte>.Count * 3);
@ -190,7 +190,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
{
int remainder = source.Length % (Vector128<byte>.Count * 3);
@ -223,7 +223,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
{
int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1); // bit-hack for modulo
@ -249,7 +249,7 @@ internal static partial class SimdUtils
Span<float> destination,
[ConstantExpected] byte control)
{
if (Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleFloat)
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat)
{
ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
@ -263,21 +263,21 @@ internal static partial class SimdUtils
ref Vector512<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector512<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector512Utilities.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
vd0 = Vector512_.ShuffleNative(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector512Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control);
Unsafe.Add(ref destinationBase, i) = Vector512_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control);
}
}
}
else if (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleFloat)
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat)
{
ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@ -291,21 +291,21 @@ internal static partial class SimdUtils
ref Vector256<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector256<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector256Utilities.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
vd0 = Vector256_.ShuffleNative(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector256Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control);
Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control);
}
}
}
else if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleFloat)
else if (Vector128.IsHardwareAccelerated)
{
ref Vector128<float> sourceBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination));
@ -319,17 +319,17 @@ internal static partial class SimdUtils
ref Vector128<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector128<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector128Utilities.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
vd0 = Vector128_.ShuffleNative(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), control);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector128Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control);
Unsafe.Add(ref destinationBase, i) = Vector128_.ShuffleNative(Unsafe.Add(ref sourceBase, i), control);
}
}
}
@ -341,7 +341,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleByte)
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte)
{
Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -359,21 +359,21 @@ internal static partial class SimdUtils
ref Vector512<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector512<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector512Utilities.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector512_.ShuffleNative(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector512_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector512Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector512_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
else if (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleByte)
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte)
{
Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -391,21 +391,21 @@ internal static partial class SimdUtils
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector256Utilities.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector256_.ShuffleNative(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector256Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
else if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte)
else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)
{
Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -423,17 +423,17 @@ internal static partial class SimdUtils
ref Vector128<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector128<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector128Utilities.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector128_.ShuffleNative(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector128_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector128Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector128_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
@ -445,11 +445,13 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsRightAlign)
if (Vector128.IsHardwareAccelerated &&
Vector128_.SupportsShuffleNativeByte &&
Vector128_.SupportsAlignRight)
{
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
Vector128<byte> maskE = Vector128Utilities.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count];
Shuffle.MMShuffleSpan(ref bytes, control);
@ -467,28 +469,28 @@ internal static partial class SimdUtils
Vector128<byte> v0 = vs;
Vector128<byte> v1 = Unsafe.Add(ref vs, (nuint)1);
Vector128<byte> v2 = Unsafe.Add(ref vs, (nuint)2);
Vector128<byte> v3 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
Vector128<byte> v3 = Vector128_.ShiftRightBytesInVector(v2, 4);
v2 = Vector128Utilities.AlignRight(v2, v1, 8);
v1 = Vector128Utilities.AlignRight(v1, v0, 12);
v2 = Vector128_.AlignRight(v2, v1, 8);
v1 = Vector128_.AlignRight(v1, v0, 12);
v0 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, maskPad4Nx16), mask);
v1 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, maskPad4Nx16), mask);
v2 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, maskPad4Nx16), mask);
v3 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, maskPad4Nx16), mask);
v0 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, maskPad4Nx16), mask);
v1 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, maskPad4Nx16), mask);
v2 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, maskPad4Nx16), mask);
v3 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, maskPad4Nx16), mask);
v0 = Vector128Utilities.Shuffle(v0, maskE);
v1 = Vector128Utilities.Shuffle(v1, maskSlice4Nx16);
v2 = Vector128Utilities.Shuffle(v2, maskE);
v3 = Vector128Utilities.Shuffle(v3, maskSlice4Nx16);
v0 = Vector128_.ShuffleNative(v0, maskE);
v1 = Vector128_.ShuffleNative(v1, maskSlice4Nx16);
v2 = Vector128_.ShuffleNative(v2, maskE);
v3 = Vector128_.ShuffleNative(v3, maskSlice4Nx16);
v0 = Vector128Utilities.AlignRight(v1, v0, 4);
v3 = Vector128Utilities.AlignRight(v3, v2, 12);
v0 = Vector128_.AlignRight(v1, v0, 4);
v3 = Vector128_.AlignRight(v3, v2, 12);
v1 = Vector128Utilities.ShiftLeftBytesInVector(v1, 4);
v2 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
v1 = Vector128_.ShiftLeftBytesInVector(v1, 4);
v2 = Vector128_.ShiftRightBytesInVector(v2, 4);
v1 = Vector128Utilities.AlignRight(v2, v1, 8);
v1 = Vector128_.AlignRight(v2, v1, 8);
ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, i);
@ -505,7 +507,10 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated &&
Vector128_.SupportsShuffleNativeByte &&
Vector128_.SupportsShiftByte &&
Vector128_.SupportsAlignRight)
{
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@ -527,17 +532,17 @@ internal static partial class SimdUtils
ref Vector128<byte> v0 = ref Unsafe.Add(ref sourceBase, i);
Vector128<byte> v1 = Unsafe.Add(ref v0, 1);
Vector128<byte> v2 = Unsafe.Add(ref v0, 2);
Vector128<byte> v3 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
Vector128<byte> v3 = Vector128_.ShiftRightBytesInVector(v2, 4);
v2 = Vector128Utilities.AlignRight(v2, v1, 8);
v1 = Vector128Utilities.AlignRight(v1, v0, 12);
v2 = Vector128_.AlignRight(v2, v1, 8);
v1 = Vector128_.AlignRight(v1, v0, 12);
ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, j);
vd = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 1) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 2) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 3) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, maskPad4Nx16) | fill, mask);
vd = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 1) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 2) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 3) = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, maskPad4Nx16) | fill, mask);
}
}
}
@ -548,10 +553,13 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated &&
Vector128_.SupportsShuffleNativeByte &&
Vector128_.SupportsShiftByte &&
Vector128_.SupportsAlignRight)
{
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
Vector128<byte> maskE = Vector128Utilities.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -574,18 +582,18 @@ internal static partial class SimdUtils
Vector128<byte> v2 = Unsafe.Add(ref vs, 2);
Vector128<byte> v3 = Unsafe.Add(ref vs, 3);
v0 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, mask), maskE);
v1 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, mask), maskSlice4Nx16);
v2 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, mask), maskE);
v3 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, mask), maskSlice4Nx16);
v0 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v0, mask), maskE);
v1 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v1, mask), maskSlice4Nx16);
v2 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v2, mask), maskE);
v3 = Vector128_.ShuffleNative(Vector128_.ShuffleNative(v3, mask), maskSlice4Nx16);
v0 = Vector128Utilities.AlignRight(v1, v0, 4);
v3 = Vector128Utilities.AlignRight(v3, v2, 12);
v0 = Vector128_.AlignRight(v1, v0, 4);
v3 = Vector128_.AlignRight(v3, v2, 12);
v1 = Vector128Utilities.ShiftLeftBytesInVector(v1, 4);
v2 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
v1 = Vector128_.ShiftLeftBytesInVector(v1, 4);
v2 = Vector128_.ShiftRightBytesInVector(v2, 4);
v1 = Vector128Utilities.AlignRight(v2, v1, 8);
v1 = Vector128_.AlignRight(v2, v1, 8);
ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, j);
@ -619,29 +627,6 @@ internal static partial class SimdUtils
return va + (vm0 * vm1);
}
/// <summary>
/// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
/// TODO: Fix. The arguments are in a different order to the FMA intrinsic.
/// </summary>
/// <remarks>ret = (vm0 * vm1) - vs</remarks>
/// <param name="vs">The vector to subtract from the intermediate result.</param>
/// <param name="vm0">The first vector to multiply.</param>
/// <param name="vm1">The second vector to multiply.</param>
/// <returns>The <see cref="Vector256{T}"/>.</returns>
[MethodImpl(InliningOptions.ShortMethod)]
public static Vector256<float> MultiplySubtract(
Vector256<float> vs,
Vector256<float> vm0,
Vector256<float> vm1)
{
if (Fma.IsSupported)
{
return Fma.MultiplySubtract(vm1, vm0, vs);
}
return Avx.Subtract(Avx.Multiply(vm0, vm1), vs);
}
/// <summary>
/// Performs a multiplication and a negated addition of the <see cref="Vector256{Single}"/>.
/// </summary>
@ -965,10 +950,10 @@ internal static partial class SimdUtils
Vector512<float> f2 = scale * Unsafe.Add(ref s, 2);
Vector512<float> f3 = scale * Unsafe.Add(ref s, 3);
Vector512<int> w0 = Vector512Utilities.ConvertToInt32RoundToEven(f0);
Vector512<int> w1 = Vector512Utilities.ConvertToInt32RoundToEven(f1);
Vector512<int> w2 = Vector512Utilities.ConvertToInt32RoundToEven(f2);
Vector512<int> w3 = Vector512Utilities.ConvertToInt32RoundToEven(f3);
Vector512<int> w0 = Vector512_.ConvertToInt32RoundToEven(f0);
Vector512<int> w1 = Vector512_.ConvertToInt32RoundToEven(f1);
Vector512<int> w2 = Vector512_.ConvertToInt32RoundToEven(f2);
Vector512<int> w3 = Vector512_.ConvertToInt32RoundToEven(f3);
Vector512<short> u0 = Avx512BW.PackSignedSaturate(w0, w1);
Vector512<short> u1 = Avx512BW.PackSignedSaturate(w2, w3);
@ -999,10 +984,10 @@ internal static partial class SimdUtils
Vector256<float> f2 = scale * Unsafe.Add(ref s, 2);
Vector256<float> f3 = scale * Unsafe.Add(ref s, 3);
Vector256<int> w0 = Vector256Utilities.ConvertToInt32RoundToEven(f0);
Vector256<int> w1 = Vector256Utilities.ConvertToInt32RoundToEven(f1);
Vector256<int> w2 = Vector256Utilities.ConvertToInt32RoundToEven(f2);
Vector256<int> w3 = Vector256Utilities.ConvertToInt32RoundToEven(f3);
Vector256<int> w0 = Vector256_.ConvertToInt32RoundToEven(f0);
Vector256<int> w1 = Vector256_.ConvertToInt32RoundToEven(f1);
Vector256<int> w2 = Vector256_.ConvertToInt32RoundToEven(f2);
Vector256<int> w3 = Vector256_.ConvertToInt32RoundToEven(f3);
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
@ -1012,9 +997,9 @@ internal static partial class SimdUtils
Unsafe.Add(ref destinationBase, i) = b;
}
}
else if (Sse2.IsSupported || AdvSimd.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
// Sse, AdvSimd
// Sse, AdvSimd, etc.
DebugVerifySpanInput(source, destination, Vector128<byte>.Count);
nuint n = destination.Vector128Count<byte>();
@ -1023,6 +1008,8 @@ internal static partial class SimdUtils
ref Vector128<byte> destinationBase = ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(destination));
Vector128<float> scale = Vector128.Create((float)byte.MaxValue);
Vector128<int> min = Vector128<int>.Zero;
Vector128<int> max = Vector128.Create((int)byte.MaxValue);
for (nuint i = 0; i < n; i++)
{
@ -1033,15 +1020,20 @@ internal static partial class SimdUtils
Vector128<float> f2 = scale * Unsafe.Add(ref s, 2);
Vector128<float> f3 = scale * Unsafe.Add(ref s, 3);
Vector128<int> w0 = Vector128Utilities.ConvertToInt32RoundToEven(f0);
Vector128<int> w1 = Vector128Utilities.ConvertToInt32RoundToEven(f1);
Vector128<int> w2 = Vector128Utilities.ConvertToInt32RoundToEven(f2);
Vector128<int> w3 = Vector128Utilities.ConvertToInt32RoundToEven(f3);
Vector128<int> w0 = Vector128_.ConvertToInt32RoundToEven(f0);
Vector128<int> w1 = Vector128_.ConvertToInt32RoundToEven(f1);
Vector128<int> w2 = Vector128_.ConvertToInt32RoundToEven(f2);
Vector128<int> w3 = Vector128_.ConvertToInt32RoundToEven(f3);
w0 = Vector128_.Clamp(w0, min, max);
w1 = Vector128_.Clamp(w1, min, max);
w2 = Vector128_.Clamp(w2, min, max);
w3 = Vector128_.Clamp(w3, min, max);
Vector128<short> u0 = Vector128Utilities.PackSignedSaturate(w0, w1);
Vector128<short> u1 = Vector128Utilities.PackSignedSaturate(w2, w3);
Vector128<ushort> u0 = Vector128.Narrow(w0, w1).AsUInt16();
Vector128<ushort> u1 = Vector128.Narrow(w2, w3).AsUInt16();
Unsafe.Add(ref destinationBase, i) = Vector128Utilities.PackUnsignedSaturate(u0, u1);
Unsafe.Add(ref destinationBase, i) = Vector128.Narrow(u0, u1);
}
}
}

104
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

@ -4,8 +4,10 @@
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.Wasm;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp.Common.Helpers;
@ -18,30 +20,36 @@ namespace SixLabors.ImageSharp.Common.Helpers;
/// </list>
/// Should only be used if the intrinsics are available.
/// </summary>
internal static class Vector128Utilities
#pragma warning disable SA1649 // File name should match first type name
internal static class Vector128_
#pragma warning restore SA1649 // File name should match first type name
{
/// <summary>
/// Gets a value indicating whether shuffle operations are supported.
/// </summary>
public static bool SupportsShuffleFloat
public static bool SupportsShuffleNativeByte
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Sse.IsSupported;
}
get
{
if (Vector128.IsHardwareAccelerated)
{
if (RuntimeInformation.ProcessArchitecture is Architecture.X86 or Architecture.X64)
{
return Ssse3.IsSupported;
}
/// <summary>
/// Gets a value indicating whether shuffle operations are supported.
/// </summary>
public static bool SupportsShuffleByte
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Ssse3.IsSupported || AdvSimd.Arm64.IsSupported;
return true;
}
return false;
}
}
/// <summary>
/// Gets a value indicating whether right align operations are supported.
/// </summary>
public static bool SupportsRightAlign
public static bool SupportsAlignRight
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Ssse3.IsSupported || AdvSimd.IsSupported;
@ -63,15 +71,21 @@ internal static class Vector128Utilities
/// <param name="control">The shuffle control byte.</param>
/// <returns>The <see cref="Vector128{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<float> Shuffle(Vector128<float> vector, [ConstantExpected] byte control)
public static Vector128<float> ShuffleNative(Vector128<float> vector, [ConstantExpected] byte control)
{
if (Sse.IsSupported)
{
return Sse.Shuffle(vector, vector, control);
}
ThrowUnreachableException();
return default;
// Don't use InverseMMShuffle here as we want to avoid the cast.
Vector128<int> indices = Vector128.Create(
control & 0x3,
(control >> 2) & 0x3,
(control >> 4) & 0x3,
(control >> 6) & 0x3);
return Vector128.Shuffle(vector, indices);
}
/// <summary>
@ -86,20 +100,18 @@ internal static class Vector128Utilities
/// A new vector containing the values from <paramref name="vector" /> selected by the given <paramref name="indices" />.
/// </returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<byte> Shuffle(Vector128<byte> vector, Vector128<byte> indices)
public static Vector128<byte> ShuffleNative(Vector128<byte> vector, Vector128<byte> indices)
{
// For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1.
if (Ssse3.IsSupported)
{
return Ssse3.Shuffle(vector, indices);
}
if (AdvSimd.Arm64.IsSupported)
{
return AdvSimd.Arm64.VectorTableLookup(vector, indices);
}
ThrowUnreachableException();
return default;
// For ARM and WASM, codegen will be optimal.
// We don't throw for x86/x64 so we should never use this method without
// checking for support.
return Vector128.Shuffle(vector, indices);
}
/// <summary>
@ -193,6 +205,11 @@ internal static class Vector128Utilities
return AdvSimd.ConvertToInt32RoundToEven(vector);
}
if (PackedSimd.IsSupported)
{
return PackedSimd.ConvertToInt32Saturate(PackedSimd.RoundToNearest(vector));
}
Vector128<float> sign = vector & Vector128.Create(-0F);
Vector128<float> val_2p23_f32 = sign | Vector128.Create(8388608F);
@ -218,6 +235,11 @@ internal static class Vector128Utilities
return AdvSimd.RoundToNearest(vector);
}
if (PackedSimd.IsSupported)
{
return PackedSimd.RoundToNearest(vector);
}
Vector128<float> sign = vector & Vector128.Create(-0F);
Vector128<float> val_2p23_f32 = sign | Vector128.Create(8388608F);
@ -270,8 +292,16 @@ internal static class Vector128Utilities
return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right);
}
ThrowUnreachableException();
return default;
if (PackedSimd.IsSupported)
{
return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right);
}
Vector128<short> min = Vector128.Create((short)byte.MinValue);
Vector128<short> max = Vector128.Create((short)byte.MaxValue);
Vector128<ushort> lefClamped = Clamp(left, min, max).AsUInt16();
Vector128<ushort> rightClamped = Clamp(right, min, max).AsUInt16();
return Vector128.Narrow(lefClamped, rightClamped);
}
/// <summary>
@ -293,10 +323,30 @@ internal static class Vector128Utilities
return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right);
}
ThrowUnreachableException();
return default;
if (PackedSimd.IsSupported)
{
return PackedSimd.ConvertNarrowingSaturateSigned(left, right);
}
Vector128<int> min = Vector128.Create((int)short.MinValue);
Vector128<int> max = Vector128.Create((int)short.MaxValue);
Vector128<int> lefClamped = Clamp(left, min, max);
Vector128<int> rightClamped = Clamp(right, min, max);
return Vector128.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
/// </summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="value">The vector to restrict.</param>
/// <param name="min">The minimum value.</param>
/// <param name="max">The maximum value.</param>
/// <returns>The restricted <see cref="Vector128{T}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<T> Clamp<T>(Vector128<T> value, Vector128<T> min, Vector128<T> max)
=> Vector128.Min(Vector128.Max(value, min), max);
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

98
src/ImageSharp/Common/Helpers/Vector256Utilities.cs

@ -17,21 +17,23 @@ namespace SixLabors.ImageSharp.Common.Helpers;
/// </list>
/// Should only be used if the intrinsics are available.
/// </summary>
internal static class Vector256Utilities
#pragma warning disable SA1649 // File name should match first type name
internal static class Vector256_
#pragma warning restore SA1649 // File name should match first type name
{
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleFloat
public static bool SupportsShuffleNativeFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx.IsSupported || Sse.IsSupported;
get => Avx.IsSupported;
}
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleByte
public static bool SupportsShuffleNativeByte
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx2.IsSupported;
@ -44,20 +46,13 @@ internal static class Vector256Utilities
/// <param name="control">The shuffle control byte.</param>
/// <returns>The <see cref="Vector256{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> Shuffle(Vector256<float> vector, [ConstantExpected] byte control)
public static Vector256<float> ShuffleNative(Vector256<float> vector, [ConstantExpected] byte control)
{
if (Avx.IsSupported)
{
return Avx.Shuffle(vector, vector, control);
}
if (Sse.IsSupported)
{
Vector128<float> lower = vector.GetLower();
Vector128<float> upper = vector.GetUpper();
return Vector256.Create(Sse.Shuffle(lower, lower, control), Sse.Shuffle(upper, upper, control));
}
ThrowUnreachableException();
return default;
}
@ -71,7 +66,7 @@ internal static class Vector256Utilities
/// </param>
/// <returns>The <see cref="Vector256{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<byte> Shuffle(Vector256<byte> vector, Vector256<byte> indices)
public static Vector256<byte> ShuffleNative(Vector256<byte> vector, Vector256<byte> indices)
{
if (Avx2.IsSupported)
{
@ -96,13 +91,6 @@ internal static class Vector256Utilities
return Avx.ConvertToVector256Int32(vector);
}
if (Sse2.IsSupported)
{
Vector128<int> lower = Sse2.ConvertToVector128Int32(vector.GetLower());
Vector128<int> upper = Sse2.ConvertToVector128Int32(vector.GetUpper());
return Vector256.Create(lower, upper);
}
Vector256<float> sign = vector & Vector256.Create(-0F);
Vector256<float> val_2p23_f32 = sign | Vector256.Create(8388608F);
@ -152,6 +140,76 @@ internal static class Vector256Utilities
return va + (vm0 * vm1);
}
/// <summary>
/// Performs a multiplication and a subtraction of the <see cref="Vector256{Single}"/>.
/// </summary>
/// <remarks>ret = (vm0 * vm1) - vs</remarks>
/// <param name="vs">The vector to subtract from the intermediate result.</param>
/// <param name="vm0">The first vector to multiply.</param>
/// <param name="vm1">The second vector to multiply.</param>
/// <returns>The <see cref="Vector256{T}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<float> MultiplySubtract(
Vector256<float> vs,
Vector256<float> vm0,
Vector256<float> vm1)
{
if (Fma.IsSupported)
{
return Fma.MultiplySubtract(vm1, vm0, vs);
}
return (vm0 * vm1) - vs;
}
/// <summary>
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
/// </summary>
/// <param name="left">The left hand source vector.</param>
/// <param name="right">The right hand source vector.</param>
/// <returns>The <see cref="Vector256{Int16}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<short> PackSignedSaturate(Vector256<int> left, Vector256<int> right)
{
if (Avx2.IsSupported)
{
return Avx2.PackSignedSaturate(left, right);
}
Vector256<int> min = Vector256.Create((int)short.MinValue);
Vector256<int> max = Vector256.Create((int)short.MaxValue);
Vector256<int> lefClamped = Clamp(left, min, max);
Vector256<int> rightClamped = Clamp(right, min, max);
return Vector256.Narrow(lefClamped, rightClamped);
}
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
/// </summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="value">The vector to restrict.</param>
/// <param name="min">The minimum value.</param>
/// <param name="max">The maximum value.</param>
/// <returns>The restricted <see cref="Vector256{T}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<T> Clamp<T>(Vector256<T> value, Vector256<T> min, Vector256<T> max)
=> Vector256.Min(Vector256.Max(value, min), max);
/// <summary>
/// Widens a <see cref="Vector128{Int16}"/> to a <see cref="Vector256{Int32}"/>.
/// </summary>
/// <param name="value">The vector to widen.</param>
/// <returns>The widened <see cref="Vector256{Int32}"/>.</returns>
public static Vector256<int> Widen(Vector128<short> value)
{
if (Avx2.IsSupported)
{
return Avx2.ConvertToVector256Int32(value);
}
return Vector256.WidenLower(value.ToVector256());
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

47
src/ImageSharp/Common/Helpers/Vector512Utilities.cs

@ -17,21 +17,23 @@ namespace SixLabors.ImageSharp.Common.Helpers;
/// </list>
/// Should only be used if the intrinsics are available.
/// </summary>
internal static class Vector512Utilities
#pragma warning disable SA1649 // File name should match first type name
internal static class Vector512_
#pragma warning restore SA1649 // File name should match first type name
{
/// <summary>
/// Gets a value indicating whether shuffle float operations are supported.
/// </summary>
public static bool SupportsShuffleFloat
public static bool SupportsShuffleNativeFloat
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx512F.IsSupported || Avx.IsSupported;
get => Avx512F.IsSupported;
}
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
/// </summary>
public static bool SupportsShuffleByte
public static bool SupportsShuffleNativeByte
{
[MethodImpl(MethodImplOptions.AggressiveInlining)]
get => Avx512BW.IsSupported;
@ -44,20 +46,13 @@ internal static class Vector512Utilities
/// <param name="control">The shuffle control byte.</param>
/// <returns>The <see cref="Vector512{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<float> Shuffle(Vector512<float> vector, [ConstantExpected] byte control)
public static Vector512<float> ShuffleNative(Vector512<float> vector, [ConstantExpected] byte control)
{
if (Avx512F.IsSupported)
{
return Avx512F.Shuffle(vector, vector, control);
}
if (Avx.IsSupported)
{
Vector256<float> lower = vector.GetLower();
Vector256<float> upper = vector.GetUpper();
return Vector512.Create(Avx.Shuffle(lower, lower, control), Avx.Shuffle(upper, upper, control));
}
ThrowUnreachableException();
return default;
}
@ -71,7 +66,7 @@ internal static class Vector512Utilities
/// </param>
/// <returns>The <see cref="Vector512{Byte}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<byte> Shuffle(Vector512<byte> vector, Vector512<byte> indices)
public static Vector512<byte> ShuffleNative(Vector512<byte> vector, Vector512<byte> indices)
{
if (Avx512BW.IsSupported)
{
@ -126,6 +121,13 @@ internal static class Vector512Utilities
return Avx512F.RoundScale(vector, 0b0000_1000);
}
if (Avx.IsSupported)
{
Vector256<float> lower = Avx.RoundToNearestInteger(vector.GetLower());
Vector256<float> upper = Avx.RoundToNearestInteger(vector.GetUpper());
return Vector512.Create(lower, upper);
}
Vector512<float> sign = vector & Vector512.Create(-0F);
Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608F);
@ -152,9 +154,28 @@ internal static class Vector512Utilities
return Avx512F.FusedMultiplyAdd(vm0, vm1, va);
}
if (Fma.IsSupported)
{
Vector256<float> lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower());
Vector256<float> upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper());
return Vector512.Create(lower, upper);
}
return va + (vm0 * vm1);
}
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
/// </summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="value">The vector to restrict.</param>
/// <param name="min">The minimum value.</param>
/// <param name="max">The maximum value.</param>
/// <returns>The restricted <see cref="Vector512{T}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<T> Clamp<T>(Vector512<T> value, Vector512<T> min, Vector512<T> max)
=> Vector512.Min(Vector512.Max(value, min), max);
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

4
src/ImageSharp/Formats/Jpeg/Components/Block8x8.cs

@ -211,10 +211,10 @@ internal partial struct Block8x8
}
/// <summary>
/// Transpose the block inplace.
/// Transpose the block in place.
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
public void TransposeInplace()
public void TransposeInPlace()
{
ref short elemRef = ref Unsafe.As<Block8x8, short>(ref this);

153
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.cs

@ -1,153 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
// <auto-generated />
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal partial struct Block8x8F
{
/// <summary>
/// Level shift by +maximum/2, clip to [0, maximum]
/// </summary>
public void NormalizeColorsInPlace(float maximum)
{
var CMin4 = new Vector4(0F);
var CMax4 = new Vector4(maximum);
var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F));
this.V0L = Numerics.Clamp(this.V0L + COff4, CMin4, CMax4);
this.V0R = Numerics.Clamp(this.V0R + COff4, CMin4, CMax4);
this.V1L = Numerics.Clamp(this.V1L + COff4, CMin4, CMax4);
this.V1R = Numerics.Clamp(this.V1R + COff4, CMin4, CMax4);
this.V2L = Numerics.Clamp(this.V2L + COff4, CMin4, CMax4);
this.V2R = Numerics.Clamp(this.V2R + COff4, CMin4, CMax4);
this.V3L = Numerics.Clamp(this.V3L + COff4, CMin4, CMax4);
this.V3R = Numerics.Clamp(this.V3R + COff4, CMin4, CMax4);
this.V4L = Numerics.Clamp(this.V4L + COff4, CMin4, CMax4);
this.V4R = Numerics.Clamp(this.V4R + COff4, CMin4, CMax4);
this.V5L = Numerics.Clamp(this.V5L + COff4, CMin4, CMax4);
this.V5R = Numerics.Clamp(this.V5R + COff4, CMin4, CMax4);
this.V6L = Numerics.Clamp(this.V6L + COff4, CMin4, CMax4);
this.V6R = Numerics.Clamp(this.V6R + COff4, CMin4, CMax4);
this.V7L = Numerics.Clamp(this.V7L + COff4, CMin4, CMax4);
this.V7R = Numerics.Clamp(this.V7R + COff4, CMin4, CMax4);
}
/// <summary>
/// AVX2-only variant for executing <see cref="NormalizeColorsInPlace"/> and <see cref="RoundInPlace"/> in one step.
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
public void NormalizeColorsAndRoundInPlaceVector8(float maximum)
{
var off = new Vector<float>(MathF.Ceiling(maximum * 0.5F));
var max = new Vector<float>(maximum);
ref Vector<float> row0 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V0L);
row0 = NormalizeAndRound(row0, off, max);
ref Vector<float> row1 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V1L);
row1 = NormalizeAndRound(row1, off, max);
ref Vector<float> row2 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V2L);
row2 = NormalizeAndRound(row2, off, max);
ref Vector<float> row3 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V3L);
row3 = NormalizeAndRound(row3, off, max);
ref Vector<float> row4 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V4L);
row4 = NormalizeAndRound(row4, off, max);
ref Vector<float> row5 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V5L);
row5 = NormalizeAndRound(row5, off, max);
ref Vector<float> row6 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V6L);
row6 = NormalizeAndRound(row6, off, max);
ref Vector<float> row7 = ref Unsafe.As<Vector4, Vector<float>>(ref this.V7L);
row7 = NormalizeAndRound(row7, off, max);
}
/// <summary>
/// Fill the block from 'source' doing short -> float conversion.
/// </summary>
public void LoadFromInt16Scalar(ref Block8x8 source)
{
ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
this.V0L.X = Unsafe.Add(ref selfRef, 0);
this.V0L.Y = Unsafe.Add(ref selfRef, 1);
this.V0L.Z = Unsafe.Add(ref selfRef, 2);
this.V0L.W = Unsafe.Add(ref selfRef, 3);
this.V0R.X = Unsafe.Add(ref selfRef, 4);
this.V0R.Y = Unsafe.Add(ref selfRef, 5);
this.V0R.Z = Unsafe.Add(ref selfRef, 6);
this.V0R.W = Unsafe.Add(ref selfRef, 7);
this.V1L.X = Unsafe.Add(ref selfRef, 8);
this.V1L.Y = Unsafe.Add(ref selfRef, 9);
this.V1L.Z = Unsafe.Add(ref selfRef, 10);
this.V1L.W = Unsafe.Add(ref selfRef, 11);
this.V1R.X = Unsafe.Add(ref selfRef, 12);
this.V1R.Y = Unsafe.Add(ref selfRef, 13);
this.V1R.Z = Unsafe.Add(ref selfRef, 14);
this.V1R.W = Unsafe.Add(ref selfRef, 15);
this.V2L.X = Unsafe.Add(ref selfRef, 16);
this.V2L.Y = Unsafe.Add(ref selfRef, 17);
this.V2L.Z = Unsafe.Add(ref selfRef, 18);
this.V2L.W = Unsafe.Add(ref selfRef, 19);
this.V2R.X = Unsafe.Add(ref selfRef, 20);
this.V2R.Y = Unsafe.Add(ref selfRef, 21);
this.V2R.Z = Unsafe.Add(ref selfRef, 22);
this.V2R.W = Unsafe.Add(ref selfRef, 23);
this.V3L.X = Unsafe.Add(ref selfRef, 24);
this.V3L.Y = Unsafe.Add(ref selfRef, 25);
this.V3L.Z = Unsafe.Add(ref selfRef, 26);
this.V3L.W = Unsafe.Add(ref selfRef, 27);
this.V3R.X = Unsafe.Add(ref selfRef, 28);
this.V3R.Y = Unsafe.Add(ref selfRef, 29);
this.V3R.Z = Unsafe.Add(ref selfRef, 30);
this.V3R.W = Unsafe.Add(ref selfRef, 31);
this.V4L.X = Unsafe.Add(ref selfRef, 32);
this.V4L.Y = Unsafe.Add(ref selfRef, 33);
this.V4L.Z = Unsafe.Add(ref selfRef, 34);
this.V4L.W = Unsafe.Add(ref selfRef, 35);
this.V4R.X = Unsafe.Add(ref selfRef, 36);
this.V4R.Y = Unsafe.Add(ref selfRef, 37);
this.V4R.Z = Unsafe.Add(ref selfRef, 38);
this.V4R.W = Unsafe.Add(ref selfRef, 39);
this.V5L.X = Unsafe.Add(ref selfRef, 40);
this.V5L.Y = Unsafe.Add(ref selfRef, 41);
this.V5L.Z = Unsafe.Add(ref selfRef, 42);
this.V5L.W = Unsafe.Add(ref selfRef, 43);
this.V5R.X = Unsafe.Add(ref selfRef, 44);
this.V5R.Y = Unsafe.Add(ref selfRef, 45);
this.V5R.Z = Unsafe.Add(ref selfRef, 46);
this.V5R.W = Unsafe.Add(ref selfRef, 47);
this.V6L.X = Unsafe.Add(ref selfRef, 48);
this.V6L.Y = Unsafe.Add(ref selfRef, 49);
this.V6L.Z = Unsafe.Add(ref selfRef, 50);
this.V6L.W = Unsafe.Add(ref selfRef, 51);
this.V6R.X = Unsafe.Add(ref selfRef, 52);
this.V6R.Y = Unsafe.Add(ref selfRef, 53);
this.V6R.Z = Unsafe.Add(ref selfRef, 54);
this.V6R.W = Unsafe.Add(ref selfRef, 55);
this.V7L.X = Unsafe.Add(ref selfRef, 56);
this.V7L.Y = Unsafe.Add(ref selfRef, 57);
this.V7L.Z = Unsafe.Add(ref selfRef, 58);
this.V7L.W = Unsafe.Add(ref selfRef, 59);
this.V7R.X = Unsafe.Add(ref selfRef, 60);
this.V7R.Y = Unsafe.Add(ref selfRef, 61);
this.V7R.Z = Unsafe.Add(ref selfRef, 62);
this.V7R.W = Unsafe.Add(ref selfRef, 63);
}
}

103
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Generated.tt

@ -1,103 +0,0 @@
<#
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
#>
<#@ template debug="false" hostspecific="false" language="C#" #>
<#@ assembly name="System.Core" #>
<#@ import namespace="System.Linq" #>
<#@ import namespace="System.Text" #>
<#@ import namespace="System.Collections.Generic" #>
<#@ output extension=".cs" #>
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
// <auto-generated />
<#
char[] coordz = {'X', 'Y', 'Z', 'W'};
#>
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal partial struct Block8x8F
{
/// <summary>
/// Level shift by +maximum/2, clip to [0, maximum]
/// </summary>
public void NormalizeColorsInPlace(float maximum)
{
var CMin4 = new Vector4(0F);
var CMax4 = new Vector4(maximum);
var COff4 = new Vector4(MathF.Ceiling(maximum * 0.5F));
<#
PushIndent(" ");
for (int i = 0; i < 8; i++)
{
for (int j = 0; j < 2; j++)
{
char side = j == 0 ? 'L' : 'R';
Write($"this.V{i}{side} = Numerics.Clamp(this.V{i}{side} + COff4, CMin4, CMax4);\r\n");
}
}
PopIndent();
#>
}
/// <summary>
/// AVX2-only variant for executing <see cref="NormalizeColorsInPlace"/> and <see cref="RoundInPlace"/> in one step.
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
public void NormalizeColorsAndRoundInPlaceVector8(float maximum)
{
var off = new Vector<float>(MathF.Ceiling(maximum * 0.5F));
var max = new Vector<float>(maximum);
<#
for (int i = 0; i < 8; i++)
{
#>
ref Vector<float> row<#=i#> = ref Unsafe.As<Vector4, Vector<float>>(ref this.V<#=i#>L);
row<#=i#> = NormalizeAndRound(row<#=i#>, off, max);
<#
}
#>
}
/// <summary>
/// Fill the block from 'source' doing short -> float conversion.
/// </summary>
public void LoadFromInt16Scalar(ref Block8x8 source)
{
ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
<#
PushIndent(" ");
for (int j = 0; j < 8; j++)
{
for (int i = 0; i < 8; i++)
{
char destCoord = coordz[i % 4];
char destSide = (i / 4) % 2 == 0 ? 'L' : 'R';
if(j > 0 && i == 0){
WriteLine("");
}
char srcCoord = coordz[j % 4];
char srcSide = (j / 4) % 2 == 0 ? 'L' : 'R';
var expression = $"this.V{j}{destSide}.{destCoord} = Unsafe.Add(ref selfRef, {j*8+i});\r\n";
Write(expression);
}
}
PopIndent();
#>
}
}

144
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs

@ -1,144 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal partial struct Block8x8F
{
/// <summary>
/// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
/// </summary>
public const int RowCount = 8;
[FieldOffset(0)]
public Vector256<float> V0;
[FieldOffset(32)]
public Vector256<float> V1;
[FieldOffset(64)]
public Vector256<float> V2;
[FieldOffset(96)]
public Vector256<float> V3;
[FieldOffset(128)]
public Vector256<float> V4;
[FieldOffset(160)]
public Vector256<float> V5;
[FieldOffset(192)]
public Vector256<float> V6;
[FieldOffset(224)]
public Vector256<float> V7;
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
ref Vector256<float> aBase = ref a.V0;
ref Vector256<float> bBase = ref b.V0;
ref Vector256<short> destRef = ref dest.V01;
Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
for (nuint i = 0; i < 8; i += 2)
{
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16();
Unsafe.Add(ref destRef, i / 2) = row;
}
}
private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
for (nuint i = 0; i < 16; i += 2)
{
Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector128<short> row = Sse2.PackSignedSaturate(left, right);
Unsafe.Add(ref destBase, i / 2) = row;
}
}
private void TransposeInplace_Avx()
{
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
Vector256<float> r0 = Avx.InsertVector128(
this.V0,
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
1);
Vector256<float> r1 = Avx.InsertVector128(
this.V1,
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
1);
Vector256<float> r2 = Avx.InsertVector128(
this.V2,
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
1);
Vector256<float> r3 = Avx.InsertVector128(
this.V3,
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
1);
Vector256<float> r4 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
1);
Vector256<float> r5 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
1);
Vector256<float> r6 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
1);
Vector256<float> r7 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
1);
Vector256<float> t0 = Avx.UnpackLow(r0, r1);
Vector256<float> t2 = Avx.UnpackLow(r2, r3);
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
this.V0 = Avx.Blend(t0, v, 0xCC);
this.V1 = Avx.Blend(t2, v, 0x33);
Vector256<float> t4 = Avx.UnpackLow(r4, r5);
Vector256<float> t6 = Avx.UnpackLow(r6, r7);
v = Avx.Shuffle(t4, t6, 0x4E);
this.V4 = Avx.Blend(t4, v, 0xCC);
this.V5 = Avx.Blend(t6, v, 0x33);
Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
v = Avx.Shuffle(t1, t3, 0x4E);
this.V2 = Avx.Blend(t1, v, 0xCC);
this.V3 = Avx.Blend(t3, v, 0x33);
Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
v = Avx.Shuffle(t5, t7, 0x4E);
this.V6 = Avx.Blend(t5, v, 0xCC);
this.V7 = Avx.Blend(t7, v, 0x33);
}
}

93
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs

@ -0,0 +1,93 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
/// <content>
/// <see cref="Vector128{Single}"/> version of <see cref="Block8x8F"/>.
/// </content>
internal partial struct Block8x8F
{
/// <summary>
/// <see cref="Vector128{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
/// </summary>
/// <param name="maximum">The maximum value to normalize to.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void NormalizeColorsAndRoundInPlaceVector128(float maximum)
{
Vector128<float> max = Vector128.Create(maximum);
Vector128<float> off = Vector128.Ceiling(max * .5F);
this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4();
this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4();
this.V1L = NormalizeAndRoundVector128(this.V1L.AsVector128(), off, max).AsVector4();
this.V1R = NormalizeAndRoundVector128(this.V1R.AsVector128(), off, max).AsVector4();
this.V2L = NormalizeAndRoundVector128(this.V2L.AsVector128(), off, max).AsVector4();
this.V2R = NormalizeAndRoundVector128(this.V2R.AsVector128(), off, max).AsVector4();
this.V3L = NormalizeAndRoundVector128(this.V3L.AsVector128(), off, max).AsVector4();
this.V3R = NormalizeAndRoundVector128(this.V3R.AsVector128(), off, max).AsVector4();
this.V4L = NormalizeAndRoundVector128(this.V4L.AsVector128(), off, max).AsVector4();
this.V4R = NormalizeAndRoundVector128(this.V4R.AsVector128(), off, max).AsVector4();
this.V5L = NormalizeAndRoundVector128(this.V5L.AsVector128(), off, max).AsVector4();
this.V5R = NormalizeAndRoundVector128(this.V5R.AsVector128(), off, max).AsVector4();
this.V6L = NormalizeAndRoundVector128(this.V6L.AsVector128(), off, max).AsVector4();
this.V6R = NormalizeAndRoundVector128(this.V6R.AsVector128(), off, max).AsVector4();
this.V7L = NormalizeAndRoundVector128(this.V7L.AsVector128(), off, max).AsVector4();
this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4();
}
/// <summary>
/// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
/// </summary>
/// <param name="source">The source <see cref="Block8x8"/></param>
public void LoadFromInt16ExtendedVector128(ref Block8x8 source)
{
DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!");
ref Vector128<short> srcBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref source);
ref Vector128<float> destBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref this);
// Only 8 iterations, one per 128b short block
for (nuint i = 0; i < 8; i++)
{
Vector128<short> src = Unsafe.Add(ref srcBase, i);
// Step 1: Widen short -> int
Vector128<int> lower = Vector128.WidenLower(src); // lower 4 shorts -> 4 ints
Vector128<int> upper = Vector128.WidenUpper(src); // upper 4 shorts -> 4 ints
// Step 2: Convert int -> float
Vector128<float> lowerF = Vector128.ConvertToSingle(lower);
Vector128<float> upperF = Vector128.ConvertToSingle(upper);
// Step 3: Store to destination (this is 16 lanes -> two Vector128<float> blocks)
Unsafe.Add(ref destBase, (i * 2) + 0) = lowerF;
Unsafe.Add(ref destBase, (i * 2) + 1) = upperF;
}
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> value, Vector128<float> off, Vector128<float> max)
=> Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128<float>.Zero, max));
private static void MultiplyIntoInt16Vector128(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!");
ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
for (nuint i = 0; i < 16; i += 2)
{
Vector128<int> left = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0));
Vector128<int> right = Vector128_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1));
Unsafe.Add(ref destBase, i / 2) = Vector128_.PackSignedSaturate(left, right);
}
}
}

157
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs

@ -0,0 +1,157 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
/// <content>
/// <see cref="Vector128{Single}"/> version of <see cref="Block8x8F"/>.
/// </content>
internal partial struct Block8x8F
{
/// <summary>
/// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
/// </summary>
public const int RowCount = 8;
#pragma warning disable SA1310 // Field names should not contain underscore
[FieldOffset(0)]
public Vector256<float> V256_0;
[FieldOffset(32)]
public Vector256<float> V256_1;
[FieldOffset(64)]
public Vector256<float> V256_2;
[FieldOffset(96)]
public Vector256<float> V256_3;
[FieldOffset(128)]
public Vector256<float> V256_4;
[FieldOffset(160)]
public Vector256<float> V256_5;
[FieldOffset(192)]
public Vector256<float> V256_6;
[FieldOffset(224)]
public Vector256<float> V256_7;
#pragma warning restore SA1310 // Field names should not contain underscore
/// <summary>
/// <see cref="Vector256{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
/// </summary>
/// <param name="maximum">The maximum value to normalize to.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
{
Vector256<float> max = Vector256.Create(maximum);
Vector256<float> off = Vector256.Ceiling(max * .5F);
this.V256_0 = NormalizeAndRoundVector256(this.V256_0, off, max);
this.V256_1 = NormalizeAndRoundVector256(this.V256_1, off, max);
this.V256_2 = NormalizeAndRoundVector256(this.V256_2, off, max);
this.V256_3 = NormalizeAndRoundVector256(this.V256_3, off, max);
this.V256_4 = NormalizeAndRoundVector256(this.V256_4, off, max);
this.V256_5 = NormalizeAndRoundVector256(this.V256_5, off, max);
this.V256_6 = NormalizeAndRoundVector256(this.V256_6, off, max);
this.V256_7 = NormalizeAndRoundVector256(this.V256_7, off, max);
}
/// <summary>
/// Loads values from <paramref name="source"/> using <see cref="Vector256{T}"/> intrinsics.
/// </summary>
/// <param name="source">The source <see cref="Block8x8"/></param>
public void LoadFromInt16ExtendedVector256(ref Block8x8 source)
{
DebugGuard.IsTrue(
Vector256.IsHardwareAccelerated,
"LoadFromInt16ExtendedVector256 only works on Vector256 compatible architecture!");
ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
// Vector256<ushort>.Count == 16
// We can process 2 block rows in a single step
Vector256<int> top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef));
Vector256<int> bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
dRef = Vector256.ConvertToSingle(top);
Unsafe.Add(ref dRef, 1) = Vector256.ConvertToSingle(bottom);
top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
Unsafe.Add(ref dRef, 2) = Vector256.ConvertToSingle(top);
Unsafe.Add(ref dRef, 3) = Vector256.ConvertToSingle(bottom);
top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
Unsafe.Add(ref dRef, 4) = Vector256.ConvertToSingle(top);
Unsafe.Add(ref dRef, 5) = Vector256.ConvertToSingle(bottom);
top = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
bottom = Vector256_.Widen(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
Unsafe.Add(ref dRef, 6) = Vector256.ConvertToSingle(top);
Unsafe.Add(ref dRef, 7) = Vector256.ConvertToSingle(bottom);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector256<float> NormalizeAndRoundVector256(Vector256<float> value, Vector256<float> off, Vector256<float> max)
=> Vector256_.RoundToNearestInteger(Vector256_.Clamp(value + off, Vector256<float>.Zero, max));
private static unsafe void MultiplyIntoInt16Vector256(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to run this operation!");
ref Vector256<float> aBase = ref a.V256_0;
ref Vector256<float> bBase = ref b.V256_0;
ref Vector256<short> destRef = ref dest.V01;
for (nuint i = 0; i < 8; i += 2)
{
Vector256<int> row0 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 0) * Unsafe.Add(ref bBase, i + 0));
Vector256<int> row1 = Vector256_.ConvertToInt32RoundToEven(Unsafe.Add(ref aBase, i + 1) * Unsafe.Add(ref bBase, i + 1));
Vector256<short> row = Vector256_.PackSignedSaturate(row0, row1);
row = Vector256.Shuffle(row.AsInt32(), Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7)).AsInt16();
Unsafe.Add(ref destRef, i / 2) = row;
}
}
private void TransposeInPlaceVector256()
{
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
Vector256<float> r0 = this.V256_0.WithUpper(this.V4L.AsVector128());
Vector256<float> r1 = this.V256_1.WithUpper(this.V5L.AsVector128());
Vector256<float> r2 = this.V256_2.WithUpper(this.V6L.AsVector128());
Vector256<float> r3 = this.V256_3.WithUpper(this.V7L.AsVector128());
Vector256<float> r4 = this.V0R.AsVector128().ToVector256().WithUpper(this.V4R.AsVector128());
Vector256<float> r5 = this.V1R.AsVector128().ToVector256().WithUpper(this.V5R.AsVector128());
Vector256<float> r6 = this.V2R.AsVector128().ToVector256().WithUpper(this.V6R.AsVector128());
Vector256<float> r7 = this.V3R.AsVector128().ToVector256().WithUpper(this.V7R.AsVector128());
Vector256<float> t0 = Avx.UnpackLow(r0, r1);
Vector256<float> t2 = Avx.UnpackLow(r2, r3);
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
this.V256_0 = Avx.Blend(t0, v, 0xCC);
this.V256_1 = Avx.Blend(t2, v, 0x33);
Vector256<float> t4 = Avx.UnpackLow(r4, r5);
Vector256<float> t6 = Avx.UnpackLow(r6, r7);
v = Avx.Shuffle(t4, t6, 0x4E);
this.V256_4 = Avx.Blend(t4, v, 0xCC);
this.V256_5 = Avx.Blend(t6, v, 0x33);
Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
v = Avx.Shuffle(t1, t3, 0x4E);
this.V256_2 = Avx.Blend(t1, v, 0xCC);
this.V256_3 = Avx.Blend(t3, v, 0x33);
Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
v = Avx.Shuffle(t5, t7, 0x4E);
this.V256_6 = Avx.Blend(t5, v, 0xCC);
this.V256_7 = Avx.Blend(t7, v, 0x33);
}
}

275
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

@ -5,7 +5,6 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Text;
using SixLabors.ImageSharp.Common.Helpers;
@ -23,7 +22,6 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
/// </summary>
public const int Size = 64;
#pragma warning disable SA1600 // ElementsMustBeDocumented
[FieldOffset(0)]
public Vector4 V0L;
[FieldOffset(16)]
@ -63,7 +61,6 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
public Vector4 V7L;
[FieldOffset(240)]
public Vector4 V7R;
#pragma warning restore SA1600 // ElementsMustBeDocumented
/// <summary>
/// Get/Set scalar elements at a given index
@ -157,17 +154,17 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)]
public void MultiplyInPlace(float value)
{
if (Avx.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
Vector256<float> valueVec = Vector256.Create(value);
this.V0 = Avx.Multiply(this.V0, valueVec);
this.V1 = Avx.Multiply(this.V1, valueVec);
this.V2 = Avx.Multiply(this.V2, valueVec);
this.V3 = Avx.Multiply(this.V3, valueVec);
this.V4 = Avx.Multiply(this.V4, valueVec);
this.V5 = Avx.Multiply(this.V5, valueVec);
this.V6 = Avx.Multiply(this.V6, valueVec);
this.V7 = Avx.Multiply(this.V7, valueVec);
this.V256_0 *= valueVec;
this.V256_1 *= valueVec;
this.V256_2 *= valueVec;
this.V256_3 *= valueVec;
this.V256_4 *= valueVec;
this.V256_5 *= valueVec;
this.V256_6 *= valueVec;
this.V256_7 *= valueVec;
}
else
{
@ -198,16 +195,16 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)]
public unsafe void MultiplyInPlace(ref Block8x8F other)
{
if (Avx.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
this.V0 = Avx.Multiply(this.V0, other.V0);
this.V1 = Avx.Multiply(this.V1, other.V1);
this.V2 = Avx.Multiply(this.V2, other.V2);
this.V3 = Avx.Multiply(this.V3, other.V3);
this.V4 = Avx.Multiply(this.V4, other.V4);
this.V5 = Avx.Multiply(this.V5, other.V5);
this.V6 = Avx.Multiply(this.V6, other.V6);
this.V7 = Avx.Multiply(this.V7, other.V7);
this.V256_0 *= other.V256_0;
this.V256_1 *= other.V256_1;
this.V256_2 *= other.V256_2;
this.V256_3 *= other.V256_3;
this.V256_4 *= other.V256_4;
this.V256_5 *= other.V256_5;
this.V256_6 *= other.V256_6;
this.V256_7 *= other.V256_7;
}
else
{
@ -237,17 +234,17 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)]
public void AddInPlace(float value)
{
if (Avx.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
Vector256<float> valueVec = Vector256.Create(value);
this.V0 = Avx.Add(this.V0, valueVec);
this.V1 = Avx.Add(this.V1, valueVec);
this.V2 = Avx.Add(this.V2, valueVec);
this.V3 = Avx.Add(this.V3, valueVec);
this.V4 = Avx.Add(this.V4, valueVec);
this.V5 = Avx.Add(this.V5, valueVec);
this.V6 = Avx.Add(this.V6, valueVec);
this.V7 = Avx.Add(this.V7, valueVec);
this.V256_0 += valueVec;
this.V256_1 += valueVec;
this.V256_2 += valueVec;
this.V256_3 += valueVec;
this.V256_4 += valueVec;
this.V256_5 += valueVec;
this.V256_6 += valueVec;
this.V256_7 += valueVec;
}
else
{
@ -279,15 +276,15 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
/// <param name="qt">The quantization table.</param>
public static void Quantize(ref Block8x8F block, ref Block8x8 dest, ref Block8x8F qt)
{
if (Avx2.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
MultiplyIntoInt16_Avx2(ref block, ref qt, ref dest);
MultiplyIntoInt16Vector256(ref block, ref qt, ref dest);
ZigZag.ApplyTransposingZigZagOrderingAvx2(ref dest);
}
else if (Ssse3.IsSupported)
else if (Vector128.IsHardwareAccelerated)
{
MultiplyIntoInt16_Sse2(ref block, ref qt, ref dest);
ZigZag.ApplyTransposingZigZagOrderingSsse3(ref dest);
MultiplyIntoInt16Vector128(ref block, ref qt, ref dest);
ZigZag.ApplyTransposingZigZagOrderingVector128(ref dest);
}
else
{
@ -332,9 +329,13 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
/// <param name="maximum">The maximum value.</param>
public void NormalizeColorsAndRoundInPlace(float maximum)
{
if (SimdUtils.HasVector8)
if (Vector256.IsHardwareAccelerated)
{
this.NormalizeColorsAndRoundInPlaceVector8(maximum);
this.NormalizeColorsAndRoundInPlaceVector256(maximum);
}
else if (Vector128.IsHardwareAccelerated)
{
this.NormalizeColorsAndRoundInPlaceVector128(maximum);
}
else
{
@ -343,17 +344,32 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
}
}
public void DE_NormalizeColors(float maximum)
/// <summary>
/// Level shift by +maximum/2, clip to [0, maximum]
/// </summary>
/// <param name="maximum">The maximum value to normalize to.</param>
public void NormalizeColorsInPlace(float maximum)
{
if (SimdUtils.HasVector8)
{
this.NormalizeColorsAndRoundInPlaceVector8(maximum);
}
else
{
this.NormalizeColorsInPlace(maximum);
this.RoundInPlace();
}
Vector4 min = Vector4.Zero;
Vector4 max = new(maximum);
Vector4 off = new(MathF.Ceiling(maximum * 0.5F));
this.V0L = Vector4.Clamp(this.V0L + off, min, max);
this.V0R = Vector4.Clamp(this.V0R + off, min, max);
this.V1L = Vector4.Clamp(this.V1L + off, min, max);
this.V1R = Vector4.Clamp(this.V1R + off, min, max);
this.V2L = Vector4.Clamp(this.V2L + off, min, max);
this.V2R = Vector4.Clamp(this.V2R + off, min, max);
this.V3L = Vector4.Clamp(this.V3L + off, min, max);
this.V3R = Vector4.Clamp(this.V3R + off, min, max);
this.V4L = Vector4.Clamp(this.V4L + off, min, max);
this.V4R = Vector4.Clamp(this.V4R + off, min, max);
this.V5L = Vector4.Clamp(this.V5L + off, min, max);
this.V5R = Vector4.Clamp(this.V5R + off, min, max);
this.V6L = Vector4.Clamp(this.V6L + off, min, max);
this.V6R = Vector4.Clamp(this.V6R + off, min, max);
this.V7L = Vector4.Clamp(this.V7L + off, min, max);
this.V7R = Vector4.Clamp(this.V7R + off, min, max);
}
/// <summary>
@ -370,9 +386,14 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)]
public void LoadFrom(ref Block8x8 source)
{
if (SimdUtils.HasVector8)
if (Vector256.IsHardwareAccelerated)
{
this.LoadFromInt16ExtendedAvx2(ref source);
this.LoadFromInt16ExtendedVector256(ref source);
return;
}
else if (Vector128.IsHardwareAccelerated)
{
this.LoadFromInt16ExtendedVector128(ref source);
return;
}
@ -380,39 +401,84 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
}
/// <summary>
/// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
/// Fill the block from <paramref name="source"/> doing short -&gt; float conversion.
/// </summary>
/// <param name="source">The source <see cref="Block8x8"/></param>
public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
/// <param name="source">The source block</param>
public void LoadFromInt16Scalar(ref Block8x8 source)
{
DebugGuard.IsTrue(
Avx2.IsSupported,
"LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");
ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
// Vector256<ushort>.Count == 16 on AVX2
// We can process 2 block rows in a single step
Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
dRef = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
this.V0L.X = Unsafe.Add(ref selfRef, 0);
this.V0L.Y = Unsafe.Add(ref selfRef, 1);
this.V0L.Z = Unsafe.Add(ref selfRef, 2);
this.V0L.W = Unsafe.Add(ref selfRef, 3);
this.V0R.X = Unsafe.Add(ref selfRef, 4);
this.V0R.Y = Unsafe.Add(ref selfRef, 5);
this.V0R.Z = Unsafe.Add(ref selfRef, 6);
this.V0R.W = Unsafe.Add(ref selfRef, 7);
this.V1L.X = Unsafe.Add(ref selfRef, 8);
this.V1L.Y = Unsafe.Add(ref selfRef, 9);
this.V1L.Z = Unsafe.Add(ref selfRef, 10);
this.V1L.W = Unsafe.Add(ref selfRef, 11);
this.V1R.X = Unsafe.Add(ref selfRef, 12);
this.V1R.Y = Unsafe.Add(ref selfRef, 13);
this.V1R.Z = Unsafe.Add(ref selfRef, 14);
this.V1R.W = Unsafe.Add(ref selfRef, 15);
this.V2L.X = Unsafe.Add(ref selfRef, 16);
this.V2L.Y = Unsafe.Add(ref selfRef, 17);
this.V2L.Z = Unsafe.Add(ref selfRef, 18);
this.V2L.W = Unsafe.Add(ref selfRef, 19);
this.V2R.X = Unsafe.Add(ref selfRef, 20);
this.V2R.Y = Unsafe.Add(ref selfRef, 21);
this.V2R.Z = Unsafe.Add(ref selfRef, 22);
this.V2R.W = Unsafe.Add(ref selfRef, 23);
this.V3L.X = Unsafe.Add(ref selfRef, 24);
this.V3L.Y = Unsafe.Add(ref selfRef, 25);
this.V3L.Z = Unsafe.Add(ref selfRef, 26);
this.V3L.W = Unsafe.Add(ref selfRef, 27);
this.V3R.X = Unsafe.Add(ref selfRef, 28);
this.V3R.Y = Unsafe.Add(ref selfRef, 29);
this.V3R.Z = Unsafe.Add(ref selfRef, 30);
this.V3R.W = Unsafe.Add(ref selfRef, 31);
this.V4L.X = Unsafe.Add(ref selfRef, 32);
this.V4L.Y = Unsafe.Add(ref selfRef, 33);
this.V4L.Z = Unsafe.Add(ref selfRef, 34);
this.V4L.W = Unsafe.Add(ref selfRef, 35);
this.V4R.X = Unsafe.Add(ref selfRef, 36);
this.V4R.Y = Unsafe.Add(ref selfRef, 37);
this.V4R.Z = Unsafe.Add(ref selfRef, 38);
this.V4R.W = Unsafe.Add(ref selfRef, 39);
this.V5L.X = Unsafe.Add(ref selfRef, 40);
this.V5L.Y = Unsafe.Add(ref selfRef, 41);
this.V5L.Z = Unsafe.Add(ref selfRef, 42);
this.V5L.W = Unsafe.Add(ref selfRef, 43);
this.V5R.X = Unsafe.Add(ref selfRef, 44);
this.V5R.Y = Unsafe.Add(ref selfRef, 45);
this.V5R.Z = Unsafe.Add(ref selfRef, 46);
this.V5R.W = Unsafe.Add(ref selfRef, 47);
this.V6L.X = Unsafe.Add(ref selfRef, 48);
this.V6L.Y = Unsafe.Add(ref selfRef, 49);
this.V6L.Z = Unsafe.Add(ref selfRef, 50);
this.V6L.W = Unsafe.Add(ref selfRef, 51);
this.V6R.X = Unsafe.Add(ref selfRef, 52);
this.V6R.Y = Unsafe.Add(ref selfRef, 53);
this.V6R.Z = Unsafe.Add(ref selfRef, 54);
this.V6R.W = Unsafe.Add(ref selfRef, 55);
this.V7L.X = Unsafe.Add(ref selfRef, 56);
this.V7L.Y = Unsafe.Add(ref selfRef, 57);
this.V7L.Z = Unsafe.Add(ref selfRef, 58);
this.V7L.W = Unsafe.Add(ref selfRef, 59);
this.V7R.X = Unsafe.Add(ref selfRef, 60);
this.V7R.Y = Unsafe.Add(ref selfRef, 61);
this.V7R.Z = Unsafe.Add(ref selfRef, 62);
this.V7R.W = Unsafe.Add(ref selfRef, 63);
}
/// <summary>
@ -421,17 +487,30 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
/// <param name="value">Value to compare to.</param>
public bool EqualsToScalar(int value)
{
if (Avx2.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
Vector256<int> targetVector = Vector256.Create(value);
ref Vector256<float> blockStride = ref this.V0;
ref Vector256<float> blockStride = ref this.V256_0;
for (nuint i = 0; i < RowCount; i++)
{
Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
if (!Vector256.EqualsAll(Vector256.ConvertToInt32(Unsafe.Add(ref this.V256_0, i)), targetVector))
{
return false;
}
}
return true;
}
if (Vector128.IsHardwareAccelerated)
{
Vector128<int> targetVector = Vector128.Create(value);
ref Vector4 blockStride = ref this.V0L;
for (nuint i = 0; i < RowCount * 2; i++)
{
if (!Vector128.EqualsAll(Vector128.ConvertToInt32(Unsafe.Add(ref this.V0L, i).AsVector128()), targetVector))
{
return false;
}
@ -516,26 +595,27 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
}
/// <summary>
/// Transpose the block inplace.
/// Transpose the block in-place.
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
public void TransposeInplace()
public void TransposeInPlace()
{
if (Avx.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
this.TransposeInplace_Avx();
this.TransposeInPlaceVector256();
}
else
{
this.TransposeInplace_Scalar();
// TODO: Can we provide a Vector128 implementation for this?
this.TransposeInPlace_Scalar();
}
}
/// <summary>
/// Scalar inplace transpose implementation for <see cref="TransposeInplace"/>
/// Scalar in-place transpose implementation for <see cref="TransposeInPlace"/>
/// </summary>
[MethodImpl(InliningOptions.ShortMethod)]
private void TransposeInplace_Scalar()
private void TransposeInPlace_Scalar()
{
ref float elemRef = ref Unsafe.As<Block8x8F, float>(ref this);
@ -581,13 +661,4 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
// row #6
RuntimeUtility.Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector<float> NormalizeAndRound(Vector<float> row, Vector<float> off, Vector<float> max)
{
row += off;
row = Vector.Max(row, Vector<float>.Zero);
row = Vector.Min(row, max);
return row.FastRound();
}
}

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs

@ -1,4 +1,4 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
@ -60,7 +60,7 @@ internal abstract partial class JpegColorConverterBase
ref Vector128<float> b = ref Unsafe.Add(ref srcBlue, i);
// luminosity = (0.299 * r) + (0.587 * g) + (0.114 * b)
Unsafe.Add(ref destLuminance, i) = Vector128Utilities.MultiplyAdd(Vector128Utilities.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
Unsafe.Add(ref destLuminance, i) = Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
}
}
}

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

2
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs

@ -4,7 +4,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

142
src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs

@ -1,142 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal static partial class FloatingPointDCT
{
/// <summary>
/// Apply floating point FDCT inplace using simd operations.
/// </summary>
/// <param name="block">Input block.</param>
private static void FDCT8x8_Avx(ref Block8x8F block)
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
// First pass - process columns
FDCT8x8_1D_Avx(ref block);
// Second pass - process rows
block.TransposeInplace();
FDCT8x8_1D_Avx(ref block);
// Applies 1D floating point FDCT inplace
static void FDCT8x8_1D_Avx(ref Block8x8F block)
{
Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
// Even part
Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
Vector256<float> tmp13 = Avx.Subtract(tmp0, tmp3);
Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
block.V0 = Avx.Add(tmp10, tmp11);
block.V4 = Avx.Subtract(tmp10, tmp11);
var mm256_F_0_7071 = Vector256.Create(0.707106781f);
Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
block.V2 = Avx.Add(tmp13, z1);
block.V6 = Avx.Subtract(tmp13, z1);
// Odd part
tmp10 = Avx.Add(tmp4, tmp5);
tmp11 = Avx.Add(tmp5, tmp6);
tmp12 = Avx.Add(tmp6, tmp7);
Vector256<float> z5 = Avx.Multiply(Avx.Subtract(tmp10, tmp12), Vector256.Create(0.382683433f)); // mm256_F_0_3826
Vector256<float> z2 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
Vector256<float> z4 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
Vector256<float> z3 = Avx.Multiply(tmp11, mm256_F_0_7071);
Vector256<float> z11 = Avx.Add(tmp7, z3);
Vector256<float> z13 = Avx.Subtract(tmp7, z3);
block.V5 = Avx.Add(z13, z2);
block.V3 = Avx.Subtract(z13, z2);
block.V1 = Avx.Add(z11, z4);
block.V7 = Avx.Subtract(z11, z4);
}
}
/// <summary>
/// Apply floating point IDCT inplace using simd operations.
/// </summary>
/// <param name="transposedBlock">Transposed input block.</param>
private static void IDCT8x8_Avx(ref Block8x8F transposedBlock)
{
DebugGuard.IsTrue(Avx.IsSupported, "Avx support is required to execute this operation.");
// First pass - process columns
IDCT8x8_1D_Avx(ref transposedBlock);
// Second pass - process rows
transposedBlock.TransposeInplace();
IDCT8x8_1D_Avx(ref transposedBlock);
// Applies 1D floating point FDCT inplace
static void IDCT8x8_1D_Avx(ref Block8x8F block)
{
// Even part
Vector256<float> tmp0 = block.V0;
Vector256<float> tmp1 = block.V2;
Vector256<float> tmp2 = block.V4;
Vector256<float> tmp3 = block.V6;
Vector256<float> z5 = tmp0;
Vector256<float> tmp10 = Avx.Add(z5, tmp2);
Vector256<float> tmp11 = Avx.Subtract(z5, tmp2);
var mm256_F_1_4142 = Vector256.Create(1.414213562f);
Vector256<float> tmp13 = Avx.Add(tmp1, tmp3);
Vector256<float> tmp12 = SimdUtils.HwIntrinsics.MultiplySubtract(tmp13, Avx.Subtract(tmp1, tmp3), mm256_F_1_4142);
tmp0 = Avx.Add(tmp10, tmp13);
tmp3 = Avx.Subtract(tmp10, tmp13);
tmp1 = Avx.Add(tmp11, tmp12);
tmp2 = Avx.Subtract(tmp11, tmp12);
// Odd part
Vector256<float> tmp4 = block.V1;
Vector256<float> tmp5 = block.V3;
Vector256<float> tmp6 = block.V5;
Vector256<float> tmp7 = block.V7;
Vector256<float> z13 = Avx.Add(tmp6, tmp5);
Vector256<float> z10 = Avx.Subtract(tmp6, tmp5);
Vector256<float> z11 = Avx.Add(tmp4, tmp7);
Vector256<float> z12 = Avx.Subtract(tmp4, tmp7);
tmp7 = Avx.Add(z11, z13);
tmp11 = Avx.Multiply(Avx.Subtract(z11, z13), mm256_F_1_4142);
z5 = Avx.Multiply(Avx.Add(z10, z12), Vector256.Create(1.847759065f)); // mm256_F_1_8477
tmp10 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
tmp12 = SimdUtils.HwIntrinsics.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131
tmp6 = Avx.Subtract(tmp12, tmp7);
tmp5 = Avx.Subtract(tmp11, tmp6);
tmp4 = Avx.Subtract(tmp10, tmp5);
block.V0 = Avx.Add(tmp0, tmp7);
block.V7 = Avx.Subtract(tmp0, tmp7);
block.V1 = Avx.Add(tmp1, tmp6);
block.V6 = Avx.Subtract(tmp1, tmp6);
block.V2 = Avx.Add(tmp2, tmp5);
block.V5 = Avx.Subtract(tmp2, tmp5);
block.V3 = Avx.Add(tmp3, tmp4);
block.V4 = Avx.Subtract(tmp3, tmp4);
}
}
}

142
src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Vector256.cs

@ -0,0 +1,142 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal static partial class FloatingPointDCT
{
/// <summary>
/// Apply floating point FDCT in place using simd operations.
/// </summary>
/// <param name="block">Input block.</param>
private static void FDCT8x8_Vector256(ref Block8x8F block)
{
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
// First pass - process columns
FDCT8x8_1D_Vector256(ref block);
// Second pass - process rows
block.TransposeInPlace();
FDCT8x8_1D_Vector256(ref block);
// Applies 1D floating point FDCT in place
static void FDCT8x8_1D_Vector256(ref Block8x8F block)
{
Vector256<float> tmp0 = block.V256_0 + block.V256_7;
Vector256<float> tmp7 = block.V256_0 - block.V256_7;
Vector256<float> tmp1 = block.V256_1 + block.V256_6;
Vector256<float> tmp6 = block.V256_1 - block.V256_6;
Vector256<float> tmp2 = block.V256_2 + block.V256_5;
Vector256<float> tmp5 = block.V256_2 - block.V256_5;
Vector256<float> tmp3 = block.V256_3 + block.V256_4;
Vector256<float> tmp4 = block.V256_3 - block.V256_4;
// Even part
Vector256<float> tmp10 = tmp0 + tmp3;
Vector256<float> tmp13 = tmp0 - tmp3;
Vector256<float> tmp11 = tmp1 + tmp2;
Vector256<float> tmp12 = tmp1 - tmp2;
block.V256_0 = tmp10 + tmp11;
block.V256_4 = tmp10 - tmp11;
Vector256<float> mm256_F_0_7071 = Vector256.Create(0.707106781f);
Vector256<float> z1 = (tmp12 + tmp13) * mm256_F_0_7071;
block.V256_2 = tmp13 + z1;
block.V256_6 = tmp13 - z1;
// Odd part
tmp10 = tmp4 + tmp5;
tmp11 = tmp5 + tmp6;
tmp12 = tmp6 + tmp7;
Vector256<float> z5 = (tmp10 - tmp12) * Vector256.Create(0.382683433f); // mm256_F_0_3826
Vector256<float> z2 = Vector256_.MultiplyAdd(z5, Vector256.Create(0.541196100f), tmp10); // mm256_F_0_5411
Vector256<float> z4 = Vector256_.MultiplyAdd(z5, Vector256.Create(1.306562965f), tmp12); // mm256_F_1_3065
Vector256<float> z3 = tmp11 * mm256_F_0_7071;
Vector256<float> z11 = tmp7 + z3;
Vector256<float> z13 = tmp7 - z3;
block.V256_5 = z13 + z2;
block.V256_3 = z13 - z2;
block.V256_1 = z11 + z4;
block.V256_7 = z11 - z4;
}
}
/// <summary>
/// Apply floating point IDCT in place using simd operations.
/// </summary>
/// <param name="transposedBlock">Transposed input block.</param>
private static void IDCT8x8_Vector256(ref Block8x8F transposedBlock)
{
DebugGuard.IsTrue(Vector256.IsHardwareAccelerated, "Vector256 support is required to execute this operation.");
// First pass - process columns
IDCT8x8_1D_Vector256(ref transposedBlock);
// Second pass - process rows
transposedBlock.TransposeInPlace();
IDCT8x8_1D_Vector256(ref transposedBlock);
// Applies 1D floating point FDCT in place
static void IDCT8x8_1D_Vector256(ref Block8x8F block)
{
// Even part
Vector256<float> tmp0 = block.V256_0;
Vector256<float> tmp1 = block.V256_2;
Vector256<float> tmp2 = block.V256_4;
Vector256<float> tmp3 = block.V256_6;
Vector256<float> z5 = tmp0;
Vector256<float> tmp10 = z5 + tmp2;
Vector256<float> tmp11 = z5 - tmp2;
Vector256<float> mm256_F_1_4142 = Vector256.Create(1.414213562f);
Vector256<float> tmp13 = tmp1 + tmp3;
Vector256<float> tmp12 = Vector256_.MultiplySubtract(tmp13, tmp1 - tmp3, mm256_F_1_4142);
tmp0 = tmp10 + tmp13;
tmp3 = tmp10 - tmp13;
tmp1 = tmp11 + tmp12;
tmp2 = tmp11 - tmp12;
// Odd part
Vector256<float> tmp4 = block.V256_1;
Vector256<float> tmp5 = block.V256_3;
Vector256<float> tmp6 = block.V256_5;
Vector256<float> tmp7 = block.V256_7;
Vector256<float> z13 = tmp6 + tmp5;
Vector256<float> z10 = tmp6 - tmp5;
Vector256<float> z11 = tmp4 + tmp7;
Vector256<float> z12 = tmp4 - tmp7;
tmp7 = z11 + z13;
tmp11 = (z11 - z13) * mm256_F_1_4142;
z5 = (z10 + z12) * Vector256.Create(1.847759065f); // mm256_F_1_8477
tmp10 = Vector256_.MultiplyAdd(z5, z12, Vector256.Create(-1.082392200f)); // mm256_F_n1_0823
tmp12 = Vector256_.MultiplyAdd(z5, z10, Vector256.Create(-2.613125930f)); // mm256_F_n2_6131
tmp6 = tmp12 - tmp7;
tmp5 = tmp11 - tmp6;
tmp4 = tmp10 - tmp5;
block.V256_0 = tmp0 + tmp7;
block.V256_7 = tmp0 - tmp7;
block.V256_1 = tmp1 + tmp6;
block.V256_6 = tmp1 - tmp6;
block.V256_2 = tmp2 + tmp5;
block.V256_5 = tmp2 - tmp5;
block.V256_3 = tmp3 + tmp4;
block.V256_4 = tmp3 - tmp4;
}
}
}

22
src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.cs

@ -4,7 +4,7 @@
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics.X86;
using System.Runtime.Intrinsics;
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@ -77,7 +77,7 @@ internal static partial class FloatingPointDCT
// Spectral macroblocks are transposed before quantization
// so we must transpose quantization table
quantTable.TransposeInplace();
quantTable.TransposeInPlace();
}
/// <summary>
@ -97,11 +97,11 @@ internal static partial class FloatingPointDCT
// Spectral macroblocks are not transposed before quantization
// Transpose is done after quantization at zig-zag stage
// so we must transpose quantization table
quantTable.TransposeInplace();
quantTable.TransposeInPlace();
}
/// <summary>
/// Apply 2D floating point IDCT inplace.
/// Apply 2D floating point IDCT in place.
/// </summary>
/// <remarks>
/// Input block must be dequantized with quantization table
@ -110,9 +110,9 @@ internal static partial class FloatingPointDCT
/// <param name="block">Input block.</param>
public static void TransformIDCT(ref Block8x8F block)
{
if (Avx.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
IDCT8x8_Avx(ref block);
IDCT8x8_Vector256(ref block);
}
else
{
@ -121,7 +121,7 @@ internal static partial class FloatingPointDCT
}
/// <summary>
/// Apply 2D floating point IDCT inplace.
/// Apply 2D floating point IDCT in place.
/// </summary>
/// <remarks>
/// Input block must be quantized after this method with quantization
@ -130,9 +130,9 @@ internal static partial class FloatingPointDCT
/// <param name="block">Input block.</param>
public static void TransformFDCT(ref Block8x8F block)
{
if (Avx.IsSupported)
if (Vector256.IsHardwareAccelerated)
{
FDCT8x8_Avx(ref block);
FDCT8x8_Vector256(ref block);
}
else
{
@ -155,7 +155,7 @@ internal static partial class FloatingPointDCT
IDCT8x4_Vector4(ref transposedBlock.V0R);
// Second pass - process rows
transposedBlock.TransposeInplace();
transposedBlock.TransposeInPlace();
IDCT8x4_Vector4(ref transposedBlock.V0L);
IDCT8x4_Vector4(ref transposedBlock.V0R);
@ -225,7 +225,7 @@ internal static partial class FloatingPointDCT
FDCT8x4_Vector4(ref block.V0R);
// Second pass - process rows
block.TransposeInplace();
block.TransposeInPlace();
FDCT8x4_Vector4(ref block.V0L);
FDCT8x4_Vector4(ref block.V0R);

2
src/ImageSharp/Formats/Jpeg/Components/ScaledFloatingPointDCT.cs

@ -48,7 +48,7 @@ internal static class ScaledFloatingPointDCT
// Spectral macroblocks are transposed before quantization
// so we must transpose quantization table
quantTable.TransposeInplace();
quantTable.TransposeInPlace();
}
/// <summary>

135
src/ImageSharp/Formats/Jpeg/Components/ZigZag.Intrinsic.cs

@ -1,6 +1,9 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
@ -17,11 +20,11 @@ internal static partial class ZigZag
#pragma warning restore SA1309
/// <summary>
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingSsse3"/>
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingVector128"/>
/// zig zag implementation.
/// </summary>
private static ReadOnlySpan<byte> SseShuffleMasks => new byte[]
{
private static ReadOnlySpan<byte> SseShuffleMasks =>
[
#pragma warning disable SA1515
/* row0 - A0 B0 A1 A2 B1 C0 D0 C1 */
// A
@ -83,14 +86,14 @@ internal static partial class ZigZag
// H
_, _, _, _, _, _, _, _, 10, 11, 12, 13, _, _, 14, 15,
#pragma warning restore SA1515
};
];
/// <summary>
/// Gets shuffle vectors for <see cref="ApplyTransposingZigZagOrderingAvx2"/>
/// zig zag implementation.
/// </summary>
private static ReadOnlySpan<byte> AvxShuffleMasks => new byte[]
{
private static ReadOnlySpan<byte> AvxShuffleMasks =>
[
#pragma warning disable SA1515
/* 01 */
// [cr] crln_01_AB_CD
@ -138,15 +141,15 @@ internal static partial class ZigZag
// (in) GH
_, _, _, _, _, _, _, _, 0, 1, 10, 11, 12, 13, 2, 3, _, _, _, _, _, _, 0, 1, 6, 7, 8, 9, 2, 3, 10, 11,
#pragma warning restore SA1515
};
];
/// <summary>
/// Applies zig zag ordering for given 8x8 matrix using SSE cpu intrinsics.
/// Applies zig zag ordering for given 8x8 matrix using <see cref="Vector128{T}"/> cpu intrinsics.
/// </summary>
/// <param name="block">Input matrix.</param>
public static unsafe void ApplyTransposingZigZagOrderingSsse3(ref Block8x8 block)
public static unsafe void ApplyTransposingZigZagOrderingVector128(ref Block8x8 block)
{
DebugGuard.IsTrue(Ssse3.IsSupported, "Ssse3 support is required to run this operation!");
DebugGuard.IsTrue(Vector128.IsHardwareAccelerated, "Vector128 support is required to run this operation!");
fixed (byte* shuffleVectorsPtr = &MemoryMarshal.GetReference(SseShuffleMasks))
{
@ -160,68 +163,68 @@ internal static partial class ZigZag
Vector128<byte> rowH = block.V7.AsByte();
// row0 - A0 B0 A1 A2 B1 C0 D0 C1
Vector128<short> row0_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 0))).AsInt16();
Vector128<short> row0_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 1))).AsInt16();
Vector128<short> row0_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 2))).AsInt16();
Vector128<short> row0 = Sse2.Or(Sse2.Or(row0_A, row0_B), row0_C);
row0 = Sse2.Insert(row0.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 0), 6).AsInt16();
Vector128<short> row0_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 0))).AsInt16();
Vector128<short> row0_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 1))).AsInt16();
Vector128<short> row0_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 2))).AsInt16();
Vector128<short> row0 = row0_A | row0_B | row0_C;
row0 = row0.AsUInt16().WithElement(6, rowD.AsUInt16().GetElement(0)).AsInt16();
// row1 - B2 A3 A4 B3 C2 D1 E0 F0
Vector128<short> row1_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 3))).AsInt16();
Vector128<short> row1_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 4))).AsInt16();
Vector128<short> row1 = Sse2.Or(row1_A, row1_B);
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 2), 4).AsInt16();
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 1), 5).AsInt16();
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 0), 6).AsInt16();
row1 = Sse2.Insert(row1.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 0), 7).AsInt16();
Vector128<short> row1_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 3))).AsInt16();
Vector128<short> row1_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 4))).AsInt16();
Vector128<short> row1 = row1_A | row1_B;
row1 = row1.AsUInt16().WithElement(4, rowC.AsUInt16().GetElement(2)).AsInt16();
row1 = row1.AsUInt16().WithElement(5, rowD.AsUInt16().GetElement(1)).AsInt16();
row1 = row1.AsUInt16().WithElement(6, rowE.AsUInt16().GetElement(0)).AsInt16();
row1 = row1.AsUInt16().WithElement(7, rowF.AsUInt16().GetElement(0)).AsInt16();
// row2 - E1 D2 C3 B4 A5 A6 B5 C4
Vector128<short> row2_A = Ssse3.Shuffle(rowA, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 5))).AsInt16();
Vector128<short> row2_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 6))).AsInt16();
Vector128<short> row2_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 7))).AsInt16();
Vector128<short> row2 = Sse2.Or(Sse2.Or(row2_A, row2_B), row2_C);
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 2), 1).AsInt16();
row2 = Sse2.Insert(row2.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 1), 0).AsInt16();
Vector128<short> row2_A = ZShuffle(rowA, Vector128.Load(shuffleVectorsPtr + (16 * 5))).AsInt16();
Vector128<short> row2_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 6))).AsInt16();
Vector128<short> row2_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 7))).AsInt16();
Vector128<short> row2 = row2_A | row2_B | row2_C;
row2 = row2.AsUInt16().WithElement(1, rowD.AsUInt16().GetElement(2)).AsInt16();
row2 = row2.AsUInt16().WithElement(0, rowE.AsUInt16().GetElement(1)).AsInt16();
// row3 - D3 E2 F1 G0 H0 G1 F2 E3
Vector128<short> row3_E = Ssse3.Shuffle(rowE, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 8))).AsInt16();
Vector128<short> row3_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 9))).AsInt16();
Vector128<short> row3_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 10))).AsInt16();
Vector128<short> row3 = Sse2.Or(Sse2.Or(row3_E, row3_F), row3_G);
row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 3), 0).AsInt16();
row3 = Sse2.Insert(row3.AsUInt16(), Sse2.Extract(rowH.AsUInt16(), 0), 4).AsInt16();
Vector128<short> row3_E = ZShuffle(rowE, Vector128.Load(shuffleVectorsPtr + (16 * 8))).AsInt16();
Vector128<short> row3_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 9))).AsInt16();
Vector128<short> row3_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 10))).AsInt16();
Vector128<short> row3 = row3_E | row3_F | row3_G;
row3 = row3.AsUInt16().WithElement(0, rowD.AsUInt16().GetElement(3)).AsInt16();
row3 = row3.AsUInt16().WithElement(4, rowH.AsUInt16().GetElement(0)).AsInt16();
// row4 - D4 C5 B6 A7 B7 C6 D5 E4
Vector128<short> row4_B = Ssse3.Shuffle(rowB, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 11))).AsInt16();
Vector128<short> row4_C = Ssse3.Shuffle(rowC, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 12))).AsInt16();
Vector128<short> row4_D = Ssse3.Shuffle(rowD, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 13))).AsInt16();
Vector128<short> row4 = Sse2.Or(Sse2.Or(row4_B, row4_C), row4_D);
row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowA.AsUInt16(), 7), 3).AsInt16();
row4 = Sse2.Insert(row4.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 4), 7).AsInt16();
Vector128<short> row4_B = ZShuffle(rowB, Vector128.Load(shuffleVectorsPtr + (16 * 11))).AsInt16();
Vector128<short> row4_C = ZShuffle(rowC, Vector128.Load(shuffleVectorsPtr + (16 * 12))).AsInt16();
Vector128<short> row4_D = ZShuffle(rowD, Vector128.Load(shuffleVectorsPtr + (16 * 13))).AsInt16();
Vector128<short> row4 = row4_B | row4_C | row4_D;
row4 = row4.AsUInt16().WithElement(3, rowA.AsUInt16().GetElement(7)).AsInt16();
row4 = row4.AsUInt16().WithElement(7, rowE.AsUInt16().GetElement(4)).AsInt16();
// row5 - F3 G2 H1 H2 G3 F4 E5 D6
Vector128<short> row5_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 14))).AsInt16();
Vector128<short> row5_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 15))).AsInt16();
Vector128<short> row5_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 16))).AsInt16();
Vector128<short> row5 = Sse2.Or(Sse2.Or(row5_F, row5_G), row5_H);
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 6), 7).AsInt16();
row5 = Sse2.Insert(row5.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 5), 6).AsInt16();
Vector128<short> row5_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 14))).AsInt16();
Vector128<short> row5_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 15))).AsInt16();
Vector128<short> row5_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 16))).AsInt16();
Vector128<short> row5 = row5_F | row5_G | row5_H;
row5 = row5.AsUInt16().WithElement(7, rowD.AsUInt16().GetElement(6)).AsInt16();
row5 = row5.AsUInt16().WithElement(6, rowE.AsUInt16().GetElement(5)).AsInt16();
// row6 - C7 D7 E6 F5 G4 H3 H4 G5
Vector128<short> row6_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 17))).AsInt16();
Vector128<short> row6_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 18))).AsInt16();
Vector128<short> row6 = Sse2.Or(row6_G, row6_H);
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowC.AsUInt16(), 7), 0).AsInt16();
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowD.AsUInt16(), 7), 1).AsInt16();
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 6), 2).AsInt16();
row6 = Sse2.Insert(row6.AsUInt16(), Sse2.Extract(rowF.AsUInt16(), 5), 3).AsInt16();
Vector128<short> row6_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 17))).AsInt16();
Vector128<short> row6_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 18))).AsInt16();
Vector128<short> row6 = row6_G | row6_H;
row6 = row6.AsUInt16().WithElement(0, rowC.AsUInt16().GetElement(7)).AsInt16();
row6 = row6.AsUInt16().WithElement(1, rowD.AsUInt16().GetElement(7)).AsInt16();
row6 = row6.AsUInt16().WithElement(2, rowE.AsUInt16().GetElement(6)).AsInt16();
row6 = row6.AsUInt16().WithElement(3, rowF.AsUInt16().GetElement(5)).AsInt16();
// row7 - F6 E7 F7 G6 H5 H6 G7 H7
Vector128<short> row7_F = Ssse3.Shuffle(rowF, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 19))).AsInt16();
Vector128<short> row7_G = Ssse3.Shuffle(rowG, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 20))).AsInt16();
Vector128<short> row7_H = Ssse3.Shuffle(rowH, Sse2.LoadVector128(shuffleVectorsPtr + (16 * 21))).AsInt16();
Vector128<short> row7 = Sse2.Or(Sse2.Or(row7_F, row7_G), row7_H);
row7 = Sse2.Insert(row7.AsUInt16(), Sse2.Extract(rowE.AsUInt16(), 7), 1).AsInt16();
Vector128<short> row7_F = ZShuffle(rowF, Vector128.Load(shuffleVectorsPtr + (16 * 19))).AsInt16();
Vector128<short> row7_G = ZShuffle(rowG, Vector128.Load(shuffleVectorsPtr + (16 * 20))).AsInt16();
Vector128<short> row7_H = ZShuffle(rowH, Vector128.Load(shuffleVectorsPtr + (16 * 21))).AsInt16();
Vector128<short> row7 = row7_F | row7_G | row7_H;
row7 = row7.AsUInt16().WithElement(1, rowE.AsUInt16().GetElement(7)).AsInt16();
block.V0 = row0;
block.V1 = row1;
@ -300,4 +303,20 @@ internal static partial class ZigZag
block.V67 = row67.AsInt16();
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector128<byte> ZShuffle(Vector128<byte> source, Vector128<byte> mask)
{
// For x64 we use the SSSE3 shuffle intrinsic to avoid additional instructions. 3 vs 1.
if (Ssse3.IsSupported)
{
return Ssse3.Shuffle(source, mask);
}
// For ARM and WASM, codegen will be optimal.
return Vector128.Shuffle(source, mask);
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

6
src/ImageSharp/Formats/Webp/AlphaDecoder.cs

@ -326,11 +326,11 @@ internal class AlphaDecoder : IDisposable
{
Vector128<long> a0 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, i)), 0);
Vector128<byte> a1 = a0.AsByte() + last.AsByte();
Vector128<byte> a2 = Vector128Utilities.ShiftLeftBytesInVector(a1, 1);
Vector128<byte> a2 = Vector128_.ShiftLeftBytesInVector(a1, 1);
Vector128<byte> a3 = a1 + a2;
Vector128<byte> a4 = Vector128Utilities.ShiftLeftBytesInVector(a3, 2);
Vector128<byte> a4 = Vector128_.ShiftLeftBytesInVector(a3, 2);
Vector128<byte> a5 = a3 + a4;
Vector128<byte> a6 = Vector128Utilities.ShiftLeftBytesInVector(a5, 4);
Vector128<byte> a6 = Vector128_.ShiftLeftBytesInVector(a5, 4);
Vector128<byte> a7 = a5 + a6;
ref byte outputRef = ref Unsafe.Add(ref dstRef, i);

18
src/ImageSharp/ImageSharp.csproj

@ -61,16 +61,6 @@
<AutoGen>True</AutoGen>
<DependentUpon>ImageMetadataExtensions.tt</DependentUpon>
</Compile>
<Compile Update="Formats\Jpeg\Components\Block8x8F.Generated.cs">
<DesignTime>True</DesignTime>
<AutoGen>True</AutoGen>
<DependentUpon>Block8x8F.Generated.tt</DependentUpon>
</Compile>
<Compile Update="Formats\Jpeg\Components\Block8x8F.Generated.cs">
<DesignTime>True</DesignTime>
<AutoGen>True</AutoGen>
<DependentUpon>Block8x8F.Generated.tt</DependentUpon>
</Compile>
<Compile Update="PixelFormats\PixelImplementations\PixelOperations\Generated\Abgr32.PixelOperations.Generated.cs">
<DesignTime>True</DesignTime>
<AutoGen>True</AutoGen>
@ -167,14 +157,6 @@
<LastGenOutput>ImageMetadataExtensions.cs</LastGenOutput>
<Generator>TextTemplatingFileGenerator</Generator>
</None>
<None Update="Formats\Jpeg\Components\Block8x8F.Generated.tt">
<Generator>TextTemplatingFileGenerator</Generator>
<LastGenOutput>Block8x8F.Generated.cs</LastGenOutput>
</None>
<None Update="Formats\Jpeg\Components\Block8x8F.Generated.tt">
<Generator>TextTemplatingFileGenerator</Generator>
<LastGenOutput>Block8x8F.Generated.cs</LastGenOutput>
</None>
<None Update="PixelFormats\PixelImplementations\PixelOperations\Generated\Abgr32.PixelOperations.Generated.tt">
<Generator>TextTemplatingFileGenerator</Generator>
<LastGenOutput>Abgr32.PixelOperations.Generated.cs</LastGenOutput>

2
tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_LoadFromInt16.cs

@ -32,7 +32,7 @@ public class Block8x8F_LoadFromInt16
public void Scalar() => this.destination.LoadFromInt16Scalar(ref this.source);
[Benchmark]
public void ExtendedAvx2() => this.destination.LoadFromInt16ExtendedAvx2(ref this.source);
public void ExtendedAvx2() => this.destination.LoadFromInt16ExtendedVector256(ref this.source);
// RESULT:
// Method | Mean | Error | StdDev | Scaled |

2
tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_Transpose.cs

@ -14,7 +14,7 @@ public class Block8x8F_Transpose
[Benchmark]
public float TransposeInplace()
{
this.source.TransposeInplace();
this.source.TransposeInPlace();
return this.source[0];
}

11
tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs

@ -8,6 +8,7 @@ using SixLabors.ImageSharp.Tests;
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg;
[Config(typeof(Config.HwIntrinsics_SSE_AVX))]
public class DecodeJpeg
{
private JpegDecoder decoder;
@ -21,7 +22,7 @@ public class DecodeJpeg
this.preloadedImageStream = new MemoryStream(bytes);
}
private void GenericBechmark()
private void GenericBenchmark()
{
this.preloadedImageStream.Position = 0;
using Image img = this.decoder.Decode(DecoderOptions.Default, this.preloadedImageStream);
@ -51,16 +52,16 @@ public class DecodeJpeg
}
[Benchmark(Description = "Baseline 4:4:4 Interleaved")]
public void JpegBaselineInterleaved444() => this.GenericBechmark();
public void JpegBaselineInterleaved444() => this.GenericBenchmark();
[Benchmark(Description = "Baseline 4:2:0 Interleaved")]
public void JpegBaselineInterleaved420() => this.GenericBechmark();
public void JpegBaselineInterleaved420() => this.GenericBenchmark();
[Benchmark(Description = "Baseline 4:0:0 (grayscale)")]
public void JpegBaseline400() => this.GenericBechmark();
public void JpegBaseline400() => this.GenericBenchmark();
[Benchmark(Description = "Progressive 4:2:0 Non-Interleaved")]
public void JpegProgressiveNonInterleaved420() => this.GenericBechmark();
public void JpegProgressiveNonInterleaved420() => this.GenericBenchmark();
}
/*

33
tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs

@ -34,6 +34,7 @@ public partial class Config
// like `LZCNT`, `BMI1`, or `BMI2`
// `EnableSSE3_4` is a legacy switch that exists for compat and is basically the same as `EnableSSE3`
private const string EnableAES = "DOTNET_EnableAES";
private const string EnableAVX512F = "DOTNET_EnableAVX512F";
private const string EnableAVX = "DOTNET_EnableAVX";
private const string EnableAVX2 = "DOTNET_EnableAVX2";
private const string EnableBMI1 = "DOTNET_EnableBMI1";
@ -76,4 +77,36 @@ public partial class Config
}
}
}
public class HwIntrinsics_SSE_AVX_AVX512F : Config
{
public HwIntrinsics_SSE_AVX_AVX512F()
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
.WithEnvironmentVariables(
new EnvironmentVariable(EnableHWIntrinsic, Off),
new EnvironmentVariable(FeatureSIMD, Off))
.WithId("1. No HwIntrinsics").AsBaseline());
if (Sse.IsSupported)
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
.WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
.WithId("2. SSE"));
}
if (Avx.IsSupported)
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
.WithEnvironmentVariables(new EnvironmentVariable(EnableAVX512F, Off))
.WithId("3. AVX"));
}
if (Avx512F.IsSupported)
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
.WithId("3. AVX512F"));
}
}
}
}

95
tests/ImageSharp.Tests/Formats/Jpg/Block8x8FTests.cs

@ -3,6 +3,7 @@
// Uncomment this to turn unit tests into benchmarks:
// #define BENCHMARKING
using System.Runtime.Intrinsics;
using SixLabors.ImageSharp.Formats.Jpeg.Components;
using SixLabors.ImageSharp.Tests.Formats.Jpg.Utils;
using SixLabors.ImageSharp.Tests.TestUtilities;
@ -24,11 +25,22 @@ public partial class Block8x8FTests : JpegFixture
{
}
private bool SkipOnNonAvx2Runner()
private bool SkipOnNonVector256Runner()
{
if (!SimdUtils.HasVector8)
if (!Vector256.IsHardwareAccelerated)
{
this.Output.WriteLine("AVX2 not supported, skipping!");
this.Output.WriteLine("Vector256 not supported, skipping!");
return true;
}
return false;
}
private bool SkipOnNonVector128Runner()
{
if (!Vector128.IsHardwareAccelerated)
{
this.Output.WriteLine("Vector128 not supported, skipping!");
return true;
}
@ -43,7 +55,7 @@ public partial class Block8x8FTests : JpegFixture
Times,
() =>
{
var block = default(Block8x8F);
Block8x8F block = default;
for (int i = 0; i < Block8x8F.Size; i++)
{
@ -56,7 +68,7 @@ public partial class Block8x8FTests : JpegFixture
sum += block[i];
}
});
Assert.Equal(sum, 64f * 63f * 0.5f);
Assert.Equal(64f * 63f * 0.5f, sum);
}
[Fact]
@ -81,7 +93,7 @@ public partial class Block8x8FTests : JpegFixture
sum += block[i];
}
});
Assert.Equal(sum, 64f * 63f * 0.5f);
Assert.Equal(64f * 63f * 0.5f, sum);
}
[Fact]
@ -109,7 +121,7 @@ public partial class Block8x8FTests : JpegFixture
}
[Fact]
public void TransposeInplace()
public void TransposeInPlace()
{
static void RunTest()
{
@ -118,7 +130,7 @@ public partial class Block8x8FTests : JpegFixture
Block8x8F block8x8 = Block8x8F.Load(Create8x8FloatData());
block8x8.TransposeInplace();
block8x8.TransposeInPlace();
float[] actual = new float[64];
block8x8.ScaledCopyTo(actual);
@ -172,9 +184,33 @@ public partial class Block8x8FTests : JpegFixture
[Theory]
[InlineData(1)]
[InlineData(2)]
public void NormalizeColorsAndRoundAvx2(int seed)
public void NormalizeColorsAndRoundVector256(int seed)
{
if (this.SkipOnNonVector256Runner())
{
return;
}
Block8x8F source = CreateRandomFloatBlock(-200, 200, seed);
Block8x8F expected = source;
expected.NormalizeColorsInPlace(255);
expected.RoundInPlace();
Block8x8F actual = source;
actual.NormalizeColorsAndRoundInPlaceVector256(255);
this.Output.WriteLine(expected.ToString());
this.Output.WriteLine(actual.ToString());
this.CompareBlocks(expected, actual, 0);
}
[Theory]
[InlineData(1)]
[InlineData(2)]
public void NormalizeColorsAndRoundVector128(int seed)
{
if (this.SkipOnNonAvx2Runner())
if (this.SkipOnNonVector128Runner())
{
return;
}
@ -186,7 +222,7 @@ public partial class Block8x8FTests : JpegFixture
expected.RoundInPlace();
Block8x8F actual = source;
actual.NormalizeColorsAndRoundInPlaceVector8(255);
actual.NormalizeColorsAndRoundInPlaceVector128(255);
this.Output.WriteLine(expected.ToString());
this.Output.WriteLine(actual.ToString());
@ -206,7 +242,7 @@ public partial class Block8x8FTests : JpegFixture
Block8x8F source = CreateRandomFloatBlock(-2000, 2000, srcSeed);
// Quantization code is used only in jpeg where it's guaranteed that
// qunatization valus are greater than 1
// quantization values are greater than 1
// Quantize method supports negative numbers by very small numbers can cause troubles
Block8x8F quant = CreateRandomFloatBlock(1, 2000, qtSeed);
@ -240,7 +276,7 @@ public partial class Block8x8FTests : JpegFixture
float[] data = Create8x8RandomFloatData(-1000, 1000);
Block8x8F source = Block8x8F.Load(data);
var dest = default(Block8x8);
Block8x8 dest = default;
source.RoundInto(ref dest);
@ -345,14 +381,14 @@ public partial class Block8x8FTests : JpegFixture
[Fact]
public void LoadFromUInt16Scalar()
{
if (this.SkipOnNonAvx2Runner())
if (this.SkipOnNonVector256Runner())
{
return;
}
short[] data = Create8x8ShortData();
var source = Block8x8.Load(data);
Block8x8 source = Block8x8.Load(data);
Block8x8F dest = default;
dest.LoadFromInt16Scalar(ref source);
@ -363,20 +399,41 @@ public partial class Block8x8FTests : JpegFixture
}
}
[Fact]
public void LoadFromUInt16ExtendedVector128()
{
if (this.SkipOnNonVector128Runner())
{
return;
}
short[] data = Create8x8ShortData();
Block8x8 source = Block8x8.Load(data);
Block8x8F dest = default;
dest.LoadFromInt16ExtendedVector128(ref source);
for (int i = 0; i < Block8x8F.Size; i++)
{
Assert.Equal(data[i], dest[i]);
}
}
[Fact]
public void LoadFromUInt16ExtendedAvx2()
{
if (this.SkipOnNonAvx2Runner())
if (this.SkipOnNonVector256Runner())
{
return;
}
short[] data = Create8x8ShortData();
var source = Block8x8.Load(data);
Block8x8 source = Block8x8.Load(data);
Block8x8F dest = default;
dest.LoadFromInt16ExtendedAvx2(ref source);
dest.LoadFromInt16ExtendedVector256(ref source);
for (int i = 0; i < Block8x8F.Size; i++)
{
@ -405,7 +462,7 @@ public partial class Block8x8FTests : JpegFixture
// 3. DisableAvx2 - call fallback code of float implementation
FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunTest,
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2);
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE);
}
[Theory]

2
tests/ImageSharp.Tests/Formats/Jpg/Block8x8Tests.cs

@ -271,7 +271,7 @@ public class Block8x8Tests : JpegFixture
Block8x8 block8x8 = Block8x8.Load(Create8x8ShortData());
block8x8.TransposeInplace();
block8x8.TransposeInPlace();
short[] actual = new short[64];
block8x8.CopyTo(actual);

12
tests/ImageSharp.Tests/Formats/Jpg/DCTTests.cs

@ -62,7 +62,7 @@ public static class DCTTests
FloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
// IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace();
srcBlock.TransposeInPlace();
srcBlock.MultiplyInPlace(ref dequantMatrix);
// IDCT calculation
@ -95,7 +95,7 @@ public static class DCTTests
FloatingPointDCT.AdjustToIDCT(ref dequantMatrix);
// IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace();
srcBlock.TransposeInPlace();
srcBlock.MultiplyInPlace(ref dequantMatrix);
// IDCT calculation
@ -136,7 +136,7 @@ public static class DCTTests
// testee
// IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace();
srcBlock.TransposeInPlace();
FloatingPointDCT.TransformIDCT(ref srcBlock);
float[] actualDest = srcBlock.ToArray();
@ -182,7 +182,7 @@ public static class DCTTests
// testee
// IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace();
srcBlock.TransposeInPlace();
ScaledFloatingPointDCT.TransformIDCT_4x4(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue);
Span<float> expectedSpan = expectedDest.AsSpan();
@ -243,7 +243,7 @@ public static class DCTTests
// testee
// IDCT implementation tranforms blocks after transposition
srcBlock.TransposeInplace();
srcBlock.TransposeInPlace();
ScaledFloatingPointDCT.TransformIDCT_2x2(ref srcBlock, ref dequantMatrix, NormalizationValue, MaxOutputValue);
Span<float> expectedSpan = expectedDest.AsSpan();
@ -338,7 +338,7 @@ public static class DCTTests
// Second transpose call is done by Quantize step
// Do this manually here just to be complient to the reference implementation
FloatingPointDCT.TransformFDCT(ref block);
block.TransposeInplace();
block.TransposeInPlace();
// Part of the IDCT calculations is fused into the quantization step
// We must multiply input block with adjusted no-quantization matrix

2
tests/ImageSharp.Tests/Formats/Jpg/Utils/LibJpegTools.ComponentData.cs

@ -60,7 +60,7 @@ internal static partial class LibJpegTools
internal void MakeBlock(Block8x8 block, int y, int x)
{
block.TransposeInplace();
block.TransposeInPlace();
this.MakeBlock(block.ToArray(), y, x);
}

Loading…
Cancel
Save