Browse Source

Rename utils, organize BlockF8x8

pull/2918/head
James Jackson-South 1 year ago
parent
commit
5125a0480f
  1. 194
      src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
  2. 8
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  3. 16
      src/ImageSharp/Common/Helpers/Vector256Utilities.cs
  4. 30
      src/ImageSharp/Common/Helpers/Vector512Utilities.cs
  5. 145
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs
  6. 183
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs
  7. 66
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs
  8. 191
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs
  9. 221
      src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
  10. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs
  11. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs
  12. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs
  13. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs
  14. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs
  15. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs
  16. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs
  17. 4
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs
  18. 2
      src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs
  19. 64
      src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs
  20. 6
      src/ImageSharp/Formats/Webp/AlphaDecoder.cs
  21. 11
      tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs
  22. 33
      tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs

194
src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs

@ -66,9 +66,9 @@ internal static partial class SimdUtils
ref Span<float> destination,
[ConstantExpected] byte control)
{
if ((Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleFloat) ||
(Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleFloat) ||
(Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleFloat))
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat) ||
(Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat))
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@ -112,9 +112,9 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if ((Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleByte) ||
(Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleByte) ||
(Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte))
if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte) ||
(Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte) ||
(Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte))
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@ -158,7 +158,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsRightAlign)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign)
{
int remainder = source.Length % (Vector128<byte>.Count * 3);
@ -190,7 +190,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
{
int remainder = source.Length % (Vector128<byte>.Count * 3);
@ -223,7 +223,7 @@ internal static partial class SimdUtils
ref Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
{
int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1); // bit-hack for modulo
@ -249,7 +249,7 @@ internal static partial class SimdUtils
Span<float> destination,
[ConstantExpected] byte control)
{
if (Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleFloat)
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleFloat)
{
ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
@ -263,21 +263,21 @@ internal static partial class SimdUtils
ref Vector512<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector512<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector512Utilities.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
vd0 = Vector512_.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector512Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control);
Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
}
}
}
else if (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleFloat)
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleFloat)
{
ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@ -291,21 +291,21 @@ internal static partial class SimdUtils
ref Vector256<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector256<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector256Utilities.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
vd0 = Vector256_.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector256Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control);
Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
}
}
}
else if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleFloat)
else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleFloat)
{
ref Vector128<float> sourceBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source));
ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination));
@ -319,17 +319,17 @@ internal static partial class SimdUtils
ref Vector128<float> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector128<float> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector128Utilities.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
vd0 = Vector128_.Shuffle(vs0, control);
Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control);
Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control);
Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector128Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control);
Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), control);
}
}
}
@ -341,7 +341,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleByte)
if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleByte)
{
Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -359,21 +359,21 @@ internal static partial class SimdUtils
ref Vector512<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector512<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector512Utilities.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector512_.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector512_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector512Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector512_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
else if (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleByte)
else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleByte)
{
Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -391,21 +391,21 @@ internal static partial class SimdUtils
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector256Utilities.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector256_.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector256_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector256Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector256_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
else if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte)
else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte)
{
Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -423,17 +423,17 @@ internal static partial class SimdUtils
ref Vector128<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector128<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);
vd0 = Vector128Utilities.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
vd0 = Vector128_.Shuffle(vs0, mask);
Unsafe.Add(ref vd0, (nuint)1) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
Unsafe.Add(ref vd0, (nuint)2) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
Unsafe.Add(ref vd0, (nuint)3) = Vector128_.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
Unsafe.Add(ref destinationBase, i) = Vector128Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
Unsafe.Add(ref destinationBase, i) = Vector128_.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
@ -445,11 +445,11 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsRightAlign)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsRightAlign)
{
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
Vector128<byte> maskE = Vector128Utilities.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count];
Shuffle.MMShuffleSpan(ref bytes, control);
@ -467,28 +467,28 @@ internal static partial class SimdUtils
Vector128<byte> v0 = vs;
Vector128<byte> v1 = Unsafe.Add(ref vs, (nuint)1);
Vector128<byte> v2 = Unsafe.Add(ref vs, (nuint)2);
Vector128<byte> v3 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
Vector128<byte> v3 = Vector128_.ShiftRightBytesInVector(v2, 4);
v2 = Vector128Utilities.AlignRight(v2, v1, 8);
v1 = Vector128Utilities.AlignRight(v1, v0, 12);
v2 = Vector128_.AlignRight(v2, v1, 8);
v1 = Vector128_.AlignRight(v1, v0, 12);
v0 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, maskPad4Nx16), mask);
v1 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, maskPad4Nx16), mask);
v2 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, maskPad4Nx16), mask);
v3 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, maskPad4Nx16), mask);
v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16), mask);
v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16), mask);
v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16), mask);
v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16), mask);
v0 = Vector128Utilities.Shuffle(v0, maskE);
v1 = Vector128Utilities.Shuffle(v1, maskSlice4Nx16);
v2 = Vector128Utilities.Shuffle(v2, maskE);
v3 = Vector128Utilities.Shuffle(v3, maskSlice4Nx16);
v0 = Vector128_.Shuffle(v0, maskE);
v1 = Vector128_.Shuffle(v1, maskSlice4Nx16);
v2 = Vector128_.Shuffle(v2, maskE);
v3 = Vector128_.Shuffle(v3, maskSlice4Nx16);
v0 = Vector128Utilities.AlignRight(v1, v0, 4);
v3 = Vector128Utilities.AlignRight(v3, v2, 12);
v0 = Vector128_.AlignRight(v1, v0, 4);
v3 = Vector128_.AlignRight(v3, v2, 12);
v1 = Vector128Utilities.ShiftLeftBytesInVector(v1, 4);
v2 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
v1 = Vector128_.ShiftLeftBytesInVector(v1, 4);
v2 = Vector128_.ShiftRightBytesInVector(v2, 4);
v1 = Vector128Utilities.AlignRight(v2, v1, 8);
v1 = Vector128_.AlignRight(v2, v1, 8);
ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, i);
@ -505,7 +505,7 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
{
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@ -527,17 +527,17 @@ internal static partial class SimdUtils
ref Vector128<byte> v0 = ref Unsafe.Add(ref sourceBase, i);
Vector128<byte> v1 = Unsafe.Add(ref v0, 1);
Vector128<byte> v2 = Unsafe.Add(ref v0, 2);
Vector128<byte> v3 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
Vector128<byte> v3 = Vector128_.ShiftRightBytesInVector(v2, 4);
v2 = Vector128Utilities.AlignRight(v2, v1, 8);
v1 = Vector128Utilities.AlignRight(v1, v0, 12);
v2 = Vector128_.AlignRight(v2, v1, 8);
v1 = Vector128_.AlignRight(v1, v0, 12);
ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, j);
vd = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 1) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 2) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 3) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, maskPad4Nx16) | fill, mask);
vd = Vector128_.Shuffle(Vector128_.Shuffle(v0, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 1) = Vector128_.Shuffle(Vector128_.Shuffle(v1, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 2) = Vector128_.Shuffle(Vector128_.Shuffle(v2, maskPad4Nx16) | fill, mask);
Unsafe.Add(ref vd, 3) = Vector128_.Shuffle(Vector128_.Shuffle(v3, maskPad4Nx16) | fill, mask);
}
}
}
@ -548,10 +548,10 @@ internal static partial class SimdUtils
Span<byte> destination,
[ConstantExpected] byte control)
{
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte)
if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleByte && Vector128_.SupportsShiftByte)
{
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
Vector128<byte> maskE = Vector128Utilities.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@ -574,18 +574,18 @@ internal static partial class SimdUtils
Vector128<byte> v2 = Unsafe.Add(ref vs, 2);
Vector128<byte> v3 = Unsafe.Add(ref vs, 3);
v0 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, mask), maskE);
v1 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, mask), maskSlice4Nx16);
v2 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, mask), maskE);
v3 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, mask), maskSlice4Nx16);
v0 = Vector128_.Shuffle(Vector128_.Shuffle(v0, mask), maskE);
v1 = Vector128_.Shuffle(Vector128_.Shuffle(v1, mask), maskSlice4Nx16);
v2 = Vector128_.Shuffle(Vector128_.Shuffle(v2, mask), maskE);
v3 = Vector128_.Shuffle(Vector128_.Shuffle(v3, mask), maskSlice4Nx16);
v0 = Vector128Utilities.AlignRight(v1, v0, 4);
v3 = Vector128Utilities.AlignRight(v3, v2, 12);
v0 = Vector128_.AlignRight(v1, v0, 4);
v3 = Vector128_.AlignRight(v3, v2, 12);
v1 = Vector128Utilities.ShiftLeftBytesInVector(v1, 4);
v2 = Vector128Utilities.ShiftRightBytesInVector(v2, 4);
v1 = Vector128_.ShiftLeftBytesInVector(v1, 4);
v2 = Vector128_.ShiftRightBytesInVector(v2, 4);
v1 = Vector128Utilities.AlignRight(v2, v1, 8);
v1 = Vector128_.AlignRight(v2, v1, 8);
ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, j);
@ -965,10 +965,10 @@ internal static partial class SimdUtils
Vector512<float> f2 = scale * Unsafe.Add(ref s, 2);
Vector512<float> f3 = scale * Unsafe.Add(ref s, 3);
Vector512<int> w0 = Vector512Utilities.ConvertToInt32RoundToEven(f0);
Vector512<int> w1 = Vector512Utilities.ConvertToInt32RoundToEven(f1);
Vector512<int> w2 = Vector512Utilities.ConvertToInt32RoundToEven(f2);
Vector512<int> w3 = Vector512Utilities.ConvertToInt32RoundToEven(f3);
Vector512<int> w0 = Vector512_.ConvertToInt32RoundToEven(f0);
Vector512<int> w1 = Vector512_.ConvertToInt32RoundToEven(f1);
Vector512<int> w2 = Vector512_.ConvertToInt32RoundToEven(f2);
Vector512<int> w3 = Vector512_.ConvertToInt32RoundToEven(f3);
Vector512<short> u0 = Avx512BW.PackSignedSaturate(w0, w1);
Vector512<short> u1 = Avx512BW.PackSignedSaturate(w2, w3);
@ -999,10 +999,10 @@ internal static partial class SimdUtils
Vector256<float> f2 = scale * Unsafe.Add(ref s, 2);
Vector256<float> f3 = scale * Unsafe.Add(ref s, 3);
Vector256<int> w0 = Vector256Utilities.ConvertToInt32RoundToEven(f0);
Vector256<int> w1 = Vector256Utilities.ConvertToInt32RoundToEven(f1);
Vector256<int> w2 = Vector256Utilities.ConvertToInt32RoundToEven(f2);
Vector256<int> w3 = Vector256Utilities.ConvertToInt32RoundToEven(f3);
Vector256<int> w0 = Vector256_.ConvertToInt32RoundToEven(f0);
Vector256<int> w1 = Vector256_.ConvertToInt32RoundToEven(f1);
Vector256<int> w2 = Vector256_.ConvertToInt32RoundToEven(f2);
Vector256<int> w3 = Vector256_.ConvertToInt32RoundToEven(f3);
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
@ -1033,15 +1033,15 @@ internal static partial class SimdUtils
Vector128<float> f2 = scale * Unsafe.Add(ref s, 2);
Vector128<float> f3 = scale * Unsafe.Add(ref s, 3);
Vector128<int> w0 = Vector128Utilities.ConvertToInt32RoundToEven(f0);
Vector128<int> w1 = Vector128Utilities.ConvertToInt32RoundToEven(f1);
Vector128<int> w2 = Vector128Utilities.ConvertToInt32RoundToEven(f2);
Vector128<int> w3 = Vector128Utilities.ConvertToInt32RoundToEven(f3);
Vector128<int> w0 = Vector128_.ConvertToInt32RoundToEven(f0);
Vector128<int> w1 = Vector128_.ConvertToInt32RoundToEven(f1);
Vector128<int> w2 = Vector128_.ConvertToInt32RoundToEven(f2);
Vector128<int> w3 = Vector128_.ConvertToInt32RoundToEven(f3);
Vector128<short> u0 = Vector128Utilities.PackSignedSaturate(w0, w1);
Vector128<short> u1 = Vector128Utilities.PackSignedSaturate(w2, w3);
Vector128<short> u0 = Vector128_.PackSignedSaturate(w0, w1);
Vector128<short> u1 = Vector128_.PackSignedSaturate(w2, w3);
Unsafe.Add(ref destinationBase, i) = Vector128Utilities.PackUnsignedSaturate(u0, u1);
Unsafe.Add(ref destinationBase, i) = Vector128_.PackUnsignedSaturate(u0, u1);
}
}
}

8
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

@ -19,7 +19,9 @@ namespace SixLabors.ImageSharp.Common.Helpers;
/// </list>
/// Should only be used if the intrinsics are available.
/// </summary>
internal static class Vector128Utilities
#pragma warning disable SA1649 // File name should match first type name
internal static class Vector128_
#pragma warning restore SA1649 // File name should match first type name
{
/// <summary>
/// Gets a value indicating whether shuffle operations are supported.
@ -314,8 +316,8 @@ internal static class Vector128Utilities
return Vector128.Narrow(lefClamped, rightClamped);
}
/// <summary
/// >Restricts a vector between a minimum and a maximum value.
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
/// </summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="value">The vector to restrict.</param>

16
src/ImageSharp/Common/Helpers/Vector256Utilities.cs

@ -17,7 +17,9 @@ namespace SixLabors.ImageSharp.Common.Helpers;
/// </list>
/// Should only be used if the intrinsics are available.
/// </summary>
internal static class Vector256Utilities
#pragma warning disable SA1649 // File name should match first type name
internal static class Vector256_
#pragma warning restore SA1649 // File name should match first type name
{
/// <summary>
/// Gets a value indicating whether shuffle byte operations are supported.
@ -152,6 +154,18 @@ internal static class Vector256Utilities
return va + (vm0 * vm1);
}
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
/// </summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="value">The vector to restrict.</param>
/// <param name="min">The minimum value.</param>
/// <param name="max">The maximum value.</param>
/// <returns>The restricted <see cref="Vector256{T}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256<T> Clamp<T>(Vector256<T> value, Vector256<T> min, Vector256<T> max)
=> Vector256.Min(Vector256.Max(value, min), max);
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

30
src/ImageSharp/Common/Helpers/Vector512Utilities.cs

@ -17,7 +17,9 @@ namespace SixLabors.ImageSharp.Common.Helpers;
/// </list>
/// Should only be used if the intrinsics are available.
/// </summary>
internal static class Vector512Utilities
#pragma warning disable SA1649 // File name should match first type name
internal static class Vector512_
#pragma warning restore SA1649 // File name should match first type name
{
/// <summary>
/// Gets a value indicating whether shuffle float operations are supported.
@ -126,6 +128,13 @@ internal static class Vector512Utilities
return Avx512F.RoundScale(vector, 0b0000_1000);
}
if (Avx.IsSupported)
{
Vector256<float> lower = Avx.RoundToNearestInteger(vector.GetLower());
Vector256<float> upper = Avx.RoundToNearestInteger(vector.GetUpper());
return Vector512.Create(lower, upper);
}
Vector512<float> sign = vector & Vector512.Create(-0F);
Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608F);
@ -152,9 +161,28 @@ internal static class Vector512Utilities
return Avx512F.FusedMultiplyAdd(vm0, vm1, va);
}
if (Fma.IsSupported)
{
Vector256<float> lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower());
Vector256<float> upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper());
return Vector512.Create(lower, upper);
}
return va + (vm0 * vm1);
}
/// <summary>
/// Restricts a vector between a minimum and a maximum value.
/// </summary>
/// <typeparam name="T">The type of the elements in the vector.</typeparam>
/// <param name="value">The vector to restrict.</param>
/// <param name="min">The minimum value.</param>
/// <param name="max">The maximum value.</param>
/// <returns>The restricted <see cref="Vector512{T}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector512<T> Clamp<T>(Vector512<T> value, Vector512<T> min, Vector512<T> max)
=> Vector512.Min(Vector512.Max(value, min), max);
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

145
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Intrinsic.cs

@ -1,145 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal partial struct Block8x8F
{
/// <summary>
/// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
/// </summary>
public const int RowCount = 8;
[FieldOffset(0)]
public Vector256<float> V0;
[FieldOffset(32)]
public Vector256<float> V1;
[FieldOffset(64)]
public Vector256<float> V2;
[FieldOffset(96)]
public Vector256<float> V3;
[FieldOffset(128)]
public Vector256<float> V4;
[FieldOffset(160)]
public Vector256<float> V5;
[FieldOffset(192)]
public Vector256<float> V6;
[FieldOffset(224)]
public Vector256<float> V7;
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
ref Vector256<float> aBase = ref a.V0;
ref Vector256<float> bBase = ref b.V0;
ref Vector256<short> destRef = ref dest.V01;
Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
for (nuint i = 0; i < 8; i += 2)
{
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16();
Unsafe.Add(ref destRef, i / 2) = row;
}
}
private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
// TODO: We can use the v128 utilities for this.
for (nuint i = 0; i < 16; i += 2)
{
Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector128<short> row = Sse2.PackSignedSaturate(left, right);
Unsafe.Add(ref destBase, i / 2) = row;
}
}
private void TransposeInplace_Avx()
{
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
Vector256<float> r0 = Avx.InsertVector128(
this.V0,
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
1);
Vector256<float> r1 = Avx.InsertVector128(
this.V1,
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
1);
Vector256<float> r2 = Avx.InsertVector128(
this.V2,
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
1);
Vector256<float> r3 = Avx.InsertVector128(
this.V3,
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
1);
Vector256<float> r4 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
1);
Vector256<float> r5 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
1);
Vector256<float> r6 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
1);
Vector256<float> r7 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
1);
Vector256<float> t0 = Avx.UnpackLow(r0, r1);
Vector256<float> t2 = Avx.UnpackLow(r2, r3);
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
this.V0 = Avx.Blend(t0, v, 0xCC);
this.V1 = Avx.Blend(t2, v, 0x33);
Vector256<float> t4 = Avx.UnpackLow(r4, r5);
Vector256<float> t6 = Avx.UnpackLow(r6, r7);
v = Avx.Shuffle(t4, t6, 0x4E);
this.V4 = Avx.Blend(t4, v, 0xCC);
this.V5 = Avx.Blend(t6, v, 0x33);
Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
v = Avx.Shuffle(t1, t3, 0x4E);
this.V2 = Avx.Blend(t1, v, 0xCC);
this.V3 = Avx.Blend(t3, v, 0x33);
Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
v = Avx.Shuffle(t5, t7, 0x4E);
this.V6 = Avx.Blend(t5, v, 0xCC);
this.V7 = Avx.Blend(t7, v, 0x33);
}
}

183
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Round.cs

@ -1,183 +0,0 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
internal partial struct Block8x8F
{
/// <summary>
/// Level shift by +maximum/2, clip to [0, maximum]
/// </summary>
/// <param name="maximum">The maximum value to normalize to.</param>
public void NormalizeColorsInPlace(float maximum)
{
Vector4 min = Vector4.Zero;
Vector4 max = new(maximum);
Vector4 off = new(MathF.Ceiling(maximum * 0.5F));
this.V0L = Vector4.Clamp(this.V0L + off, min, max);
this.V0R = Vector4.Clamp(this.V0R + off, min, max);
this.V1L = Vector4.Clamp(this.V1L + off, min, max);
this.V1R = Vector4.Clamp(this.V1R + off, min, max);
this.V2L = Vector4.Clamp(this.V2L + off, min, max);
this.V2R = Vector4.Clamp(this.V2R + off, min, max);
this.V3L = Vector4.Clamp(this.V3L + off, min, max);
this.V3R = Vector4.Clamp(this.V3R + off, min, max);
this.V4L = Vector4.Clamp(this.V4L + off, min, max);
this.V4R = Vector4.Clamp(this.V4R + off, min, max);
this.V5L = Vector4.Clamp(this.V5L + off, min, max);
this.V5R = Vector4.Clamp(this.V5R + off, min, max);
this.V6L = Vector4.Clamp(this.V6L + off, min, max);
this.V6R = Vector4.Clamp(this.V6R + off, min, max);
this.V7L = Vector4.Clamp(this.V7L + off, min, max);
this.V7R = Vector4.Clamp(this.V7R + off, min, max);
}
/// <summary>
/// <see cref="Vector256{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
/// </summary>
/// <param name="maximum">The maximum value to normalize to.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
{
Vector256<float> off = Vector256.Create(MathF.Ceiling(maximum * 0.5F));
Vector256<float> max = Vector256.Create(maximum);
ref Vector256<float> row0 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V0L);
row0 = NormalizeAndRoundVector256(row0, off, max);
ref Vector256<float> row1 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V1L);
row1 = NormalizeAndRoundVector256(row1, off, max);
ref Vector256<float> row2 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V2L);
row2 = NormalizeAndRoundVector256(row2, off, max);
ref Vector256<float> row3 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V3L);
row3 = NormalizeAndRoundVector256(row3, off, max);
ref Vector256<float> row4 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V4L);
row4 = NormalizeAndRoundVector256(row4, off, max);
ref Vector256<float> row5 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V5L);
row5 = NormalizeAndRoundVector256(row5, off, max);
ref Vector256<float> row6 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V6L);
row6 = NormalizeAndRoundVector256(row6, off, max);
ref Vector256<float> row7 = ref Unsafe.As<Vector4, Vector256<float>>(ref this.V7L);
row7 = NormalizeAndRoundVector256(row7, off, max);
}
/// <summary>
/// <see cref="Vector128{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
/// </summary>
/// <param name="maximum">The maximum value to normalize to.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void NormalizeColorsAndRoundInPlaceVector128(float maximum)
{
Vector128<float> off = Vector128.Create(MathF.Ceiling(maximum * 0.5F));
Vector128<float> max = Vector128.Create(maximum);
this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4();
this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4();
this.V1L = NormalizeAndRoundVector128(this.V1L.AsVector128(), off, max).AsVector4();
this.V1R = NormalizeAndRoundVector128(this.V1R.AsVector128(), off, max).AsVector4();
this.V2L = NormalizeAndRoundVector128(this.V2L.AsVector128(), off, max).AsVector4();
this.V2R = NormalizeAndRoundVector128(this.V2R.AsVector128(), off, max).AsVector4();
this.V3L = NormalizeAndRoundVector128(this.V3L.AsVector128(), off, max).AsVector4();
this.V3R = NormalizeAndRoundVector128(this.V3R.AsVector128(), off, max).AsVector4();
this.V4L = NormalizeAndRoundVector128(this.V4L.AsVector128(), off, max).AsVector4();
this.V4R = NormalizeAndRoundVector128(this.V4R.AsVector128(), off, max).AsVector4();
this.V5L = NormalizeAndRoundVector128(this.V5L.AsVector128(), off, max).AsVector4();
this.V5R = NormalizeAndRoundVector128(this.V5R.AsVector128(), off, max).AsVector4();
this.V6L = NormalizeAndRoundVector128(this.V6L.AsVector128(), off, max).AsVector4();
this.V6R = NormalizeAndRoundVector128(this.V6R.AsVector128(), off, max).AsVector4();
this.V7L = NormalizeAndRoundVector128(this.V7L.AsVector128(), off, max).AsVector4();
this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4();
}
/// <summary>
/// Fill the block from 'source' doing short -> float conversion.
/// </summary>
/// <param name="source">The source block</param>
public void LoadFromInt16Scalar(ref Block8x8 source)
{
ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
this.V0L.X = Unsafe.Add(ref selfRef, 0);
this.V0L.Y = Unsafe.Add(ref selfRef, 1);
this.V0L.Z = Unsafe.Add(ref selfRef, 2);
this.V0L.W = Unsafe.Add(ref selfRef, 3);
this.V0R.X = Unsafe.Add(ref selfRef, 4);
this.V0R.Y = Unsafe.Add(ref selfRef, 5);
this.V0R.Z = Unsafe.Add(ref selfRef, 6);
this.V0R.W = Unsafe.Add(ref selfRef, 7);
this.V1L.X = Unsafe.Add(ref selfRef, 8);
this.V1L.Y = Unsafe.Add(ref selfRef, 9);
this.V1L.Z = Unsafe.Add(ref selfRef, 10);
this.V1L.W = Unsafe.Add(ref selfRef, 11);
this.V1R.X = Unsafe.Add(ref selfRef, 12);
this.V1R.Y = Unsafe.Add(ref selfRef, 13);
this.V1R.Z = Unsafe.Add(ref selfRef, 14);
this.V1R.W = Unsafe.Add(ref selfRef, 15);
this.V2L.X = Unsafe.Add(ref selfRef, 16);
this.V2L.Y = Unsafe.Add(ref selfRef, 17);
this.V2L.Z = Unsafe.Add(ref selfRef, 18);
this.V2L.W = Unsafe.Add(ref selfRef, 19);
this.V2R.X = Unsafe.Add(ref selfRef, 20);
this.V2R.Y = Unsafe.Add(ref selfRef, 21);
this.V2R.Z = Unsafe.Add(ref selfRef, 22);
this.V2R.W = Unsafe.Add(ref selfRef, 23);
this.V3L.X = Unsafe.Add(ref selfRef, 24);
this.V3L.Y = Unsafe.Add(ref selfRef, 25);
this.V3L.Z = Unsafe.Add(ref selfRef, 26);
this.V3L.W = Unsafe.Add(ref selfRef, 27);
this.V3R.X = Unsafe.Add(ref selfRef, 28);
this.V3R.Y = Unsafe.Add(ref selfRef, 29);
this.V3R.Z = Unsafe.Add(ref selfRef, 30);
this.V3R.W = Unsafe.Add(ref selfRef, 31);
this.V4L.X = Unsafe.Add(ref selfRef, 32);
this.V4L.Y = Unsafe.Add(ref selfRef, 33);
this.V4L.Z = Unsafe.Add(ref selfRef, 34);
this.V4L.W = Unsafe.Add(ref selfRef, 35);
this.V4R.X = Unsafe.Add(ref selfRef, 36);
this.V4R.Y = Unsafe.Add(ref selfRef, 37);
this.V4R.Z = Unsafe.Add(ref selfRef, 38);
this.V4R.W = Unsafe.Add(ref selfRef, 39);
this.V5L.X = Unsafe.Add(ref selfRef, 40);
this.V5L.Y = Unsafe.Add(ref selfRef, 41);
this.V5L.Z = Unsafe.Add(ref selfRef, 42);
this.V5L.W = Unsafe.Add(ref selfRef, 43);
this.V5R.X = Unsafe.Add(ref selfRef, 44);
this.V5R.Y = Unsafe.Add(ref selfRef, 45);
this.V5R.Z = Unsafe.Add(ref selfRef, 46);
this.V5R.W = Unsafe.Add(ref selfRef, 47);
this.V6L.X = Unsafe.Add(ref selfRef, 48);
this.V6L.Y = Unsafe.Add(ref selfRef, 49);
this.V6L.Z = Unsafe.Add(ref selfRef, 50);
this.V6L.W = Unsafe.Add(ref selfRef, 51);
this.V6R.X = Unsafe.Add(ref selfRef, 52);
this.V6R.Y = Unsafe.Add(ref selfRef, 53);
this.V6R.Z = Unsafe.Add(ref selfRef, 54);
this.V6R.W = Unsafe.Add(ref selfRef, 55);
this.V7L.X = Unsafe.Add(ref selfRef, 56);
this.V7L.Y = Unsafe.Add(ref selfRef, 57);
this.V7L.Z = Unsafe.Add(ref selfRef, 58);
this.V7L.W = Unsafe.Add(ref selfRef, 59);
this.V7R.X = Unsafe.Add(ref selfRef, 60);
this.V7R.Y = Unsafe.Add(ref selfRef, 61);
this.V7R.Z = Unsafe.Add(ref selfRef, 62);
this.V7R.W = Unsafe.Add(ref selfRef, 63);
}
}

66
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector128.cs

@ -0,0 +1,66 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
/// <content>
/// <see cref="Vector128{Single}"/> version of <see cref="Block8x8F"/>.
/// </content>
internal partial struct Block8x8F
{
/// <summary>
/// <see cref="Vector128{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
/// </summary>
/// <param name="maximum">The maximum value to normalize to.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void NormalizeColorsAndRoundInPlaceVector128(float maximum)
{
Vector128<float> off = Vector128.Create(MathF.Ceiling(maximum * 0.5F));
Vector128<float> max = Vector128.Create(maximum);
this.V0L = NormalizeAndRoundVector128(this.V0L.AsVector128(), off, max).AsVector4();
this.V0R = NormalizeAndRoundVector128(this.V0R.AsVector128(), off, max).AsVector4();
this.V1L = NormalizeAndRoundVector128(this.V1L.AsVector128(), off, max).AsVector4();
this.V1R = NormalizeAndRoundVector128(this.V1R.AsVector128(), off, max).AsVector4();
this.V2L = NormalizeAndRoundVector128(this.V2L.AsVector128(), off, max).AsVector4();
this.V2R = NormalizeAndRoundVector128(this.V2R.AsVector128(), off, max).AsVector4();
this.V3L = NormalizeAndRoundVector128(this.V3L.AsVector128(), off, max).AsVector4();
this.V3R = NormalizeAndRoundVector128(this.V3R.AsVector128(), off, max).AsVector4();
this.V4L = NormalizeAndRoundVector128(this.V4L.AsVector128(), off, max).AsVector4();
this.V4R = NormalizeAndRoundVector128(this.V4R.AsVector128(), off, max).AsVector4();
this.V5L = NormalizeAndRoundVector128(this.V5L.AsVector128(), off, max).AsVector4();
this.V5R = NormalizeAndRoundVector128(this.V5R.AsVector128(), off, max).AsVector4();
this.V6L = NormalizeAndRoundVector128(this.V6L.AsVector128(), off, max).AsVector4();
this.V6R = NormalizeAndRoundVector128(this.V6R.AsVector128(), off, max).AsVector4();
this.V7L = NormalizeAndRoundVector128(this.V7L.AsVector128(), off, max).AsVector4();
this.V7R = NormalizeAndRoundVector128(this.V7R.AsVector128(), off, max).AsVector4();
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> value, Vector128<float> off, Vector128<float> max)
=> Vector128_.RoundToNearestInteger(Vector128_.Clamp(value + off, Vector128<float>.Zero, max));
private static void MultiplyIntoInt16_Sse2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Sse2.IsSupported, "Sse2 support is required to run this operation!");
ref Vector128<float> aBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref a);
ref Vector128<float> bBase = ref Unsafe.As<Block8x8F, Vector128<float>>(ref b);
ref Vector128<short> destBase = ref Unsafe.As<Block8x8, Vector128<short>>(ref dest);
// TODO: We can use the v128 utilities for this.
for (nuint i = 0; i < 16; i += 2)
{
Vector128<int> left = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector128<int> right = Sse2.ConvertToVector128Int32(Sse.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Unsafe.Add(ref destBase, i / 2) = Sse2.PackSignedSaturate(left, right);
}
}
}

191
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.Vector256.cs

@ -0,0 +1,191 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
/// <content>
/// <see cref="Vector128{Single}"/> version of <see cref="Block8x8F"/>.
/// </content>
internal partial struct Block8x8F
{
/// <summary>
/// A number of rows of 8 scalar coefficients each in <see cref="Block8x8F"/>
/// </summary>
public const int RowCount = 8;
#pragma warning disable SA1310 // Field names should not contain underscore
[FieldOffset(0)]
public Vector256<float> V256_0;
[FieldOffset(32)]
public Vector256<float> V256_1;
[FieldOffset(64)]
public Vector256<float> V256_2;
[FieldOffset(96)]
public Vector256<float> V256_3;
[FieldOffset(128)]
public Vector256<float> V256_4;
[FieldOffset(160)]
public Vector256<float> V256_5;
[FieldOffset(192)]
public Vector256<float> V256_6;
[FieldOffset(224)]
public Vector256<float> V256_7;
#pragma warning restore SA1310 // Field names should not contain underscore
/// <summary>
/// <see cref="Vector256{Single}"/> version of <see cref="NormalizeColorsInPlace(float)"/> and <see cref="RoundInPlace()"/>.
/// </summary>
/// <param name="maximum">The maximum value to normalize to.</param>
[MethodImpl(InliningOptions.ShortMethod)]
public void NormalizeColorsAndRoundInPlaceVector256(float maximum)
{
Vector256<float> off = Vector256.Create(MathF.Ceiling(maximum * 0.5F));
Vector256<float> max = Vector256.Create(maximum);
this.V256_0 = NormalizeAndRoundVector256(this.V256_0, off, max);
this.V256_1 = NormalizeAndRoundVector256(this.V256_1, off, max);
this.V256_2 = NormalizeAndRoundVector256(this.V256_2, off, max);
this.V256_3 = NormalizeAndRoundVector256(this.V256_3, off, max);
this.V256_4 = NormalizeAndRoundVector256(this.V256_4, off, max);
this.V256_5 = NormalizeAndRoundVector256(this.V256_5, off, max);
this.V256_6 = NormalizeAndRoundVector256(this.V256_6, off, max);
this.V256_7 = NormalizeAndRoundVector256(this.V256_7, off, max);
}
/// <summary>
/// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
/// </summary>
/// <param name="source">The source <see cref="Block8x8"/></param>
public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
{
DebugGuard.IsTrue(
Avx2.IsSupported,
"LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");
ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
// Vector256<ushort>.Count == 16 on AVX2
// We can process 2 block rows in a single step
Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
dRef = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector256<float> NormalizeAndRoundVector256(Vector256<float> value, Vector256<float> off, Vector256<float> max)
=> Vector256_.RoundToNearestInteger(Vector256_.Clamp(value + off, Vector256<float>.Zero, max));
private static unsafe void MultiplyIntoInt16_Avx2(ref Block8x8F a, ref Block8x8F b, ref Block8x8 dest)
{
DebugGuard.IsTrue(Avx2.IsSupported, "Avx2 support is required to run this operation!");
ref Vector256<float> aBase = ref a.V256_0;
ref Vector256<float> bBase = ref b.V256_0;
ref Vector256<short> destRef = ref dest.V01;
Vector256<int> multiplyIntoInt16ShuffleMask = Vector256.Create(0, 1, 4, 5, 2, 3, 6, 7);
for (nuint i = 0; i < 8; i += 2)
{
Vector256<int> row0 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 0), Unsafe.Add(ref bBase, i + 0)));
Vector256<int> row1 = Avx.ConvertToVector256Int32(Avx.Multiply(Unsafe.Add(ref aBase, i + 1), Unsafe.Add(ref bBase, i + 1)));
Vector256<short> row = Avx2.PackSignedSaturate(row0, row1);
row = Avx2.PermuteVar8x32(row.AsInt32(), multiplyIntoInt16ShuffleMask).AsInt16();
Unsafe.Add(ref destRef, i / 2) = row;
}
}
private void TransposeInplace_Avx()
{
// https://stackoverflow.com/questions/25622745/transpose-an-8x8-float-using-avx-avx2/25627536#25627536
Vector256<float> r0 = Avx.InsertVector128(
this.V256_0,
Unsafe.As<Vector4, Vector128<float>>(ref this.V4L),
1);
Vector256<float> r1 = Avx.InsertVector128(
this.V256_1,
Unsafe.As<Vector4, Vector128<float>>(ref this.V5L),
1);
Vector256<float> r2 = Avx.InsertVector128(
this.V256_2,
Unsafe.As<Vector4, Vector128<float>>(ref this.V6L),
1);
Vector256<float> r3 = Avx.InsertVector128(
this.V256_3,
Unsafe.As<Vector4, Vector128<float>>(ref this.V7L),
1);
Vector256<float> r4 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V0R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V4R),
1);
Vector256<float> r5 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V1R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V5R),
1);
Vector256<float> r6 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V2R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V6R),
1);
Vector256<float> r7 = Avx.InsertVector128(
Unsafe.As<Vector4, Vector128<float>>(ref this.V3R).ToVector256(),
Unsafe.As<Vector4, Vector128<float>>(ref this.V7R),
1);
Vector256<float> t0 = Avx.UnpackLow(r0, r1);
Vector256<float> t2 = Avx.UnpackLow(r2, r3);
Vector256<float> v = Avx.Shuffle(t0, t2, 0x4E);
this.V256_0 = Avx.Blend(t0, v, 0xCC);
this.V256_1 = Avx.Blend(t2, v, 0x33);
Vector256<float> t4 = Avx.UnpackLow(r4, r5);
Vector256<float> t6 = Avx.UnpackLow(r6, r7);
v = Avx.Shuffle(t4, t6, 0x4E);
this.V256_4 = Avx.Blend(t4, v, 0xCC);
this.V256_5 = Avx.Blend(t6, v, 0x33);
Vector256<float> t1 = Avx.UnpackHigh(r0, r1);
Vector256<float> t3 = Avx.UnpackHigh(r2, r3);
v = Avx.Shuffle(t1, t3, 0x4E);
this.V256_2 = Avx.Blend(t1, v, 0xCC);
this.V256_3 = Avx.Blend(t3, v, 0x33);
Vector256<float> t5 = Avx.UnpackHigh(r4, r5);
Vector256<float> t7 = Avx.UnpackHigh(r6, r7);
v = Avx.Shuffle(t5, t7, 0x4E);
this.V256_6 = Avx.Blend(t5, v, 0xCC);
this.V256_7 = Avx.Blend(t7, v, 0x33);
}
}

221
src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs

@ -8,8 +8,6 @@ using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Text;
using SixLabors.ImageSharp.Common.Helpers;
using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities;
using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
@ -25,7 +23,6 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
/// </summary>
public const int Size = 64;
#pragma warning disable SA1600 // ElementsMustBeDocumented
[FieldOffset(0)]
public Vector4 V0L;
[FieldOffset(16)]
@ -65,7 +62,6 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
public Vector4 V7L;
[FieldOffset(240)]
public Vector4 V7R;
#pragma warning restore SA1600 // ElementsMustBeDocumented
/// <summary>
/// Get/Set scalar elements at a given index
@ -159,18 +155,17 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)]
public void MultiplyInPlace(float value)
{
// TODO: Vector512
if (Vector256.IsHardwareAccelerated)
{
Vector256<float> valueVec = Vector256.Create(value);
this.V0 *= valueVec;
this.V1 *= valueVec;
this.V2 *= valueVec;
this.V3 *= valueVec;
this.V4 *= valueVec;
this.V5 *= valueVec;
this.V6 *= valueVec;
this.V7 *= valueVec;
this.V256_0 *= valueVec;
this.V256_1 *= valueVec;
this.V256_2 *= valueVec;
this.V256_3 *= valueVec;
this.V256_4 *= valueVec;
this.V256_5 *= valueVec;
this.V256_6 *= valueVec;
this.V256_7 *= valueVec;
}
else
{
@ -201,17 +196,16 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)]
public unsafe void MultiplyInPlace(ref Block8x8F other)
{
// TODO: Vector512
if (Vector256.IsHardwareAccelerated)
{
this.V0 *= other.V0;
this.V1 *= other.V1;
this.V2 *= other.V2;
this.V3 *= other.V3;
this.V4 *= other.V4;
this.V5 *= other.V5;
this.V6 *= other.V6;
this.V7 *= other.V7;
this.V256_0 *= other.V256_0;
this.V256_1 *= other.V256_1;
this.V256_2 *= other.V256_2;
this.V256_3 *= other.V256_3;
this.V256_4 *= other.V256_4;
this.V256_5 *= other.V256_5;
this.V256_6 *= other.V256_6;
this.V256_7 *= other.V256_7;
}
else
{
@ -241,18 +235,17 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
[MethodImpl(InliningOptions.ShortMethod)]
public void AddInPlace(float value)
{
// TODO: Vector512
if (Vector256.IsHardwareAccelerated)
{
Vector256<float> valueVec = Vector256.Create(value);
this.V0 += valueVec;
this.V1 += valueVec;
this.V2 += valueVec;
this.V3 += valueVec;
this.V4 += valueVec;
this.V5 += valueVec;
this.V6 += valueVec;
this.V7 += valueVec;
this.V256_0 += valueVec;
this.V256_1 += valueVec;
this.V256_2 += valueVec;
this.V256_3 += valueVec;
this.V256_4 += valueVec;
this.V256_5 += valueVec;
this.V256_6 += valueVec;
this.V256_7 += valueVec;
}
else
{
@ -352,6 +345,34 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
}
}
/// <summary>
/// Level shift by +maximum/2, clip to [0, maximum]
/// </summary>
/// <param name="maximum">The maximum value to normalize to.</param>
public void NormalizeColorsInPlace(float maximum)
{
Vector4 min = Vector4.Zero;
Vector4 max = new(maximum);
Vector4 off = new(MathF.Ceiling(maximum * 0.5F));
this.V0L = Vector4.Clamp(this.V0L + off, min, max);
this.V0R = Vector4.Clamp(this.V0R + off, min, max);
this.V1L = Vector4.Clamp(this.V1L + off, min, max);
this.V1R = Vector4.Clamp(this.V1R + off, min, max);
this.V2L = Vector4.Clamp(this.V2L + off, min, max);
this.V2R = Vector4.Clamp(this.V2R + off, min, max);
this.V3L = Vector4.Clamp(this.V3L + off, min, max);
this.V3R = Vector4.Clamp(this.V3R + off, min, max);
this.V4L = Vector4.Clamp(this.V4L + off, min, max);
this.V4R = Vector4.Clamp(this.V4R + off, min, max);
this.V5L = Vector4.Clamp(this.V5L + off, min, max);
this.V5R = Vector4.Clamp(this.V5R + off, min, max);
this.V6L = Vector4.Clamp(this.V6L + off, min, max);
this.V6R = Vector4.Clamp(this.V6R + off, min, max);
this.V7L = Vector4.Clamp(this.V7L + off, min, max);
this.V7R = Vector4.Clamp(this.V7R + off, min, max);
}
/// <summary>
/// Rounds all values in the block.
/// </summary>
@ -376,39 +397,84 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
}
/// <summary>
/// Loads values from <paramref name="source"/> using extended AVX2 intrinsics.
/// Fill the block from <paramref name="source"/> doing short -&gt; float conversion.
/// </summary>
/// <param name="source">The source <see cref="Block8x8"/></param>
public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
/// <param name="source">The source block</param>
public void LoadFromInt16Scalar(ref Block8x8 source)
{
DebugGuard.IsTrue(
Avx2.IsSupported,
"LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");
ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);
// Vector256<ushort>.Count == 16 on AVX2
// We can process 2 block rows in a single step
Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
dRef = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
ref short selfRef = ref Unsafe.As<Block8x8, short>(ref source);
this.V0L.X = Unsafe.Add(ref selfRef, 0);
this.V0L.Y = Unsafe.Add(ref selfRef, 1);
this.V0L.Z = Unsafe.Add(ref selfRef, 2);
this.V0L.W = Unsafe.Add(ref selfRef, 3);
this.V0R.X = Unsafe.Add(ref selfRef, 4);
this.V0R.Y = Unsafe.Add(ref selfRef, 5);
this.V0R.Z = Unsafe.Add(ref selfRef, 6);
this.V0R.W = Unsafe.Add(ref selfRef, 7);
this.V1L.X = Unsafe.Add(ref selfRef, 8);
this.V1L.Y = Unsafe.Add(ref selfRef, 9);
this.V1L.Z = Unsafe.Add(ref selfRef, 10);
this.V1L.W = Unsafe.Add(ref selfRef, 11);
this.V1R.X = Unsafe.Add(ref selfRef, 12);
this.V1R.Y = Unsafe.Add(ref selfRef, 13);
this.V1R.Z = Unsafe.Add(ref selfRef, 14);
this.V1R.W = Unsafe.Add(ref selfRef, 15);
this.V2L.X = Unsafe.Add(ref selfRef, 16);
this.V2L.Y = Unsafe.Add(ref selfRef, 17);
this.V2L.Z = Unsafe.Add(ref selfRef, 18);
this.V2L.W = Unsafe.Add(ref selfRef, 19);
this.V2R.X = Unsafe.Add(ref selfRef, 20);
this.V2R.Y = Unsafe.Add(ref selfRef, 21);
this.V2R.Z = Unsafe.Add(ref selfRef, 22);
this.V2R.W = Unsafe.Add(ref selfRef, 23);
this.V3L.X = Unsafe.Add(ref selfRef, 24);
this.V3L.Y = Unsafe.Add(ref selfRef, 25);
this.V3L.Z = Unsafe.Add(ref selfRef, 26);
this.V3L.W = Unsafe.Add(ref selfRef, 27);
this.V3R.X = Unsafe.Add(ref selfRef, 28);
this.V3R.Y = Unsafe.Add(ref selfRef, 29);
this.V3R.Z = Unsafe.Add(ref selfRef, 30);
this.V3R.W = Unsafe.Add(ref selfRef, 31);
this.V4L.X = Unsafe.Add(ref selfRef, 32);
this.V4L.Y = Unsafe.Add(ref selfRef, 33);
this.V4L.Z = Unsafe.Add(ref selfRef, 34);
this.V4L.W = Unsafe.Add(ref selfRef, 35);
this.V4R.X = Unsafe.Add(ref selfRef, 36);
this.V4R.Y = Unsafe.Add(ref selfRef, 37);
this.V4R.Z = Unsafe.Add(ref selfRef, 38);
this.V4R.W = Unsafe.Add(ref selfRef, 39);
this.V5L.X = Unsafe.Add(ref selfRef, 40);
this.V5L.Y = Unsafe.Add(ref selfRef, 41);
this.V5L.Z = Unsafe.Add(ref selfRef, 42);
this.V5L.W = Unsafe.Add(ref selfRef, 43);
this.V5R.X = Unsafe.Add(ref selfRef, 44);
this.V5R.Y = Unsafe.Add(ref selfRef, 45);
this.V5R.Z = Unsafe.Add(ref selfRef, 46);
this.V5R.W = Unsafe.Add(ref selfRef, 47);
this.V6L.X = Unsafe.Add(ref selfRef, 48);
this.V6L.Y = Unsafe.Add(ref selfRef, 49);
this.V6L.Z = Unsafe.Add(ref selfRef, 50);
this.V6L.W = Unsafe.Add(ref selfRef, 51);
this.V6R.X = Unsafe.Add(ref selfRef, 52);
this.V6R.Y = Unsafe.Add(ref selfRef, 53);
this.V6R.Z = Unsafe.Add(ref selfRef, 54);
this.V6R.W = Unsafe.Add(ref selfRef, 55);
this.V7L.X = Unsafe.Add(ref selfRef, 56);
this.V7L.Y = Unsafe.Add(ref selfRef, 57);
this.V7L.Z = Unsafe.Add(ref selfRef, 58);
this.V7L.W = Unsafe.Add(ref selfRef, 59);
this.V7R.X = Unsafe.Add(ref selfRef, 60);
this.V7R.Y = Unsafe.Add(ref selfRef, 61);
this.V7R.Z = Unsafe.Add(ref selfRef, 62);
this.V7R.W = Unsafe.Add(ref selfRef, 63);
}
/// <summary>
@ -422,11 +488,11 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
const int equalityMask = unchecked((int)0b1111_1111_1111_1111_1111_1111_1111_1111);
Vector256<int> targetVector = Vector256.Create(value);
ref Vector256<float> blockStride = ref this.V0;
ref Vector256<float> blockStride = ref this.V256_0;
for (nuint i = 0; i < RowCount; i++)
{
Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V0, i)), targetVector);
Vector256<int> areEqual = Avx2.CompareEqual(Avx.ConvertToVector256Int32WithTruncation(Unsafe.Add(ref this.V256_0, i)), targetVector);
if (Avx2.MoveMask(areEqual.AsByte()) != equalityMask)
{
return false;
@ -577,31 +643,4 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
// row #6
RuntimeUtility.Swap(ref Unsafe.Add(ref elemRef, 55), ref Unsafe.Add(ref elemRef, 62));
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector<float> NormalizeAndRound(Vector<float> row, Vector<float> off, Vector<float> max)
{
row += off;
row = Vector.Max(row, Vector<float>.Zero);
row = Vector.Min(row, max);
return row.FastRound();
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector256<float> NormalizeAndRoundVector256(Vector256<float> row, Vector256<float> off, Vector256<float> max)
{
row += off;
row = Vector256.Max(row, Vector256<float>.Zero);
row = Vector256.Min(row, max);
return Vector256_.RoundToNearestInteger(row);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector128<float> NormalizeAndRoundVector128(Vector128<float> row, Vector128<float> off, Vector128<float> max)
{
row += off;
row = Vector128.Max(row, Vector128<float>.Zero);
row = Vector128.Min(row, max);
return Vector128_.RoundToNearestInteger(row);
}
}

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector128.cs

@ -1,4 +1,4 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
@ -60,7 +60,7 @@ internal abstract partial class JpegColorConverterBase
ref Vector128<float> b = ref Unsafe.Add(ref srcBlue, i);
// luminosity = (0.299 * r) + (0.587 * g) + (0.114 * b)
Unsafe.Add(ref destLuminance, i) = Vector128Utilities.MultiplyAdd(Vector128Utilities.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
Unsafe.Add(ref destLuminance, i) = Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
}
}
}

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector256.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.GrayScaleVector512.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector128.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector256.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YCbCrVector512.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector128.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector128_ = SixLabors.ImageSharp.Common.Helpers.Vector128Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

4
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector256.cs

@ -1,10 +1,10 @@
// Copyright (c) Six Labors.
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector256_ = SixLabors.ImageSharp.Common.Helpers.Vector256Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

2
src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs

@ -4,7 +4,7 @@
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using Vector512_ = SixLabors.ImageSharp.Common.Helpers.Vector512Utilities;
using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Jpeg.Components;

64
src/ImageSharp/Formats/Jpeg/Components/FloatingPointDCT.Intrinsic.cs

@ -26,14 +26,14 @@ internal static partial class FloatingPointDCT
// Applies 1D floating point FDCT inplace
static void FDCT8x8_1D_Avx(ref Block8x8F block)
{
Vector256<float> tmp0 = Avx.Add(block.V0, block.V7);
Vector256<float> tmp7 = Avx.Subtract(block.V0, block.V7);
Vector256<float> tmp1 = Avx.Add(block.V1, block.V6);
Vector256<float> tmp6 = Avx.Subtract(block.V1, block.V6);
Vector256<float> tmp2 = Avx.Add(block.V2, block.V5);
Vector256<float> tmp5 = Avx.Subtract(block.V2, block.V5);
Vector256<float> tmp3 = Avx.Add(block.V3, block.V4);
Vector256<float> tmp4 = Avx.Subtract(block.V3, block.V4);
Vector256<float> tmp0 = Avx.Add(block.V256_0, block.V256_7);
Vector256<float> tmp7 = Avx.Subtract(block.V256_0, block.V256_7);
Vector256<float> tmp1 = Avx.Add(block.V256_1, block.V256_6);
Vector256<float> tmp6 = Avx.Subtract(block.V256_1, block.V256_6);
Vector256<float> tmp2 = Avx.Add(block.V256_2, block.V256_5);
Vector256<float> tmp5 = Avx.Subtract(block.V256_2, block.V256_5);
Vector256<float> tmp3 = Avx.Add(block.V256_3, block.V256_4);
Vector256<float> tmp4 = Avx.Subtract(block.V256_3, block.V256_4);
// Even part
Vector256<float> tmp10 = Avx.Add(tmp0, tmp3);
@ -41,13 +41,13 @@ internal static partial class FloatingPointDCT
Vector256<float> tmp11 = Avx.Add(tmp1, tmp2);
Vector256<float> tmp12 = Avx.Subtract(tmp1, tmp2);
block.V0 = Avx.Add(tmp10, tmp11);
block.V4 = Avx.Subtract(tmp10, tmp11);
block.V256_0 = Avx.Add(tmp10, tmp11);
block.V256_4 = Avx.Subtract(tmp10, tmp11);
var mm256_F_0_7071 = Vector256.Create(0.707106781f);
Vector256<float> z1 = Avx.Multiply(Avx.Add(tmp12, tmp13), mm256_F_0_7071);
block.V2 = Avx.Add(tmp13, z1);
block.V6 = Avx.Subtract(tmp13, z1);
block.V256_2 = Avx.Add(tmp13, z1);
block.V256_6 = Avx.Subtract(tmp13, z1);
// Odd part
tmp10 = Avx.Add(tmp4, tmp5);
@ -62,10 +62,10 @@ internal static partial class FloatingPointDCT
Vector256<float> z11 = Avx.Add(tmp7, z3);
Vector256<float> z13 = Avx.Subtract(tmp7, z3);
block.V5 = Avx.Add(z13, z2);
block.V3 = Avx.Subtract(z13, z2);
block.V1 = Avx.Add(z11, z4);
block.V7 = Avx.Subtract(z11, z4);
block.V256_5 = Avx.Add(z13, z2);
block.V256_3 = Avx.Subtract(z13, z2);
block.V256_1 = Avx.Add(z11, z4);
block.V256_7 = Avx.Subtract(z11, z4);
}
}
@ -88,10 +88,10 @@ internal static partial class FloatingPointDCT
static void IDCT8x8_1D_Avx(ref Block8x8F block)
{
// Even part
Vector256<float> tmp0 = block.V0;
Vector256<float> tmp1 = block.V2;
Vector256<float> tmp2 = block.V4;
Vector256<float> tmp3 = block.V6;
Vector256<float> tmp0 = block.V256_0;
Vector256<float> tmp1 = block.V256_2;
Vector256<float> tmp2 = block.V256_4;
Vector256<float> tmp3 = block.V256_6;
Vector256<float> z5 = tmp0;
Vector256<float> tmp10 = Avx.Add(z5, tmp2);
@ -107,10 +107,10 @@ internal static partial class FloatingPointDCT
tmp2 = Avx.Subtract(tmp11, tmp12);
// Odd part
Vector256<float> tmp4 = block.V1;
Vector256<float> tmp5 = block.V3;
Vector256<float> tmp6 = block.V5;
Vector256<float> tmp7 = block.V7;
Vector256<float> tmp4 = block.V256_1;
Vector256<float> tmp5 = block.V256_3;
Vector256<float> tmp6 = block.V256_5;
Vector256<float> tmp7 = block.V256_7;
Vector256<float> z13 = Avx.Add(tmp6, tmp5);
Vector256<float> z10 = Avx.Subtract(tmp6, tmp5);
@ -129,14 +129,14 @@ internal static partial class FloatingPointDCT
tmp5 = Avx.Subtract(tmp11, tmp6);
tmp4 = Avx.Subtract(tmp10, tmp5);
block.V0 = Avx.Add(tmp0, tmp7);
block.V7 = Avx.Subtract(tmp0, tmp7);
block.V1 = Avx.Add(tmp1, tmp6);
block.V6 = Avx.Subtract(tmp1, tmp6);
block.V2 = Avx.Add(tmp2, tmp5);
block.V5 = Avx.Subtract(tmp2, tmp5);
block.V3 = Avx.Add(tmp3, tmp4);
block.V4 = Avx.Subtract(tmp3, tmp4);
block.V256_0 = Avx.Add(tmp0, tmp7);
block.V256_7 = Avx.Subtract(tmp0, tmp7);
block.V256_1 = Avx.Add(tmp1, tmp6);
block.V256_6 = Avx.Subtract(tmp1, tmp6);
block.V256_2 = Avx.Add(tmp2, tmp5);
block.V256_5 = Avx.Subtract(tmp2, tmp5);
block.V256_3 = Avx.Add(tmp3, tmp4);
block.V256_4 = Avx.Subtract(tmp3, tmp4);
}
}
}

6
src/ImageSharp/Formats/Webp/AlphaDecoder.cs

@ -326,11 +326,11 @@ internal class AlphaDecoder : IDisposable
{
Vector128<long> a0 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref srcRef, i)), 0);
Vector128<byte> a1 = a0.AsByte() + last.AsByte();
Vector128<byte> a2 = Vector128Utilities.ShiftLeftBytesInVector(a1, 1);
Vector128<byte> a2 = Vector128_.ShiftLeftBytesInVector(a1, 1);
Vector128<byte> a3 = a1 + a2;
Vector128<byte> a4 = Vector128Utilities.ShiftLeftBytesInVector(a3, 2);
Vector128<byte> a4 = Vector128_.ShiftLeftBytesInVector(a3, 2);
Vector128<byte> a5 = a3 + a4;
Vector128<byte> a6 = Vector128Utilities.ShiftLeftBytesInVector(a5, 4);
Vector128<byte> a6 = Vector128_.ShiftLeftBytesInVector(a5, 4);
Vector128<byte> a7 = a5 + a6;
ref byte outputRef = ref Unsafe.Add(ref dstRef, i);

11
tests/ImageSharp.Benchmarks/Codecs/Jpeg/DecodeJpeg.cs

@ -8,6 +8,7 @@ using SixLabors.ImageSharp.Tests;
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg;
[Config(typeof(Config.HwIntrinsics_SSE_AVX))]
public class DecodeJpeg
{
private JpegDecoder decoder;
@ -21,7 +22,7 @@ public class DecodeJpeg
this.preloadedImageStream = new MemoryStream(bytes);
}
private void GenericBechmark()
private void GenericBenchmark()
{
this.preloadedImageStream.Position = 0;
using Image img = this.decoder.Decode(DecoderOptions.Default, this.preloadedImageStream);
@ -51,16 +52,16 @@ public class DecodeJpeg
}
[Benchmark(Description = "Baseline 4:4:4 Interleaved")]
public void JpegBaselineInterleaved444() => this.GenericBechmark();
public void JpegBaselineInterleaved444() => this.GenericBenchmark();
[Benchmark(Description = "Baseline 4:2:0 Interleaved")]
public void JpegBaselineInterleaved420() => this.GenericBechmark();
public void JpegBaselineInterleaved420() => this.GenericBenchmark();
[Benchmark(Description = "Baseline 4:0:0 (grayscale)")]
public void JpegBaseline400() => this.GenericBechmark();
public void JpegBaseline400() => this.GenericBenchmark();
[Benchmark(Description = "Progressive 4:2:0 Non-Interleaved")]
public void JpegProgressiveNonInterleaved420() => this.GenericBechmark();
public void JpegProgressiveNonInterleaved420() => this.GenericBenchmark();
}
/*

33
tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs

@ -34,6 +34,7 @@ public partial class Config
// like `LZCNT`, `BMI1`, or `BMI2`
// `EnableSSE3_4` is a legacy switch that exists for compat and is basically the same as `EnableSSE3`
private const string EnableAES = "DOTNET_EnableAES";
private const string EnableAVX512F = "DOTNET_EnableAVX512F";
private const string EnableAVX = "DOTNET_EnableAVX";
private const string EnableAVX2 = "DOTNET_EnableAVX2";
private const string EnableBMI1 = "DOTNET_EnableBMI1";
@ -76,4 +77,36 @@ public partial class Config
}
}
}
public class HwIntrinsics_SSE_AVX_AVX512F : Config
{
public HwIntrinsics_SSE_AVX_AVX512F()
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
.WithEnvironmentVariables(
new EnvironmentVariable(EnableHWIntrinsic, Off),
new EnvironmentVariable(FeatureSIMD, Off))
.WithId("1. No HwIntrinsics").AsBaseline());
if (Sse.IsSupported)
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
.WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off))
.WithId("2. SSE"));
}
if (Avx.IsSupported)
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
.WithEnvironmentVariables(new EnvironmentVariable(EnableAVX512F, Off))
.WithId("3. AVX"));
}
if (Avx512F.IsSupported)
{
this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core80)
.WithId("3. AVX512F"));
}
}
}
}

Loading…
Cancel
Save