Browse Source

Merge pull request #1881 from SixLabors/bp/Vp8_Sse16xN

Add SSE2 and AVX2 versions of Vp8_Sse16X16 and Vp8_Sse16X8
pull/1892/head
Brian Popow 4 years ago
committed by GitHub
parent
commit
bf7362c983
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
  1. 134
      src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
  2. 142
      tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

134
src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

@ -40,11 +40,43 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
// Note: method name in libwebp reference implementation is called VP8SSE16x16.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 16);
public static int Vp8_Sse16X16(Span<byte> a, Span<byte> b)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported)
{
return Vp8_Sse16xN_Avx2(a, b, 4);
}
if (Sse2.IsSupported)
{
return Vp8_Sse16xN_Sse2(a, b, 8);
}
#endif
{
return Vp8_SseNxN(a, b, 16, 16);
}
}
// Note: method name in libwebp reference implementation is called VP8SSE16x8.
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b) => Vp8_SseNxN(a, b, 16, 8);
public static int Vp8_Sse16X8(Span<byte> a, Span<byte> b)
{
#if SUPPORTS_RUNTIME_INTRINSICS
if (Avx2.IsSupported)
{
return Vp8_Sse16xN_Avx2(a, b, 2);
}
if (Sse2.IsSupported)
{
return Vp8_Sse16xN_Sse2(a, b, 4);
}
#endif
{
return Vp8_SseNxN(a, b, 16, 8);
}
}
// Note: method name in libwebp reference implementation is called VP8SSE4x4.
[MethodImpl(InliningOptions.ShortMethod)]
@ -146,6 +178,104 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
return count;
}
#if SUPPORTS_RUNTIME_INTRINSICS
[MethodImpl(InliningOptions.ShortMethod)]
private static int Vp8_Sse16xN_Sse2(Span<byte> a, Span<byte> b, int numPairs)
{
Vector128<int> sum = Vector128<int>.Zero;
nint offset = 0;
ref byte aRef = ref MemoryMarshal.GetReference(a);
ref byte bRef = ref MemoryMarshal.GetReference(b);
for (int i = 0; i < numPairs; i++)
{
// Load values.
Vector128<byte> a0 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset));
Vector128<byte> b0 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset));
Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps));
Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps));
Vector128<int> sum1 = SubtractAndAccumulate(a0, b0);
Vector128<int> sum2 = SubtractAndAccumulate(a1, b1);
sum = Sse2.Add(sum, Sse2.Add(sum1, sum2));
offset += 2 * WebpConstants.Bps;
}
return Numerics.ReduceSum(sum);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static int Vp8_Sse16xN_Avx2(Span<byte> a, Span<byte> b, int numPairs)
{
Vector256<int> sum = Vector256<int>.Zero;
nint offset = 0;
ref byte aRef = ref MemoryMarshal.GetReference(a);
ref byte bRef = ref MemoryMarshal.GetReference(b);
for (int i = 0; i < numPairs; i++)
{
// Load values.
var a0 = Vector256.Create(
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset)),
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps)));
var b0 = Vector256.Create(
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset)),
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps)));
var a1 = Vector256.Create(
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + (2 * WebpConstants.Bps))),
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + (3 * WebpConstants.Bps))));
var b1 = Vector256.Create(
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))),
Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps))));
Vector256<int> sum1 = SubtractAndAccumulate(a0, b0);
Vector256<int> sum2 = SubtractAndAccumulate(a1, b1);
sum = Avx2.Add(sum, Avx2.Add(sum1, sum2));
offset += 4 * WebpConstants.Bps;
}
return Numerics.ReduceSum(sum);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector128<int> SubtractAndAccumulate(Vector128<byte> a, Vector128<byte> b)
{
// Take abs(a-b) in 8b.
Vector128<byte> ab = Sse2.SubtractSaturate(a, b);
Vector128<byte> ba = Sse2.SubtractSaturate(b, a);
Vector128<byte> absAb = Sse2.Or(ab, ba);
// Zero-extend to 16b.
Vector128<byte> c0 = Sse2.UnpackLow(absAb, Vector128<byte>.Zero);
Vector128<byte> c1 = Sse2.UnpackHigh(absAb, Vector128<byte>.Zero);
// Multiply with self.
Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
return Sse2.Add(sum1, sum2);
}
[MethodImpl(InliningOptions.ShortMethod)]
private static Vector256<int> SubtractAndAccumulate(Vector256<byte> a, Vector256<byte> b)
{
// Take abs(a-b) in 8b.
Vector256<byte> ab = Avx2.SubtractSaturate(a, b);
Vector256<byte> ba = Avx2.SubtractSaturate(b, a);
Vector256<byte> absAb = Avx2.Or(ab, ba);
// Zero-extend to 16b.
Vector256<byte> c0 = Avx2.UnpackLow(absAb, Vector256<byte>.Zero);
Vector256<byte> c1 = Avx2.UnpackHigh(absAb, Vector256<byte>.Zero);
// Multiply with self.
Vector256<int> sum1 = Avx2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
Vector256<int> sum2 = Avx2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
return Avx2.Add(sum1, sum2);
}
#endif
[MethodImpl(InliningOptions.ShortMethod)]
public static void Vp8Copy4X4(Span<byte> src, Span<byte> dst) => Copy(src, dst, 4, 4);

142
tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs

@ -76,6 +76,124 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
Assert.True(expected.SequenceEqual(dst));
}
private static void RunVp8Sse16X16Test()
{
// arrange
byte[] a =
{
154, 154, 151, 151, 149, 148, 151, 157, 163, 163, 154, 132, 102, 98, 104, 108, 107, 104, 104, 103,
101, 106, 123, 119, 170, 171, 172, 171, 168, 175, 171, 173, 151, 151, 149, 150, 147, 147, 146, 159,
164, 165, 154, 129, 92, 90, 101, 105, 104, 103, 104, 101, 100, 105, 123, 117, 172, 172, 172, 168,
170, 177, 170, 175, 151, 149, 150, 150, 147, 147, 156, 161, 161, 161, 151, 126, 93, 90, 102, 107,
104, 103, 104, 101, 104, 104, 122, 117, 172, 172, 170, 168, 170, 177, 172, 175, 150, 149, 152, 151,
148, 151, 160, 159, 157, 157, 148, 133, 96, 90, 103, 107, 104, 104, 101, 100, 102, 102, 121, 117,
170, 170, 169, 171, 171, 179, 173, 175, 149, 151, 152, 151, 148, 154, 162, 157, 154, 154, 151, 132,
92, 89, 101, 108, 104, 102, 101, 101, 103, 103, 123, 118, 171, 168, 177, 173, 171, 178, 172, 176,
152, 152, 152, 151, 154, 162, 161, 155, 149, 157, 156, 129, 92, 87, 101, 107, 102, 100, 107, 100,
101, 102, 123, 118, 170, 175, 182, 172, 171, 179, 173, 175, 152, 151, 154, 155, 160, 162, 161, 153,
150, 156, 153, 129, 92, 91, 102, 106, 100, 109, 115, 99, 101, 102, 124, 120, 171, 179, 178, 172,
171, 181, 171, 173, 154, 154, 154, 162, 160, 158, 156, 152, 153, 157, 151, 128, 86, 86, 102, 105,
102, 122, 114, 99, 101, 102, 125, 120, 178, 173, 177, 172, 171, 180, 172, 173, 154, 152, 158, 163,
150, 148, 148, 156, 151, 158, 152, 129, 87, 87, 101, 105, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 154, 151, 165, 156, 141, 137, 146, 158, 152, 159, 152, 133,
90, 88, 99, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
154, 160, 164, 150, 126, 127, 149, 159, 155, 161, 153, 131, 84, 86, 97, 103, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 157, 167, 157, 137, 102, 128, 155, 161,
157, 159, 154, 134, 84, 82, 97, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 163, 163, 150, 113, 78, 132, 156, 162, 159, 160, 154, 132, 83, 78, 91, 97, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 163, 157, 137, 80, 78,
131, 154, 163, 157, 159, 149, 131, 82, 77, 94, 100, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 159, 151, 108, 72, 88, 132, 156, 162, 159, 157, 151, 130, 79, 78,
95, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 151, 130,
82, 82, 89, 134, 154, 161, 161, 157, 152, 129, 81, 77, 95, 102, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204
};
byte[] b =
{
150, 150, 150, 150, 146, 149, 152, 154, 164, 166, 154, 132, 99, 92, 106, 112, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 150, 150, 150, 150, 146, 149, 152, 154,
161, 164, 151, 130, 93, 86, 100, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 150, 150, 150, 150, 146, 149, 152, 154, 158, 161, 148, 127, 93, 86, 100, 106,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 150, 150, 150, 150,
146, 149, 152, 154, 156, 159, 146, 125, 99, 92, 106, 112, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 148, 148, 148, 148, 149, 158, 162, 159, 155, 155, 153, 129,
94, 87, 101, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
151, 151, 151, 151, 152, 159, 161, 156, 155, 155, 153, 129, 94, 87, 101, 106, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 154, 154, 154, 154, 156, 161, 159, 152,
155, 155, 153, 129, 94, 87, 101, 106, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 156, 156, 156, 156, 159, 162, 158, 149, 155, 155, 153, 129, 94, 87, 101, 106,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 152, 153, 157, 162,
150, 149, 149, 151, 155, 160, 150, 131, 91, 90, 104, 104, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 152, 156, 158, 157, 140, 137, 145, 159, 155, 160, 150, 131,
89, 88, 102, 101, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
153, 161, 160, 149, 118, 128, 147, 162, 155, 160, 150, 131, 86, 85, 99, 98, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 154, 165, 161, 144, 96, 128, 154, 159, 155,
160, 150, 131, 83, 82, 97, 96, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 161, 160, 149, 105, 78, 127, 156, 170, 156, 156, 154, 130, 81, 77, 95, 102, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 160, 160, 133, 85, 81, 129, 155,
167, 156, 156, 154, 130, 81, 77, 95, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 156, 147, 109, 76, 85, 130, 153, 163, 156, 156, 154, 130, 81, 77, 95, 102,
204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 204, 152, 128, 87, 83,
88, 132, 152, 159, 156, 156, 154, 130, 81, 77, 95, 102, 204, 204, 204, 204, 204, 204, 204, 204, 204,
204, 204, 204, 204, 204, 204, 204
};
int expected = 2063;
// act
int actual = LossyUtils.Vp8_Sse16X16(a, b);
// assert
Assert.Equal(expected, actual);
}
private static void RunVp8Sse16X8Test()
{
// arrange
byte[] a =
{
107, 104, 104, 103, 101, 106, 123, 119, 170, 171, 172, 171, 168, 175, 171, 173, 151, 151, 149, 150,
147, 147, 146, 159, 164, 165, 154, 129, 92, 90, 101, 105, 104, 103, 104, 101, 100, 105, 123, 117,
172, 172, 172, 168, 170, 177, 170, 175, 151, 149, 150, 150, 147, 147, 156, 161, 161, 161, 151, 126,
93, 90, 102, 107, 104, 103, 104, 101, 104, 104, 122, 117, 172, 172, 170, 168, 170, 177, 172, 175,
150, 149, 152, 151, 148, 151, 160, 159, 157, 157, 148, 133, 96, 90, 103, 107, 104, 104, 101, 100,
102, 102, 121, 117, 170, 170, 169, 171, 171, 179, 173, 175, 149, 151, 152, 151, 148, 154, 162, 157,
154, 154, 151, 132, 92, 89, 101, 108, 104, 102, 101, 101, 103, 103, 123, 118, 171, 168, 177, 173,
171, 178, 172, 176, 152, 152, 152, 151, 154, 162, 161, 155, 149, 157, 156, 129, 92, 87, 101, 107,
102, 100, 107, 100, 101, 102, 123, 118, 170, 175, 182, 172, 171, 179, 173, 175, 152, 151, 154, 155,
160, 162, 161, 153, 150, 156, 153, 129, 92, 91, 102, 106, 100, 109, 115, 99, 101, 102, 124, 120,
171, 179, 178, 172, 171, 181, 171, 173, 154, 154, 154, 162, 160, 158, 156, 152, 153, 157, 151, 128,
86, 86, 102, 105, 102, 122, 114, 99, 101, 102, 125, 120, 178, 173, 177, 172, 171, 180, 172, 173,
154, 152, 158, 163, 150, 148, 148, 156, 151, 158, 152, 129, 87, 87, 101, 105
};
byte[] b =
{
103, 103, 103, 103, 101, 106, 122, 114, 171, 171, 171, 171, 171, 177, 169, 175, 150, 150, 150, 150,
146, 149, 152, 154, 161, 164, 151, 130, 93, 86, 100, 106, 103, 103, 103, 103, 101, 106, 122, 114,
171, 171, 171, 171, 171, 177, 169, 175, 150, 150, 150, 150, 146, 149, 152, 154, 158, 161, 148, 127,
93, 86, 100, 106, 103, 103, 103, 103, 101, 106, 122, 114, 171, 171, 171, 171, 171, 177, 169, 175,
150, 150, 150, 150, 146, 149, 152, 154, 156, 159, 146, 125, 99, 92, 106, 112, 103, 103, 103, 103,
101, 106, 122, 114, 171, 171, 171, 171, 171, 177, 169, 175, 148, 148, 148, 148, 149, 158, 162, 159,
155, 155, 153, 129, 94, 87, 101, 106, 102, 100, 100, 102, 100, 101, 120, 122, 170, 176, 176, 170,
174, 180, 171, 177, 151, 151, 151, 151, 152, 159, 161, 156, 155, 155, 153, 129, 94, 87, 101, 106,
102, 105, 105, 102, 100, 101, 120, 122, 170, 176, 176, 170, 174, 180, 171, 177, 154, 154, 154, 154,
156, 161, 159, 152, 155, 155, 153, 129, 94, 87, 101, 106, 102, 112, 112, 102, 100, 101, 120, 122,
170, 176, 176, 170, 174, 180, 171, 177, 156, 156, 156, 156, 159, 162, 158, 149, 155, 155, 153, 129,
94, 87, 101, 106, 102, 117, 117, 102, 100, 101, 120, 122, 170, 176, 176, 170, 174, 180, 171, 177,
152, 153, 157, 162, 150, 149, 149, 151, 155, 160, 150, 131, 91, 90, 104, 104
};
int expected = 749;
// act
int actual = LossyUtils.Vp8_Sse16X8(a, b);
// assert
Assert.Equal(expected, actual);
}
private static void RunVp8Sse4X4Test()
{
// arrange
@ -168,6 +286,12 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
[Fact]
public void RunTransformOne_Works() => RunTransformOneTest();
[Fact]
public void Vp8Sse16X16_Works() => RunVp8Sse16X16Test();
[Fact]
public void Vp8Sse16X8_Works() => RunVp8Sse16X8Test();
[Fact]
public void Vp8Sse4X4_Works() => RunVp8Sse4X4Test();
@ -190,6 +314,24 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp
[Fact]
public void TransformOne_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTransformOneTest, HwIntrinsics.DisableHWIntrinsic);
[Fact]
public void Vp8Sse16X16_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.AllowAll);
[Fact]
public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2);
[Fact]
public void Vp8Sse16X16_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2);
[Fact]
public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll);
[Fact]
public void Vp8Sse16X8_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2);
[Fact]
public void Vp8Sse16X8_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2);
[Fact]
public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);

Loading…
Cancel
Save