diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 69ce6c7f98..abaf720892 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -43,6 +43,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static int Vp8_Sse16X16(Span a, Span b) { #if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + return Vp8_Sse16xN_Avx2(a, b, 4); + } + if (Sse2.IsSupported) { return Vp8_Sse16xN_Sse2(a, b, 8); @@ -58,6 +63,11 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static int Vp8_Sse16X8(Span a, Span b) { #if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + return Vp8_Sse16xN_Avx2(a, b, 2); + } + if (Sse2.IsSupported) { return Vp8_Sse16xN_Sse2(a, b, 4); @@ -194,6 +204,39 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy return Numerics.ReduceSum(sum); } + [MethodImpl(InliningOptions.ShortMethod)] + private static int Vp8_Sse16xN_Avx2(Span a, Span b, int numPairs) + { + Vector256 sum = Vector256.Zero; + int offset = 0; + for (int i = 0; i < numPairs; i++) + { + // Load values. + ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); + var a0 = Vector256.Create( + Unsafe.As>(ref Unsafe.Add(ref aRef, offset)), + Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps))); + var b0 = Vector256.Create( + Unsafe.As>(ref Unsafe.Add(ref bRef, offset)), + Unsafe.As>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps))); + var a1 = Vector256.Create( + Unsafe.As>(ref Unsafe.Add(ref aRef, offset + (2 * WebpConstants.Bps))), + Unsafe.As>(ref Unsafe.Add(ref aRef, offset + (3 * WebpConstants.Bps)))); + var b1 = Vector256.Create( + Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))), + Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps)))); + + Vector256 sum1 = SubtractAndAccumulate(a0, b0); + Vector256 sum2 = SubtractAndAccumulate(a1, b1); + sum = Avx2.Add(sum, Avx2.Add(sum1, sum2)); + + offset += 4 * WebpConstants.Bps; + } + + return Numerics.ReduceSum(sum); + } + [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b) { @@ -212,6 +255,25 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy return Sse2.Add(sum1, sum2); } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector256 SubtractAndAccumulate(Vector256 a, Vector256 b) + { + // Take abs(a-b) in 8b. + Vector256 ab = Avx2.SubtractSaturate(a, b); + Vector256 ba = Avx2.SubtractSaturate(b, a); + Vector256 absAb = Avx2.Or(ab, ba); + + // Zero-extend to 16b. + Vector256 c0 = Avx2.UnpackLow(absAb, Vector256.Zero); + Vector256 c1 = Avx2.UnpackHigh(absAb, Vector256.Zero); + + // Multiply with self. + Vector256 sum1 = Avx2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); + Vector256 sum2 = Avx2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); + + return Avx2.Add(sum1, sum2); + } #endif [MethodImpl(InliningOptions.ShortMethod)] diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index 9625008ca8..cc5f1b4c27 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -320,12 +320,18 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp [Fact] public void Vp8Sse16X16_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableSSE2); + [Fact] + public void Vp8Sse16X16_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X16Test, HwIntrinsics.DisableAVX2); + [Fact] public void Vp8Sse16X8_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.AllowAll); [Fact] public void Vp8Sse16X8_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableSSE2); + [Fact] + public void Vp8Sse16X8_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse16X8Test, HwIntrinsics.DisableAVX2); + [Fact] public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll);