diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index b8986f66f..8fa4ab7a1 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -32,15 +32,48 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static int Vp8_Sse4X4(Span a, Span b) { #if SUPPORTS_RUNTIME_INTRINSICS + if (Avx2.IsSupported) + { + // Load values. + ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); + var a0 = Vector256.Create( + Unsafe.As>(ref aRef), + Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps))); + var a1 = Vector256.Create( + Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2)), + Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3))); + var b0 = Vector256.Create( + Unsafe.As>(ref bRef), + Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps))); + var b1 = Vector256.Create( + Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2)), + Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3))); + + // Combine pair of lines. + Vector256 a01 = Avx2.UnpackLow(a0.AsInt32(), a1.AsInt32()); + Vector256 b01 = Avx2.UnpackLow(b0.AsInt32(), b1.AsInt32()); + + // Convert to 16b. + Vector256 a01s = Avx2.UnpackLow(a01.AsByte(), Vector256.Zero); + Vector256 b01s = Avx2.UnpackLow(b01.AsByte(), Vector256.Zero); + + // subtract, square and accumulate. + Vector256 d0 = Avx2.SubtractSaturate(a01s, b01s); + Vector256 e0 = Avx2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16()); + + return Numerics.ReduceSum(e0); + } + if (Sse2.IsSupported) { // Load values. ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); Vector128 a0 = Unsafe.As>(ref aRef); Vector128 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps)); Vector128 a2 = Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 2)); Vector128 a3 = Unsafe.As>(ref Unsafe.Add(ref aRef, WebpConstants.Bps * 3)); - ref byte bRef = ref MemoryMarshal.GetReference(b); Vector128 b0 = Unsafe.As>(ref bRef); Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps)); Vector128 b2 = Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 2)); @@ -67,7 +100,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy return Numerics.ReduceSum(sum); } - else #endif { return Vp8_SseNxN(a, b, 4, 4); diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs index fcd61f2c0..eb8d92110 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8EncIterator.cs @@ -358,7 +358,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int kThreshold = 8 + ((17 - 8) * q / 100); int k; Span dc = stackalloc uint[16]; - Span tmp = stackalloc ushort[16]; uint m; uint m2; for (k = 0; k < 16; k += 4) diff --git a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs index 907b18300..defd5af6c 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/LossyUtilsTests.cs @@ -104,7 +104,10 @@ namespace SixLabors.ImageSharp.Tests.Formats.Webp public void Vp8Sse4X4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.AllowAll); [Fact] - public void Vp8Sse4X4_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableHWIntrinsic); + public void Vp8Sse4X4_WithoutSSE2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableSSE2); + + [Fact] + public void Vp8Sse4X4_WithoutAVX2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunVp8Sse4X4Test, HwIntrinsics.DisableAVX2); [Fact] public void Mean16x4_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunMean16x4Test, HwIntrinsics.AllowAll);