diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index d5db3dffa5..82e2214701 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -4,11 +4,16 @@ using System; using System.Buffers.Binary; using System.Runtime.CompilerServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Numerics; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Webp.Lossy { - internal static class LossyUtils + internal static unsafe class LossyUtils { [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8Sse16X16(Span a, Span b) => GetSse(a, b, 16, 16); @@ -17,7 +22,57 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static int Vp8Sse16X8(Span a, Span b) => GetSse(a, b, 16, 8); [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8Sse4X4(Span a, Span b) => GetSse(a, b, 4, 4); + public static int Vp8Sse4X4(Span a, Span b) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { +#pragma warning disable SA1503 // Braces should not be omitted + Span tmp = stackalloc int[4]; + fixed (byte* aPtr = a) + fixed (byte* bPtr = b) + fixed (int* tmpPtr = tmp) + { + // Load values. + Vector128 a0 = Sse2.LoadVector128(aPtr); + Vector128 a1 = Sse2.LoadVector128(aPtr + WebpConstants.Bps); + Vector128 a2 = Sse2.LoadVector128(aPtr + (WebpConstants.Bps * 2)); + Vector128 a3 = Sse2.LoadVector128(aPtr + (WebpConstants.Bps * 3)); + Vector128 b0 = Sse2.LoadVector128(bPtr); + Vector128 b1 = Sse2.LoadVector128(bPtr + WebpConstants.Bps); + Vector128 b2 = Sse2.LoadVector128(bPtr + (WebpConstants.Bps * 2)); + Vector128 b3 = Sse2.LoadVector128(bPtr + (WebpConstants.Bps * 3)); + + // Combine pair of lines. + Vector128 a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32()); + Vector128 a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32()); + Vector128 b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32()); + Vector128 b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32()); + + // Convert to 16b. + Vector128 a01s = Sse2.UnpackLow(a01.AsByte(), Vector128.Zero); + Vector128 a23s = Sse2.UnpackLow(a23.AsByte(), Vector128.Zero); + Vector128 b01s = Sse2.UnpackLow(b01.AsByte(), Vector128.Zero); + Vector128 b23s = Sse2.UnpackLow(b23.AsByte(), Vector128.Zero); + + // subtract, square and accumulate. + Vector128 d0 = Sse2.SubtractSaturate(a01s, b01s); + Vector128 d1 = Sse2.SubtractSaturate(a23s, b23s); + Vector128 e0 = Sse2.MultiplyAddAdjacent(d0.AsInt16(), d0.AsInt16()); + Vector128 e1 = Sse2.MultiplyAddAdjacent(d1.AsInt16(), d1.AsInt16()); + Vector128 sum = Sse2.Add(e0, e1); + + Sse2.Store(tmpPtr, sum); + return tmp[3] + tmp[2] + tmp[1] + tmp[0]; + } +#pragma warning restore SA1503 // Braces should not be omitted + } + else +#endif + { + return GetSse(a, b, 4, 4); + } + } [MethodImpl(InliningOptions.ShortMethod)] public static int GetSse(Span a, Span b, int w, int h)