From 1f6a503b9b3e57480cb73ae9f1218143228802bd Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Wed, 8 Dec 2021 13:05:47 +0100 Subject: [PATCH] Add SSE2 version of Vp8_Sse16X16 and Vp8_Sse16X8 --- .../Formats/Webp/Lossy/LossyUtils.cs | 72 ++++++++++++++++++- 1 file changed, 70 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index af5f136fa9..69ce6c7f98 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -40,11 +40,33 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Note: method name in libwebp reference implementation is called VP8SSE16x16. [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8_Sse16X16(Span a, Span b) => Vp8_SseNxN(a, b, 16, 16); + public static int Vp8_Sse16X16(Span a, Span b) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + return Vp8_Sse16xN_Sse2(a, b, 8); + } +#endif + { + return Vp8_SseNxN(a, b, 16, 16); + } + } // Note: method name in libwebp reference implementation is called VP8SSE16x8. [MethodImpl(InliningOptions.ShortMethod)] - public static int Vp8_Sse16X8(Span a, Span b) => Vp8_SseNxN(a, b, 16, 8); + public static int Vp8_Sse16X8(Span a, Span b) + { +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { + return Vp8_Sse16xN_Sse2(a, b, 4); + } +#endif + { + return Vp8_SseNxN(a, b, 16, 8); + } + } // Note: method name in libwebp reference implementation is called VP8SSE4x4. [MethodImpl(InliningOptions.ShortMethod)] @@ -146,6 +168,52 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy return count; } +#if SUPPORTS_RUNTIME_INTRINSICS + [MethodImpl(InliningOptions.ShortMethod)] + private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) + { + Vector128 sum = Vector128.Zero; + int offset = 0; + for (int i = 0; i < numPairs; i++) + { + // Load values. + ref byte aRef = ref MemoryMarshal.GetReference(a); + ref byte bRef = ref MemoryMarshal.GetReference(b); + Vector128 a0 = Unsafe.As>(ref Unsafe.Add(ref aRef, offset)); + Vector128 b0 = Unsafe.As>(ref Unsafe.Add(ref bRef, offset)); + Vector128 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps)); + Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps)); + + Vector128 sum1 = SubtractAndAccumulate(a0, b0); + Vector128 sum2 = SubtractAndAccumulate(a1, b1); + sum = Sse2.Add(sum, Sse2.Add(sum1, sum2)); + + offset += 2 * WebpConstants.Bps; + } + + return Numerics.ReduceSum(sum); + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b) + { + // Take abs(a-b) in 8b. + Vector128 ab = Sse2.SubtractSaturate(a, b); + Vector128 ba = Sse2.SubtractSaturate(b, a); + Vector128 absAb = Sse2.Or(ab, ba); + + // Zero-extend to 16b. + Vector128 c0 = Sse2.UnpackLow(absAb, Vector128.Zero); + Vector128 c1 = Sse2.UnpackHigh(absAb, Vector128.Zero); + + // Multiply with self. + Vector128 sum1 = Sse2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); + Vector128 sum2 = Sse2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); + + return Sse2.Add(sum1, sum2); + } +#endif + [MethodImpl(InliningOptions.ShortMethod)] public static void Vp8Copy4X4(Span src, Span dst) => Copy(src, dst, 4, 4);