From 798e9c3ad6e77e3bda0770a16e2e283c7bc45ff1 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 26 Nov 2021 14:09:31 +0100 Subject: [PATCH] Add SSE2 version of FTransform2 --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 64 ++++++++++++++++++- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index a3a9c924c..f657d3252 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -404,8 +404,66 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static void FTransform2(Span src, Span reference, Span output, Span output2, Span scratch) { - FTransform(src, reference, output, scratch); - FTransform(src.Slice(4), reference.Slice(4), output2, scratch); +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) + { +#pragma warning disable SA1503 // Braces should not be omitted + fixed (byte* srcRef = src) + fixed (byte* referenceRef = reference) + { + // Load src. + Vector128 src0 = Sse2.LoadScalarVector128((ulong*)srcRef); + Vector128 src1 = Sse2.LoadScalarVector128((ulong*)(srcRef + WebpConstants.Bps)); + Vector128 src2 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 2))); + Vector128 src3 = Sse2.LoadScalarVector128((ulong*)(srcRef + (WebpConstants.Bps * 3))); + + // Load ref. + Vector128 ref0 = Sse2.LoadScalarVector128((ulong*)referenceRef); + Vector128 ref1 = Sse2.LoadScalarVector128((ulong*)(referenceRef + WebpConstants.Bps)); + Vector128 ref2 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (WebpConstants.Bps * 2))); + Vector128 ref3 = Sse2.LoadScalarVector128((ulong*)(referenceRef + (+WebpConstants.Bps * 3))); + + // Convert both to 16 bit. + Vector128 srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128.Zero); + Vector128 srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128.Zero); + Vector128 srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128.Zero); + Vector128 srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128.Zero); + Vector128 refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128.Zero); + Vector128 refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128.Zero); + Vector128 refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128.Zero); + Vector128 refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128.Zero); + + // Compute difference. -> 00 01 02 03 00' 01' 02' 03' + Vector128 diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16()); + Vector128 diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16()); + Vector128 diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16()); + Vector128 diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16()); + + // Unpack and shuffle. + // 00 01 02 03 0 0 0 0 + // 10 11 12 13 0 0 0 0 + // 20 21 22 23 0 0 0 0 + // 30 31 32 33 0 0 0 0 + Vector128 shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32()); + Vector128 shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32()); + Vector128 shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32()); + Vector128 shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32()); + + // First pass. + FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128 v01l, out Vector128 v32l); + FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128 v01h, out Vector128 v32h); + + // Second pass. + FTransformPass2SSE2(v01l, v32l, output); + FTransformPass2SSE2(v01h, v32h, output2); + } + } + else +#endif + { + FTransform(src, reference, output, scratch); + FTransform(src.Slice(4), reference.Slice(4), output2, scratch); + } } public static void FTransform(Span src, Span reference, Span output, Span scratch) @@ -567,7 +625,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // a0 = v0 + v3 // a1 = v1 + v2 - Vector128 a01 = Sse2.Add(v01, v32); + Vector128 a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16()); Vector128 a01Plus7 = Sse2.Add(a01.AsInt16(), Seven); Vector128 a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16(); Vector128 c0 = Sse2.Add(a01Plus7, a11);