From 6e8def1cc87808965cd0fc5a6f141161cc02de27 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 00:11:44 +0100 Subject: [PATCH 01/12] Add sse2 version of inverse transform --- .../Formats/Webp/Lossy/LossyUtils.cs | 60 ++++-- .../Formats/Webp/Lossy/Vp8Encoding.cs | 204 +++++++++++++++++- 2 files changed, 239 insertions(+), 25 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 3064ccc03..cfac273c4 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -661,28 +661,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. - Vector128 transpose00 = Sse2.UnpackLow(b0, b1); - Vector128 transpose01 = Sse2.UnpackLow(b2, b3); - Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); - Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); - - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); - Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); - - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - Vector128 output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); - Vector128 output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); - Vector128 output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); - Vector128 output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -728,6 +707,43 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Unsafe.As>(ref outputRef) = result.AsInt32(); return sum[3] + sum[2] + sum[1] + sum[0]; } + + // Transpose two 4x4 16b matrices horizontally stored in registers. + public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) + { + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + Vector128 transpose00 = Sse2.UnpackLow(b0, b1); + Vector128 transpose01 = Sse2.UnpackLow(b2, b3); + Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); + Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); + + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); + + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); + output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); + output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); + output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } #endif public static void TransformTwo(Span src, Span dst, Span scratch) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 0567a0f27..cb149bec7 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -4,6 +4,11 @@ using System; using System.Buffers.Binary; using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; +#if SUPPORTS_RUNTIME_INTRINSICS +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; +#endif namespace SixLabors.ImageSharp.Formats.Webp.Lossy { @@ -60,6 +65,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static readonly int[] Vp8I4ModeOffsets = { I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4 }; +#if SUPPORTS_RUNTIME_INTRINSICS + public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16(); + + public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16(); +#endif + static Vp8Encoding() { for (int i = -255; i <= 255 + 255; i++) @@ -68,12 +79,199 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } + // Transforms (Paragraph 14.4) + // Does one or two inverse transforms. public static void ITransform(Span reference, Span input, Span dst, bool doTwo, Span scratch) { - ITransformOne(reference, input, dst, scratch); - if (doTwo) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); + // This implementation makes use of 16-bit fixed point versions of two + // multiply constants: + // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 + // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 + // + // To be able to use signed 16-bit integers, we use the following trick to + // have constants within range: + // - Associated constants are obtained by subtracting the 16-bit fixed point + // version of one: + // k = K - (1 << 16) => K = k + (1 << 16) + // K1 = 85267 => k1 = 20091 + // K2 = 35468 => k2 = -30068 + // - The multiplication of a variable by a constant become the sum of the + // variable and the multiplication of that variable by the associated + // constant: + // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x + + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + ref short inputRef = ref MemoryMarshal.GetReference(input); + var in0 = Vector128.Create(Unsafe.As(ref inputRef), 0); + var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 4)), 0); + var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 8)), 0); + var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 12)), 0); + + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + if (doTwo) + { + var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 16)), 0); + var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); + var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); + var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); + + in0 = Sse2.UnpackLow(in0, inb0); + in1 = Sse2.UnpackLow(in1, inb1); + in2 = Sse2.UnpackLow(in2, inb2); + in3 = Sse2.UnpackLow(in3, inb3); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Vertical pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); + Vector128 c3 = Sse2.Subtract(in1, in3); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); + Vector128 d3 = Sse2.Add(in1, in3); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + + // Horizontal pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + var four = Vector128.Create((short)4); + Vector128 dc = Sse2.Add(t0.AsInt16(), four); + a = Sse2.Add(dc, t2.AsInt16()); + b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + c3 = Sse2.Subtract(t1, t3); + c4 = Sse2.Subtract(c1, c2); + c = Sse2.Add(c3.AsInt16(), c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + d3 = Sse2.Add(t1, t3); + d4 = Sse2.Add(d1, d2); + d = Sse2.Add(d3.AsInt16(), d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + + // Add inverse transform to 'ref' and store. + // Load the reference(s). + Vector128 ref0 = Vector128.Zero; + Vector128 ref1 = Vector128.Zero; + Vector128 ref2 = Vector128.Zero; + Vector128 ref3 = Vector128.Zero; + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + if (doTwo) + { + // Load eight bytes/pixels per line. + ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); + ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); + ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); + ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); + } + else + { + // Load four bytes/pixels per line. + ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); + ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + } + + // Convert to 16b. + ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); + ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); + ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); + ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + + // Add the inverse transform(s). + Vector128 ref0InvAdded = Sse2.Add(ref0.AsUInt16(), t0.AsUInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsUInt16(), t1.AsUInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsUInt16(), t2.AsUInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsUInt16(), t3.AsUInt16()); + + // Unsigned saturate to 8b. + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded.AsInt16(), ref1InvAdded.AsInt16()); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded.AsInt16(), ref2InvAdded.AsInt16()); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded.AsInt16(), ref3InvAdded.AsInt16()); + + // Unsigned saturate to 8b. + if (doTwo) + { + // Store eight bytes/pixels per line. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + Unsafe.As>(ref outputRef) = ref0; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2; + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3; + } + else + { + // Store four bytes/pixels per line. + int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); + int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); + int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); + int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); + + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + Unsafe.As(ref outputRef) = output0; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; + } + } + else +#endif + { + ITransformOne(reference, input, dst, scratch); + if (doTwo) + { + ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); + } } } From 835ecead49cd0e98b223d4e4cb9b32d11190b8b2 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 13:58:02 +0100 Subject: [PATCH 02/12] Store only eight bytes per line --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index c0f81b49f..55fa2593c 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -15,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// /// Methods for encoding a VP8 frame. /// - internal static class Vp8Encoding + internal static unsafe class Vp8Encoding { private const int KC1 = 20091 + (1 << 16); @@ -69,6 +69,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy public static readonly Vector128 K1 = Vector128.Create((short)20091).AsInt16(); public static readonly Vector128 K2 = Vector128.Create((short)-30068).AsInt16(); + + public static readonly Vector128 Four = Vector128.Create((short)4); #endif static Vp8Encoding() @@ -85,6 +87,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) + //if (false) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: @@ -165,8 +168,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - var four = Vector128.Create((short)4); - Vector128 dc = Sse2.Add(t0.AsInt16(), four); + Vector128 dc = Sse2.Add(t0.AsInt16(), Four); a = Sse2.Add(dc, t2.AsInt16()); b = Sse2.Subtract(dc, t2.AsInt16()); @@ -243,11 +245,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy if (doTwo) { // Store eight bytes/pixels per line. - ref byte outputRef = ref MemoryMarshal.GetReference(dst); - Unsafe.As>(ref outputRef) = ref0; - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1; - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2; - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3; + // TODO: avoid pinning, if possible. + fixed (byte* dstPtr = dst) + { + Sse2.StoreScalar((long*)dstPtr, ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref0.AsInt64()); + } } else { From 5968de8f779c21d46facd7b088ec8ae05ecb4a7b Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 13:58:37 +0100 Subject: [PATCH 03/12] Add sse tests for inverse transform --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 1 - .../Formats/WebP/Vp8EncodingTests.cs | 57 +++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 55fa2593c..8f8cf7643 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -87,7 +87,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) - //if (false) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs new file mode 100644 index 000000000..cd5a24d8c --- /dev/null +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -0,0 +1,57 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System.Linq; +using SixLabors.ImageSharp.Formats.Webp.Lossy; +using SixLabors.ImageSharp.Tests.TestUtilities; +using Xunit; + +namespace SixLabors.ImageSharp.Tests.Formats.WebP +{ + [Trait("Format", "Webp")] + public class Vp8EncodingTests + { + private static void RunInverseTransformTest() + { + // arrange + byte[] reference = + { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129 + }; + short[] input = { 177, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 177, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + byte[] dst = new byte[128]; + byte[] expected = + { + 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + int[] scratch = new int[16]; + + // act + Vp8Encoding.ITransform(reference, input, dst, true, scratch); + + // assert + Assert.True(dst.SequenceEqual(expected)); + } + + [Fact] + public void InverseTransform_Works() => RunInverseTransformTest(); + +#if SUPPORTS_RUNTIME_INTRINSICS + [Fact] + public void InverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.AllowAll); + + [Fact] + public void InverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); +#endif + } +} From 5c0b598ece1dd8ca63664c93c01591310a98a16c Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 15:03:37 +0100 Subject: [PATCH 04/12] Fix copy paste mistake --- src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 8f8cf7643..dab466b9a 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -248,9 +248,9 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy fixed (byte* dstPtr = dst) { Sse2.StoreScalar((long*)dstPtr, ref0.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref0.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref0.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref0.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref1.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref2.AsInt64()); + Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref3.AsInt64()); } } else From 6039d2a8719a263d9a71a4cffb8b1325d6384947 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 16:57:40 +0100 Subject: [PATCH 05/12] Better test case --- .../Formats/WebP/Vp8EncodingTests.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index cd5a24d8c..053496389 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -24,15 +24,15 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129 }; - short[] input = { 177, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 177, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + short[] input = { 1, 216, -48, 0, 96, -24, -48, 24, 0, -24, 24, 0, 0, 0, 0, 0, 38, -240, -72, -24, 0, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; byte[] dst = new byte[128]; byte[] expected = { - 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 150, 150, 150, 150, 146, 149, 152, 154, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + 161, 160, 149, 105, 78, 127, 156, 170, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 160, 160, 133, 85, 81, 129, 155, 167, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 156, 147, 109, 76, 85, 130, 153, 163, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 152, 128, 87, 83, 88, 132, 152, 159, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; int[] scratch = new int[16]; From abcbc4c48d6bce5543a45003742f98ccd0b7ef9d Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 17:01:56 +0100 Subject: [PATCH 06/12] Fix issue: vectors need to be short type --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index dab466b9a..6ec191baa 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -3,6 +3,7 @@ using System; using System.Buffers.Binary; +using System.Linq; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS @@ -145,14 +146,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); - Vector128 c3 = Sse2.Subtract(in1, in3); + Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); Vector128 c4 = Sse2.Subtract(c1, c2); Vector128 c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); - Vector128 d3 = Sse2.Add(in1, in3); + Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); Vector128 d4 = Sse2.Add(d1, d2); Vector128 d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); @@ -174,14 +175,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); - c3 = Sse2.Subtract(t1, t3); + c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); c4 = Sse2.Subtract(c1, c2); c = Sse2.Add(c3.AsInt16(), c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); - d3 = Sse2.Add(t1, t3); + d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); d4 = Sse2.Add(d1, d2); d = Sse2.Add(d3.AsInt16(), d4); @@ -229,10 +230,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); // Add the inverse transform(s). - Vector128 ref0InvAdded = Sse2.Add(ref0.AsUInt16(), t0.AsUInt16()); - Vector128 ref1InvAdded = Sse2.Add(ref1.AsUInt16(), t1.AsUInt16()); - Vector128 ref2InvAdded = Sse2.Add(ref2.AsUInt16(), t2.AsUInt16()); - Vector128 ref3InvAdded = Sse2.Add(ref3.AsUInt16(), t3.AsUInt16()); + Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); // Unsigned saturate to 8b. ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); From 18ecb065a313601b5d81329f99677bc1357ce8d2 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 17:13:23 +0100 Subject: [PATCH 07/12] Add tests for executing only one transform --- .../Formats/WebP/Vp8EncodingTests.cs | 49 +++++++++++++++++-- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index 053496389..c4f8601b1 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -11,7 +11,39 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP [Trait("Format", "Webp")] public class Vp8EncodingTests { - private static void RunInverseTransformTest() + private static void RunOneInverseTransformTest() + { + // arrange + byte[] reference = + { + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, + 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 129, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 129, 129, 129, 129, 129, 129 + }; + short[] input = { 1, 216, -48, 0, 96, -24, -48, 24, 0, -24, 24, 0, 0, 0, 0, 0, 38, -240, -72, -24, 0, -24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + byte[] dst = new byte[128]; + byte[] expected = + { + 161, 160, 149, 105, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 160, 160, 133, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 156, 147, 109, 76, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 152, 128, 87, 83, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0 + }; + int[] scratch = new int[16]; + + // act + Vp8Encoding.ITransform(reference, input, dst, false, scratch); + + // assert + Assert.True(dst.SequenceEqual(expected)); + } + + private static void RunTwoInverseTransformTest() { // arrange byte[] reference = @@ -44,14 +76,23 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP } [Fact] - public void InverseTransform_Works() => RunInverseTransformTest(); + public void OneInverseTransform_Works() => RunOneInverseTransformTest(); + + [Fact] + public void TwoInverseTransform_Works() => RunTwoInverseTransformTest(); #if SUPPORTS_RUNTIME_INTRINSICS [Fact] - public void InverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.AllowAll); + public void OneInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.AllowAll); + + [Fact] + public void OneInverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunOneInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); + + [Fact] + public void TwoInverseTransform_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTwoInverseTransformTest, HwIntrinsics.AllowAll); [Fact] - public void InverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); + public void TwoInverseTransform_WithoutHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunTwoInverseTransformTest, HwIntrinsics.DisableHWIntrinsic); #endif } } From 6e548b5e5bace5fa4c58529d616ff14438fc89bf Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 17:25:00 +0100 Subject: [PATCH 08/12] Remove unnecessary casts --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 6ec191baa..70500566f 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -144,18 +144,18 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2.AsInt16()); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1.AsInt16()); + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3.AsInt16(), c4.AsInt16()); + Vector128 c = Sse2.Add(c3, c4); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1.AsInt16()); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2.AsInt16()); + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3.AsInt16(), d4.AsInt16()); + Vector128 d = Sse2.Add(d3, d4); // Second pass. Vector128 tmp0 = Sse2.Add(a, d); @@ -177,14 +177,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3.AsInt16(), c4); + c = Sse2.Add(c3, c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3.AsInt16(), d4); + d = Sse2.Add(d3, d4); // Second pass. tmp0 = Sse2.Add(a, d); @@ -236,10 +236,10 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); // Unsigned saturate to 8b. - ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded.AsInt16(), ref0InvAdded.AsInt16()); - ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded.AsInt16(), ref1InvAdded.AsInt16()); - ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded.AsInt16(), ref2InvAdded.AsInt16()); - ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded.AsInt16(), ref3InvAdded.AsInt16()); + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); // Unsigned saturate to 8b. if (doTwo) From a201e8a1427976addf71974adfffc742cf8ab888 Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Thu, 11 Nov 2021 18:09:38 +0100 Subject: [PATCH 09/12] Avoid pinning --- src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 70500566f..34a3a5f17 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -242,17 +242,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); // Unsigned saturate to 8b. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); if (doTwo) { // Store eight bytes/pixels per line. - // TODO: avoid pinning, if possible. - fixed (byte* dstPtr = dst) - { - Sse2.StoreScalar((long*)dstPtr, ref0.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + WebpConstants.Bps), ref1.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 2)), ref2.AsInt64()); - Sse2.StoreScalar((long*)(dstPtr + (WebpConstants.Bps * 3)), ref3.AsInt64()); - } + Unsafe.As>(ref outputRef) = ref0.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); } else { @@ -262,7 +259,6 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); - ref byte outputRef = ref MemoryMarshal.GetReference(dst); Unsafe.As(ref outputRef) = output0; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; From b7059ae23a72f62ae760f453b21d64fbd63057b0 Mon Sep 17 00:00:00 2001 From: Brian Popow <38701097+brianpopow@users.noreply.github.com> Date: Fri, 12 Nov 2021 12:58:58 +0100 Subject: [PATCH 10/12] Add [MethodImpl(InliningOptions.ShortMethod)] Co-authored-by: Anton Firszov --- src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index c80fd5817..b8986f66f 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -750,6 +750,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } // Transpose two 4x4 16b matrices horizontally stored in registers. + [MethodImpl(InliningOptions.ShortMethod)] public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) { // Transpose the two 4x4. From 544319e9ea8689e6f257c03e7990136bbfaad53e Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 13:18:41 +0100 Subject: [PATCH 11/12] ITransform now always does two transforms --- src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs | 6 +- .../Formats/Webp/Lossy/Vp8Encoding.cs | 277 ++++++++++++------ .../Formats/WebP/Vp8EncodingTests.cs | 4 +- 3 files changed, 192 insertions(+), 95 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs index 38ed80590..2fcea8cee 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/QuantEnc.cs @@ -329,7 +329,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy LossyUtils.TransformWht(dcTmp, tmp, scratch); for (n = 0; n < 16; n += 2) { - Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), true, scratch); + Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8Scan[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8Scan[n]), scratch); } return nz; @@ -342,7 +342,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Span scratch = it.Scratch3.AsSpan(0, 16); Vp8Encoding.FTransform(src, reference, tmp, scratch); int nz = QuantizeBlock(tmp, levels, ref dqm.Y1); - Vp8Encoding.ITransform(reference, tmp, yuvOut, false, scratch); + Vp8Encoding.ITransformOne(reference, tmp, yuvOut, scratch); return nz; } @@ -375,7 +375,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy for (n = 0; n < 8; n += 2) { - Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), true, scratch); + Vp8Encoding.ITransform(reference.Slice(WebpLookupTables.Vp8ScanUv[n]), tmp.Slice(n * 16, 32), yuvOut.Slice(WebpLookupTables.Vp8ScanUv[n]), scratch); } return nz << 16; diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index 34a3a5f17..bcecdcd75 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -3,7 +3,6 @@ using System; using System.Buffers.Binary; -using System.Linq; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; #if SUPPORTS_RUNTIME_INTRINSICS @@ -16,7 +15,7 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy /// /// Methods for encoding a VP8 frame. /// - internal static unsafe class Vp8Encoding + internal static class Vp8Encoding { private const int KC1 = 20091 + (1 << 16); @@ -83,8 +82,8 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } // Transforms (Paragraph 14.4) - // Does one or two inverse transforms. - public static void ITransform(Span reference, Span input, Span dst, bool doTwo, Span scratch) + // Does two inverse transforms. + public static void ITransform(Span reference, Span input, Span dst, Span scratch) { #if SUPPORTS_RUNTIME_INTRINSICS if (Sse2.IsSupported) @@ -120,23 +119,20 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x - if (doTwo) - { - var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 16)), 0); - var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); - var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); - var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); - - in0 = Sse2.UnpackLow(in0, inb0); - in1 = Sse2.UnpackLow(in1, inb1); - in2 = Sse2.UnpackLow(in2, inb2); - in3 = Sse2.UnpackLow(in3, inb3); - - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 - } + var inb0 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 16)), 0); + var inb1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 20)), 0); + var inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0); + var inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0); + + in0 = Sse2.UnpackLow(in0, inb0); + in1 = Sse2.UnpackLow(in1, inb1); + in2 = Sse2.UnpackLow(in2, inb2); + in3 = Sse2.UnpackLow(in3, inb3); + + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. @@ -206,22 +202,12 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy Vector128 ref2 = Vector128.Zero; Vector128 ref3 = Vector128.Zero; ref byte referenceRef = ref MemoryMarshal.GetReference(reference); - if (doTwo) - { - // Load eight bytes/pixels per line. - ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); - ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); - ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); - ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); - } - else - { - // Load four bytes/pixels per line. - ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); - ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); - ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); - ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); - } + + // Load eight bytes/pixels per line. + ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte(); + ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte(); + ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte(); + ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte(); // Convert to 16b. ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); @@ -243,72 +229,183 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Unsigned saturate to 8b. ref byte outputRef = ref MemoryMarshal.GetReference(dst); - if (doTwo) - { - // Store eight bytes/pixels per line. - Unsafe.As>(ref outputRef) = ref0.GetLower(); - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); - Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); - } - else - { - // Store four bytes/pixels per line. - int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); - int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); - int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); - int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); - - Unsafe.As(ref outputRef) = output0; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; - Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; - } + + // Store eight bytes/pixels per line. + Unsafe.As>(ref outputRef) = ref0.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = ref1.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = ref2.GetLower(); + Unsafe.As>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = ref3.GetLower(); } else #endif { ITransformOne(reference, input, dst, scratch); - if (doTwo) - { - ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); - } + ITransformOne(reference.Slice(4), input.Slice(16), dst.Slice(4), scratch); } } public static void ITransformOne(Span reference, Span input, Span dst, Span scratch) { - int i; - Span tmp = scratch.Slice(0, 16); - for (i = 0; i < 4; i++) +#if SUPPORTS_RUNTIME_INTRINSICS + if (Sse2.IsSupported) { - // vertical pass. - int a = input[0] + input[8]; - int b = input[0] - input[8]; - int c = Mul(input[4], KC2) - Mul(input[12], KC1); - int d = Mul(input[4], KC1) + Mul(input[12], KC2); - tmp[0] = a + d; - tmp[1] = b + c; - tmp[2] = b - c; - tmp[3] = a - d; - tmp = tmp.Slice(4); - input = input.Slice(1); - } + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + ref short inputRef = ref MemoryMarshal.GetReference(input); + var in0 = Vector128.Create(Unsafe.As(ref inputRef), 0); + var in1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 4)), 0); + var in2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 8)), 0); + var in3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 12)), 0); - tmp = scratch; - for (i = 0; i < 4; i++) + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + + // Vertical pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + + // Horizontal pass and subsequent transpose. + // First pass, c and d calculations are longer because of the "trick" multiplications. + Vector128 dc = Sse2.Add(t0.AsInt16(), Four); + a = Sse2.Add(dc, t2.AsInt16()); + b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); + c4 = Sse2.Subtract(c1, c2); + c = Sse2.Add(c3, c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); + d4 = Sse2.Add(d1, d2); + d = Sse2.Add(d3, d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + + // Transpose the two 4x4. + LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + + // Add inverse transform to 'ref' and store. + // Load the reference(s). + Vector128 ref0 = Vector128.Zero; + Vector128 ref1 = Vector128.Zero; + Vector128 ref2 = Vector128.Zero; + Vector128 ref3 = Vector128.Zero; + ref byte referenceRef = ref MemoryMarshal.GetReference(reference); + + // Load four bytes/pixels per line. + ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte(); + ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte(); + ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte(); + ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte(); + + // Convert to 16b. + ref0 = Sse2.UnpackLow(ref0, Vector128.Zero); + ref1 = Sse2.UnpackLow(ref1, Vector128.Zero); + ref2 = Sse2.UnpackLow(ref2, Vector128.Zero); + ref3 = Sse2.UnpackLow(ref3, Vector128.Zero); + + // Add the inverse transform(s). + Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16()); + Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16()); + Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16()); + Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16()); + + // Unsigned saturate to 8b. + ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded); + ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded); + ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded); + ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded); + + // Unsigned saturate to 8b. + ref byte outputRef = ref MemoryMarshal.GetReference(dst); + + // Store four bytes/pixels per line. + int output0 = Sse2.ConvertToInt32(ref0.AsInt32()); + int output1 = Sse2.ConvertToInt32(ref1.AsInt32()); + int output2 = Sse2.ConvertToInt32(ref2.AsInt32()); + int output3 = Sse2.ConvertToInt32(ref3.AsInt32()); + + Unsafe.As(ref outputRef) = output0; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; + Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3; + } + else +#endif { - // horizontal pass. - int dc = tmp[0] + 4; - int a = dc + tmp[8]; - int b = dc - tmp[8]; - int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1); - int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2); - Store(dst, reference, 0, i, a + d); - Store(dst, reference, 1, i, b + c); - Store(dst, reference, 2, i, b - c); - Store(dst, reference, 3, i, a - d); - tmp = tmp.Slice(1); + int i; + Span tmp = scratch.Slice(0, 16); + for (i = 0; i < 4; i++) + { + // vertical pass. + int a = input[0] + input[8]; + int b = input[0] - input[8]; + int c = Mul(input[4], KC2) - Mul(input[12], KC1); + int d = Mul(input[4], KC1) + Mul(input[12], KC2); + tmp[0] = a + d; + tmp[1] = b + c; + tmp[2] = b - c; + tmp[3] = a - d; + tmp = tmp.Slice(4); + input = input.Slice(1); + } + + tmp = scratch; + for (i = 0; i < 4; i++) + { + // horizontal pass. + int dc = tmp[0] + 4; + int a = dc + tmp[8]; + int b = dc - tmp[8]; + int c = Mul(tmp[4], KC2) - Mul(tmp[12], KC1); + int d = Mul(tmp[4], KC1) + Mul(tmp[12], KC2); + Store(dst, reference, 0, i, a + d); + Store(dst, reference, 1, i, b + c); + Store(dst, reference, 2, i, b - c); + Store(dst, reference, 3, i, a - d); + tmp = tmp.Slice(1); + } } } diff --git a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs index c4f8601b1..17c9beb9b 100644 --- a/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs +++ b/tests/ImageSharp.Tests/Formats/WebP/Vp8EncodingTests.cs @@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP int[] scratch = new int[16]; // act - Vp8Encoding.ITransform(reference, input, dst, false, scratch); + Vp8Encoding.ITransformOne(reference, input, dst, scratch); // assert Assert.True(dst.SequenceEqual(expected)); @@ -69,7 +69,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.WebP int[] scratch = new int[16]; // act - Vp8Encoding.ITransform(reference, input, dst, true, scratch); + Vp8Encoding.ITransform(reference, input, dst, scratch); // assert Assert.True(dst.SequenceEqual(expected)); From 5074ee6204f7c33875ee40988f1dc9bb20211a3b Mon Sep 17 00:00:00 2001 From: Brian Popow Date: Fri, 12 Nov 2021 13:33:30 +0100 Subject: [PATCH 12/12] Refactor: extract horizontal and vertical pass into methods --- .../Formats/Webp/Lossy/Vp8Encoding.cs | 161 +++++++----------- 1 file changed, 63 insertions(+), 98 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index bcecdcd75..aa4ab5767 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -136,61 +136,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); - - // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3, c4); - - // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); - - // Second pass. - Vector128 tmp0 = Sse2.Add(a, d); - Vector128 tmp1 = Sse2.Add(b, c); - Vector128 tmp2 = Sse2.Subtract(b, c); - Vector128 tmp3 = Sse2.Subtract(a, d); + InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Four); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); - - // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); - - // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); - - // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); @@ -266,61 +219,14 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); - - // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3, c4); - - // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); - - // Second pass. - Vector128 tmp0 = Sse2.Add(a, d); - Vector128 tmp1 = Sse2.Add(b, c); - Vector128 tmp2 = Sse2.Subtract(b, c); - Vector128 tmp3 = Sse2.Subtract(a, d); + InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Four); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); - - // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); - - // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); - - // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); @@ -409,6 +315,65 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy } } +#if SUPPORTS_RUNTIME_INTRINSICS + private static void InverseTransformVerticalPass(Vector128 in0, Vector128 in2, Vector128 in1, Vector128 in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3) + { + Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); + Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + tmp0 = Sse2.Add(a, d); + tmp1 = Sse2.Add(b, c); + tmp2 = Sse2.Subtract(b, c); + tmp3 = Sse2.Subtract(a, d); + } + + private static void InverseTransformHorizontalPass(Vector128 t0, Vector128 t2, Vector128 t1, Vector128 t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3) + { + Vector128 dc = Sse2.Add(t0.AsInt16(), Four); + Vector128 a = Sse2.Add(dc, t2.AsInt16()); + Vector128 b = Sse2.Subtract(dc, t2.AsInt16()); + + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + Vector128 c1 = Sse2.MultiplyHigh(t1.AsInt16(), K2); + Vector128 c2 = Sse2.MultiplyHigh(t3.AsInt16(), K1); + Vector128 c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); + Vector128 c4 = Sse2.Subtract(c1, c2); + Vector128 c = Sse2.Add(c3, c4); + + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + Vector128 d1 = Sse2.MultiplyHigh(t1.AsInt16(), K1); + Vector128 d2 = Sse2.MultiplyHigh(t3.AsInt16(), K2); + Vector128 d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); + Vector128 d4 = Sse2.Add(d1, d2); + Vector128 d = Sse2.Add(d3, d4); + + // Second pass. + Vector128 tmp0 = Sse2.Add(a, d); + Vector128 tmp1 = Sse2.Add(b, c); + Vector128 tmp2 = Sse2.Subtract(b, c); + Vector128 tmp3 = Sse2.Subtract(a, d); + shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); + shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); + shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); + shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + } +#endif + public static void FTransform2(Span src, Span reference, Span output, Span output2, Span scratch) { FTransform(src, reference, output, scratch);