Add SSE41 version of UpSample

5 years ago · 59a11bf901
1 changed files with 332 additions and 0 deletions
--- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
@ -4,6 +4,11 @@
 using System;
 using System.Buffers;
 using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+#if SUPPORTS_RUNTIME_INTRINSICS
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+#endif
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;

@ -18,8 +23,66 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy

        private const int YuvHalf = 1 << (YuvFix - 1);

+#if SUPPORTS_RUNTIME_INTRINSICS
+        private static readonly Vector128<byte> One = Vector128.Create((byte)1);
+
+        // These constants are 14b fixed-point version of ITU-R BT.601 constants.
+        // R = (19077 * y             + 26149 * v - 14234) >> 6
+        // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
+        // B = (19077 * y + 33050 * u             - 17685) >> 6
+        private static readonly Vector128<byte> K19077 = Vector128.Create((short)19077).AsByte();
+
+        private static readonly Vector128<byte> K26149 = Vector128.Create((short)26149).AsByte();
+
+        private static readonly Vector128<byte> K14234 = Vector128.Create((short)14234).AsByte();
+
+        // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
+        private static readonly Vector128<byte> K33050 = Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129);
+
+        private static readonly Vector128<byte> K17685 = Vector128.Create((short)17685).AsByte();
+
+        private static readonly Vector128<byte> K6419 = Vector128.Create((short)6419).AsByte();
+
+        private static readonly Vector128<byte> K13320 = Vector128.Create((short)13320).AsByte();
+
+        private static readonly Vector128<byte> K8708 = Vector128.Create((short)8708).AsByte();
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle0 = Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle1 = Vector128.Create(255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle2 = Vector128.Create(255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 14, 255, 255, 15, 255, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle3 = Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle4 = Vector128.Create(5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255, 10);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle5 = Vector128.Create(255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 24, 255, 255, 15, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle6 = Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle7 = Vector128.Create(255, 5, 255, 255, 6, 255, 255, 7, 255, 255, 8, 255, 255, 9, 255, 255);
+
+        private static readonly Vector128<byte> PlanarTo24Shuffle8 = Vector128.Create(10, 255, 255, 11, 255, 255, 12, 255, 255, 13, 255, 255, 24, 255, 255, 15);
+#endif
+
+        // UpSample from YUV to RGB.
        public static void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len)
        {
+#if SUPPORTS_RUNTIME_INTRINSICS
+            if (Sse41.IsSupported)
+            {
+                UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len);
+            }
+            else
+#endif
+            {
+                UpSampleScalar(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len);
+            }
+        }
+
+        public static void UpSampleScalar(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len)
+        {
            int xStep = 3;
            int lastPixelPair = (len - 1) >> 1;
            uint tluv = LoadUv(topU[0], topV[0]); // top-left sample
@ -72,6 +135,106 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
            }
        }

+#if SUPPORTS_RUNTIME_INTRINSICS
+        // We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
+        // u = (9*a + 3*b + 3*c + d + 8) / 16
+        //   = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
+        //   = (a + m + 1) / 2
+        // where m = (a + 3*b + 3*c + d) / 8
+        //         = ((a + b + c + d) / 2 + b + c) / 4
+        //
+        // Let's say  k = (a + b + c + d) / 4.
+        // We can compute k as
+        // k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
+        // where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
+        //
+        // Then m can be written as
+        // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
+        public static void UpSampleSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len)
+        {
+            const int xStep = 3;
+            byte[] uvBuffer = new byte[(14 * 32) + 15];
+            Span<byte> ru = uvBuffer.AsSpan(15);
+            Span<byte> rv = ru.Slice(32);
+
+            // Treat the first pixel in regular way.
+            int uDiag = ((topU[0] + curU[0]) >> 1) + 1;
+            int vDiag = ((topV[0] + curV[0]) >> 1) + 1;
+            int u0t = (topU[0] + uDiag) >> 1;
+            int v0t = (topV[0] + vDiag) >> 1;
+            YuvToBgr(topY[0], u0t, v0t, topDst);
+            if (bottomY != null)
+            {
+                int u0b = (curU[0] + uDiag) >> 1;
+                int v0b = (curV[0] + vDiag) >> 1;
+                YuvToBgr(bottomY[0], u0b, v0b, bottomDst);
+            }
+
+            // For UpSample32Pixels, 17 u/v values must be read-able for each block.
+            for (int pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
+            {
+                UpSample32Pixels(topU.Slice(uvPos), curU.Slice(uvPos), ru);
+                UpSample32Pixels(topV.Slice(uvPos), curV.Slice(uvPos), rv);
+                ConvertToBgrSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
+            }
+        }
+
+        // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
+        public static void UpSample32Pixels(Span<byte> r1, Span<byte> r2, Span<byte> output)
+        {
+            // Load inputs.
+            Vector128<byte> a = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(r1));
+            Vector128<byte> b = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref MemoryMarshal.GetReference(r1), 1));
+            Vector128<byte> c = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(r2));
+            Vector128<byte> d = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref MemoryMarshal.GetReference(r2), 1));
+
+            Vector128<byte> s = Sse2.Average(a, d); // s = (a + d + 1) / 2
+            Vector128<byte> t = Sse2.Average(b, c); // t = (b + c + 1) / 2
+            Vector128<byte> st = Sse2.Xor(s, t); // st = s^t
+
+            Vector128<byte> ad = Sse2.Xor(a, d); // ad = a^d
+            Vector128<byte> bc = Sse2.Xor(b, c); // bc = b^c
+
+            Vector128<byte> t1 = Sse2.Or(ad, bc); // (a^d) | (b^c)
+            Vector128<byte> t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t)
+            Vector128<byte> t3 = Sse2.And(t2, One); // (a^d) | (b^c) | (s^t) & 1
+            Vector128<byte> t4 = Sse2.Average(s, t);
+            Vector128<byte> k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4
+
+            Vector128<byte> diag1 = GetM(k, st, bc, t);
+            Vector128<byte> diag2 = GetM(k, st, ad, s);
+
+            // Pack the alternate pixels.
+            PackAndStore(a, b, diag1, diag2, output); // store top.
+            PackAndStore(c, d, diag2, diag1, output.Slice(2 * 32));
+        }
+
+        // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
+        private static Vector128<byte> GetM(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
+        {
+            Vector128<byte> tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2
+            Vector128<byte> tmp1 = Sse2.And(ij, st); // (ij) & (s^t)
+            Vector128<byte> tmp2 = Sse2.Xor(k, input); // (k^in)
+            Vector128<byte> tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in)
+            Vector128<byte> tmp4 = Sse2.And(tmp3, One); // & 1 -> lsb_correction
+
+            return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction
+        }
+
+        private static void PackAndStore(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
+        {
+            Vector128<byte> ta = Sse2.Average(a, da); // (9a + 3b + 3c +  d + 8) / 16
+            Vector128<byte> tb = Sse2.Average(b, db); // (3a + 9b +  c + 3d + 8) / 16
+            Vector128<byte> t1 = Sse2.UnpackLow(ta, tb);
+            Vector128<byte> t2 = Sse2.UnpackHigh(ta, tb);
+
+            ref byte output0Ref = ref MemoryMarshal.GetReference(output);
+            ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16);
+            Unsafe.As<byte, Vector128<byte>>(ref output0Ref) = t1;
+            Unsafe.As<byte, Vector128<byte>>(ref output1Ref) = t2;
+        }
+#endif
+
        /// <summary>
        /// Converts the RGB values of the image to YUV.
        /// </summary>
@ -366,6 +529,175 @@ namespace SixLabors.ImageSharp.Formats.Webp.Lossy
            bgr[0] = (byte)YuvToB(y, u);
        }

+#if SUPPORTS_RUNTIME_INTRINSICS
+
+        private static void ConvertToBgrSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
+        {
+            YuvToBgrSse41(topY.Slice(curX), ru, rv, topDst.Slice(curX * step));
+
+            if (bottomY != null)
+            {
+                YuvToBgrSse41(bottomY.Slice(curX), ru.Slice(64), rv.Slice(64), bottomDst.Slice(curX * step));
+            }
+        }
+
+        public static void YuvToBgrSse41(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
+        {
+            ConvertYuv444ToRgbSse41(y, u, v, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
+            ConvertYuv444ToRgbSse41(y.Slice(8), u.Slice(8), v.Slice(8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
+            ConvertYuv444ToRgbSse41(y.Slice(16), u.Slice(16), v.Slice(16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
+            ConvertYuv444ToRgbSse41(y.Slice(24), u.Slice(24), v.Slice(24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);
+
+            // Cast to 8b and store as BBBBGGGGRRRR.
+            Vector128<byte> bgr0 = Sse2.PackUnsignedSaturate(b0, b1);
+            Vector128<byte> bgr1 = Sse2.PackUnsignedSaturate(b2, b3);
+            Vector128<byte> bgr2 = Sse2.PackUnsignedSaturate(g0, g1);
+            Vector128<byte> bgr3 = Sse2.PackUnsignedSaturate(g2, g3);
+            Vector128<byte> bgr4 = Sse2.PackUnsignedSaturate(r0, r1);
+            Vector128<byte> bgr5 = Sse2.PackUnsignedSaturate(r2, r3);
+
+            // Pack as BGRBGRBGRBGR.
+            PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
+        }
+
+        // Pack the planar buffers
+        // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+        // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+        private static void PlanarTo24bSse41(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
+        {
+            // The input is 6 registers of sixteen 8b but for the sake of explanation,
+            // let's take 6 registers of four 8b values.
+            // To pack, we will keep taking one every two 8b integer and move it
+            // around as follows:
+            // Input:
+            //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
+            // Split the 6 registers in two sets of 3 registers: the first set as the even
+            // 8b bytes, the second the odd ones:
+            //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
+            // Repeat the same permutations twice more:
+            //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
+            //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
+
+            // Process R.
+            ChannelMixing(
+                input0,
+                input1,
+                PlanarTo24Shuffle0,
+                PlanarTo24Shuffle1,
+                PlanarTo24Shuffle2,
+                out Vector128<byte> r0,
+                out Vector128<byte> r1,
+                out Vector128<byte> r2,
+                out Vector128<byte> r3,
+                out Vector128<byte> r4,
+                out Vector128<byte> r5);
+
+            // Process G.
+            // Same as before, just shifted to the left by one and including the right padding.
+            ChannelMixing(
+                input2,
+                input3,
+                PlanarTo24Shuffle3,
+                PlanarTo24Shuffle4,
+                PlanarTo24Shuffle5,
+                out Vector128<byte> g0,
+                out Vector128<byte> g1,
+                out Vector128<byte> g2,
+                out Vector128<byte> g3,
+                out Vector128<byte> g4,
+                out Vector128<byte> g5);
+
+            // Process B.
+            ChannelMixing(
+                input4,
+                input5,
+                PlanarTo24Shuffle6,
+                PlanarTo24Shuffle7,
+                PlanarTo24Shuffle8,
+                out Vector128<byte> b0,
+                out Vector128<byte> b1,
+                out Vector128<byte> b2,
+                out Vector128<byte> b3,
+                out Vector128<byte> b4,
+                out Vector128<byte> b5);
+
+            // OR the different channels.
+            Vector128<byte> rg0 = Sse2.Or(r0, g0);
+            Vector128<byte> rg1 = Sse2.Or(r1, g1);
+            Vector128<byte> rg2 = Sse2.Or(r2, g2);
+            Vector128<byte> rg3 = Sse2.Or(r3, g3);
+            Vector128<byte> rg4 = Sse2.Or(r4, g4);
+            Vector128<byte> rg5 = Sse2.Or(r5, g5);
+
+            ref byte outputRef = ref MemoryMarshal.GetReference(rgb);
+            Unsafe.As<byte, Vector128<byte>>(ref outputRef) = Sse2.Or(rg0, b0);
+            Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1);
+            Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2);
+            Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3);
+            Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4);
+            Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5);
+        }
+
+        // Shuffles the input buffer as A0 0 0 A1 0 0 A2
+        private static void ChannelMixing(
+            Vector128<byte> input0,
+            Vector128<byte> input1,
+            Vector128<byte> shuffle0,
+            Vector128<byte> shuffle1,
+            Vector128<byte> shuffle2,
+            out Vector128<byte> output0,
+            out Vector128<byte> output1,
+            out Vector128<byte> output2,
+            out Vector128<byte> output3,
+            out Vector128<byte> output4,
+            out Vector128<byte> output5)
+        {
+            output0 = Ssse3.Shuffle(input0, shuffle0);
+            output1 = Ssse3.Shuffle(input0, shuffle1);
+            output2 = Ssse3.Shuffle(input0, shuffle2);
+            output3 = Ssse3.Shuffle(input1, shuffle0);
+            output4 = Ssse3.Shuffle(input1, shuffle1);
+            output5 = Ssse3.Shuffle(input1, shuffle2);
+        }
+
+        // Convert 32 samples of YUV444 to R/G/B
+        private static void ConvertYuv444ToRgbSse41(Span<byte> y, Span<byte> u, Span<byte> v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
+        {
+            Vector128<byte> y0 = LoadHigh(y);
+            Vector128<byte> u0 = LoadHigh(u);
+            Vector128<byte> v0 = LoadHigh(v);
+
+            Vector128<ushort> y1 = Sse2.MultiplyHigh(y0.AsUInt16(), K19077.AsUInt16());
+
+            Vector128<ushort> r0 = Sse2.MultiplyHigh(v0.AsUInt16(), K26149.AsUInt16());
+            Vector128<ushort> r1 = Sse2.Subtract(y1.AsUInt16(), K14234.AsUInt16());
+            Vector128<ushort> r2 = Sse2.Add(r1, r0);
+
+            Vector128<ushort> g0 = Sse2.MultiplyHigh(u0.AsUInt16(), K6419.AsUInt16());
+            Vector128<ushort> g1 = Sse2.MultiplyHigh(v0.AsUInt16(), K13320.AsUInt16());
+            Vector128<ushort> g2 = Sse2.Add(y1.AsUInt16(), K8708.AsUInt16());
+            Vector128<ushort> g3 = Sse2.Add(g0, g1);
+            Vector128<ushort> g4 = Sse2.Subtract(g2, g3);
+
+            Vector128<ushort> b0 = Sse2.MultiplyHigh(u0.AsUInt16(), K33050.AsUInt16());
+            Vector128<ushort> b1 = Sse2.AddSaturate(b0, y1);
+            Vector128<ushort> b2 = Sse2.SubtractSaturate(b1, K17685.AsUInt16());
+
+            // use logical shift for B2, which can be larger than 32767
+            r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
+            g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
+            b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
+        }
+
+        // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
+        private static Vector128<byte> LoadHigh(Span<byte> src)
+        {
+            Vector64<byte> tmp = Unsafe.As<byte, Vector64<byte>>(ref MemoryMarshal.GetReference(src));
+            Vector128<byte> tmp2 = Unsafe.As<Vector64<byte>, Vector128<byte>>(ref tmp);
+            return Sse2.UnpackLow(Vector128<byte>.Zero, tmp2);
+        }
+#endif
+
        [MethodImpl(InliningOptions.ShortMethod)]
        public static int YuvToB(int y, int u) => Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685);