Port TransformTwo

1 year ago · 3be2b6a7fc
2 changed files with 82 additions and 50 deletions
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@ -677,6 +677,38 @@ internal static class Vector128_
        return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7));
    }

+    /// <summary>
+    /// Unpack and interleave 8-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 8-bit integers to unpack from the low half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 8-bit integers to unpack from the low half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 8-bit integers from the low
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    public static Vector128<byte> UnpackLow(Vector128<byte> left, Vector128<byte> right)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.UnpackLow(left, right);
+        }
+
+        if (AdvSimd.IsSupported)
+        {
+            return AdvSimd.Arm64.ZipLow(left, right);
+        }
+
+        Vector128<byte> unpacked = Vector128.Create(left.GetLower(), right.GetLower());
+        return Vector128.Shuffle(
+            unpacked,
+            Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)));
+    }
+
    [DoesNotReturn]
    private static void ThrowUnreachableException() => throw new UnreachableException();
 }
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@ -1035,7 +1035,7 @@ internal static class LossyUtils
    // Does two transforms.
    public static void TransformTwo(Span<short> src, Span<byte> dst, Span<int> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            // This implementation makes use of 16-bit fixed point versions of two
            // multiply constants:
@ -1083,64 +1083,64 @@ internal static class LossyUtils

            // Vertical pass and subsequent transpose.
            // First pass, c and d calculations are longer because of the "trick" multiplications.
-            Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
-            Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
+            Vector128<short> a = in0.AsInt16() + in2.AsInt16();
+            Vector128<short> b = in0.AsInt16() - in2.AsInt16();

            Vector128<short> k1 = Vector128.Create((short)20091);
            Vector128<short> k2 = Vector128.Create((short)-30068);

            // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
-            Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2);
-            Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1);
-            Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
-            Vector128<short> c4 = Sse2.Subtract(c1, c2);
-            Vector128<short> c = Sse2.Add(c3.AsInt16(), c4);
+            Vector128<short> c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2);
+            Vector128<short> c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1);
+            Vector128<short> c3 = in1.AsInt16() - in3.AsInt16();
+            Vector128<short> c4 = c1 - c2;
+            Vector128<short> c = c3.AsInt16() + c4;

            // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
-            Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1);
-            Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2);
-            Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
-            Vector128<short> d4 = Sse2.Add(d1, d2);
-            Vector128<short> d = Sse2.Add(d3, d4);
+            Vector128<short> d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1);
+            Vector128<short> d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2);
+            Vector128<short> d3 = in1.AsInt16() + in3.AsInt16();
+            Vector128<short> d4 = d1 + d2;
+            Vector128<short> d = d3 + d4;

            // Second pass.
-            Vector128<short> tmp0 = Sse2.Add(a.AsInt16(), d);
-            Vector128<short> tmp1 = Sse2.Add(b.AsInt16(), c);
-            Vector128<short> tmp2 = Sse2.Subtract(b.AsInt16(), c);
-            Vector128<short> tmp3 = Sse2.Subtract(a.AsInt16(), d);
+            Vector128<short> tmp0 = a.AsInt16() + d;
+            Vector128<short> tmp1 = b.AsInt16() + c;
+            Vector128<short> tmp2 = b.AsInt16() - c;
+            Vector128<short> tmp3 = a.AsInt16() - d;

            // Transpose the two 4x4.
            Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);

            // Horizontal pass and subsequent transpose.
            // First pass, c and d calculations are longer because of the "trick" multiplications.
-            Vector128<short> dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4));
-            a = Sse2.Add(dc, t2.AsInt16());
-            b = Sse2.Subtract(dc, t2.AsInt16());
+            Vector128<short> dc = t0.AsInt16() + Vector128.Create((short)4);
+            a = dc + t2.AsInt16();
+            b = dc - t2.AsInt16();

            // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
-            c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2);
-            c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1);
-            c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
-            c4 = Sse2.Subtract(c1, c2);
-            c = Sse2.Add(c3, c4);
+            c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2);
+            c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1);
+            c3 = t1.AsInt16() - t3.AsInt16();
+            c4 = c1 - c2;
+            c = c3 + c4;

            // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
-            d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1);
-            d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2);
-            d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
-            d4 = Sse2.Add(d1, d2);
-            d = Sse2.Add(d3, d4);
+            d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1);
+            d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2);
+            d3 = t1.AsInt16() + t3.AsInt16();
+            d4 = d1 + d2;
+            d = d3 + d4;

            // Second pass.
-            tmp0 = Sse2.Add(a, d);
-            tmp1 = Sse2.Add(b, c);
-            tmp2 = Sse2.Subtract(b, c);
-            tmp3 = Sse2.Subtract(a, d);
-            Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
-            Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
-            Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
-            Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+            tmp0 = a + d;
+            tmp1 = b + c;
+            tmp2 = b - c;
+            tmp3 = a - d;
+            Vector128<short> shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3);
+            Vector128<short> shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3);
+            Vector128<short> shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3);
+            Vector128<short> shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3);

            // Transpose the two 4x4.
            Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
@ -1155,22 +1155,22 @@ internal static class LossyUtils
            Vector128<byte> dst3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3)), 0).AsByte();

            // Convert to 16b.
-            dst0 = Sse2.UnpackLow(dst0, Vector128<byte>.Zero);
-            dst1 = Sse2.UnpackLow(dst1, Vector128<byte>.Zero);
-            dst2 = Sse2.UnpackLow(dst2, Vector128<byte>.Zero);
-            dst3 = Sse2.UnpackLow(dst3, Vector128<byte>.Zero);
+            dst0 = Vector128_.UnpackLow(dst0, Vector128<byte>.Zero);
+            dst1 = Vector128_.UnpackLow(dst1, Vector128<byte>.Zero);
+            dst2 = Vector128_.UnpackLow(dst2, Vector128<byte>.Zero);
+            dst3 = Vector128_.UnpackLow(dst3, Vector128<byte>.Zero);

            // Add the inverse transform(s).
-            dst0 = Sse2.Add(dst0.AsInt16(), t0.AsInt16()).AsByte();
-            dst1 = Sse2.Add(dst1.AsInt16(), t1.AsInt16()).AsByte();
-            dst2 = Sse2.Add(dst2.AsInt16(), t2.AsInt16()).AsByte();
-            dst3 = Sse2.Add(dst3.AsInt16(), t3.AsInt16()).AsByte();
+            dst0 = (dst0.AsInt16() + t0.AsInt16()).AsByte();
+            dst1 = (dst1.AsInt16() + t1.AsInt16()).AsByte();
+            dst2 = (dst2.AsInt16() + t2.AsInt16()).AsByte();
+            dst3 = (dst3.AsInt16() + t3.AsInt16()).AsByte();

            // Unsigned saturate to 8b.
-            dst0 = Sse2.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16());
-            dst1 = Sse2.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16());
-            dst2 = Sse2.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16());
-            dst3 = Sse2.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16());
+            dst0 = Vector128_.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16());
+            dst1 = Vector128_.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16());
+            dst2 = Vector128_.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16());
+            dst3 = Vector128_.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16());

            // Store the results.
            // Store eight bytes/pixels per line.