From 3be2b6a7fc21da49bb0ca824d0cc36b945c1b479 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 31 May 2025 00:08:24 +1000 Subject: [PATCH] Port TransformTwo --- .../Common/Helpers/Vector128Utilities.cs | 32 ++++++ .../Formats/Webp/Lossy/LossyUtils.cs | 100 +++++++++--------- 2 files changed, 82 insertions(+), 50 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 2c37a493ea..3076788d1b 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -677,6 +677,38 @@ internal static class Vector128_ return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7)); } + /// + /// Unpack and interleave 8-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the low + /// halves of and . + /// + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle( + unpacked, + Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 7bc995030e..7d186cd651 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -1035,7 +1035,7 @@ internal static class LossyUtils // Does two transforms. public static void TransformTwo(Span src, Span dst, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: @@ -1083,64 +1083,64 @@ internal static class LossyUtils // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + Vector128 a = in0.AsInt16() + in2.AsInt16(); + Vector128 b = in0.AsInt16() - in2.AsInt16(); Vector128 k1 = Vector128.Create((short)20091); Vector128 k2 = Vector128.Create((short)-30068); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3.AsInt16(), c4); + Vector128 c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2); + Vector128 c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1); + Vector128 c3 = in1.AsInt16() - in3.AsInt16(); + Vector128 c4 = c1 - c2; + Vector128 c = c3.AsInt16() + c4; // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); + Vector128 d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1); + Vector128 d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2); + Vector128 d3 = in1.AsInt16() + in3.AsInt16(); + Vector128 d4 = d1 + d2; + Vector128 d = d3 + d4; // Second pass. - Vector128 tmp0 = Sse2.Add(a.AsInt16(), d); - Vector128 tmp1 = Sse2.Add(b.AsInt16(), c); - Vector128 tmp2 = Sse2.Subtract(b.AsInt16(), c); - Vector128 tmp3 = Sse2.Subtract(a.AsInt16(), d); + Vector128 tmp0 = a.AsInt16() + d; + Vector128 tmp1 = b.AsInt16() + c; + Vector128 tmp2 = b.AsInt16() - c; + Vector128 tmp3 = a.AsInt16() - d; // Transpose the two 4x4. Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4)); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); + Vector128 dc = t0.AsInt16() + Vector128.Create((short)4); + a = dc + t2.AsInt16(); + b = dc - t2.AsInt16(); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); + c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2); + c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1); + c3 = t1.AsInt16() - t3.AsInt16(); + c4 = c1 - c2; + c = c3 + c4; // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); + d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1); + d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2); + d3 = t1.AsInt16() + t3.AsInt16(); + d4 = d1 + d2; + d = d3 + d4; // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + tmp0 = a + d; + tmp1 = b + c; + tmp2 = b - c; + tmp3 = a - d; + Vector128 shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); // Transpose the two 4x4. Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); @@ -1155,22 +1155,22 @@ internal static class LossyUtils Vector128 dst3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3)), 0).AsByte(); // Convert to 16b. - dst0 = Sse2.UnpackLow(dst0, Vector128.Zero); - dst1 = Sse2.UnpackLow(dst1, Vector128.Zero); - dst2 = Sse2.UnpackLow(dst2, Vector128.Zero); - dst3 = Sse2.UnpackLow(dst3, Vector128.Zero); + dst0 = Vector128_.UnpackLow(dst0, Vector128.Zero); + dst1 = Vector128_.UnpackLow(dst1, Vector128.Zero); + dst2 = Vector128_.UnpackLow(dst2, Vector128.Zero); + dst3 = Vector128_.UnpackLow(dst3, Vector128.Zero); // Add the inverse transform(s). - dst0 = Sse2.Add(dst0.AsInt16(), t0.AsInt16()).AsByte(); - dst1 = Sse2.Add(dst1.AsInt16(), t1.AsInt16()).AsByte(); - dst2 = Sse2.Add(dst2.AsInt16(), t2.AsInt16()).AsByte(); - dst3 = Sse2.Add(dst3.AsInt16(), t3.AsInt16()).AsByte(); + dst0 = (dst0.AsInt16() + t0.AsInt16()).AsByte(); + dst1 = (dst1.AsInt16() + t1.AsInt16()).AsByte(); + dst2 = (dst2.AsInt16() + t2.AsInt16()).AsByte(); + dst3 = (dst3.AsInt16() + t3.AsInt16()).AsByte(); // Unsigned saturate to 8b. - dst0 = Sse2.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); - dst1 = Sse2.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); - dst2 = Sse2.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); - dst3 = Sse2.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); + dst0 = Vector128_.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); + dst1 = Vector128_.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); + dst2 = Vector128_.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); + dst3 = Vector128_.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); // Store the results. // Store eight bytes/pixels per line.