From c490bc6f66f9fda79456aef28094314ff0070b84 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Fri, 30 May 2025 23:37:07 +1000 Subject: [PATCH] Port TTransformSse41 --- src/ImageSharp/Common/Helpers/Numerics.cs | 17 -- .../Common/Helpers/Vector128Utilities.cs | 227 ++++++++++++++++++ .../Formats/Webp/Lossy/LossyUtils.cs | 145 ++++++----- 3 files changed, 308 insertions(+), 81 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs index ca14ae4c38..5f91dcd998 100644 --- a/src/ImageSharp/Common/Helpers/Numerics.cs +++ b/src/ImageSharp/Common/Helpers/Numerics.cs @@ -884,23 +884,6 @@ internal static class Numerics accumulator += intHigh; } - /// - /// Reduces elements of the vector into one sum. - /// - /// The accumulator to reduce. - /// The sum of all elements. - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static int ReduceSum(Vector128 accumulator) - { - // Add odd to even. - Vector128 vsum = Sse2.Add(accumulator, Sse2.Shuffle(accumulator, 0b_11_11_01_01)); - - // Add high to low. - vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10)); - - return Sse2.ConvertToInt32(vsum); - } - /// /// Reduces elements of the vector into one sum. /// diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index f89900d7e5..2c37a493ea 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -88,6 +88,30 @@ internal static class Vector128_ return Vector128.Shuffle(vector, indices); } + /// + /// Creates a new vector by selecting values from an input vector using the control. + /// + /// The input vector from which values are selected. + /// The shuffle control byte. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShuffleNative(Vector128 vector, [ConstantExpected] byte control) + { + if (Sse2.IsSupported) + { + return Sse2.Shuffle(vector, control); + } + + // Don't use InverseMMShuffle here as we want to avoid the cast. + Vector128 indices = Vector128.Create( + control & 0x3, + (control >> 2) & 0x3, + (control >> 4) & 0x3, + (control >> 6) & 0x3); + + return Vector128.Shuffle(vector, indices); + } + /// /// Creates a new vector by selecting values from an input vector using a set of indices. /// @@ -412,6 +436,31 @@ internal static class Vector128_ return Vector128.Narrow(prodLo, prodHi); } + public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyAddAdjacent(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); + (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLower * rightLower; + Vector128 prodHi = leftUpper * rightUpper; + + // Extract the low and high parts of the products shuffling them to form a result we can add together. + // Use out-of-bounds to zero out the unused lanes. + Vector128 v0 = Vector128.Shuffle(prodLo, Vector128.Create(0, 2, 8, 8)); + Vector128 v1 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 0, 2)); + Vector128 v2 = Vector128.Shuffle(prodLo, Vector128.Create(1, 3, 8, 8)); + Vector128 v3 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 1, 3)); + + return v0 + v1 + v2 + v3; + } + /// /// Multiply the packed 16-bit integers in and , producing /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result. @@ -450,6 +499,184 @@ internal static class Vector128_ return Vector128.Narrow(prodLo, prodHi); } + /// + /// Unpack and interleave 64-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 64-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 64-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 64-bit integers from the high + /// halves of and . + /// + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + return Vector128.Create(left.GetUpper(), right.GetUpper()); + } + + /// + /// Unpack and interleave 64-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 64-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 64-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 64-bit integers from the low + /// halves of and . + /// + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + return Vector128.Create(left.GetLower(), right.GetLower()); + } + + /// + /// Unpack and interleave 32-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 32-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 32-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 32-bit integers from the high + /// halves of and . + /// + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 2, 1, 3)); + } + + /// + /// Unpack and interleave 32-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 32-bit integers from the low + /// halves of and . + /// + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 2, 1, 3)); + } + + /// + /// Unpack and interleave 16-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 16-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 16-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 16-bit integers from the high + /// halves of and . + /// + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7)); + } + + /// + /// Unpack and interleave 16-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 16-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 16-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 16-bit integers from the low + /// halves of and . + /// + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7)); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index aae4181ce0..9c73a32072 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -7,6 +7,7 @@ using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.Arm; using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Formats.Webp.Lossy; @@ -127,7 +128,7 @@ internal static class LossyUtils Vector128 e1 = Sse2.MultiplyAddAdjacent(d1, d1); Vector128 sum = Sse2.Add(e0, e1); - return Numerics.ReduceSum(sum); + return ReduceSum(sum); } if (AdvSimd.IsSupported) @@ -174,12 +175,12 @@ internal static class LossyUtils Vector128 sum1 = SubtractAndAccumulate(a0, b0); Vector128 sum2 = SubtractAndAccumulate(a1, b1); - sum = Sse2.Add(sum, Sse2.Add(sum1, sum2)); + sum += sum1 + sum2; offset += 2 * WebpConstants.Bps; } - return Numerics.ReduceSum(sum); + return ReduceSum(sum); } [MethodImpl(InliningOptions.ShortMethod)] @@ -378,17 +379,16 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8Disto4X4(Span a, Span b, Span w, Span scratch) { - if (Sse41.IsSupported) + if (Vector128.IsHardwareAccelerated) { - int diffSum = TTransformSse41(a, b, w); + int diffSum = TTransformVector128(a, b, w); return Math.Abs(diffSum) >> 5; } - else - { - int sum1 = TTransform(a, w, scratch); - int sum2 = TTransform(b, w, scratch); - return Math.Abs(sum2 - sum1) >> 5; - } + + int sum1 = TTransform(a, w, scratch); + int sum2 = TTransform(b, w, scratch); + + return Math.Abs(sum2 - sum1) >> 5; } public static void DC16(Span dst, Span yuv, int offset) @@ -905,7 +905,7 @@ internal static class LossyUtils /// Returns the weighted sum of the absolute value of transformed coefficients. /// w[] contains a row-major 4 by 4 symmetric matrix. /// - public static int TTransformSse41(Span inputA, Span inputB, Span w) + public static int TTransformVector128(Span inputA, Span inputB, Span w) { // Load and combine inputs. Vector128 ina0 = Unsafe.As>(ref MemoryMarshal.GetReference(inputA)); @@ -918,14 +918,14 @@ internal static class LossyUtils Vector128 inb3 = Unsafe.As>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 3, 16))).AsInt64(); // Combine inA and inB (we'll do two transforms in parallel). - Vector128 inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32()); - Vector128 inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32()); - Vector128 inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32()); - Vector128 inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32()); - Vector128 tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte()); - Vector128 tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte()); - Vector128 tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte()); - Vector128 tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte()); + Vector128 inab0 = Vector128_.UnpackLow(ina0.AsInt32(), inb0.AsInt32()); + Vector128 inab1 = Vector128_.UnpackLow(ina1.AsInt32(), inb1.AsInt32()); + Vector128 inab2 = Vector128_.UnpackLow(ina2.AsInt32(), inb2.AsInt32()); + Vector128 inab3 = Vector128_.UnpackLow(ina3.AsInt32(), inb3.AsInt32()); + Vector128 tmp0 = Vector128.WidenLower(inab0.AsByte()).AsInt16(); + Vector128 tmp1 = Vector128.WidenLower(inab1.AsByte()).AsInt16(); + Vector128 tmp2 = Vector128.WidenLower(inab2.AsByte()).AsInt16(); + Vector128 tmp3 = Vector128.WidenLower(inab3.AsByte()).AsInt16(); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 @@ -934,14 +934,14 @@ internal static class LossyUtils // Vertical pass first to avoid a transpose (vertical and horizontal passes // are commutative because w/kWeightY is symmetric) and subsequent transpose. // Calculate a and b (two 4x4 at once). - Vector128 a0 = Sse2.Add(tmp0, tmp2); - Vector128 a1 = Sse2.Add(tmp1, tmp3); - Vector128 a2 = Sse2.Subtract(tmp1, tmp3); - Vector128 a3 = Sse2.Subtract(tmp0, tmp2); - Vector128 b0 = Sse2.Add(a0, a1); - Vector128 b1 = Sse2.Add(a3, a2); - Vector128 b2 = Sse2.Subtract(a3, a2); - Vector128 b3 = Sse2.Subtract(a0, a1); + Vector128 a0 = tmp0 + tmp2; + Vector128 a1 = tmp1 + tmp3; + Vector128 a2 = tmp1 - tmp3; + Vector128 a3 = tmp0 - tmp2; + Vector128 b0 = a0 + a1; + Vector128 b1 = a3 + a2; + Vector128 b2 = a3 - a2; + Vector128 b3 = a0 - a1; // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 @@ -959,38 +959,38 @@ internal static class LossyUtils Vector128 w8 = Unsafe.As>(ref MemoryMarshal.GetReference(w.Slice(8, 8))); // Calculate a and b (two 4x4 at once). - a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16()); - a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16()); - a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16()); - a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16()); - b0 = Sse2.Add(a0, a1); - b1 = Sse2.Add(a3, a2); - b2 = Sse2.Subtract(a3, a2); - b3 = Sse2.Subtract(a0, a1); + a0 = output0.AsInt16() + output2.AsInt16(); + a1 = output1.AsInt16() + output3.AsInt16(); + a2 = output1.AsInt16() - output3.AsInt16(); + a3 = output0.AsInt16() - output2.AsInt16(); + b0 = a0 + a1; + b1 = a3 + a2; + b2 = a3 - a2; + b3 = a0 - a1; // Separate the transforms of inA and inB. - Vector128 ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64()); - Vector128 ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64()); - Vector128 bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64()); - Vector128 bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64()); + Vector128 ab0 = Vector128_.UnpackLow(b0.AsInt64(), b1.AsInt64()); + Vector128 ab2 = Vector128_.UnpackLow(b2.AsInt64(), b3.AsInt64()); + Vector128 bb0 = Vector128_.UnpackHigh(b0.AsInt64(), b1.AsInt64()); + Vector128 bb2 = Vector128_.UnpackHigh(b2.AsInt64(), b3.AsInt64()); - Vector128 ab0Abs = Ssse3.Abs(ab0.AsInt16()); - Vector128 ab2Abs = Ssse3.Abs(ab2.AsInt16()); - Vector128 b0Abs = Ssse3.Abs(bb0.AsInt16()); - Vector128 bb2Abs = Ssse3.Abs(bb2.AsInt16()); + Vector128 ab0Abs = Vector128.Abs(ab0.AsInt16()); + Vector128 ab2Abs = Vector128.Abs(ab2.AsInt16()); + Vector128 b0Abs = Vector128.Abs(bb0.AsInt16()); + Vector128 bb2Abs = Vector128.Abs(bb2.AsInt16()); // weighted sums. - Vector128 ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16()); - Vector128 ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16()); - Vector128 b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16()); - Vector128 bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16()); - Vector128 ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8); - Vector128 b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8); + Vector128 ab0mulw0 = Vector128_.MultiplyAddAdjacent(ab0Abs, w0.AsInt16()); + Vector128 ab2mulw8 = Vector128_.MultiplyAddAdjacent(ab2Abs, w8.AsInt16()); + Vector128 b0mulw0 = Vector128_.MultiplyAddAdjacent(b0Abs, w0.AsInt16()); + Vector128 bb2mulw8 = Vector128_.MultiplyAddAdjacent(bb2Abs, w8.AsInt16()); + Vector128 ab0ab2Sum = ab0mulw0 + ab2mulw8; + Vector128 b0w0bb2w8Sum = b0mulw0 + bb2mulw8; // difference of weighted sums. - Vector128 result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32()); + Vector128 result = ab0ab2Sum - b0w0bb2w8Sum; - return Numerics.ReduceSum(result); + return ReduceSum(result); } // Transpose two 4x4 16b matrices horizontally stored in registers. @@ -1002,28 +1002,28 @@ internal static class LossyUtils // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 - Vector128 transpose00 = Sse2.UnpackLow(b0, b1); - Vector128 transpose01 = Sse2.UnpackLow(b2, b3); - Vector128 transpose02 = Sse2.UnpackHigh(b0, b1); - Vector128 transpose03 = Sse2.UnpackHigh(b2, b3); + Vector128 transpose00 = Vector128_.UnpackLow(b0, b1); + Vector128 transpose01 = Vector128_.UnpackLow(b2, b3); + Vector128 transpose02 = Vector128_.UnpackHigh(b0, b1); + Vector128 transpose03 = Vector128_.UnpackHigh(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 - Vector128 transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); - Vector128 transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); - Vector128 transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose10 = Vector128_.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose11 = Vector128_.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32()); + Vector128 transpose12 = Vector128_.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32()); + Vector128 transpose13 = Vector128_.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32()); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 - output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); - output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); - output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); - output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); + output0 = Vector128_.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64()); + output1 = Vector128_.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64()); + output2 = Vector128_.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64()); + output3 = Vector128_.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64()); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -1910,6 +1910,23 @@ internal static class LossyUtils // Cost of coding one event with probability 'proba'. public static int Vp8BitCost(int bit, byte proba) => bit == 0 ? WebpLookupTables.Vp8EntropyCost[proba] : WebpLookupTables.Vp8EntropyCost[255 - proba]; + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(InliningOptions.ShortMethod)] + private static int ReduceSum(Vector128 accumulator) + { + // Add odd to even. + Vector128 vsum = accumulator + Vector128_.ShuffleNative(accumulator, 0b_11_11_01_01); + + // Add high to low. + vsum += Vector128_.ShuffleNative(vsum, 0b_11_10_11_10); + + return vsum.ToScalar(); + } + [MethodImpl(InliningOptions.ShortMethod)] private static void Put16(int v, Span dst) {