From 7223e90bb441b4d14871a37f7ed2237218bc7b30 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 2 Jun 2025 14:47:36 +1000 Subject: [PATCH] Port Vp8_Sse16x16 --- .../Common/Helpers/Vector128Utilities.cs | 95 +++++++++- .../Common/Helpers/Vector256Utilities.cs | 163 ++++++++++++++++++ .../Formats/Webp/Lossy/LossyUtils.cs | 126 ++++++++------ 3 files changed, 324 insertions(+), 60 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index c160b9560..c5e16faf9 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -711,6 +711,39 @@ internal static class Vector128_ return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7)); } + /// + /// Unpack and interleave 8-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle( + unpacked, + Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); + } + /// /// Unpack and interleave 8-bit integers from the low half of and /// and store the results in the result. @@ -744,6 +777,56 @@ internal static class Vector128_ Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); } + /// + /// Subtract packed signed 16-bit integers in from packed signed 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to subtract from. + /// + /// + /// The second vector containing packed signed 16-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 32-bit signed + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Clamp to signed 16-bit range + Vector128 shortMin = Vector128.Create((int)short.MinValue); + Vector128 shortMax = Vector128.Create((int)short.MaxValue); + + diffLo = Clamp(diffLo, shortMin, shortMax); + diffHi = Clamp(diffHi, shortMin, shortMax); + + // Narrow back to 16 bit signed. + return Vector128.Narrow(diffLo, diffHi); + } + /// /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers /// in using saturation, and store the results. @@ -775,7 +858,7 @@ internal static class Vector128_ return PackedSimd.SubtractSaturate(left, right); } - // Widen inputs to 16-bit to safely compute unsigned differences without underflow + // Widen inputs to 16-bit (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); @@ -783,13 +866,11 @@ internal static class Vector128_ Vector128 diffLo = leftLo - rightLo; Vector128 diffHi = leftHi - rightHi; - // Mask lanes where left >= right to preserve the result - // All other lanes are zeroed (saturate to 0) - Vector128 maskLo = Vector128.GreaterThanOrEqual(leftLo, rightLo).AsUInt16(); - Vector128 maskHi = Vector128.GreaterThanOrEqual(leftHi, rightHi).AsUInt16(); + // Clamp to signed 8-bit range + Vector128 max = Vector128.Create((ushort)byte.MaxValue); - diffLo &= maskLo; - diffHi &= maskHi; + diffLo = Clamp(diffLo, Vector128.Zero, max); + diffHi = Clamp(diffHi, Vector128.Zero, max); // Narrow back to bytes return Vector128.Narrow(diffLo, diffHi); diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index dfefd2d34..71dfadc39 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -162,6 +162,33 @@ internal static class Vector256_ return (vm0 * vm1) - vs; } + /// + /// Multiply packed signed 16-bit integers in and , producing + /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and + /// pack the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to multiply and add. + /// + /// + /// The second vector containing packed signed 16-bit integers to multiply and add. + /// + /// + /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 MultiplyAddAdjacent(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.MultiplyAddAdjacent(left, right); + } + + return Vector256.Create( + Vector128_.MultiplyAddAdjacent(left.GetLower(), right.GetLower()), + Vector128_.MultiplyAddAdjacent(left.GetUpper(), right.GetUpper())); + } + /// /// Packs signed 32-bit integers to signed 16-bit integers and saturates. /// @@ -303,6 +330,142 @@ internal static class Vector256_ return Vector256.Narrow(prodLo, prodHi); } + /// + /// Unpack and interleave 32-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 32-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 32-bit integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 UnpackLow(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.UnpackLow(left, right); + } + + Vector128 lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower()); + Vector128 hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper()); + + return Vector256.Create(lo, hi); + } + + /// + /// Unpack and interleave 8-bit integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 UnpackHigh(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.UnpackHigh(left, right); + } + + Vector128 lo = Vector128_.UnpackHigh(left.GetLower(), right.GetLower()); + Vector128 hi = Vector128_.UnpackHigh(left.GetUpper(), right.GetUpper()); + + return Vector256.Create(lo, hi); + } + + /// + /// Unpack and interleave 8-bit integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// The second vector containing packed 8-bit integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 UnpackLow(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.UnpackLow(left, right); + } + + Vector128 lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower()); + Vector128 hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper()); + + return Vector256.Create(lo, hi); + } + + /// + /// Subtract packed signed 16-bit integers in from packed signed 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed signed 16-bit integers to subtract from. + /// + /// + /// The second vector containing packed signed 16-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 SubtractSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.SubtractSaturate(left, right); + } + + return Vector256.Create( + Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()), + Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper())); + } + + /// + /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to subtract from. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector256 SubtractSaturate(Vector256 left, Vector256 right) + { + if (Avx2.IsSupported) + { + return Avx2.SubtractSaturate(left, right); + } + + return Vector256.Create( + Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()), + Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper())); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); } diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 7d186cd65..4e61242c0 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -20,12 +20,12 @@ internal static class LossyUtils { if (Avx2.IsSupported) { - return Vp8_Sse16xN_Avx2(a, b, 4); + return Vp8_Sse16xN_Vector256(a, b, 4); } - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - return Vp8_Sse16xN_Sse2(a, b, 8); + return Vp8_16xN_Vector128(a, b, 8); } if (AdvSimd.IsSupported) @@ -40,14 +40,14 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8_Sse16x8(Span a, Span b) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { - return Vp8_Sse16xN_Avx2(a, b, 2); + return Vp8_Sse16xN_Vector256(a, b, 2); } - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - return Vp8_Sse16xN_Sse2(a, b, 4); + return Vp8_16xN_Vector128(a, b, 4); } if (AdvSimd.IsSupported) @@ -81,21 +81,21 @@ internal static class LossyUtils Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3))); // Combine pair of lines. - Vector256 a01 = Avx2.UnpackLow(a0.AsInt32(), a1.AsInt32()); - Vector256 b01 = Avx2.UnpackLow(b0.AsInt32(), b1.AsInt32()); + Vector256 a01 = Vector256_.UnpackLow(a0.AsInt32(), a1.AsInt32()); + Vector256 b01 = Vector256_.UnpackLow(b0.AsInt32(), b1.AsInt32()); // Convert to 16b. - Vector256 a01s = Avx2.UnpackLow(a01.AsByte(), Vector256.Zero); - Vector256 b01s = Avx2.UnpackLow(b01.AsByte(), Vector256.Zero); + Vector256 a01s = Vector256_.UnpackLow(a01.AsByte(), Vector256.Zero); + Vector256 b01s = Vector256_.UnpackLow(b01.AsByte(), Vector256.Zero); // subtract, square and accumulate. - Vector256 d0 = Avx2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); - Vector256 e0 = Avx2.MultiplyAddAdjacent(d0, d0); + Vector256 d0 = Vector256_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); + Vector256 e0 = Vector256_.MultiplyAddAdjacent(d0, d0); - return Numerics.ReduceSum(e0); + return ReduceSumVector256(e0); } - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load values. ref byte aRef = ref MemoryMarshal.GetReference(a); @@ -110,25 +110,25 @@ internal static class LossyUtils Vector128 b3 = Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3)); // Combine pair of lines. - Vector128 a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32()); - Vector128 a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32()); - Vector128 b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32()); - Vector128 b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32()); + Vector128 a01 = Vector128_.UnpackLow(a0.AsInt32(), a1.AsInt32()); + Vector128 a23 = Vector128_.UnpackLow(a2.AsInt32(), a3.AsInt32()); + Vector128 b01 = Vector128_.UnpackLow(b0.AsInt32(), b1.AsInt32()); + Vector128 b23 = Vector128_.UnpackLow(b2.AsInt32(), b3.AsInt32()); // Convert to 16b. - Vector128 a01s = Sse2.UnpackLow(a01.AsByte(), Vector128.Zero); - Vector128 a23s = Sse2.UnpackLow(a23.AsByte(), Vector128.Zero); - Vector128 b01s = Sse2.UnpackLow(b01.AsByte(), Vector128.Zero); - Vector128 b23s = Sse2.UnpackLow(b23.AsByte(), Vector128.Zero); + Vector128 a01s = Vector128_.UnpackLow(a01.AsByte(), Vector128.Zero); + Vector128 a23s = Vector128_.UnpackLow(a23.AsByte(), Vector128.Zero); + Vector128 b01s = Vector128_.UnpackLow(b01.AsByte(), Vector128.Zero); + Vector128 b23s = Vector128_.UnpackLow(b23.AsByte(), Vector128.Zero); // subtract, square and accumulate. - Vector128 d0 = Sse2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); + Vector128 d0 = Vector128_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); Vector128 d1 = Sse2.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16()); Vector128 e0 = Sse2.MultiplyAddAdjacent(d0, d0); Vector128 e1 = Sse2.MultiplyAddAdjacent(d1, d1); Vector128 sum = Sse2.Add(e0, e1); - return ReduceSum(sum); + return ReduceSumVector128(sum); } if (AdvSimd.IsSupported) @@ -159,7 +159,7 @@ internal static class LossyUtils } [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs) + private static int Vp8_16xN_Vector128(Span a, Span b, int numPairs) { Vector128 sum = Vector128.Zero; nuint offset = 0; @@ -173,18 +173,18 @@ internal static class LossyUtils Vector128 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps)); Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps)); - Vector128 sum1 = SubtractAndAccumulate(a0, b0); - Vector128 sum2 = SubtractAndAccumulate(a1, b1); + Vector128 sum1 = SubtractAndAccumulateVector128(a0, b0); + Vector128 sum2 = SubtractAndAccumulateVector128(a1, b1); sum += sum1 + sum2; offset += 2 * WebpConstants.Bps; } - return ReduceSum(sum); + return ReduceSumVector128(sum); } [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse16xN_Avx2(Span a, Span b, int numPairs) + private static int Vp8_Sse16xN_Vector256(Span a, Span b, int numPairs) { Vector256 sum = Vector256.Zero; nuint offset = 0; @@ -206,14 +206,14 @@ internal static class LossyUtils Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))), Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps)))); - Vector256 sum1 = SubtractAndAccumulate(a0, b0); - Vector256 sum2 = SubtractAndAccumulate(a1, b1); - sum = Avx2.Add(sum, Avx2.Add(sum1, sum2)); + Vector256 sum1 = SubtractAndAccumulateVector256(a0, b0); + Vector256 sum2 = SubtractAndAccumulateVector256(a1, b1); + sum += sum1 + sum2; offset += 4 * WebpConstants.Bps; } - return Numerics.ReduceSum(sum); + return ReduceSumVector256(sum); } [MethodImpl(InliningOptions.ShortMethod)] @@ -306,41 +306,41 @@ internal static class LossyUtils } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b) + private static Vector128 SubtractAndAccumulateVector128(Vector128 a, Vector128 b) { // Take abs(a-b) in 8b. - Vector128 ab = Sse2.SubtractSaturate(a, b); - Vector128 ba = Sse2.SubtractSaturate(b, a); - Vector128 absAb = Sse2.Or(ab, ba); + Vector128 ab = Vector128_.SubtractSaturate(a, b); + Vector128 ba = Vector128_.SubtractSaturate(b, a); + Vector128 absAb = ab | ba; // Zero-extend to 16b. - Vector128 c0 = Sse2.UnpackLow(absAb, Vector128.Zero); - Vector128 c1 = Sse2.UnpackHigh(absAb, Vector128.Zero); + Vector128 c0 = Vector128_.UnpackLow(absAb, Vector128.Zero); + Vector128 c1 = Vector128_.UnpackHigh(absAb, Vector128.Zero); // Multiply with self. - Vector128 sum1 = Sse2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); - Vector128 sum2 = Sse2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); + Vector128 sum1 = Vector128_.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); + Vector128 sum2 = Vector128_.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); - return Sse2.Add(sum1, sum2); + return sum1 + sum2; } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector256 SubtractAndAccumulate(Vector256 a, Vector256 b) + private static Vector256 SubtractAndAccumulateVector256(Vector256 a, Vector256 b) { // Take abs(a-b) in 8b. - Vector256 ab = Avx2.SubtractSaturate(a, b); - Vector256 ba = Avx2.SubtractSaturate(b, a); + Vector256 ab = Vector256_.SubtractSaturate(a, b); + Vector256 ba = Vector256_.SubtractSaturate(b, a); Vector256 absAb = Avx2.Or(ab, ba); // Zero-extend to 16b. - Vector256 c0 = Avx2.UnpackLow(absAb, Vector256.Zero); - Vector256 c1 = Avx2.UnpackHigh(absAb, Vector256.Zero); + Vector256 c0 = Vector256_.UnpackLow(absAb, Vector256.Zero); + Vector256 c1 = Vector256_.UnpackHigh(absAb, Vector256.Zero); // Multiply with self. - Vector256 sum1 = Avx2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); - Vector256 sum2 = Avx2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); + Vector256 sum1 = Vector256_.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16()); + Vector256 sum2 = Vector256_.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16()); - return Avx2.Add(sum1, sum2); + return sum1 + sum2; } [MethodImpl(InliningOptions.ShortMethod)] @@ -990,7 +990,7 @@ internal static class LossyUtils // difference of weighted sums. Vector128 result = ab0ab2Sum - b0w0bb2w8Sum; - return ReduceSum(result); + return ReduceSumVector128(result); } // Transpose two 4x4 16b matrices horizontally stored in registers. @@ -1916,7 +1916,27 @@ internal static class LossyUtils /// The accumulator to reduce. /// The sum of all elements. [MethodImpl(InliningOptions.ShortMethod)] - private static int ReduceSum(Vector128 accumulator) + public static int ReduceSumVector256(Vector256 accumulator) + { + // Add upper lane to lower lane. + Vector128 vsum = accumulator.GetLower() + accumulator.GetUpper(); + + // Add odd to even. + vsum += Vector128_.ShuffleNative(vsum, 0b_11_11_01_01); + + // Add high to low. + vsum += Vector128_.ShuffleNative(vsum, 0b_11_10_11_10); + + return vsum.ToScalar(); + } + + /// + /// Reduces elements of the vector into one sum. + /// + /// The accumulator to reduce. + /// The sum of all elements. + [MethodImpl(InliningOptions.ShortMethod)] + private static int ReduceSumVector128(Vector128 accumulator) { // Add odd to even. Vector128 vsum = accumulator + Vector128_.ShuffleNative(accumulator, 0b_11_11_01_01);