From cfad39b8fbda99896ec16069e2c3bcd663961c2c Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 31 May 2025 01:04:06 +1000 Subject: [PATCH] Add XPlat V128 SubtractSaturate --- .../Common/Helpers/Vector128Utilities.cs | 143 +++++++++++++----- 1 file changed, 101 insertions(+), 42 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index a96f5fa73..c160b9560 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -402,40 +402,6 @@ internal static class Vector128_ public static Vector128 Clamp(Vector128 value, Vector128 min, Vector128 max) => Vector128.Min(Vector128.Max(value, min), max); - /// - /// Multiply the packed 16-bit integers in and , producing - /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. - /// - /// - /// The first vector containing packed 16-bit integers to multiply. - /// - /// - /// The second vector containing packed 16-bit integers to multiply. - /// - /// - /// A vector containing the low 16 bits of the products of the packed 16-bit integers - /// from and . - /// - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public static Vector128 MultiplyLow(Vector128 left, Vector128 right) - { - if (Sse2.IsSupported) - { - return Sse2.MultiplyLow(left, right); - } - - // Widen each half of the short vectors into two int vectors - (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); - (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); - - // Elementwise multiply: each int lane now holds the full 32-bit product - Vector128 prodLo = leftLower * rightLower; - Vector128 prodHi = leftUpper * rightUpper; - - // Narrow the two int vectors back into one short vector - return Vector128.Narrow(prodLo, prodHi); - } - /// /// Multiply packed signed 16-bit integers in and , producing /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and @@ -450,6 +416,7 @@ internal static class Vector128_ /// /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -470,12 +437,12 @@ internal static class Vector128_ { // Widen each half of the short vectors into two int vectors - (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); - (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); // Elementwise multiply: each int lane now holds the full 32-bit product - Vector128 prodLo = leftLower * rightLower; - Vector128 prodHi = leftUpper * rightUpper; + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; // Extract the low and high parts of the products shuffling them to form a result we can add together. // Use out-of-bounds to zero out the unused lanes. @@ -488,6 +455,40 @@ internal static class Vector128_ } } + /// + /// Multiply the packed 16-bit integers in and , producing + /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit integers to multiply. + /// + /// + /// The second vector containing packed 16-bit integers to multiply. + /// + /// + /// A vector containing the low 16 bits of the products of the packed 16-bit integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyLow(left, right); + } + + // Widen each half of the short vectors into two int vectors + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } + /// /// Multiply the packed 16-bit integers in and , producing /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result. @@ -511,12 +512,12 @@ internal static class Vector128_ } // Widen each half of the short vectors into two int vectors - (Vector128 leftLower, Vector128 leftUpper) = Vector128.Widen(left); - (Vector128 rightLower, Vector128 rightUpper) = Vector128.Widen(right); + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); // Elementwise multiply: each int lane now holds the full 32-bit product - Vector128 prodLo = leftLower * rightLower; - Vector128 prodHi = leftUpper * rightUpper; + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; // Arithmetic shift right by 16 bits to extract the high word prodLo >>= 16; @@ -540,6 +541,7 @@ internal static class Vector128_ /// A vector containing the unpacked and interleaved 64-bit integers from the high /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackHigh(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -569,6 +571,7 @@ internal static class Vector128_ /// A vector containing the unpacked and interleaved 64-bit integers from the low /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackLow(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -598,6 +601,7 @@ internal static class Vector128_ /// A vector containing the unpacked and interleaved 32-bit integers from the high /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackHigh(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -628,6 +632,7 @@ internal static class Vector128_ /// A vector containing the unpacked and interleaved 32-bit integers from the low /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackLow(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -658,6 +663,7 @@ internal static class Vector128_ /// A vector containing the unpacked and interleaved 16-bit integers from the high /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackHigh(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -688,6 +694,7 @@ internal static class Vector128_ /// A vector containing the unpacked and interleaved 16-bit integers from the low /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackLow(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -718,6 +725,7 @@ internal static class Vector128_ /// A vector containing the unpacked and interleaved 8-bit integers from the low /// halves of and . /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 UnpackLow(Vector128 left, Vector128 right) { if (Sse2.IsSupported) @@ -736,6 +744,57 @@ internal static class Vector128_ Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); } + /// + /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to subtract from. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 16-bit to safely compute unsigned differences without underflow + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Mask lanes where left >= right to preserve the result + // All other lanes are zeroed (saturate to 0) + Vector128 maskLo = Vector128.GreaterThanOrEqual(leftLo, rightLo).AsUInt16(); + Vector128 maskHi = Vector128.GreaterThanOrEqual(leftHi, rightHi).AsUInt16(); + + diffLo &= maskLo; + diffHi &= maskHi; + + // Narrow back to bytes + return Vector128.Narrow(diffLo, diffHi); + } + [DoesNotReturn] private static void ThrowUnreachableException() => throw new UnreachableException(); }