From e6168448a38ed5b35e3853719b0b08cfcc73a860 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 2 Jun 2025 22:53:42 +1000 Subject: [PATCH] Remove all v128 util restrictions --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 24 +- .../Common/Helpers/Vector128Utilities.cs | 289 ++++++++++++++--- .../Formats/Webp/Lossy/LossyUtils.cs | 302 +++++++++--------- 3 files changed, 410 insertions(+), 205 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 96ddb7976c..0f399d2de0 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -114,7 +114,7 @@ internal static partial class SimdUtils { if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) || (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) || - (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)) + Vector128.IsHardwareAccelerated) { int remainder = 0; if (Vector512.IsHardwareAccelerated) @@ -158,7 +158,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { int remainder = source.Length % (Vector128.Count * 3); @@ -190,7 +190,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated) { int remainder = source.Length % (Vector128.Count * 3); @@ -223,7 +223,7 @@ internal static partial class SimdUtils ref Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte) + if (Vector128.IsHardwareAccelerated) { int remainder = source.Length & ((Vector128.Count * 4) - 1); // bit-hack for modulo @@ -405,7 +405,7 @@ internal static partial class SimdUtils } } } - else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte) + else if (Vector128.IsHardwareAccelerated) { Span temp = stackalloc byte[Vector128.Count]; Shuffle.MMShuffleSpan(ref temp, control); @@ -445,9 +445,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && - Vector128_.SupportsShuffleNativeByte && - Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); @@ -507,10 +505,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && - Vector128_.SupportsShuffleNativeByte && - Vector128_.SupportsShiftByte && - Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16(); Vector128 fill = Vector128.Create(0xff000000ff000000ul).AsByte(); @@ -553,10 +548,7 @@ internal static partial class SimdUtils Span destination, [ConstantExpected] byte control) { - if (Vector128.IsHardwareAccelerated && - Vector128_.SupportsShuffleNativeByte && - Vector128_.SupportsShiftByte && - Vector128_.SupportsAlignRight) + if (Vector128.IsHardwareAccelerated) { Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); Vector128 maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index c5e16faf99..a6359e6e91 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -1,7 +1,6 @@ // Copyright (c) Six Labors. // Licensed under the Six Labors Split License. -using System.Diagnostics; using System.Diagnostics.CodeAnalysis; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -46,24 +45,6 @@ internal static class Vector128_ } } - /// - /// Gets a value indicating whether right align operations are supported. - /// - public static bool SupportsAlignRight - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Ssse3.IsSupported || AdvSimd.IsSupported; - } - - /// - /// Gets a value indicating whether right or left byte shift operations are supported. - /// - public static bool SupportsShiftByte - { - [MethodImpl(MethodImplOptions.AggressiveInlining)] - get => Sse2.IsSupported || AdvSimd.IsSupported; - } - /// /// Creates a new vector by selecting values from an input vector using the control. /// @@ -157,8 +138,7 @@ internal static class Vector128_ return AdvSimd.ExtractVector128(value, Vector128.Zero, numBytes); } - ThrowUnreachableException(); - return default; + return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + Vector128.Create(numBytes)); } /// @@ -182,8 +162,7 @@ internal static class Vector128_ #pragma warning restore CA1857 // A constant is expected for the parameter } - ThrowUnreachableException(); - return default; + return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) - Vector128.Create(numBytes)); } /// @@ -206,8 +185,9 @@ internal static class Vector128_ return AdvSimd.ExtractVector128(right, left, mask); } - ThrowUnreachableException(); - return default; +#pragma warning disable CA1857 // A constant is expected for the parameter + return ShiftLeftBytesInVector(left, (byte)(Vector128.Count - mask)) | ShiftRightBytesInVector(right, mask); +#pragma warning restore CA1857 // A constant is expected for the parameter } /// @@ -390,6 +370,37 @@ internal static class Vector128_ return Vector128.Narrow(lefClamped, rightClamped); } + /// + /// Packs signed 16-bit integers to signed 8-bit integers and saturates. + /// + /// The left hand source vector. + /// The right hand source vector. + /// The . + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 PackSignedSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.PackSignedSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.ConvertNarrowingSaturateSigned(left, right); + } + + Vector128 min = Vector128.Create((short)sbyte.MinValue); + Vector128 max = Vector128.Create((short)sbyte.MaxValue); + Vector128 lefClamped = Clamp(left, min, max); + Vector128 rightClamped = Clamp(right, min, max); + return Vector128.Narrow(lefClamped, rightClamped); + } + /// /// Restricts a vector between a minimum and a maximum value. /// @@ -739,9 +750,7 @@ internal static class Vector128_ } Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); - return Vector128.Shuffle( - unpacked, - Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); + return Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); } /// @@ -772,9 +781,69 @@ internal static class Vector128_ } Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); - return Vector128.Shuffle( - unpacked, - Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15))); + return Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); + } + + /// + /// Unpack and interleave 8-bit signed integers from the high half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit signed integers to unpack from the high half. + /// + /// + /// The second vector containing packed 8-bit signed integers to unpack from the high half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit signed integers from the high + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackHigh(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipHigh(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); + } + + /// + /// Unpack and interleave 8-bit signed integers from the low half of and + /// and store the results in the result. + /// + /// + /// The first vector containing packed 8-bit signed integers to unpack from the low half. + /// + /// + /// The second vector containing packed 8-bit signed integers to unpack from the low half. + /// + /// + /// A vector containing the unpacked and interleaved 8-bit signed integers from the low + /// halves of and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 UnpackLow(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.UnpackLow(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.Arm64.ZipLow(left, right); + } + + Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower()); + return Vector128.Shuffle(unpacked, Vector128.Create(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)); } /// @@ -817,16 +886,65 @@ internal static class Vector128_ Vector128 diffHi = leftHi - rightHi; // Clamp to signed 16-bit range - Vector128 shortMin = Vector128.Create((int)short.MinValue); - Vector128 shortMax = Vector128.Create((int)short.MaxValue); + Vector128 min = Vector128.Create((int)short.MinValue); + Vector128 max = Vector128.Create((int)short.MaxValue); - diffLo = Clamp(diffLo, shortMin, shortMax); - diffHi = Clamp(diffHi, shortMin, shortMax); + diffLo = Clamp(diffLo, min, max); + diffHi = Clamp(diffHi, min, max); // Narrow back to 16 bit signed. return Vector128.Narrow(diffLo, diffHi); } + /// + /// Add packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to add to. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to add. + /// + /// + /// A vector containing the results of adding packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 AddSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.AddSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.AddSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.AddSaturate(left, right); + } + + // Widen inputs to 16-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Add + Vector128 sumLo = leftLo + rightLo; + Vector128 sumHi = leftHi + rightHi; + + // Clamp to signed 8-bit range + Vector128 max = Vector128.Create((ushort)byte.MaxValue); + + sumLo = Clamp(sumLo, Vector128.Zero, max); + sumHi = Clamp(sumHi, Vector128.Zero, max); + + // Narrow back to bytes + return Vector128.Narrow(sumLo, sumHi); + } + /// /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers /// in using saturation, and store the results. @@ -876,6 +994,103 @@ internal static class Vector128_ return Vector128.Narrow(diffLo, diffHi); } - [DoesNotReturn] - private static void ThrowUnreachableException() => throw new UnreachableException(); + /// + /// Add packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 8-bit integers to add to. + /// + /// + /// The second vector containing packed unsigned 8-bit integers to add. + /// + /// + /// A vector containing the results of adding packed unsigned 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 AddSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.AddSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.AddSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.AddSaturate(left, right); + } + + // Widen inputs to 16-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Add + Vector128 sumLo = leftLo + rightLo; + Vector128 sumHi = leftHi + rightHi; + + // Clamp to signed 8-bit range + Vector128 min = Vector128.Create((short)sbyte.MinValue); + Vector128 max = Vector128.Create((short)sbyte.MaxValue); + + sumLo = Clamp(sumLo, min, max); + sumHi = Clamp(sumHi, min, max); + + // Narrow back to signed bytes + return Vector128.Narrow(sumLo, sumHi); + } + + /// + /// Subtract packed signed 8-bit integers in from packed signed 8-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed signed 8-bit integers to subtract from. + /// + /// + /// The second vector containing packed signed 8-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed signed 8-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 16-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Clamp to signed 8-bit range + Vector128 min = Vector128.Create((short)sbyte.MinValue); + Vector128 max = Vector128.Create((short)sbyte.MaxValue); + + diffLo = Clamp(diffLo, min, max); + diffHi = Clamp(diffHi, min, max); + + // Narrow back to signed bytes + return Vector128.Narrow(diffLo, diffHi); + } } diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index 4e61242c06..b21e3c02ba 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -1521,20 +1521,20 @@ internal static class LossyUtils Vector128 p1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset - (2 * stride)))); Vector128 p0 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset - stride))); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t1, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Vector128.Max(mask, AbsVector128(t1, p2)); + mask = Vector128.Max(mask, AbsVector128(p2, p1)); Vector128 q0 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); Vector128 q1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + stride))); Vector128 q2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (2 * stride)))); t1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (3 * stride)))); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t1, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Vector128.Max(mask, AbsVector128(q1, q0)); + mask = Vector128.Max(mask, AbsVector128(t1, q2)); + mask = Vector128.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); // Store. @@ -1561,17 +1561,17 @@ internal static class LossyUtils ref byte bRef = ref Unsafe.Add(ref pRef, (uint)offset - 4); Load16x4(ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, p1)); Load16x4(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(q3, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Sse2.Max(mask, AbsVector128(q1, q0)); + mask = Sse2.Max(mask, AbsVector128(q3, q2)); + mask = Sse2.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); Store16x4(p3, p2, p1, p0, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); @@ -1599,22 +1599,22 @@ internal static class LossyUtils Span b = p[(offset + (2 * stride))..]; offset += 4 * stride; - Vector128 mask = Abs(p0, p1); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p0, p1); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, p1)); p3 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)offset)); p2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + stride))); Vector128 tmp1 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (2 * stride)))); Vector128 tmp2 = Unsafe.As>(ref Unsafe.Add(ref pRef, (uint)(offset + (3 * stride)))); - mask = Sse2.Max(mask, Abs(tmp1, tmp2)); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, tmp1)); + mask = Sse2.Max(mask, AbsVector128(tmp1, tmp2)); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, tmp1)); // p3 and p2 are not just temporary variables here: they will be // re-used for next span. And q2/q3 will become p1/p0 accordingly. - ComplexMask(p1, p0, p3, p2, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); // Store. @@ -1656,17 +1656,17 @@ internal static class LossyUtils offset += 4; // Compute partial mask. - mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, p1)); Load16x4(ref Unsafe.Add(ref pRef, (uint)offset), ref Unsafe.Add(ref pRef, (uint)(offset + (8 * stride))), stride, out p3, out p2, out Vector128 tmp1, out Vector128 tmp2); - mask = Sse2.Max(mask, Abs(tmp1, tmp2)); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, tmp1)); + mask = Sse2.Max(mask, AbsVector128(tmp1, tmp2)); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, tmp1)); - ComplexMask(p1, p0, p3, p2, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, p3, p2, thresh, ithresh, ref mask); DoFilter4Sse2(ref p1, ref p0, ref p3, ref p2, mask, hevThresh); Store16x4(p1, p0, p3, p2, ref bRef, ref Unsafe.Add(ref bRef, 8 * (uint)stride), stride); @@ -1695,34 +1695,34 @@ internal static class LossyUtils // Load uv h-edges. ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - Vector128 t1 = LoadUvEdge(ref uRef, ref vRef, offset - (4 * stride)); - Vector128 p2 = LoadUvEdge(ref uRef, ref vRef, offset - (3 * stride)); - Vector128 p1 = LoadUvEdge(ref uRef, ref vRef, offset - (2 * stride)); - Vector128 p0 = LoadUvEdge(ref uRef, ref vRef, offset - stride); + Vector128 t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - (4 * stride)); + Vector128 p2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - (3 * stride)); + Vector128 p1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - (2 * stride)); + Vector128 p0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset - stride); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t1, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(t1, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, p1)); - Vector128 q0 = LoadUvEdge(ref uRef, ref vRef, offset); - Vector128 q1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); - Vector128 q2 = LoadUvEdge(ref uRef, ref vRef, offset + (2 * stride)); - t1 = LoadUvEdge(ref uRef, ref vRef, offset + (3 * stride)); + Vector128 q0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); + Vector128 q1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); + Vector128 q2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (2 * stride)); + t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (3 * stride)); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t1, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Sse2.Max(mask, AbsVector128(q1, q0)); + mask = Sse2.Max(mask, AbsVector128(t1, q2)); + mask = Sse2.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); // Store. - StoreUv(p2, ref uRef, ref vRef, offset - (3 * stride)); - StoreUv(p1, ref uRef, ref vRef, offset - (2 * stride)); - StoreUv(p0, ref uRef, ref vRef, offset - stride); - StoreUv(q0, ref uRef, ref vRef, offset); - StoreUv(q1, ref uRef, ref vRef, offset + (1 * stride)); - StoreUv(q2, ref uRef, ref vRef, offset + (2 * stride)); + StoreUvVector128(p2, ref uRef, ref vRef, offset - (3 * stride)); + StoreUvVector128(p1, ref uRef, ref vRef, offset - (2 * stride)); + StoreUvVector128(p0, ref uRef, ref vRef, offset - stride); + StoreUvVector128(q0, ref uRef, ref vRef, offset); + StoreUvVector128(q1, ref uRef, ref vRef, offset + (1 * stride)); + StoreUvVector128(q2, ref uRef, ref vRef, offset + (2 * stride)); } else { @@ -1740,17 +1740,17 @@ internal static class LossyUtils ref byte vRef = ref MemoryMarshal.GetReference(v); Load16x4(ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride, out Vector128 p3, out Vector128 p2, out Vector128 p1, out Vector128 p0); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(p3, p2)); - mask = Sse2.Max(mask, Abs(p2, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(p3, p2)); + mask = Sse2.Max(mask, AbsVector128(p2, p1)); Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out Vector128 q2, out Vector128 q3); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(q3, q2)); - mask = Sse2.Max(mask, Abs(q2, q1)); + mask = Sse2.Max(mask, AbsVector128(q1, q0)); + mask = Sse2.Max(mask, AbsVector128(q3, q2)); + mask = Sse2.Max(mask, AbsVector128(q2, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter6Sse2(ref p2, ref p1, ref p0, ref q0, ref q1, ref q2, mask, hevThresh); Store16x4(p3, p2, p1, p0, ref Unsafe.Add(ref uRef, (uint)offset - 4), ref Unsafe.Add(ref vRef, (uint)offset - 4), stride); @@ -1771,34 +1771,34 @@ internal static class LossyUtils // Load uv h-edges. ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - Vector128 t2 = LoadUvEdge(ref uRef, ref vRef, offset); - Vector128 t1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); - Vector128 p1 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 2)); - Vector128 p0 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 3)); + Vector128 t2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); + Vector128 t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); + Vector128 p1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 2)); + Vector128 p0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 3)); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(t2, t1)); + mask = Sse2.Max(mask, AbsVector128(t1, p1)); offset += 4 * stride; - Vector128 q0 = LoadUvEdge(ref uRef, ref vRef, offset); - Vector128 q1 = LoadUvEdge(ref uRef, ref vRef, offset + stride); - t1 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 2)); - t2 = LoadUvEdge(ref uRef, ref vRef, offset + (stride * 3)); + Vector128 q0 = LoadUvEdgeVector128(ref uRef, ref vRef, offset); + Vector128 q1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + stride); + t1 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 2)); + t2 = LoadUvEdgeVector128(ref uRef, ref vRef, offset + (stride * 3)); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, q1)); + mask = Sse2.Max(mask, AbsVector128(q1, q0)); + mask = Sse2.Max(mask, AbsVector128(t2, t1)); + mask = Sse2.Max(mask, AbsVector128(t1, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); // Store. - StoreUv(p1, ref uRef, ref vRef, offset + (-2 * stride)); - StoreUv(p0, ref uRef, ref vRef, offset + (-1 * stride)); - StoreUv(q0, ref uRef, ref vRef, offset); - StoreUv(q1, ref uRef, ref vRef, offset + stride); + StoreUvVector128(p1, ref uRef, ref vRef, offset + (-2 * stride)); + StoreUvVector128(p0, ref uRef, ref vRef, offset + (-1 * stride)); + StoreUvVector128(q0, ref uRef, ref vRef, offset); + StoreUvVector128(q1, ref uRef, ref vRef, offset + stride); } else { @@ -1817,20 +1817,20 @@ internal static class LossyUtils ref byte vRef = ref MemoryMarshal.GetReference(v); Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 t2, out Vector128 t1, out Vector128 p1, out Vector128 p0); - Vector128 mask = Abs(p1, p0); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, p1)); + Vector128 mask = AbsVector128(p1, p0); + mask = Sse2.Max(mask, AbsVector128(t2, t1)); + mask = Sse2.Max(mask, AbsVector128(t1, p1)); // Beginning of q0. offset += 4; Load16x4(ref Unsafe.Add(ref uRef, (uint)offset), ref Unsafe.Add(ref vRef, (uint)offset), stride, out Vector128 q0, out Vector128 q1, out t1, out t2); - mask = Sse2.Max(mask, Abs(q1, q0)); - mask = Sse2.Max(mask, Abs(t2, t1)); - mask = Sse2.Max(mask, Abs(t1, q1)); + mask = Sse2.Max(mask, AbsVector128(q1, q0)); + mask = Sse2.Max(mask, AbsVector128(t2, t1)); + mask = Sse2.Max(mask, AbsVector128(t1, q1)); - ComplexMask(p1, p0, q0, q1, thresh, ithresh, ref mask); + ComplexMaskVector128(p1, p0, q0, q1, thresh, ithresh, ref mask); DoFilter4Sse2(ref p1, ref p0, ref q0, ref q1, mask, hevThresh); // Beginning of p1. @@ -2057,24 +2057,24 @@ internal static class LossyUtils Vector128 signBit = Vector128.Create((byte)0x80); // Convert p1/q1 to byte (for GetBaseDelta). - Vector128 p1s = Sse2.Xor(p1, signBit); - Vector128 q1s = Sse2.Xor(q1, signBit); - Vector128 mask = NeedsFilter(p1, p0, q0, q1, thresh); + Vector128 p1s = p1 ^ signBit; + Vector128 q1s = q1 ^ signBit; + Vector128 mask = NeedsFilterVector128(p1, p0, q0, q1, thresh); // Flip sign. - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p0 ^= signBit; + q0 ^= signBit; - Vector128 a = GetBaseDelta(p1s.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1s.AsSByte()).AsByte(); + Vector128 a = GetBaseDeltaVector128(p1s.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1s.AsSByte()).AsByte(); // Mask filter values we don't care about. - a = Sse2.And(a, mask); + a &= mask; DoSimpleFilterSse2(ref p0, ref q0, a); // Flip sign. - p0 = Sse2.Xor(p0, signBit); - q0 = Sse2.Xor(q0, signBit); + p0 ^= signBit; + q0 ^= signBit; } // Applies filter on 4 pixels (p1, p0, q0 and q1) @@ -2101,8 +2101,8 @@ internal static class LossyUtils t2 = Sse2.AddSaturate(t1, Vector128.Create((byte)3).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 3 Vector128 t3 = Sse2.AddSaturate(t1, Vector128.Create((byte)4).AsSByte()); // 3 * (q0 - p0) + hev(p1 - q1) + 4 - t2 = SignedShift8b(t2.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 - t3 = SignedShift8b(t3.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 + t2 = SignedShift8bVector128(t2.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 + t3 = SignedShift8bVector128(t3.AsByte()); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 p0 = Sse2.AddSaturate(p0.AsSByte(), t2).AsByte(); // p0 += t2 q0 = Sse2.SubtractSaturate(q0.AsSByte(), t3).AsByte(); // q0 -= t3 p0 = Sse2.Xor(p0, signBit); @@ -2135,7 +2135,7 @@ internal static class LossyUtils p2 = Sse2.Xor(p2, signBit); q2 = Sse2.Xor(q2, signBit); - Vector128 a = GetBaseDelta(p1.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1.AsSByte()); + Vector128 a = GetBaseDeltaVector128(p1.AsSByte(), p0.AsSByte(), q0.AsSByte(), q1.AsSByte()); // Do simple filter on pixels with hev. Vector128 m = Sse2.AndNot(notHev, mask); @@ -2162,9 +2162,9 @@ internal static class LossyUtils Vector128 a0Low = Sse2.Add(a1Low, f9Low); // Filter * 27 + 63 Vector128 a0High = Sse2.Add(a1High, f9High); // Filter * 27 + 63 - Update2Pixels(ref p2, ref q2, a2Low, a2High); - Update2Pixels(ref p1, ref q1, a1Low, a1High); - Update2Pixels(ref p0, ref q0, a0Low, a0High); + Update2PixelsVector128(ref p2, ref q2, a2Low, a2High); + Update2PixelsVector128(ref p1, ref q1, a1Low, a1High); + Update2PixelsVector128(ref p0, ref q0, a0Low, a0High); } private static void DoSimpleFilterSse2(ref Vector128 p0, ref Vector128 q0, Vector128 fl) @@ -2172,16 +2172,16 @@ internal static class LossyUtils Vector128 v3 = Sse2.AddSaturate(fl.AsSByte(), Vector128.Create((byte)3).AsSByte()); Vector128 v4 = Sse2.AddSaturate(fl.AsSByte(), Vector128.Create((byte)4).AsSByte()); - v4 = SignedShift8b(v4.AsByte()).AsSByte(); // v4 >> 3 - v3 = SignedShift8b(v3.AsByte()).AsSByte(); // v3 >> 3 + v4 = SignedShift8bVector128(v4.AsByte()).AsSByte(); // v4 >> 3 + v3 = SignedShift8bVector128(v3.AsByte()).AsSByte(); // v3 >> 3 q0 = Sse2.SubtractSaturate(q0.AsSByte(), v4).AsByte(); // q0 -= v4 p0 = Sse2.AddSaturate(p0.AsSByte(), v3).AsByte(); // p0 += v3 } private static Vector128 GetNotHev(ref Vector128 p1, ref Vector128 p0, ref Vector128 q0, ref Vector128 q1, int hevThresh) { - Vector128 t1 = Abs(p1, p0); - Vector128 t2 = Abs(q1, q0); + Vector128 t1 = AbsVector128(p1, p0); + Vector128 t2 = AbsVector128(q1, q0); Vector128 h = Vector128.Create((byte)hevThresh); Vector128 tMax = Sse2.Max(t1, t2); @@ -2270,21 +2270,21 @@ internal static class LossyUtils WebpLookupTables.Abs0(q2 - q1) <= it && WebpLookupTables.Abs0(q1 - q0) <= it; } - private static Vector128 NeedsFilter(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh) + private static Vector128 NeedsFilterVector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh) { Vector128 mthresh = Vector128.Create((byte)thresh); - Vector128 t1 = Abs(p1, q1); // abs(p1 - q1) + Vector128 t1 = AbsVector128(p1, q1); // abs(p1 - q1) Vector128 fe = Vector128.Create((byte)0xFE); - Vector128 t2 = Sse2.And(t1, fe); // set lsb of each byte to zero. - Vector128 t3 = Sse2.ShiftRightLogical(t2.AsInt16(), 1); // abs(p1 - q1) / 2 + Vector128 t2 = t1 & fe; // set lsb of each byte to zero. + Vector128 t3 = Vector128.ShiftRightLogical(t2.AsInt16(), 1); // abs(p1 - q1) / 2 - Vector128 t4 = Abs(p0, q0); // abs(p0 - q0) - Vector128 t5 = Sse2.AddSaturate(t4, t4); // abs(p0 - q0) * 2 - Vector128 t6 = Sse2.AddSaturate(t5.AsByte(), t3.AsByte()); // abs(p0-q0)*2 + abs(p1-q1)/2 + Vector128 t4 = AbsVector128(p0, q0); // abs(p0 - q0) + Vector128 t5 = Vector128_.AddSaturate(t4, t4); // abs(p0 - q0) * 2 + Vector128 t6 = Vector128_.AddSaturate(t5.AsByte(), t3.AsByte()); // abs(p0-q0)*2 + abs(p1-q1)/2 - Vector128 t7 = Sse2.SubtractSaturate(t6, mthresh.AsByte()); // mask <= m_thresh + Vector128 t7 = Vector128_.SubtractSaturate(t6, mthresh.AsByte()); // mask <= m_thresh - return Sse2.CompareEqual(t7, Vector128.Zero); + return Vector128.Equals(t7, Vector128.Zero); } private static void Load16x4(ref byte r0, ref byte r8, int stride, out Vector128 p1, out Vector128 p0, out Vector128 q0, out Vector128 q1) @@ -2304,8 +2304,8 @@ internal static class LossyUtils // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - Load8x4(ref r0, (uint)stride, out Vector128 t1, out Vector128 t2); - Load8x4(ref r8, (uint)stride, out p0, out q1); + Load8x4Vector128(ref r0, (uint)stride, out Vector128 t1, out Vector128 t2); + Load8x4Vector128(ref r8, (uint)stride, out p0, out q1); // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 @@ -2318,7 +2318,7 @@ internal static class LossyUtils } // Reads 8 rows across a vertical edge. - private static void Load8x4(ref byte bRef, nuint stride, out Vector128 p, out Vector128 q) + private static void Load8x4Vector128(ref byte bRef, nuint stride, out Vector128 p, out Vector128 q) { // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00 // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10 @@ -2335,18 +2335,18 @@ internal static class LossyUtils // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 - Vector128 b0 = Sse2.UnpackLow(a0.AsSByte(), a1.AsSByte()); - Vector128 b1 = Sse2.UnpackHigh(a0.AsSByte(), a1.AsSByte()); + Vector128 b0 = Vector128_.UnpackLow(a0.AsSByte(), a1.AsSByte()); + Vector128 b1 = Vector128_.UnpackHigh(a0.AsSByte(), a1.AsSByte()); // C0 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 // C1 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - Vector128 c0 = Sse2.UnpackLow(b0.AsInt16(), b1.AsInt16()); - Vector128 c1 = Sse2.UnpackHigh(b0.AsInt16(), b1.AsInt16()); + Vector128 c0 = Vector128_.UnpackLow(b0.AsInt16(), b1.AsInt16()); + Vector128 c1 = Vector128_.UnpackHigh(b0.AsInt16(), b1.AsInt16()); // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - p = Sse2.UnpackLow(c0.AsInt32(), c1.AsInt32()).AsByte(); - q = Sse2.UnpackHigh(c0.AsInt32(), c1.AsInt32()).AsByte(); + p = Vector128_.UnpackLow(c0.AsInt32(), c1.AsInt32()).AsByte(); + q = Vector128_.UnpackHigh(c0.AsInt32(), c1.AsInt32()).AsByte(); } // Transpose back and store @@ -2393,67 +2393,65 @@ internal static class LossyUtils } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 GetBaseDelta(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1) + private static Vector128 GetBaseDeltaVector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1) { // Beware of addition order, for saturation! - Vector128 p1q1 = Sse2.SubtractSaturate(p1, q1); // p1 - q1 - Vector128 q0p0 = Sse2.SubtractSaturate(q0, p0); // q0 - p0 - Vector128 s1 = Sse2.AddSaturate(p1q1, q0p0); // p1 - q1 + 1 * (q0 - p0) - Vector128 s2 = Sse2.AddSaturate(q0p0, s1); // p1 - q1 + 2 * (q0 - p0) - Vector128 s3 = Sse2.AddSaturate(q0p0, s2); // p1 - q1 + 3 * (q0 - p0) - - return s3; + Vector128 p1q1 = Vector128_.SubtractSaturate(p1, q1); // p1 - q1 + Vector128 q0p0 = Vector128_.SubtractSaturate(q0, p0); // q0 - p0 + Vector128 s1 = Vector128_.AddSaturate(p1q1, q0p0); // p1 - q1 + 1 * (q0 - p0) + Vector128 s2 = Vector128_.AddSaturate(q0p0, s1); // p1 - q1 + 2 * (q0 - p0) + return Vector128_.AddSaturate(q0p0, s2); // p1 - q1 + 3 * (q0 - p0) } // Shift each byte of "x" by 3 bits while preserving by the sign bit. [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 SignedShift8b(Vector128 x) + private static Vector128 SignedShift8bVector128(Vector128 x) { - Vector128 low0 = Sse2.UnpackLow(Vector128.Zero, x); - Vector128 high0 = Sse2.UnpackHigh(Vector128.Zero, x); - Vector128 low1 = Sse2.ShiftRightArithmetic(low0.AsInt16(), 3 + 8); - Vector128 high1 = Sse2.ShiftRightArithmetic(high0.AsInt16(), 3 + 8); + Vector128 low0 = Vector128_.UnpackLow(Vector128.Zero, x); + Vector128 high0 = Vector128_.UnpackHigh(Vector128.Zero, x); + Vector128 low1 = Vector128.ShiftRightArithmetic(low0.AsInt16(), 3 + 8); + Vector128 high1 = Vector128.ShiftRightArithmetic(high0.AsInt16(), 3 + 8); - return Sse2.PackSignedSaturate(low1, high1); + return Vector128_.PackSignedSaturate(low1, high1); } [MethodImpl(InliningOptions.ShortMethod)] - private static void ComplexMask(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh, int ithresh, ref Vector128 mask) + private static void ComplexMaskVector128(Vector128 p1, Vector128 p0, Vector128 q0, Vector128 q1, int thresh, int ithresh, ref Vector128 mask) { Vector128 it = Vector128.Create((byte)ithresh); - Vector128 diff = Sse2.SubtractSaturate(mask, it); - Vector128 threshMask = Sse2.CompareEqual(diff, Vector128.Zero); - Vector128 filterMask = NeedsFilter(p1, p0, q0, q1, thresh); + Vector128 diff = Vector128_.SubtractSaturate(mask, it); + Vector128 threshMask = Vector128.Equals(diff, Vector128.Zero); + Vector128 filterMask = NeedsFilterVector128(p1, p0, q0, q1, thresh); - mask = Sse2.And(threshMask, filterMask); + mask = threshMask & filterMask; } // Updates values of 2 pixels at MB edge during complex filtering. // Update operations: // q = q - delta and p = p + delta; where delta = [(a_hi >> 7), (a_lo >> 7)] // Pixels 'pi' and 'qi' are int8_t on input, uint8_t on output (sign flip). - private static void Update2Pixels(ref Vector128 pi, ref Vector128 qi, Vector128 a0Low, Vector128 a0High) + private static void Update2PixelsVector128(ref Vector128 pi, ref Vector128 qi, Vector128 a0Low, Vector128 a0High) { Vector128 signBit = Vector128.Create((byte)0x80); - Vector128 a1Low = Sse2.ShiftRightArithmetic(a0Low, 7); - Vector128 a1High = Sse2.ShiftRightArithmetic(a0High, 7); - Vector128 delta = Sse2.PackSignedSaturate(a1Low, a1High); - pi = Sse2.AddSaturate(pi.AsSByte(), delta).AsByte(); - qi = Sse2.SubtractSaturate(qi.AsSByte(), delta).AsByte(); - pi = Sse2.Xor(pi, signBit.AsByte()); - qi = Sse2.Xor(qi, signBit.AsByte()); + Vector128 a1Low = Vector128.ShiftRightArithmetic(a0Low, 7); + Vector128 a1High = Vector128.ShiftRightArithmetic(a0High, 7); + Vector128 delta = Vector128_.PackSignedSaturate(a1Low, a1High); + pi = Vector128_.AddSaturate(pi.AsSByte(), delta).AsByte(); + qi = Vector128_.SubtractSaturate(qi.AsSByte(), delta).AsByte(); + pi ^= signBit.AsByte(); + qi ^= signBit.AsByte(); } [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 LoadUvEdge(ref byte uRef, ref byte vRef, int offset) + private static Vector128 LoadUvEdgeVector128(ref byte uRef, ref byte vRef, int offset) { Vector128 uVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref uRef, (uint)offset)), 0); Vector128 vVec = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref vRef, (uint)offset)), 0); - return Sse2.UnpackLow(uVec, vVec).AsByte(); + return Vector128_.UnpackLow(uVec, vVec).AsByte(); } [MethodImpl(InliningOptions.ShortMethod)] - private static void StoreUv(Vector128 x, ref byte uRef, ref byte vRef, int offset) + private static void StoreUvVector128(Vector128 x, ref byte uRef, ref byte vRef, int offset) { Unsafe.As>(ref Unsafe.Add(ref uRef, (uint)offset)) = x.GetLower(); Unsafe.As>(ref Unsafe.Add(ref vRef, (uint)offset)) = x.GetUpper(); @@ -2461,8 +2459,8 @@ internal static class LossyUtils // Compute abs(p - q) = subs(p - q) OR subs(q - p) [MethodImpl(InliningOptions.ShortMethod)] - private static Vector128 Abs(Vector128 p, Vector128 q) - => Sse2.Or(Sse2.SubtractSaturate(q, p), Sse2.SubtractSaturate(p, q)); + private static Vector128 AbsVector128(Vector128 p, Vector128 q) + => Vector128_.SubtractSaturate(q, p) | Vector128_.SubtractSaturate(p, q); [MethodImpl(InliningOptions.ShortMethod)] private static bool Hev(Span p, int offset, int step, int thresh) @@ -2511,5 +2509,5 @@ internal static class LossyUtils private static void Memset(Span dst, byte value, int startIdx, int count) => dst.Slice(startIdx, count).Fill(value); [MethodImpl(InliningOptions.ShortMethod)] - private static int Clamp255(int x) => x < 0 ? 0 : x > 255 ? 255 : x; + private static int Clamp255(int x) => Numerics.Clamp(x, 0, 255); }