From e553807429ff282e919d478704165055c9a19465 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 4 Jun 2025 12:10:45 +1000 Subject: [PATCH] Port LosslessUtils V128 --- .../Common/Helpers/Vector128Utilities.cs | 63 ++++++ .../Common/Helpers/Vector256Utilities.cs | 4 +- .../Common/Helpers/Vector512Utilities.cs | 65 +----- .../Formats/Webp/Lossless/LosslessUtils.cs | 189 +++++++----------- 4 files changed, 144 insertions(+), 177 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 2228dae49a..a3b8e0156e 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -126,6 +126,33 @@ internal static class Vector128_ return Vector128.Create(value.GetLower(), Vector64.Shuffle(value.GetUpper(), indices)); } + /// + /// Shuffle 16-bit integers in the low 64 bits of using the control in . + /// Store the results in the low 64 bits of the destination, with the high 64 bits being copied from . + /// + /// The input vector containing packed 16-bit integers to shuffle. + /// The shuffle control byte. + /// + /// A vector containing the shuffled 16-bit integers in the low 64 bits, with the high 64 bits copied from . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShuffleLow(Vector128 value, [ConstantExpected] byte control) + { + if (Sse2.IsSupported) + { + return Sse2.ShuffleLow(value, control); + } + + // Don't use InverseMMShuffle here as we want to avoid the cast. + Vector64 indices = Vector64.Create( + (short)(control & 0x3), + (short)((control >> 2) & 0x3), + (short)((control >> 4) & 0x3), + (short)((control >> 6) & 0x3)); + + return Vector128.Create(Vector64.Shuffle(value.GetLower(), indices), value.GetUpper()); + } + /// /// Creates a new vector by selecting values from an input vector using a set of indices. /// @@ -198,6 +225,42 @@ internal static class Vector128_ return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) - Vector128.Create(numBytes)); } + /// + /// Shift packed 16-bit integers in left by while + /// shifting in zeros, and store the results + /// + /// The vector containing packed 16-bit integers to shift. + /// The number of bits to shift left. + /// + /// A vector containing the packed 16-bit integers shifted left by , with zeros shifted in. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 ShiftLeftLogical(Vector128 value, [ConstantExpected] byte count) + { + if (Sse2.IsSupported) + { + return Sse2.ShiftLeftLogical(value, count); + } + + // Zero lanes where count >= 16 to match SSE2 + if (count >= 16) + { + return Vector128.Zero; + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.ShiftLogical(value, Vector128.Create((short)count)); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.ShiftLeft(value, count); + } + + return Vector128.ShiftLeft(value, count); + } + /// /// Right aligns elements of two source 128-bit values depending on bits in a mask. /// diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs index e1c40107fe..4769df2b0b 100644 --- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs @@ -46,9 +46,7 @@ internal static class Vector256_ return Avx2.Shuffle(vector, indices); } - return Vector256.Create( - Vector128_.ShuffleNative(vector.GetLower(), indices.GetLower()), - Vector128_.ShuffleNative(vector.GetUpper(), indices.GetUpper())); + return Vector256.Shuffle(vector, indices); } /// diff --git a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs index ded47f48ee..03ee4626cd 100644 --- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs @@ -46,9 +46,7 @@ internal static class Vector512_ return Avx512BW.Shuffle(vector, indices); } - return Vector512.Create( - Vector256_.ShuffleNative(vector.GetLower(), indices.GetLower()), - Vector256_.ShuffleNative(vector.GetUpper(), indices.GetUpper())); + return Vector512.Shuffle(vector, indices); } /// @@ -59,25 +57,7 @@ internal static class Vector512_ /// The . [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 ConvertToInt32RoundToEven(Vector512 vector) - { - if (Avx512F.IsSupported) - { - return Avx512F.ConvertToVector512Int32(vector); - } - - if (Avx.IsSupported) - { - Vector256 lower = Avx.ConvertToVector256Int32(vector.GetLower()); - Vector256 upper = Avx.ConvertToVector256Int32(vector.GetUpper()); - return Vector512.Create(lower, upper); - } - - Vector512 sign = vector & Vector512.Create(-0.0f); - Vector512 val_2p23_f32 = sign | Vector512.Create(8388608.0f); - - val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32; - return Vector512.ConvertToInt32(val_2p23_f32 | sign); - } + => Avx512F.ConvertToVector512Int32(vector); /// /// Rounds all values in to the nearest integer @@ -86,28 +66,11 @@ internal static class Vector512_ /// The vector [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector512 RoundToNearestInteger(Vector512 vector) - { - if (Avx512F.IsSupported) - { - // imm8 = 0b1000: - // imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers) - // imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions) - return Avx512F.RoundScale(vector, 0b0000_1000); - } - if (Avx.IsSupported) - { - Vector256 lower = Avx.RoundToNearestInteger(vector.GetLower()); - Vector256 upper = Avx.RoundToNearestInteger(vector.GetUpper()); - return Vector512.Create(lower, upper); - } - - Vector512 sign = vector & Vector512.Create(-0F); - Vector512 val_2p23_f32 = sign | Vector512.Create(8388608F); - - val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32; - return val_2p23_f32 | sign; - } + // imm8 = 0b1000: + // imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers) + // imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions) + => Avx512F.RoundScale(vector, 0b0000_1000); /// /// Performs a multiplication and an addition of the . @@ -122,21 +85,7 @@ internal static class Vector512_ Vector512 va, Vector512 vm0, Vector512 vm1) - { - if (Avx512F.IsSupported) - { - return Avx512F.FusedMultiplyAdd(vm0, vm1, va); - } - - if (Fma.IsSupported) - { - Vector256 lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower()); - Vector256 upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper()); - return Vector512.Create(lower, upper); - } - - return va + (vm0 * vm1); - } + => Avx512F.FusedMultiplyAdd(vm0, vm1, va); /// /// Restricts a vector between a minimum and a maximum value. diff --git a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs index 5287f0b753..b96525b426 100644 --- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs @@ -6,6 +6,7 @@ using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Memory; namespace SixLabors.ImageSharp.Formats.Webp.Lossless; @@ -94,7 +95,7 @@ internal static unsafe class LosslessUtils /// The pixel data to apply the transformation. public static void AddGreenToBlueAndRed(Span pixelData) { - if (Avx2.IsSupported && pixelData.Length >= 8) + if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8) { Vector256 addGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); nuint numPixels = (uint)pixelData.Length; @@ -103,8 +104,8 @@ internal static unsafe class LosslessUtils { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); - Vector256 in0g0g = Avx2.Shuffle(input, addGreenToBlueAndRedMaskAvx2); - Vector256 output = Avx2.Add(input, in0g0g); + Vector256 in0g0g = Vector256_.ShuffleNative(input, addGreenToBlueAndRedMaskAvx2); + Vector256 output = input + in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 8; } @@ -115,39 +116,17 @@ internal static unsafe class LosslessUtils AddGreenToBlueAndRedScalar(pixelData[(int)i..]); } } - else if (Ssse3.IsSupported && pixelData.Length >= 4) - { - Vector128 addGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); - nuint numPixels = (uint)pixelData.Length; - nuint i = 0; - do - { - ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); - Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 in0g0g = Ssse3.Shuffle(input, addGreenToBlueAndRedMaskSsse3); - Vector128 output = Sse2.Add(input, in0g0g); - Unsafe.As>(ref pos) = output.AsUInt32(); - i += 4; - } - while (i <= numPixels - 4); - - if (i != numPixels) - { - AddGreenToBlueAndRedScalar(pixelData[(int)i..]); - } - } - else if (Sse2.IsSupported && pixelData.Length >= 4) + else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4) { + Vector128 addGreenToBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; do { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g - Vector128 b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g - Vector128 output = Sse2.Add(input.AsByte(), c.AsByte()); + Vector128 in0g0g = Vector128_.ShuffleNative(input, addGreenToBlueAndRedMask); + Vector128 output = input + in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 4; } @@ -180,7 +159,7 @@ internal static unsafe class LosslessUtils public static void SubtractGreenFromBlueAndRed(Span pixelData) { - if (Avx2.IsSupported && pixelData.Length >= 8) + if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8) { Vector256 subtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255); nuint numPixels = (uint)pixelData.Length; @@ -189,8 +168,8 @@ internal static unsafe class LosslessUtils { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector256 input = Unsafe.As>(ref pos).AsByte(); - Vector256 in0g0g = Avx2.Shuffle(input, subtractGreenFromBlueAndRedMaskAvx2); - Vector256 output = Avx2.Subtract(input, in0g0g); + Vector256 in0g0g = Vector256_.ShuffleNative(input, subtractGreenFromBlueAndRedMaskAvx2); + Vector256 output = input - in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 8; } @@ -201,39 +180,17 @@ internal static unsafe class LosslessUtils SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]); } } - else if (Ssse3.IsSupported && pixelData.Length >= 4) - { - Vector128 subtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); - nuint numPixels = (uint)pixelData.Length; - nuint i = 0; - do - { - ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); - Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 in0g0g = Ssse3.Shuffle(input, subtractGreenFromBlueAndRedMaskSsse3); - Vector128 output = Sse2.Subtract(input, in0g0g); - Unsafe.As>(ref pos) = output.AsUInt32(); - i += 4; - } - while (i <= numPixels - 4); - - if (i != numPixels) - { - SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]); - } - } - else if (Sse2.IsSupported && pixelData.Length >= 4) + else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4) { + Vector128 subtractGreenFromBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255); nuint numPixels = (uint)pixelData.Length; nuint i = 0; do { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i); Vector128 input = Unsafe.As>(ref pos).AsByte(); - Vector128 a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g - Vector128 b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g - Vector128 output = Sse2.Subtract(input.AsByte(), c.AsByte()); + Vector128 in0g0g = Vector128_.ShuffleNative(input, subtractGreenFromBlueAndRedMask); + Vector128 output = input - in0g0g; Unsafe.As>(ref pos) = output.AsUInt32(); i += 4; } @@ -412,7 +369,7 @@ internal static unsafe class LosslessUtils TransformColorScalar(m, pixelData[(int)idx..], numPixels - (int)idx); } } - else if (Sse2.IsSupported && numPixels >= 4) + else if (Vector128.IsHardwareAccelerated && numPixels >= 4) { Vector128 transformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); Vector128 transformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0); @@ -423,16 +380,16 @@ internal static unsafe class LosslessUtils { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector128 input = Unsafe.As>(ref pos); - Vector128 a = Sse2.And(input.AsByte(), transformColorAlphaGreenMask); - Vector128 b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector128 e = Sse2.ShiftLeftLogical(input.AsInt16(), 8); - Vector128 f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); - Vector128 g = Sse2.ShiftRightLogical(f.AsInt32(), 16); - Vector128 h = Sse2.Add(g.AsByte(), d.AsByte()); - Vector128 i = Sse2.And(h, transformColorRedBlueMask); - Vector128 output = Sse2.Subtract(input.AsByte(), i); + Vector128 a = input.AsByte() & transformColorAlphaGreenMask; + Vector128 b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector128 e = Vector128_.ShiftLeftLogical(input.AsInt16(), 8); + Vector128 f = Vector128_.MultiplyHigh(e.AsInt16(), multsb2.AsInt16()); + Vector128 g = Vector128.ShiftRightLogical(f.AsInt32(), 16); + Vector128 h = g.AsByte() + d.AsByte(); + Vector128 i = h & transformColorRedBlueMask; + Vector128 output = input.AsByte() - i; Unsafe.As>(ref pos) = output.AsUInt32(); idx += 4; } @@ -503,7 +460,7 @@ internal static unsafe class LosslessUtils TransformColorInverseScalar(m, pixelData[(int)idx..]); } } - else if (Sse2.IsSupported && pixelData.Length >= 4) + else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4) { Vector128 transformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255); Vector128 multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue)); @@ -514,17 +471,17 @@ internal static unsafe class LosslessUtils { ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx); Vector128 input = Unsafe.As>(ref pos); - Vector128 a = Sse2.And(input.AsByte(), transformColorInverseAlphaGreenMask); - Vector128 b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); - Vector128 d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); - Vector128 e = Sse2.Add(input.AsByte(), d.AsByte()); - Vector128 f = Sse2.ShiftLeftLogical(e.AsInt16(), 8); - Vector128 g = Sse2.MultiplyHigh(f, multsb2.AsInt16()); - Vector128 h = Sse2.ShiftRightLogical(g.AsInt32(), 8); - Vector128 i = Sse2.Add(h.AsByte(), f.AsByte()); - Vector128 j = Sse2.ShiftRightLogical(i.AsInt16(), 8); - Vector128 output = Sse2.Or(j.AsByte(), a); + Vector128 a = input.AsByte() & transformColorInverseAlphaGreenMask; + Vector128 b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200); + Vector128 d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16()); + Vector128 e = input.AsByte() + d.AsByte(); + Vector128 f = Vector128_.ShiftLeftLogical(e.AsInt16(), 8); + Vector128 g = Vector128_.MultiplyHigh(f, multsb2.AsInt16()); + Vector128 h = Vector128.ShiftRightLogical(g.AsInt32(), 8); + Vector128 i = h.AsByte() + f.AsByte(); + Vector128 j = Vector128.ShiftRightLogical(i.AsInt16(), 8); + Vector128 output = j.AsByte() | a; Unsafe.As>(ref pos) = output.AsUInt32(); } @@ -1401,15 +1358,15 @@ internal static unsafe class LosslessUtils private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - Vector128 c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128.Zero); - Vector128 c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128.Zero); - Vector128 c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128.Zero); - Vector128 v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16()); - Vector128 v2 = Sse2.Subtract(v1, c2Vec.AsInt16()); - Vector128 b = Sse2.PackUnsignedSaturate(v2, v2); - return Sse2.ConvertToUInt32(b.AsUInt32()); + Vector128 c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128.Zero); + Vector128 c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128.Zero); + Vector128 c2Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128.Zero); + Vector128 v1 = c0Vec.AsInt16() + c1Vec.AsInt16(); + Vector128 v2 = v1 - c2Vec.AsInt16(); + Vector128 b = Vector128_.PackUnsignedSaturate(v2, v2); + return b.AsUInt32().ToScalar(); } { @@ -1432,20 +1389,20 @@ internal static unsafe class LosslessUtils private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { - Vector128 c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128.Zero); - Vector128 c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128.Zero); - Vector128 b0 = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128.Zero); - Vector128 avg = Sse2.Add(c1Vec.AsInt16(), c0Vec.AsInt16()); - Vector128 a0 = Sse2.ShiftRightLogical(avg, 1); - Vector128 a1 = Sse2.Subtract(a0, b0.AsInt16()); - Vector128 bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16()); - Vector128 a2 = Sse2.Subtract(a1, bgta); - Vector128 a3 = Sse2.ShiftRightArithmetic(a2, 1); - Vector128 a4 = Sse2.Add(a0, a3).AsInt16(); - Vector128 a5 = Sse2.PackUnsignedSaturate(a4, a4); - return Sse2.ConvertToUInt32(a5.AsUInt32()); + Vector128 c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128.Zero); + Vector128 c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128.Zero); + Vector128 b0 = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128.Zero); + Vector128 avg = c1Vec.AsInt16() + c0Vec.AsInt16(); + Vector128 a0 = Vector128.ShiftRightLogical(avg, 1); + Vector128 a1 = a0 - b0.AsInt16(); + Vector128 bgta = Vector128.GreaterThan(b0.AsInt16(), a0.AsInt16()); + Vector128 a2 = a1 - bgta; + Vector128 a3 = Vector128.ShiftRightArithmetic(a2, 1); + Vector128 a4 = (a0 + a3).AsInt16(); + Vector128 a5 = Vector128_.PackUnsignedSaturate(a4, a4); + return a5.AsUInt32().ToScalar(); } { @@ -1475,23 +1432,23 @@ internal static unsafe class LosslessUtils private static uint Select(uint a, uint b, uint c, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { fixed (short* ptr = &MemoryMarshal.GetReference(scratch)) { - Vector128 a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte(); - Vector128 b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte(); - Vector128 c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte(); - Vector128 ac0 = Sse2.SubtractSaturate(a0, c0); - Vector128 ca0 = Sse2.SubtractSaturate(c0, a0); - Vector128 bc0 = Sse2.SubtractSaturate(b0, c0); - Vector128 cb0 = Sse2.SubtractSaturate(c0, b0); - Vector128 ac = Sse2.Or(ac0, ca0); - Vector128 bc = Sse2.Or(bc0, cb0); - Vector128 pa = Sse2.UnpackLow(ac, Vector128.Zero); // |a - c| - Vector128 pb = Sse2.UnpackLow(bc, Vector128.Zero); // |b - c| - Vector128 diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16()); - Sse2.Store((ushort*)ptr, diff); + Vector128 a0 = Vector128.CreateScalar(a).AsByte(); + Vector128 b0 = Vector128.CreateScalar(b).AsByte(); + Vector128 c0 = Vector128.CreateScalar(c).AsByte(); + Vector128 ac0 = Vector128_.SubtractSaturate(a0, c0); + Vector128 ca0 = Vector128_.SubtractSaturate(c0, a0); + Vector128 bc0 = Vector128_.SubtractSaturate(b0, c0); + Vector128 cb0 = Vector128_.SubtractSaturate(c0, b0); + Vector128 ac = ac0 | ca0; + Vector128 bc = bc0 | cb0; + Vector128 pa = Vector128_.UnpackLow(ac, Vector128.Zero); // |a - c| + Vector128 pb = Vector128_.UnpackLow(bc, Vector128.Zero); // |b - c| + Vector128 diff = pb.AsUInt16() - pa.AsUInt16(); + diff.Store((ushort*)ptr); int paMinusPb = ptr[3] + ptr[2] + ptr[1] + ptr[0]; return (paMinusPb <= 0) ? a : b; }