Port LosslessUtils V128

11 months ago · e553807429
4 changed files with 144 additions and 177 deletions
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@ -126,6 +126,33 @@ internal static class Vector128_
        return Vector128.Create(value.GetLower(), Vector64.Shuffle(value.GetUpper(), indices));
    }

+    /// <summary>
+    /// Shuffle 16-bit integers in the low 64 bits of <paramref name="value"/> using the control in <paramref name="control"/>.
+    /// Store the results in the low 64 bits of the destination, with the high 64 bits being copied from <paramref name="value"/>.
+    /// </summary>
+    /// <param name="value">The input vector containing packed 16-bit integers to shuffle.</param>
+    /// <param name="control">The shuffle control byte.</param>
+    /// <returns>
+    /// A vector containing the shuffled 16-bit integers in the low 64 bits, with the high 64 bits copied from <paramref name="value"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<short> ShuffleLow(Vector128<short> value, [ConstantExpected] byte control)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.ShuffleLow(value, control);
+        }
+
+        // Don't use InverseMMShuffle here as we want to avoid the cast.
+        Vector64<short> indices = Vector64.Create(
+            (short)(control & 0x3),
+            (short)((control >> 2) & 0x3),
+            (short)((control >> 4) & 0x3),
+            (short)((control >> 6) & 0x3));
+
+        return Vector128.Create(Vector64.Shuffle(value.GetLower(), indices), value.GetUpper());
+    }
+
    /// <summary>
    /// Creates a new vector by selecting values from an input vector using a set of indices.
    /// </summary>
@ -198,6 +225,42 @@ internal static class Vector128_
        return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) - Vector128.Create(numBytes));
    }

+    /// <summary>
+    /// Shift packed 16-bit integers in <paramref name="value"/> left by <paramref name="value"/> while
+    /// shifting in zeros, and store the results
+    /// </summary>
+    /// <param name="value">The vector containing packed 16-bit integers to shift.</param>
+    /// <param name="count">The number of bits to shift left.</param>
+    /// <returns>
+    /// A vector containing the packed 16-bit integers shifted left by <paramref name="count"/>, with zeros shifted in.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<short> ShiftLeftLogical(Vector128<short> value, [ConstantExpected] byte count)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.ShiftLeftLogical(value, count);
+        }
+
+        // Zero lanes where count >= 16 to match SSE2
+        if (count >= 16)
+        {
+            return Vector128<short>.Zero;
+        }
+
+        if (AdvSimd.IsSupported)
+        {
+            return AdvSimd.ShiftLogical(value, Vector128.Create((short)count));
+        }
+
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.ShiftLeft(value, count);
+        }
+
+        return Vector128.ShiftLeft(value, count);
+    }
+
    /// <summary>
    /// Right aligns elements of two source 128-bit values depending on bits in a mask.
    /// </summary>
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@ -46,9 +46,7 @@ internal static class Vector256_
            return Avx2.Shuffle(vector, indices);
        }

-        return Vector256.Create(
-            Vector128_.ShuffleNative(vector.GetLower(), indices.GetLower()),
-            Vector128_.ShuffleNative(vector.GetUpper(), indices.GetUpper()));
+        return Vector256.Shuffle(vector, indices);
    }

    /// <summary>
--- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@ -46,9 +46,7 @@ internal static class Vector512_
            return Avx512BW.Shuffle(vector, indices);
        }

-        return Vector512.Create(
-            Vector256_.ShuffleNative(vector.GetLower(), indices.GetLower()),
-            Vector256_.ShuffleNative(vector.GetUpper(), indices.GetUpper()));
+        return Vector512.Shuffle(vector, indices);
    }

    /// <summary>
@ -59,25 +57,7 @@ internal static class Vector512_
    /// <returns>The <see cref="Vector128{Int32}"/>.</returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
-    {
-        if (Avx512F.IsSupported)
-        {
-            return Avx512F.ConvertToVector512Int32(vector);
-        }
-
-        if (Avx.IsSupported)
-        {
-            Vector256<int> lower = Avx.ConvertToVector256Int32(vector.GetLower());
-            Vector256<int> upper = Avx.ConvertToVector256Int32(vector.GetUpper());
-            return Vector512.Create(lower, upper);
-        }
-
-        Vector512<float> sign = vector & Vector512.Create(-0.0f);
-        Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608.0f);
-
-        val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
-        return Vector512.ConvertToInt32(val_2p23_f32 | sign);
-    }
+        => Avx512F.ConvertToVector512Int32(vector);

    /// <summary>
    /// Rounds all values in <paramref name="vector"/> to the nearest integer
@ -86,28 +66,11 @@ internal static class Vector512_
    /// <param name="vector">The vector</param>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector512<float> RoundToNearestInteger(Vector512<float> vector)
-    {
-        if (Avx512F.IsSupported)
-        {
-            // imm8 = 0b1000:
-            //   imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers)
-            //   imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions)
-            return Avx512F.RoundScale(vector, 0b0000_1000);
-        }

-        if (Avx.IsSupported)
-        {
-            Vector256<float> lower = Avx.RoundToNearestInteger(vector.GetLower());
-            Vector256<float> upper = Avx.RoundToNearestInteger(vector.GetUpper());
-            return Vector512.Create(lower, upper);
-        }
-
-        Vector512<float> sign = vector & Vector512.Create(-0F);
-        Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608F);
-
-        val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
-        return val_2p23_f32 | sign;
-    }
+          // imm8 = 0b1000:
+          //   imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers)
+          //   imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions)
+          => Avx512F.RoundScale(vector, 0b0000_1000);

    /// <summary>
    /// Performs a multiplication and an addition of the <see cref="Vector512{Single}"/>.
@ -122,21 +85,7 @@ internal static class Vector512_
        Vector512<float> va,
        Vector512<float> vm0,
        Vector512<float> vm1)
-    {
-        if (Avx512F.IsSupported)
-        {
-            return Avx512F.FusedMultiplyAdd(vm0, vm1, va);
-        }
-
-        if (Fma.IsSupported)
-        {
-            Vector256<float> lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower());
-            Vector256<float> upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper());
-            return Vector512.Create(lower, upper);
-        }
-
-        return va + (vm0 * vm1);
-    }
+        => Avx512F.FusedMultiplyAdd(vm0, vm1, va);

    /// <summary>
    /// Restricts a vector between a minimum and a maximum value.
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@ -6,6 +6,7 @@ using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Memory;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless;
@ -94,7 +95,7 @@ internal static unsafe class LosslessUtils
    /// <param name="pixelData">The pixel data to apply the transformation.</param>
    public static void AddGreenToBlueAndRed(Span<uint> pixelData)
    {
-        if (Avx2.IsSupported && pixelData.Length >= 8)
+        if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
        {
            Vector256<byte> addGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
            nuint numPixels = (uint)pixelData.Length;
@ -103,8 +104,8 @@ internal static unsafe class LosslessUtils
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
-                Vector256<byte> in0g0g = Avx2.Shuffle(input, addGreenToBlueAndRedMaskAvx2);
-                Vector256<byte> output = Avx2.Add(input, in0g0g);
+                Vector256<byte> in0g0g = Vector256_.ShuffleNative(input, addGreenToBlueAndRedMaskAvx2);
+                Vector256<byte> output = input + in0g0g;
                Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
                i += 8;
            }
@ -115,39 +116,17 @@ internal static unsafe class LosslessUtils
                AddGreenToBlueAndRedScalar(pixelData[(int)i..]);
            }
        }
-        else if (Ssse3.IsSupported && pixelData.Length >= 4)
-        {
-            Vector128<byte> addGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
-            nuint numPixels = (uint)pixelData.Length;
-            nuint i = 0;
-            do
-            {
-                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
-                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<byte> in0g0g = Ssse3.Shuffle(input, addGreenToBlueAndRedMaskSsse3);
-                Vector128<byte> output = Sse2.Add(input, in0g0g);
-                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
-                i += 4;
-            }
-            while (i <= numPixels - 4);
-
-            if (i != numPixels)
-            {
-                AddGreenToBlueAndRedScalar(pixelData[(int)i..]);
-            }
-        }
-        else if (Sse2.IsSupported && pixelData.Length >= 4)
+        else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
        {
+            Vector128<byte> addGreenToBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
            do
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
-                Vector128<ushort> b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<ushort> c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g
-                Vector128<byte> output = Sse2.Add(input.AsByte(), c.AsByte());
+                Vector128<byte> in0g0g = Vector128_.ShuffleNative(input, addGreenToBlueAndRedMask);
+                Vector128<byte> output = input + in0g0g;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
                i += 4;
            }
@ -180,7 +159,7 @@ internal static unsafe class LosslessUtils

    public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
    {
-        if (Avx2.IsSupported && pixelData.Length >= 8)
+        if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
        {
            Vector256<byte> subtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
            nuint numPixels = (uint)pixelData.Length;
@ -189,8 +168,8 @@ internal static unsafe class LosslessUtils
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
-                Vector256<byte> in0g0g = Avx2.Shuffle(input, subtractGreenFromBlueAndRedMaskAvx2);
-                Vector256<byte> output = Avx2.Subtract(input, in0g0g);
+                Vector256<byte> in0g0g = Vector256_.ShuffleNative(input, subtractGreenFromBlueAndRedMaskAvx2);
+                Vector256<byte> output = input - in0g0g;
                Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
                i += 8;
            }
@ -201,39 +180,17 @@ internal static unsafe class LosslessUtils
                SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]);
            }
        }
-        else if (Ssse3.IsSupported && pixelData.Length >= 4)
-        {
-            Vector128<byte> subtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
-            nuint numPixels = (uint)pixelData.Length;
-            nuint i = 0;
-            do
-            {
-                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
-                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<byte> in0g0g = Ssse3.Shuffle(input, subtractGreenFromBlueAndRedMaskSsse3);
-                Vector128<byte> output = Sse2.Subtract(input, in0g0g);
-                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
-                i += 4;
-            }
-            while (i <= numPixels - 4);
-
-            if (i != numPixels)
-            {
-                SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]);
-            }
-        }
-        else if (Sse2.IsSupported && pixelData.Length >= 4)
+        else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
        {
+            Vector128<byte> subtractGreenFromBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
            do
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
-                Vector128<ushort> b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<ushort> c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g
-                Vector128<byte> output = Sse2.Subtract(input.AsByte(), c.AsByte());
+                Vector128<byte> in0g0g = Vector128_.ShuffleNative(input, subtractGreenFromBlueAndRedMask);
+                Vector128<byte> output = input - in0g0g;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
                i += 4;
            }
@ -412,7 +369,7 @@ internal static unsafe class LosslessUtils
                TransformColorScalar(m, pixelData[(int)idx..], numPixels - (int)idx);
            }
        }
-        else if (Sse2.IsSupported && numPixels >= 4)
+        else if (Vector128.IsHardwareAccelerated && numPixels >= 4)
        {
            Vector128<byte> transformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
            Vector128<byte> transformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
@ -423,16 +380,16 @@ internal static unsafe class LosslessUtils
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
                Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
-                Vector128<byte> a = Sse2.And(input.AsByte(), transformColorAlphaGreenMask);
-                Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
-                Vector128<short> e = Sse2.ShiftLeftLogical(input.AsInt16(), 8);
-                Vector128<short> f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
-                Vector128<int> g = Sse2.ShiftRightLogical(f.AsInt32(), 16);
-                Vector128<byte> h = Sse2.Add(g.AsByte(), d.AsByte());
-                Vector128<byte> i = Sse2.And(h, transformColorRedBlueMask);
-                Vector128<byte> output = Sse2.Subtract(input.AsByte(), i);
+                Vector128<byte> a = input.AsByte() & transformColorAlphaGreenMask;
+                Vector128<short> b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
+                Vector128<short> e = Vector128_.ShiftLeftLogical(input.AsInt16(), 8);
+                Vector128<short> f = Vector128_.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
+                Vector128<int> g = Vector128.ShiftRightLogical(f.AsInt32(), 16);
+                Vector128<byte> h = g.AsByte() + d.AsByte();
+                Vector128<byte> i = h & transformColorRedBlueMask;
+                Vector128<byte> output = input.AsByte() - i;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
                idx += 4;
            }
@ -503,7 +460,7 @@ internal static unsafe class LosslessUtils
                TransformColorInverseScalar(m, pixelData[(int)idx..]);
            }
        }
-        else if (Sse2.IsSupported && pixelData.Length >= 4)
+        else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
        {
            Vector128<byte> transformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
            Vector128<int> multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
@ -514,17 +471,17 @@ internal static unsafe class LosslessUtils
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
                Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
-                Vector128<byte> a = Sse2.And(input.AsByte(), transformColorInverseAlphaGreenMask);
-                Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
-                Vector128<byte> e = Sse2.Add(input.AsByte(), d.AsByte());
-                Vector128<short> f = Sse2.ShiftLeftLogical(e.AsInt16(), 8);
-                Vector128<short> g = Sse2.MultiplyHigh(f, multsb2.AsInt16());
-                Vector128<int> h = Sse2.ShiftRightLogical(g.AsInt32(), 8);
-                Vector128<byte> i = Sse2.Add(h.AsByte(), f.AsByte());
-                Vector128<short> j = Sse2.ShiftRightLogical(i.AsInt16(), 8);
-                Vector128<byte> output = Sse2.Or(j.AsByte(), a);
+                Vector128<byte> a = input.AsByte() & transformColorInverseAlphaGreenMask;
+                Vector128<short> b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
+                Vector128<byte> e = input.AsByte() + d.AsByte();
+                Vector128<short> f = Vector128_.ShiftLeftLogical(e.AsInt16(), 8);
+                Vector128<short> g = Vector128_.MultiplyHigh(f, multsb2.AsInt16());
+                Vector128<int> h = Vector128.ShiftRightLogical(g.AsInt32(), 8);
+                Vector128<byte> i = h.AsByte() + f.AsByte();
+                Vector128<short> j = Vector128.ShiftRightLogical(i.AsInt16(), 8);
+                Vector128<byte> output = j.AsByte() | a;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
            }

@ -1401,15 +1358,15 @@ internal static unsafe class LosslessUtils

    private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
-            Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
-            Vector128<short> v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16());
-            Vector128<short> v2 = Sse2.Subtract(v1, c2Vec.AsInt16());
-            Vector128<byte> b = Sse2.PackUnsignedSaturate(v2, v2);
-            return Sse2.ConvertToUInt32(b.AsUInt32());
+            Vector128<byte> c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> c2Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128<byte>.Zero);
+            Vector128<short> v1 = c0Vec.AsInt16() + c1Vec.AsInt16();
+            Vector128<short> v2 = v1 - c2Vec.AsInt16();
+            Vector128<byte> b = Vector128_.PackUnsignedSaturate(v2, v2);
+            return b.AsUInt32().ToScalar();
        }

        {
@ -1432,20 +1389,20 @@ internal static unsafe class LosslessUtils

    private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
-            Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> b0 = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
-            Vector128<short> avg = Sse2.Add(c1Vec.AsInt16(), c0Vec.AsInt16());
-            Vector128<short> a0 = Sse2.ShiftRightLogical(avg, 1);
-            Vector128<short> a1 = Sse2.Subtract(a0, b0.AsInt16());
-            Vector128<short> bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16());
-            Vector128<short> a2 = Sse2.Subtract(a1, bgta);
-            Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2, 1);
-            Vector128<short> a4 = Sse2.Add(a0, a3).AsInt16();
-            Vector128<byte> a5 = Sse2.PackUnsignedSaturate(a4, a4);
-            return Sse2.ConvertToUInt32(a5.AsUInt32());
+            Vector128<byte> c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> b0 = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128<byte>.Zero);
+            Vector128<short> avg = c1Vec.AsInt16() + c0Vec.AsInt16();
+            Vector128<short> a0 = Vector128.ShiftRightLogical(avg, 1);
+            Vector128<short> a1 = a0 - b0.AsInt16();
+            Vector128<short> bgta = Vector128.GreaterThan(b0.AsInt16(), a0.AsInt16());
+            Vector128<short> a2 = a1 - bgta;
+            Vector128<short> a3 = Vector128.ShiftRightArithmetic(a2, 1);
+            Vector128<short> a4 = (a0 + a3).AsInt16();
+            Vector128<byte> a5 = Vector128_.PackUnsignedSaturate(a4, a4);
+            return a5.AsUInt32().ToScalar();
        }

        {
@ -1475,23 +1432,23 @@ internal static unsafe class LosslessUtils

    private static uint Select(uint a, uint b, uint c, Span<short> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            fixed (short* ptr = &MemoryMarshal.GetReference(scratch))
            {
-                Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
-                Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
-                Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte();
-                Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0);
-                Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0);
-                Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0);
-                Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
-                Vector128<byte> ac = Sse2.Or(ac0, ca0);
-                Vector128<byte> bc = Sse2.Or(bc0, cb0);
-                Vector128<byte> pa = Sse2.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
-                Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
-                Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
-                Sse2.Store((ushort*)ptr, diff);
+                Vector128<byte> a0 = Vector128.CreateScalar(a).AsByte();
+                Vector128<byte> b0 = Vector128.CreateScalar(b).AsByte();
+                Vector128<byte> c0 = Vector128.CreateScalar(c).AsByte();
+                Vector128<byte> ac0 = Vector128_.SubtractSaturate(a0, c0);
+                Vector128<byte> ca0 = Vector128_.SubtractSaturate(c0, a0);
+                Vector128<byte> bc0 = Vector128_.SubtractSaturate(b0, c0);
+                Vector128<byte> cb0 = Vector128_.SubtractSaturate(c0, b0);
+                Vector128<byte> ac = ac0 | ca0;
+                Vector128<byte> bc = bc0 | cb0;
+                Vector128<byte> pa = Vector128_.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
+                Vector128<byte> pb = Vector128_.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
+                Vector128<ushort> diff = pb.AsUInt16() - pa.AsUInt16();
+                diff.Store((ushort*)ptr);
                int paMinusPb = ptr[3] + ptr[2] + ptr[1] + ptr[0];
                return (paMinusPb <= 0) ? a : b;
            }