Port YuvConversion

11 months ago · b5fe86c08c
2 changed files with 244 additions and 106 deletions
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@ -608,6 +608,44 @@ internal static class Vector128_
        return Vector128.Narrow(prodLo, prodHi);
    }

+    /// <summary>
+    /// Multiply the packed 16-bit unsigned integers in <paramref name="left"/> and <paramref name="right"/>, producing
+    /// intermediate unsigned 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 16-bit unsigned integers to multiply.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 16-bit unsigned integers to multiply.
+    /// </param>
+    /// <returns>
+    /// A vector containing the high 16 bits of the products of the packed 16-bit unsigned integers
+    /// from <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<ushort> MultiplyHigh(Vector128<ushort> left, Vector128<ushort> right)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.MultiplyHigh(left, right);
+        }
+
+        // Widen each half of the short vectors into two uint vectors
+        (Vector128<uint> leftLo, Vector128<uint> leftHi) = Vector128.Widen(left);
+        (Vector128<uint> rightLo, Vector128<uint> rightHi) = Vector128.Widen(right);
+
+        // Elementwise multiply: each int lane now holds the full 32-bit product
+        Vector128<uint> prodLo = leftLo * rightLo;
+        Vector128<uint> prodHi = leftHi * rightHi;
+
+        // Arithmetic shift right by 16 bits to extract the high word
+        prodLo >>= 16;
+        prodHi >>= 16;
+
+        // Narrow the two int vectors back into one short vector
+        return Vector128.Narrow(prodLo, prodHi);
+    }
+
    /// <summary>
    /// Unpack and interleave 64-bit integers from the high half of <paramref name="left"/> and <paramref name="right"/>
    /// and store the results in the result.
@ -927,7 +965,7 @@ internal static class Vector128_
    /// The second vector containing packed signed 16-bit integers to subtract.
    /// </param>
    /// <returns>
-    /// A vector containing the results of subtracting packed unsigned 16-bit integers
+    /// A vector containing the results of subtracting packed signed 16-bit integers
    /// </returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector128<short> SubtractSaturate(Vector128<short> left, Vector128<short> right)
@ -967,7 +1005,57 @@ internal static class Vector128_
    }

    /// <summary>
-    /// Add packed unsigned 8-bit integers in <paramref name="right"/> from packed unsigned 8-bit integers
+    /// Subtract packed unsigned 16-bit integers in <paramref name="right"/> from packed unsigned 16-bit integers
+    /// in <paramref name="left"/> using saturation, and store the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed unsigned 16-bit integers to subtract from.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed unsigned 16-bit integers to subtract.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of subtracting packed unsigned 16-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<ushort> SubtractSaturate(Vector128<ushort> left, Vector128<ushort> right)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.SubtractSaturate(left, right);
+        }
+
+        if (AdvSimd.IsSupported)
+        {
+            return AdvSimd.SubtractSaturate(left, right);
+        }
+
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.SubtractSaturate(left, right);
+        }
+
+        // Widen inputs to 32-bit signed
+        (Vector128<uint> leftLo, Vector128<uint> leftHi) = Vector128.Widen(left);
+        (Vector128<uint> rightLo, Vector128<uint> rightHi) = Vector128.Widen(right);
+
+        // Subtract
+        Vector128<uint> diffLo = leftLo - rightLo;
+        Vector128<uint> diffHi = leftHi - rightHi;
+
+        // Clamp to signed 16-bit range
+        Vector128<uint> min = Vector128.Create((uint)ushort.MinValue);
+        Vector128<uint> max = Vector128.Create((uint)ushort.MaxValue);
+
+        diffLo = Clamp(diffLo, min, max);
+        diffHi = Clamp(diffHi, min, max);
+
+        // Narrow back to 16 bit signed.
+        return Vector128.Narrow(diffLo, diffHi);
+    }
+
+    /// <summary>
+    /// Add packed unsigned 8-bit integers in <paramref name="right"/> to packed unsigned 8-bit integers
    /// in <paramref name="left"/> using saturation, and store the results.
    /// </summary>
    /// <param name="left">
@ -1015,6 +1103,55 @@ internal static class Vector128_
        return Vector128.Narrow(sumLo, sumHi);
    }

+    /// <summary>
+    /// Add packed unsigned 16-bit integers in <paramref name="right"/> to packed unsigned 16-bit integers
+    /// in <paramref name="left"/> using saturation, and store the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed unsigned 16-bit integers to add to.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed unsigned 16-bit integers to add.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of adding packed unsigned 16-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<ushort> AddSaturate(Vector128<ushort> left, Vector128<ushort> right)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.AddSaturate(left, right);
+        }
+
+        if (AdvSimd.IsSupported)
+        {
+            return AdvSimd.AddSaturate(left, right);
+        }
+
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.AddSaturate(left, right);
+        }
+
+        // Widen inputs to 32-bit
+        (Vector128<uint> leftLo, Vector128<uint> leftHi) = Vector128.Widen(left);
+        (Vector128<uint> rightLo, Vector128<uint> rightHi) = Vector128.Widen(right);
+
+        // Add
+        Vector128<uint> sumLo = leftLo + rightLo;
+        Vector128<uint> sumHi = leftHi + rightHi;
+
+        // Clamp to signed 16-bit range
+        Vector128<uint> max = Vector128.Create((uint)ushort.MaxValue);
+
+        sumLo = Clamp(sumLo, Vector128<uint>.Zero, max);
+        sumHi = Clamp(sumHi, Vector128<uint>.Zero, max);
+
+        // Narrow back to 16 bit unsigned.
+        return Vector128.Narrow(sumLo, sumHi);
+    }
+
    /// <summary>
    /// Subtract packed unsigned 8-bit integers in <paramref name="right"/> from packed unsigned 8-bit integers
    /// in <paramref name="left"/> using saturation, and store the results.
--- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
@ -5,7 +5,7 @@ using System.Buffers;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;

@ -29,9 +29,9 @@ internal static class YuvConversion
    //  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
    public static void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
    {
-        if (Sse41.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
-            UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
+            UpSampleVector128(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
        }
        else
        {
@ -107,7 +107,7 @@ internal static class YuvConversion
    //
    // Then m can be written as
    // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
-    private static void UpSampleSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
+    private static void UpSampleVector128(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
    {
        const int xStep = 3;
        Array.Clear(uvBuffer);
@ -138,18 +138,18 @@ internal static class YuvConversion
        {
            for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
            {
-                UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
-                UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
-                ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
+                ConvertYuvToBgrWithBottomYVector128(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
            }
        }
        else
        {
            for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
            {
-                UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
-                UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
-                ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
+                ConvertYuvToBgrVector128(topY, topDst, ru, rv, pos, xStep);
            }
        }

@ -161,18 +161,18 @@ internal static class YuvConversion
            Span<byte> tmpBottomDst = tmpTopDst[(4 * 32)..];
            Span<byte> tmpTop = tmpBottomDst[(4 * 32)..];
            Span<byte> tmpBottom = bottomY.IsEmpty ? null : tmpTop[32..];
-            UpSampleLastBlock(topU[uvPos..], curU[uvPos..], leftOver, ru);
-            UpSampleLastBlock(topV[uvPos..], curV[uvPos..], leftOver, rv);
+            UpSampleLastBlockVector128(topU[uvPos..], curU[uvPos..], leftOver, ru);
+            UpSampleLastBlockVector128(topV[uvPos..], curV[uvPos..], leftOver, rv);

            topY[pos..len].CopyTo(tmpTop);
            if (!bottomY.IsEmpty)
            {
                bottomY[pos..len].CopyTo(tmpBottom);
-                ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
+                ConvertYuvToBgrWithBottomYVector128(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
            }
            else
            {
-                ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep);
+                ConvertYuvToBgrVector128(tmpTop, tmpTopDst, ru, rv, 0, xStep);
            }

            tmpTopDst[..((len - pos) * xStep)].CopyTo(topDst[(pos * xStep)..]);
@ -184,7 +184,7 @@ internal static class YuvConversion
    }

    // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
-    private static void UpSample32Pixels(ref byte r1, ref byte r2, Span<byte> output)
+    private static void UpSample32PixelsVector128(ref byte r1, ref byte r2, Span<byte> output)
    {
        // Load inputs.
        Vector128<byte> a = Unsafe.As<byte, Vector128<byte>>(ref r1);
@ -192,28 +192,28 @@ internal static class YuvConversion
        Vector128<byte> c = Unsafe.As<byte, Vector128<byte>>(ref r2);
        Vector128<byte> d = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref r2, 1));

-        Vector128<byte> s = Sse2.Average(a, d); // s = (a + d + 1) / 2
-        Vector128<byte> t = Sse2.Average(b, c); // t = (b + c + 1) / 2
-        Vector128<byte> st = Sse2.Xor(s, t); // st = s^t
+        Vector128<byte> s = Vector128_.Average(a, d); // s = (a + d + 1) / 2
+        Vector128<byte> t = Vector128_.Average(b, c); // t = (b + c + 1) / 2
+        Vector128<byte> st = s ^ t; // st = s^t

-        Vector128<byte> ad = Sse2.Xor(a, d); // ad = a^d
-        Vector128<byte> bc = Sse2.Xor(b, c); // bc = b^c
+        Vector128<byte> ad = a ^ d; // ad = a^d
+        Vector128<byte> bc = b ^ c; // bc = b^c

-        Vector128<byte> t1 = Sse2.Or(ad, bc); // (a^d) | (b^c)
-        Vector128<byte> t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t)
-        Vector128<byte> t3 = Sse2.And(t2, Vector128.Create((byte)1)); // (a^d) | (b^c) | (s^t) & 1
-        Vector128<byte> t4 = Sse2.Average(s, t);
-        Vector128<byte> k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4
+        Vector128<byte> t1 = ad | bc; // (a^d) | (b^c)
+        Vector128<byte> t2 = t1 | st; // (a^d) | (b^c) | (s^t)
+        Vector128<byte> t3 = t2 & Vector128.Create((byte)1); // (a^d) | (b^c) | (s^t) & 1
+        Vector128<byte> t4 = Vector128_.Average(s, t);
+        Vector128<byte> k = t4 - t3; // k = (a + b + c + d) / 4

-        Vector128<byte> diag1 = GetM(k, st, bc, t);
-        Vector128<byte> diag2 = GetM(k, st, ad, s);
+        Vector128<byte> diag1 = GetMVector128(k, st, bc, t);
+        Vector128<byte> diag2 = GetMVector128(k, st, ad, s);

        // Pack the alternate pixels.
-        PackAndStore(a, b, diag1, diag2, output); // store top.
-        PackAndStore(c, d, diag2, diag1, output[(2 * 32)..]);
+        PackAndStoreVector128(a, b, diag1, diag2, output); // store top.
+        PackAndStoreVector128(c, d, diag2, diag1, output[(2 * 32)..]);
    }

-    private static void UpSampleLastBlock(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
+    private static void UpSampleLastBlockVector128(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
    {
        Span<byte> r1 = stackalloc byte[17];
        Span<byte> r2 = stackalloc byte[17];
@ -230,27 +230,27 @@ internal static class YuvConversion

        ref byte r1Ref = ref MemoryMarshal.GetReference(r1);
        ref byte r2Ref = ref MemoryMarshal.GetReference(r2);
-        UpSample32Pixels(ref r1Ref, ref r2Ref, output);
+        UpSample32PixelsVector128(ref r1Ref, ref r2Ref, output);
    }

    // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
-    private static Vector128<byte> GetM(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
+    private static Vector128<byte> GetMVector128(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
    {
-        Vector128<byte> tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2
-        Vector128<byte> tmp1 = Sse2.And(ij, st); // (ij) & (s^t)
-        Vector128<byte> tmp2 = Sse2.Xor(k, input); // (k^in)
-        Vector128<byte> tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in)
-        Vector128<byte> tmp4 = Sse2.And(tmp3, Vector128.Create((byte)1)); // & 1 -> lsb_correction
+        Vector128<byte> tmp0 = Vector128_.Average(k, input); // (k + in + 1) / 2
+        Vector128<byte> tmp1 = ij & st; // (ij) & (s^t)
+        Vector128<byte> tmp2 = k ^ input; // (k^in)
+        Vector128<byte> tmp3 = tmp1 | tmp2; // ((ij) & (s^t)) | (k^in)
+        Vector128<byte> tmp4 = tmp3 & Vector128.Create((byte)1); // & 1 -> lsb_correction

-        return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction
+        return tmp0 - tmp4; // (k + in + 1) / 2 - lsb_correction
    }

-    private static void PackAndStore(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
+    private static void PackAndStoreVector128(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
    {
-        Vector128<byte> ta = Sse2.Average(a, da); // (9a + 3b + 3c +  d + 8) / 16
-        Vector128<byte> tb = Sse2.Average(b, db); // (3a + 9b +  c + 3d + 8) / 16
-        Vector128<byte> t1 = Sse2.UnpackLow(ta, tb);
-        Vector128<byte> t2 = Sse2.UnpackHigh(ta, tb);
+        Vector128<byte> ta = Vector128_.Average(a, da); // (9a + 3b + 3c +  d + 8) / 16
+        Vector128<byte> tb = Vector128_.Average(b, db); // (3a + 9b +  c + 3d + 8) / 16
+        Vector128<byte> t1 = Vector128_.UnpackLow(ta, tb);
+        Vector128<byte> t2 = Vector128_.UnpackHigh(ta, tb);

        ref byte output0Ref = ref MemoryMarshal.GetReference(output);
        ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16);
@ -562,41 +562,42 @@ internal static class YuvConversion
    }

    [MethodImpl(InliningOptions.ShortMethod)]
-    private static void ConvertYuvToBgrSse41(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step) => YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
+    private static void ConvertYuvToBgrVector128(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step)
+        => YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);

    [MethodImpl(InliningOptions.ShortMethod)]
-    private static void ConvertYuvToBgrWithBottomYSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
+    private static void ConvertYuvToBgrWithBottomYVector128(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
    {
-        YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
-        YuvToBgrSse41(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
+        YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);
+        YuvToBgrVector128(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
    }

-    private static void YuvToBgrSse41(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
+    private static void YuvToBgrVector128(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
    {
        ref byte yRef = ref MemoryMarshal.GetReference(y);
        ref byte uRef = ref MemoryMarshal.GetReference(u);
        ref byte vRef = ref MemoryMarshal.GetReference(v);
-        ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
-        ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
-        ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
-        ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);
+        ConvertYuv444ToBgrVector128(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
+        ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
+        ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
+        ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);

        // Cast to 8b and store as BBBBGGGGRRRR.
-        Vector128<byte> bgr0 = Sse2.PackUnsignedSaturate(b0, b1);
-        Vector128<byte> bgr1 = Sse2.PackUnsignedSaturate(b2, b3);
-        Vector128<byte> bgr2 = Sse2.PackUnsignedSaturate(g0, g1);
-        Vector128<byte> bgr3 = Sse2.PackUnsignedSaturate(g2, g3);
-        Vector128<byte> bgr4 = Sse2.PackUnsignedSaturate(r0, r1);
-        Vector128<byte> bgr5 = Sse2.PackUnsignedSaturate(r2, r3);
+        Vector128<byte> bgr0 = Vector128_.PackUnsignedSaturate(b0, b1);
+        Vector128<byte> bgr1 = Vector128_.PackUnsignedSaturate(b2, b3);
+        Vector128<byte> bgr2 = Vector128_.PackUnsignedSaturate(g0, g1);
+        Vector128<byte> bgr3 = Vector128_.PackUnsignedSaturate(g2, g3);
+        Vector128<byte> bgr4 = Vector128_.PackUnsignedSaturate(r0, r1);
+        Vector128<byte> bgr5 = Vector128_.PackUnsignedSaturate(r2, r3);

        // Pack as BGRBGRBGRBGR.
-        PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
+        PlanarTo24bVector128(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
    }

    // Pack the planar buffers
    // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
    // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-    private static void PlanarTo24bSse41(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
+    private static void PlanarTo24bVector128(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
    {
        // The input is 6 registers of sixteen 8b but for the sake of explanation,
        // let's take 6 registers of four 8b values.
@ -612,7 +613,7 @@ internal static class YuvConversion
        //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7

        // Process R.
-        ChannelMixing(
+        ChannelMixingVector128(
            input0,
            input1,
            Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5),        // PlanarTo24Shuffle0
@ -627,7 +628,7 @@ internal static class YuvConversion

        // Process G.
        // Same as before, just shifted to the left by one and including the right padding.
-        ChannelMixing(
+        ChannelMixingVector128(
            input2,
            input3,
            Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255),      // PlanarTo24Shuffle3
@ -641,7 +642,7 @@ internal static class YuvConversion
            out Vector128<byte> g5);

        // Process B.
-        ChannelMixing(
+        ChannelMixingVector128(
            input4,
            input5,
            Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255),     // PlanarTo24Shuffle6
@ -655,24 +656,24 @@ internal static class YuvConversion
            out Vector128<byte> b5);

        // OR the different channels.
-        Vector128<byte> rg0 = Sse2.Or(r0, g0);
-        Vector128<byte> rg1 = Sse2.Or(r1, g1);
-        Vector128<byte> rg2 = Sse2.Or(r2, g2);
-        Vector128<byte> rg3 = Sse2.Or(r3, g3);
-        Vector128<byte> rg4 = Sse2.Or(r4, g4);
-        Vector128<byte> rg5 = Sse2.Or(r5, g5);
+        Vector128<byte> rg0 = r0 | g0;
+        Vector128<byte> rg1 = r1 | g1;
+        Vector128<byte> rg2 = r2 | g2;
+        Vector128<byte> rg3 = r3 | g3;
+        Vector128<byte> rg4 = r4 | g4;
+        Vector128<byte> rg5 = r5 | g5;

        ref byte outputRef = ref MemoryMarshal.GetReference(rgb);
-        Unsafe.As<byte, Vector128<byte>>(ref outputRef) = Sse2.Or(rg0, b0);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5);
+        Unsafe.As<byte, Vector128<byte>>(ref outputRef) = rg0 | b0;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = rg1 | b1;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = rg2 | b2;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = rg3 | b3;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = rg4 | b4;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = rg5 | b5;
    }

    // Shuffles the input buffer as A0 0 0 A1 0 0 A2
-    private static void ChannelMixing(
+    private static void ChannelMixingVector128(
        Vector128<byte> input0,
        Vector128<byte> input1,
        Vector128<byte> shuffle0,
@ -685,53 +686,53 @@ internal static class YuvConversion
        out Vector128<byte> output4,
        out Vector128<byte> output5)
    {
-        output0 = Ssse3.Shuffle(input0, shuffle0);
-        output1 = Ssse3.Shuffle(input0, shuffle1);
-        output2 = Ssse3.Shuffle(input0, shuffle2);
-        output3 = Ssse3.Shuffle(input1, shuffle0);
-        output4 = Ssse3.Shuffle(input1, shuffle1);
-        output5 = Ssse3.Shuffle(input1, shuffle2);
+        output0 = Vector128_.ShuffleNative(input0, shuffle0);
+        output1 = Vector128_.ShuffleNative(input0, shuffle1);
+        output2 = Vector128_.ShuffleNative(input0, shuffle2);
+        output3 = Vector128_.ShuffleNative(input1, shuffle0);
+        output4 = Vector128_.ShuffleNative(input1, shuffle1);
+        output5 = Vector128_.ShuffleNative(input1, shuffle2);
    }

    // Convert 32 samples of YUV444 to B/G/R
-    private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
+    private static void ConvertYuv444ToBgrVector128(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
    {
        // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
        Vector128<byte> y0 = Unsafe.As<byte, Vector128<byte>>(ref y);
        Vector128<byte> u0 = Unsafe.As<byte, Vector128<byte>>(ref u);
        Vector128<byte> v0 = Unsafe.As<byte, Vector128<byte>>(ref v);
-        y0 = Sse2.UnpackLow(Vector128<byte>.Zero, y0);
-        u0 = Sse2.UnpackLow(Vector128<byte>.Zero, u0);
-        v0 = Sse2.UnpackLow(Vector128<byte>.Zero, v0);
+        y0 = Vector128_.UnpackLow(Vector128<byte>.Zero, y0);
+        u0 = Vector128_.UnpackLow(Vector128<byte>.Zero, u0);
+        v0 = Vector128_.UnpackLow(Vector128<byte>.Zero, v0);

        // These constants are 14b fixed-point version of ITU-R BT.601 constants.
        // R = (19077 * y             + 26149 * v - 14234) >> 6
        // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
        // B = (19077 * y + 33050 * u             - 17685) >> 6
-        var k19077 = Vector128.Create((ushort)19077);
-        var k26149 = Vector128.Create((ushort)26149);
-        var k14234 = Vector128.Create((ushort)14234);
+        Vector128<ushort> k19077 = Vector128.Create((ushort)19077);
+        Vector128<ushort> k26149 = Vector128.Create((ushort)26149);
+        Vector128<ushort> k14234 = Vector128.Create((ushort)14234);

-        Vector128<ushort> y1 = Sse2.MultiplyHigh(y0.AsUInt16(), k19077);
-        Vector128<ushort> r0 = Sse2.MultiplyHigh(v0.AsUInt16(), k26149);
-        Vector128<ushort> g0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
-        Vector128<ushort> g1 = Sse2.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));
+        Vector128<ushort> y1 = Vector128_.MultiplyHigh(y0.AsUInt16(), k19077);
+        Vector128<ushort> r0 = Vector128_.MultiplyHigh(v0.AsUInt16(), k26149);
+        Vector128<ushort> g0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
+        Vector128<ushort> g1 = Vector128_.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));

-        Vector128<ushort> r1 = Sse2.Subtract(y1.AsUInt16(), k14234);
-        Vector128<ushort> r2 = Sse2.Add(r1, r0);
+        Vector128<ushort> r1 = y1.AsUInt16() - k14234;
+        Vector128<ushort> r2 = r1 + r0;

-        Vector128<ushort> g2 = Sse2.Add(y1.AsUInt16(), Vector128.Create((ushort)8708));
-        Vector128<ushort> g3 = Sse2.Add(g0, g1);
-        Vector128<ushort> g4 = Sse2.Subtract(g2, g3);
+        Vector128<ushort> g2 = y1.AsUInt16() + Vector128.Create((ushort)8708);
+        Vector128<ushort> g3 = g0 + g1;
+        Vector128<ushort> g4 = g2 - g3;

-        Vector128<ushort> b0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
-        Vector128<ushort> b1 = Sse2.AddSaturate(b0, y1);
-        Vector128<ushort> b2 = Sse2.SubtractSaturate(b1, Vector128.Create((ushort)17685));
+        Vector128<ushort> b0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
+        Vector128<ushort> b1 = Vector128_.AddSaturate(b0, y1);
+        Vector128<ushort> b2 = Vector128_.SubtractSaturate(b1, Vector128.Create((ushort)17685));

        // Use logical shift for B2, which can be larger than 32767.
-        r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
-        g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
-        b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
+        r = Vector128.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
+        g = Vector128.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
+        b = Vector128.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
    }

    [MethodImpl(InliningOptions.ShortMethod)]