Merge pull request #2933 from SixLabors/js/webp-arm

Add ARM support to WEBP Utilities
1 year ago · 166a846b6d
13 changed files with 2451 additions and 1407 deletions
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@ -884,23 +884,6 @@ internal static class Numerics
        accumulator += intHigh;
    }

-    /// <summary>
-    /// Reduces elements of the vector into one sum.
-    /// </summary>
-    /// <param name="accumulator">The accumulator to reduce.</param>
-    /// <returns>The sum of all elements.</returns>
-    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static int ReduceSum(Vector128<int> accumulator)
-    {
-        // Add odd to even.
-        Vector128<int> vsum = Sse2.Add(accumulator, Sse2.Shuffle(accumulator, 0b_11_11_01_01));
-
-        // Add high to low.
-        vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));
-
-        return Sse2.ConvertToInt32(vsum);
-    }
-
    /// <summary>
    /// Reduces elements of the vector into one sum.
    /// </summary>
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@ -66,9 +66,9 @@ internal static partial class SimdUtils
            ref Span<float> destination,
            [ConstantExpected] byte control)
        {
-            if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) ||
-                (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) ||
-                 Vector128.IsHardwareAccelerated)
+            if (Vector512.IsHardwareAccelerated ||
+                Vector256.IsHardwareAccelerated ||
+                Vector128.IsHardwareAccelerated)
            {
                int remainder = 0;
                if (Vector512.IsHardwareAccelerated)
@ -112,9 +112,9 @@ internal static partial class SimdUtils
            ref Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) ||
-                (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) ||
-                (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte))
+            if (Vector512.IsHardwareAccelerated ||
+                Vector256.IsHardwareAccelerated ||
+                Vector128.IsHardwareAccelerated)
            {
                int remainder = 0;
                if (Vector512.IsHardwareAccelerated)
@ -158,7 +158,7 @@ internal static partial class SimdUtils
            ref Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight)
+            if (Vector128.IsHardwareAccelerated)
            {
                int remainder = source.Length % (Vector128<byte>.Count * 3);

@ -190,7 +190,7 @@ internal static partial class SimdUtils
            ref Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated)
            {
                int remainder = source.Length % (Vector128<byte>.Count * 3);

@ -223,7 +223,7 @@ internal static partial class SimdUtils
            ref Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated)
            {
                int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1);    // bit-hack for modulo

@ -249,7 +249,7 @@ internal static partial class SimdUtils
            Span<float> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat)
+            if (Vector512.IsHardwareAccelerated)
            {
                ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
                ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
@ -277,7 +277,7 @@ internal static partial class SimdUtils
                    }
                }
            }
-            else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat)
+            else if (Vector256.IsHardwareAccelerated)
            {
                ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
                ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@ -341,7 +341,7 @@ internal static partial class SimdUtils
            Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte)
+            if (Vector512.IsHardwareAccelerated)
            {
                Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
                Shuffle.MMShuffleSpan(ref temp, control);
@ -373,8 +373,13 @@ internal static partial class SimdUtils
                    }
                }
            }
-            else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte)
+            else if (Vector256.IsHardwareAccelerated)
            {
+                // ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb).
+                // MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte,
+                // so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F)
+                // for indexing within each lane, and ignores the upper bits unless bit 7 is set,
+                // this usage is guaranteed to remain within-lane and non-zeroing.
                Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
                Shuffle.MMShuffleSpan(ref temp, control);
                Vector256<byte> mask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp));
@ -391,21 +396,21 @@ internal static partial class SimdUtils
                    ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
                    ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);

-                    vd0 = Vector256_.ShuffleNative(vs0, mask);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
+                    vd0 = Vector256_.ShufflePerLane(vs0, mask);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)1), mask);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)2), mask);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)3), mask);
                }

                if (m > 0)
                {
                    for (nuint i = u; i < n; i++)
                    {
-                        Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
+                        Unsafe.Add(ref destinationBase, i) = Vector256_.ShufflePerLane(Unsafe.Add(ref sourceBase, i), mask);
                    }
                }
            }
-            else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)
+            else if (Vector128.IsHardwareAccelerated)
            {
                Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
                Shuffle.MMShuffleSpan(ref temp, control);
@ -445,9 +450,7 @@ internal static partial class SimdUtils
            Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated &&
-                Vector128_.SupportsShuffleNativeByte &&
-                Vector128_.SupportsAlignRight)
+            if (Vector128.IsHardwareAccelerated)
            {
                Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
                Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
@ -507,10 +510,7 @@ internal static partial class SimdUtils
            Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated &&
-                Vector128_.SupportsShuffleNativeByte &&
-                Vector128_.SupportsShiftByte &&
-                Vector128_.SupportsAlignRight)
+            if (Vector128.IsHardwareAccelerated)
            {
                Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
                Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@ -553,10 +553,7 @@ internal static partial class SimdUtils
            Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated &&
-                Vector128_.SupportsShuffleNativeByte &&
-                Vector128_.SupportsShiftByte &&
-                Vector128_.SupportsAlignRight)
+            if (Vector128.IsHardwareAccelerated)
            {
                Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
                Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@ -1,7 +1,6 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.

-using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers;
 internal static class Vector256_
 #pragma warning restore SA1649 // File name should match first type name
 {
-    /// <summary>
-    /// Gets a value indicating whether shuffle byte operations are supported.
-    /// </summary>
-    public static bool SupportsShuffleNativeFloat
-    {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx.IsSupported;
-    }
-
-    /// <summary>
-    /// Gets a value indicating whether shuffle byte operations are supported.
-    /// </summary>
-    public static bool SupportsShuffleNativeByte
-    {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx2.IsSupported;
-    }
-
    /// <summary>
    /// Creates a new vector by selecting values from an input vector using a set of indices.
    /// </summary>
@ -47,15 +28,7 @@ internal static class Vector256_
    /// <returns>The <see cref="Vector256{Single}"/>.</returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector256<float> ShuffleNative(Vector256<float> vector, [ConstantExpected] byte control)
-    {
-        if (Avx.IsSupported)
-        {
-            return Avx.Shuffle(vector, vector, control);
-        }
-
-        ThrowUnreachableException();
-        return default;
-    }
+        => Avx.Shuffle(vector, vector, control);

    /// <summary>
    /// Creates a new vector by selecting values from an input vector using a set of indices.</summary>
@ -66,15 +39,17 @@ internal static class Vector256_
    /// </param>
    /// <returns>The <see cref="Vector256{Single}"/>.</returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<byte> ShuffleNative(Vector256<byte> vector, Vector256<byte> indices)
+    public static Vector256<byte> ShufflePerLane(Vector256<byte> vector, Vector256<byte> indices)
    {
        if (Avx2.IsSupported)
        {
            return Avx2.Shuffle(vector, indices);
        }

-        ThrowUnreachableException();
-        return default;
+        Vector128<byte> indicesLo = indices.GetLower();
+        Vector128<byte> lower = Vector128_.ShuffleNative(vector.GetLower(), indicesLo);
+        Vector128<byte> upper = Vector128_.ShuffleNative(vector.GetUpper(), indicesLo);
+        return Vector256.Create(lower, upper);
    }

    /// <summary>
@ -162,6 +137,54 @@ internal static class Vector256_
        return (vm0 * vm1) - vs;
    }

+    /// <summary>
+    /// Multiply packed signed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
+    /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and
+    /// pack the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed signed 16-bit integers to multiply and add.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed signed 16-bit integers to multiply and add.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<int> MultiplyAddAdjacent(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.MultiplyAddAdjacent(left, right);
+        }
+
+        return Vector256.Create(
+            Vector128_.MultiplyAddAdjacent(left.GetLower(), right.GetLower()),
+            Vector128_.MultiplyAddAdjacent(left.GetUpper(), right.GetUpper()));
+    }
+
+    /// <summary>
+    /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
+    /// </summary>
+    /// <param name="left">The left hand source vector.</param>
+    /// <param name="right">The right hand source vector.</param>
+    /// <returns>The <see cref="Vector256{UInt16}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<ushort> PackUnsignedSaturate(Vector256<int> left, Vector256<int> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.PackUnsignedSaturate(left, right);
+        }
+
+        Vector256<int> min = Vector256.Create((int)ushort.MinValue);
+        Vector256<int> max = Vector256.Create((int)ushort.MaxValue);
+        Vector256<uint> lefClamped = Clamp(left, min, max).AsUInt32();
+        Vector256<uint> rightClamped = Clamp(right, min, max).AsUInt32();
+        return Vector256.Narrow(lefClamped, rightClamped);
+    }
+
    /// <summary>
    /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
    /// </summary>
@ -183,6 +206,27 @@ internal static class Vector256_
        return Vector256.Narrow(lefClamped, rightClamped);
    }

+    /// <summary>
+    /// Packs signed 16-bit integers to signed 8-bit integers and saturates.
+    /// </summary>
+    /// <param name="left">The left hand source vector.</param>
+    /// <param name="right">The right hand source vector.</param>
+    /// <returns>The <see cref="Vector256{SByte}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<sbyte> PackSignedSaturate(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.PackSignedSaturate(left, right);
+        }
+
+        Vector256<short> min = Vector256.Create((short)sbyte.MinValue);
+        Vector256<short> max = Vector256.Create((short)sbyte.MaxValue);
+        Vector256<short> lefClamped = Clamp(left, min, max);
+        Vector256<short> rightClamped = Clamp(right, min, max);
+        return Vector256.Narrow(lefClamped, rightClamped);
+    }
+
    /// <summary>
    /// Restricts a vector between a minimum and a maximum value.
    /// </summary>
@ -210,6 +254,211 @@ internal static class Vector256_
        return Vector256.WidenLower(value.ToVector256());
    }

-    [DoesNotReturn]
-    private static void ThrowUnreachableException() => throw new UnreachableException();
+    /// <summary>
+    /// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
+    /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 16-bit integers to multiply.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 16-bit integers to multiply.
+    /// </param>
+    /// <returns>
+    /// A vector containing the low 16 bits of the products of the packed 16-bit integers
+    /// from <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<short> MultiplyLow(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.MultiplyLow(left, right);
+        }
+
+        // Widen each half of the short vectors into two int vectors
+        (Vector256<int> leftLower, Vector256<int> leftUpper) = Vector256.Widen(left);
+        (Vector256<int> rightLower, Vector256<int> rightUpper) = Vector256.Widen(right);
+
+        // Elementwise multiply: each int lane now holds the full 32-bit product
+        Vector256<int> prodLo = leftLower * rightLower;
+        Vector256<int> prodHi = leftUpper * rightUpper;
+
+        // Narrow the two int vectors back into one short vector
+        return Vector256.Narrow(prodLo, prodHi);
+    }
+
+    /// <summary>
+    /// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
+    /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 16-bit integers to multiply.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 16-bit integers to multiply.
+    /// </param>
+    /// <returns>
+    /// A vector containing the high 16 bits of the products of the packed 16-bit integers
+    /// from <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<short> MultiplyHigh(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.MultiplyHigh(left, right);
+        }
+
+        // Widen each half of the short vectors into two int vectors
+        (Vector256<int> leftLower, Vector256<int> leftUpper) = Vector256.Widen(left);
+        (Vector256<int> rightLower, Vector256<int> rightUpper) = Vector256.Widen(right);
+
+        // Elementwise multiply: each int lane now holds the full 32-bit product
+        Vector256<int> prodLo = leftLower * rightLower;
+        Vector256<int> prodHi = leftUpper * rightUpper;
+
+        // Arithmetic shift right by 16 bits to extract the high word
+        prodLo >>= 16;
+        prodHi >>= 16;
+
+        // Narrow the two int vectors back into one short vector
+        return Vector256.Narrow(prodLo, prodHi);
+    }
+
+    /// <summary>
+    /// Unpack and interleave 32-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 32-bit integers to unpack from the low half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 32-bit integers to unpack from the low half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 32-bit integers from the low
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<int> UnpackLow(Vector256<int> left, Vector256<int> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.UnpackLow(left, right);
+        }
+
+        Vector128<int> lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
+        Vector128<int> hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
+
+        return Vector256.Create(lo, hi);
+    }
+
+    /// <summary>
+    /// Unpack and interleave 8-bit integers from the high half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 8-bit integers to unpack from the high half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 8-bit integers to unpack from the high half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 8-bit integers from the high
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<byte> UnpackHigh(Vector256<byte> left, Vector256<byte> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.UnpackHigh(left, right);
+        }
+
+        Vector128<byte> lo = Vector128_.UnpackHigh(left.GetLower(), right.GetLower());
+        Vector128<byte> hi = Vector128_.UnpackHigh(left.GetUpper(), right.GetUpper());
+
+        return Vector256.Create(lo, hi);
+    }
+
+    /// <summary>
+    /// Unpack and interleave 8-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 8-bit integers to unpack from the low half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 8-bit integers to unpack from the low half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 8-bit integers from the low
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<byte> UnpackLow(Vector256<byte> left, Vector256<byte> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.UnpackLow(left, right);
+        }
+
+        Vector128<byte> lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
+        Vector128<byte> hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
+
+        return Vector256.Create(lo, hi);
+    }
+
+    /// <summary>
+    /// Subtract packed signed 16-bit integers in <paramref name="right"/> from packed signed 16-bit integers
+    /// in <paramref name="left"/> using saturation, and store the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed signed 16-bit integers to subtract from.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed signed 16-bit integers to subtract.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of subtracting packed unsigned 16-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<short> SubtractSaturate(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.SubtractSaturate(left, right);
+        }
+
+        return Vector256.Create(
+            Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
+            Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
+    }
+
+    /// <summary>
+    /// Subtract packed unsigned 8-bit integers in <paramref name="right"/> from packed unsigned 8-bit integers
+    /// in <paramref name="left"/> using saturation, and store the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed unsigned 8-bit integers to subtract from.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed unsigned 8-bit integers to subtract.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of subtracting packed unsigned 8-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<byte> SubtractSaturate(Vector256<byte> left, Vector256<byte> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.SubtractSaturate(left, right);
+        }
+
+        return Vector256.Create(
+            Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
+            Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
+    }
 }
--- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@ -1,7 +1,6 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.

-using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers;
 internal static class Vector512_
 #pragma warning restore SA1649 // File name should match first type name
 {
-    /// <summary>
-    /// Gets a value indicating whether shuffle float operations are supported.
-    /// </summary>
-    public static bool SupportsShuffleNativeFloat
-    {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx512F.IsSupported;
-    }
-
-    /// <summary>
-    /// Gets a value indicating whether shuffle byte operations are supported.
-    /// </summary>
-    public static bool SupportsShuffleNativeByte
-    {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx512BW.IsSupported;
-    }
-
    /// <summary>
    /// Creates a new vector by selecting values from an input vector using the control.
    /// </summary>
@ -47,15 +28,7 @@ internal static class Vector512_
    /// <returns>The <see cref="Vector512{Single}"/>.</returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector512<float> ShuffleNative(Vector512<float> vector, [ConstantExpected] byte control)
-    {
-        if (Avx512F.IsSupported)
-        {
-            return Avx512F.Shuffle(vector, vector, control);
-        }
-
-        ThrowUnreachableException();
-        return default;
-    }
+        => Avx512F.Shuffle(vector, vector, control);

    /// <summary>
    /// Creates a new vector by selecting values from an input vector using a set of indices.
@ -73,8 +46,7 @@ internal static class Vector512_
            return Avx512BW.Shuffle(vector, indices);
        }

-        ThrowUnreachableException();
-        return default;
+        return Vector512.Shuffle(vector, indices);
    }

    /// <summary>
@ -85,25 +57,7 @@ internal static class Vector512_
    /// <returns>The <see cref="Vector128{Int32}"/>.</returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
-    {
-        if (Avx512F.IsSupported)
-        {
-            return Avx512F.ConvertToVector512Int32(vector);
-        }
-
-        if (Avx.IsSupported)
-        {
-            Vector256<int> lower = Avx.ConvertToVector256Int32(vector.GetLower());
-            Vector256<int> upper = Avx.ConvertToVector256Int32(vector.GetUpper());
-            return Vector512.Create(lower, upper);
-        }
-
-        Vector512<float> sign = vector & Vector512.Create(-0.0f);
-        Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608.0f);
-
-        val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
-        return Vector512.ConvertToInt32(val_2p23_f32 | sign);
-    }
+        => Avx512F.ConvertToVector512Int32(vector);

    /// <summary>
    /// Rounds all values in <paramref name="vector"/> to the nearest integer
@ -112,28 +66,11 @@ internal static class Vector512_
    /// <param name="vector">The vector</param>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector512<float> RoundToNearestInteger(Vector512<float> vector)
-    {
-        if (Avx512F.IsSupported)
-        {
-            // imm8 = 0b1000:
-            //   imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers)
-            //   imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions)
-            return Avx512F.RoundScale(vector, 0b0000_1000);
-        }
-
-        if (Avx.IsSupported)
-        {
-            Vector256<float> lower = Avx.RoundToNearestInteger(vector.GetLower());
-            Vector256<float> upper = Avx.RoundToNearestInteger(vector.GetUpper());
-            return Vector512.Create(lower, upper);
-        }
-
-        Vector512<float> sign = vector & Vector512.Create(-0F);
-        Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608F);

-        val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
-        return val_2p23_f32 | sign;
-    }
+          // imm8 = 0b1000:
+          //   imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers)
+          //   imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions)
+          => Avx512F.RoundScale(vector, 0b0000_1000);

    /// <summary>
    /// Performs a multiplication and an addition of the <see cref="Vector512{Single}"/>.
@ -148,21 +85,7 @@ internal static class Vector512_
        Vector512<float> va,
        Vector512<float> vm0,
        Vector512<float> vm1)
-    {
-        if (Avx512F.IsSupported)
-        {
-            return Avx512F.FusedMultiplyAdd(vm0, vm1, va);
-        }
-
-        if (Fma.IsSupported)
-        {
-            Vector256<float> lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower());
-            Vector256<float> upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper());
-            return Vector512.Create(lower, upper);
-        }
-
-        return va + (vm0 * vm1);
-    }
+        => Avx512F.FusedMultiplyAdd(vm0, vm1, va);

    /// <summary>
    /// Restricts a vector between a minimum and a maximum value.
@ -175,7 +98,4 @@ internal static class Vector512_
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector512<T> Clamp<T>(Vector512<T> value, Vector512<T> min, Vector512<T> max)
        => Vector512.Min(Vector512.Max(value, min), max);
-
-    [DoesNotReturn]
-    private static void ThrowUnreachableException() => throw new UnreachableException();
 }
--- a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs
@ -6,7 +6,6 @@ using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Formats.Webp.BitReader;
@ -314,7 +313,7 @@ internal class AlphaDecoder : IDisposable

    private static void HorizontalUnfilter(Span<byte> prev, Span<byte> input, Span<byte> dst, int width)
    {
-        if ((Sse2.IsSupported || AdvSimd.IsSupported) && width >= 9)
+        if (Vector128.IsHardwareAccelerated && width >= 9)
        {
            dst[0] = (byte)(input[0] + (prev.IsEmpty ? 0 : prev[0]));
            nuint i;
@ -362,7 +361,7 @@ internal class AlphaDecoder : IDisposable
        {
            HorizontalUnfilter(null, input, dst, width);
        }
-        else if (Avx2.IsSupported)
+        else if (Vector256.IsHardwareAccelerated)
        {
            ref byte inputRef = ref MemoryMarshal.GetReference(input);
            ref byte prevRef = ref MemoryMarshal.GetReference(prev);
@ -374,7 +373,7 @@ internal class AlphaDecoder : IDisposable
            {
                Vector256<int> a0 = Unsafe.As<byte, Vector256<int>>(ref Unsafe.Add(ref inputRef, i));
                Vector256<int> b0 = Unsafe.As<byte, Vector256<int>>(ref Unsafe.Add(ref prevRef, i));
-                Vector256<byte> c0 = Avx2.Add(a0.AsByte(), b0.AsByte());
+                Vector256<byte> c0 = a0.AsByte() + b0.AsByte();
                ref byte outputRef = ref Unsafe.Add(ref dstRef, i);
                Unsafe.As<byte, Vector256<byte>>(ref outputRef) = c0;
            }
--- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
@ -4,7 +4,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless;

@ -12,17 +12,20 @@ internal static class ColorSpaceTransformUtils
 {
    public static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
    {
-        if (Avx2.IsSupported && tileWidth >= 16)
+        if (Vector256.IsHardwareAccelerated && tileWidth >= 16)
        {
            const int span = 16;
            Span<ushort> values = stackalloc ushort[span];
-            var collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
-            var collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
-            var collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-            var collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
-            var collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
-            var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
-            var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
+
+            // These shuffle masks are safe for use with Avx2.Shuffle because all indices are within their respective 128-bit lanes (0–15 for the low mask, 16–31 for the high mask),
+            // and all disabled lanes are set to 0xFF to zero those bytes per the vpshufb specification. This guarantees lane-local shuffling with no cross-lane violations.
+            Vector256<byte> collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
+            Vector256<byte> collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
+            Vector256<byte> collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+            Vector256<byte> collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+            Vector256<byte> collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+            Vector256<short> multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
+            Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
            for (int y = 0; y < tileHeight; y++)
            {
                Span<uint> srcSpan = bgra[(y * stride)..];
@ -33,18 +36,18 @@ internal static class ColorSpaceTransformUtils
                    nuint input1Idx = x + (span / 2);
                    Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
                    Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
-                    Vector256<byte> r0 = Avx2.Shuffle(input0, collectColorBlueTransformsShuffleLowMask256);
-                    Vector256<byte> r1 = Avx2.Shuffle(input1, collectColorBlueTransformsShuffleHighMask256);
-                    Vector256<byte> r = Avx2.Or(r0, r1);
-                    Vector256<byte> gb0 = Avx2.And(input0, collectColorBlueTransformsGreenBlueMask256);
-                    Vector256<byte> gb1 = Avx2.And(input1, collectColorBlueTransformsGreenBlueMask256);
-                    Vector256<ushort> gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
-                    Vector256<byte> g = Avx2.And(gb.AsByte(), collectColorBlueTransformsGreenMask256);
-                    Vector256<short> a = Avx2.MultiplyHigh(r.AsInt16(), multsr);
-                    Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg);
-                    Vector256<byte> c = Avx2.Subtract(gb.AsByte(), b.AsByte());
-                    Vector256<byte> d = Avx2.Subtract(c, a.AsByte());
-                    Vector256<byte> e = Avx2.And(d, collectColorBlueTransformsBlueMask256);
+                    Vector256<byte> r0 = Vector256_.ShufflePerLane(input0, collectColorBlueTransformsShuffleLowMask256);
+                    Vector256<byte> r1 = Vector256_.ShufflePerLane(input1, collectColorBlueTransformsShuffleHighMask256);
+                    Vector256<byte> r = r0 | r1;
+                    Vector256<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask256;
+                    Vector256<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask256;
+                    Vector256<ushort> gb = Vector256_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
+                    Vector256<byte> g = gb.AsByte() & collectColorBlueTransformsGreenMask256;
+                    Vector256<short> a = Vector256_.MultiplyHigh(r.AsInt16(), multsr);
+                    Vector256<short> b = Vector256_.MultiplyHigh(g.AsInt16(), multsg);
+                    Vector256<byte> c = gb.AsByte() - b.AsByte();
+                    Vector256<byte> d = c - a.AsByte();
+                    Vector256<byte> e = d & collectColorBlueTransformsBlueMask256;

                    ref ushort outputRef = ref MemoryMarshal.GetReference(values);
                    Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = e.AsUInt16();
@ -59,20 +62,20 @@ internal static class ColorSpaceTransformUtils
            int leftOver = tileWidth & (span - 1);
            if (leftOver > 0)
            {
-                CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
+                CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
            }
        }
-        else if (Sse41.IsSupported)
+        else if (Vector128.IsHardwareAccelerated)
        {
            const int span = 8;
            Span<ushort> values = stackalloc ushort[span];
-            var collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
-            var collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
-            var collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-            var collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
-            var collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
-            var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
-            var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
+            Vector128<byte> collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
+            Vector128<byte> collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
+            Vector128<byte> collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+            Vector128<byte> collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+            Vector128<byte> collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+            Vector128<short> multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
+            Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
            for (int y = 0; y < tileHeight; y++)
            {
                Span<uint> srcSpan = bgra[(y * stride)..];
@ -83,18 +86,18 @@ internal static class ColorSpaceTransformUtils
                    nuint input1Idx = x + (span / 2);
                    Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
                    Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
-                    Vector128<byte> r0 = Ssse3.Shuffle(input0, collectColorBlueTransformsShuffleLowMask);
-                    Vector128<byte> r1 = Ssse3.Shuffle(input1, collectColorBlueTransformsShuffleHighMask);
-                    Vector128<byte> r = Sse2.Or(r0, r1);
-                    Vector128<byte> gb0 = Sse2.And(input0, collectColorBlueTransformsGreenBlueMask);
-                    Vector128<byte> gb1 = Sse2.And(input1, collectColorBlueTransformsGreenBlueMask);
-                    Vector128<ushort> gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
-                    Vector128<byte> g = Sse2.And(gb.AsByte(), collectColorBlueTransformsGreenMask);
-                    Vector128<short> a = Sse2.MultiplyHigh(r.AsInt16(), multsr);
-                    Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg);
-                    Vector128<byte> c = Sse2.Subtract(gb.AsByte(), b.AsByte());
-                    Vector128<byte> d = Sse2.Subtract(c, a.AsByte());
-                    Vector128<byte> e = Sse2.And(d, collectColorBlueTransformsBlueMask);
+                    Vector128<byte> r0 = Vector128_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask);
+                    Vector128<byte> r1 = Vector128_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask);
+                    Vector128<byte> r = r0 | r1;
+                    Vector128<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask;
+                    Vector128<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask;
+                    Vector128<ushort> gb = Vector128_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
+                    Vector128<byte> g = gb.AsByte() & collectColorBlueTransformsGreenMask;
+                    Vector128<short> a = Vector128_.MultiplyHigh(r.AsInt16(), multsr);
+                    Vector128<short> b = Vector128_.MultiplyHigh(g.AsInt16(), multsg);
+                    Vector128<byte> c = gb.AsByte() - b.AsByte();
+                    Vector128<byte> d = c - a.AsByte();
+                    Vector128<byte> e = d & collectColorBlueTransformsBlueMask;

                    ref ushort outputRef = ref MemoryMarshal.GetReference(values);
                    Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = e.AsUInt16();
@ -109,16 +112,16 @@ internal static class ColorSpaceTransformUtils
            int leftOver = tileWidth & (span - 1);
            if (leftOver > 0)
            {
-                CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
+                CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
            }
        }
        else
        {
-            CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
+            CollectColorBlueTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
        }
    }

-    private static void CollectColorBlueTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
+    private static void CollectColorBlueTransformsScalar(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
    {
        int pos = 0;
        while (tileHeight-- > 0)
@ -135,11 +138,11 @@ internal static class ColorSpaceTransformUtils

    public static void CollectColorRedTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
    {
-        if (Avx2.IsSupported && tileWidth >= 16)
+        if (Vector256.IsHardwareAccelerated && tileWidth >= 16)
        {
            Vector256<byte> collectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte();
            Vector256<byte> collectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte();
-            var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
+            Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
            const int span = 16;
            Span<ushort> values = stackalloc ushort[span];
            for (int y = 0; y < tileHeight; y++)
@ -152,15 +155,15 @@ internal static class ColorSpaceTransformUtils
                    nuint input1Idx = x + (span / 2);
                    Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
                    Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
-                    Vector256<byte> g0 = Avx2.And(input0, collectColorRedTransformsGreenMask256); // 0 0  | g 0
-                    Vector256<byte> g1 = Avx2.And(input1, collectColorRedTransformsGreenMask256);
-                    Vector256<ushort> g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
-                    Vector256<int> a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0  | x r
-                    Vector256<int> a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16);
-                    Vector256<ushort> a = Avx2.PackUnsignedSaturate(a0, a1); // x r
-                    Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr
-                    Vector256<byte> c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r'
-                    Vector256<byte> d = Avx2.And(c, collectColorRedTransformsAndMask256); // 0 r'
+                    Vector256<byte> g0 = input0 & collectColorRedTransformsGreenMask256; // 0 0  | g 0
+                    Vector256<byte> g1 = input1 & collectColorRedTransformsGreenMask256;
+                    Vector256<ushort> g = Vector256_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
+                    Vector256<int> a0 = Vector256.ShiftRightLogical(input0.AsInt32(), 16); // 0 0  | x r
+                    Vector256<int> a1 = Vector256.ShiftRightLogical(input1.AsInt32(), 16);
+                    Vector256<ushort> a = Vector256_.PackUnsignedSaturate(a0, a1); // x r
+                    Vector256<short> b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); // x dr
+                    Vector256<byte> c = a.AsByte() - b.AsByte(); // x r'
+                    Vector256<byte> d = c & collectColorRedTransformsAndMask256; // 0 r'

                    ref ushort outputRef = ref MemoryMarshal.GetReference(values);
                    Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = d.AsUInt16();
@ -175,14 +178,14 @@ internal static class ColorSpaceTransformUtils
            int leftOver = tileWidth & (span - 1);
            if (leftOver > 0)
            {
-                CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
+                CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
            }
        }
-        else if (Sse41.IsSupported)
+        else if (Vector128.IsHardwareAccelerated)
        {
            Vector128<byte> collectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
            Vector128<byte> collectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
-            var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
+            Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
            const int span = 8;
            Span<ushort> values = stackalloc ushort[span];
            for (int y = 0; y < tileHeight; y++)
@ -195,15 +198,15 @@ internal static class ColorSpaceTransformUtils
                    nuint input1Idx = x + (span / 2);
                    Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
                    Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
-                    Vector128<byte> g0 = Sse2.And(input0, collectColorRedTransformsGreenMask); // 0 0  | g 0
-                    Vector128<byte> g1 = Sse2.And(input1, collectColorRedTransformsGreenMask);
-                    Vector128<ushort> g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
-                    Vector128<int> a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0  | x r
-                    Vector128<int> a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16);
-                    Vector128<ushort> a = Sse41.PackUnsignedSaturate(a0, a1); // x r
-                    Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr
-                    Vector128<byte> c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r'
-                    Vector128<byte> d = Sse2.And(c, collectColorRedTransformsAndMask); // 0 r'
+                    Vector128<byte> g0 = input0 & collectColorRedTransformsGreenMask; // 0 0  | g 0
+                    Vector128<byte> g1 = input1 & collectColorRedTransformsGreenMask;
+                    Vector128<ushort> g = Vector128_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
+                    Vector128<int> a0 = Vector128.ShiftRightLogical(input0.AsInt32(), 16); // 0 0  | x r
+                    Vector128<int> a1 = Vector128.ShiftRightLogical(input1.AsInt32(), 16);
+                    Vector128<ushort> a = Vector128_.PackUnsignedSaturate(a0, a1); // x r
+                    Vector128<short> b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); // x dr
+                    Vector128<byte> c = a.AsByte() - b.AsByte(); // x r'
+                    Vector128<byte> d = c & collectColorRedTransformsAndMask; // 0 r'

                    ref ushort outputRef = ref MemoryMarshal.GetReference(values);
                    Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = d.AsUInt16();
@ -218,16 +221,16 @@ internal static class ColorSpaceTransformUtils
            int leftOver = tileWidth & (span - 1);
            if (leftOver > 0)
            {
-                CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
+                CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
            }
        }
        else
        {
-            CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
+            CollectColorRedTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
        }
    }

-    private static void CollectColorRedTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
+    private static void CollectColorRedTransformsScalar(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
    {
        int pos = 0;
        while (tileHeight-- > 0)
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@ -6,6 +6,7 @@ using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Memory;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless;
@ -94,17 +95,20 @@ internal static unsafe class LosslessUtils
    /// <param name="pixelData">The pixel data to apply the transformation.</param>
    public static void AddGreenToBlueAndRed(Span<uint> pixelData)
    {
-        if (Avx2.IsSupported && pixelData.Length >= 8)
+        if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
        {
-            Vector256<byte> addGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
+            // The `255` values disable the write for alpha (A), since 0x80 is set in the control byte (high bit set).
+            // Each byte index is within its respective 128-bit lane (0–15 and 16–31), so this is safe for per-lane shuffle.
+            // The high bits are not set for the index bytes, and the values are always < 16 per lane, satisfying AVX2 lane rules.
+            Vector256<byte> addGreenToBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
            do
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
-                Vector256<byte> in0g0g = Avx2.Shuffle(input, addGreenToBlueAndRedMaskAvx2);
-                Vector256<byte> output = Avx2.Add(input, in0g0g);
+                Vector256<byte> in0g0g = Vector256_.ShufflePerLane(input, addGreenToBlueAndRedMask);
+                Vector256<byte> output = input + in0g0g;
                Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
                i += 8;
            }
@ -115,39 +119,17 @@ internal static unsafe class LosslessUtils
                AddGreenToBlueAndRedScalar(pixelData[(int)i..]);
            }
        }
-        else if (Ssse3.IsSupported && pixelData.Length >= 4)
-        {
-            Vector128<byte> addGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
-            nuint numPixels = (uint)pixelData.Length;
-            nuint i = 0;
-            do
-            {
-                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
-                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<byte> in0g0g = Ssse3.Shuffle(input, addGreenToBlueAndRedMaskSsse3);
-                Vector128<byte> output = Sse2.Add(input, in0g0g);
-                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
-                i += 4;
-            }
-            while (i <= numPixels - 4);
-
-            if (i != numPixels)
-            {
-                AddGreenToBlueAndRedScalar(pixelData[(int)i..]);
-            }
-        }
-        else if (Sse2.IsSupported && pixelData.Length >= 4)
+        else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
        {
+            Vector128<byte> addGreenToBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
            do
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
-                Vector128<ushort> b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<ushort> c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g
-                Vector128<byte> output = Sse2.Add(input.AsByte(), c.AsByte());
+                Vector128<byte> in0g0g = Vector128_.ShuffleNative(input, addGreenToBlueAndRedMask);
+                Vector128<byte> output = input + in0g0g;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
                i += 4;
            }
@ -180,17 +162,17 @@ internal static unsafe class LosslessUtils

    public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
    {
-        if (Avx2.IsSupported && pixelData.Length >= 8)
+        if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
        {
-            Vector256<byte> subtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
+            Vector256<byte> subtractGreenFromBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
            do
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
-                Vector256<byte> in0g0g = Avx2.Shuffle(input, subtractGreenFromBlueAndRedMaskAvx2);
-                Vector256<byte> output = Avx2.Subtract(input, in0g0g);
+                Vector256<byte> in0g0g = Vector256_.ShufflePerLane(input, subtractGreenFromBlueAndRedMask);
+                Vector256<byte> output = input - in0g0g;
                Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
                i += 8;
            }
@ -201,39 +183,17 @@ internal static unsafe class LosslessUtils
                SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]);
            }
        }
-        else if (Ssse3.IsSupported && pixelData.Length >= 4)
-        {
-            Vector128<byte> subtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
-            nuint numPixels = (uint)pixelData.Length;
-            nuint i = 0;
-            do
-            {
-                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
-                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<byte> in0g0g = Ssse3.Shuffle(input, subtractGreenFromBlueAndRedMaskSsse3);
-                Vector128<byte> output = Sse2.Subtract(input, in0g0g);
-                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
-                i += 4;
-            }
-            while (i <= numPixels - 4);
-
-            if (i != numPixels)
-            {
-                SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]);
-            }
-        }
-        else if (Sse2.IsSupported && pixelData.Length >= 4)
+        else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
        {
+            Vector128<byte> subtractGreenFromBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
            do
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
-                Vector128<ushort> b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<ushort> c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g
-                Vector128<byte> output = Sse2.Subtract(input.AsByte(), c.AsByte());
+                Vector128<byte> in0g0g = Vector128_.ShuffleNative(input, subtractGreenFromBlueAndRedMask);
+                Vector128<byte> output = input - in0g0g;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
                i += 4;
            }
@ -412,7 +372,7 @@ internal static unsafe class LosslessUtils
                TransformColorScalar(m, pixelData[(int)idx..], numPixels - (int)idx);
            }
        }
-        else if (Sse2.IsSupported && numPixels >= 4)
+        else if (Vector128.IsHardwareAccelerated && numPixels >= 4)
        {
            Vector128<byte> transformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
            Vector128<byte> transformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
@ -423,16 +383,16 @@ internal static unsafe class LosslessUtils
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
                Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
-                Vector128<byte> a = Sse2.And(input.AsByte(), transformColorAlphaGreenMask);
-                Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
-                Vector128<short> e = Sse2.ShiftLeftLogical(input.AsInt16(), 8);
-                Vector128<short> f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
-                Vector128<int> g = Sse2.ShiftRightLogical(f.AsInt32(), 16);
-                Vector128<byte> h = Sse2.Add(g.AsByte(), d.AsByte());
-                Vector128<byte> i = Sse2.And(h, transformColorRedBlueMask);
-                Vector128<byte> output = Sse2.Subtract(input.AsByte(), i);
+                Vector128<byte> a = input.AsByte() & transformColorAlphaGreenMask;
+                Vector128<short> b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
+                Vector128<short> e = Vector128_.ShiftLeftLogical(input.AsInt16(), 8);
+                Vector128<short> f = Vector128_.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
+                Vector128<int> g = Vector128.ShiftRightLogical(f.AsInt32(), 16);
+                Vector128<byte> h = g.AsByte() + d.AsByte();
+                Vector128<byte> i = h & transformColorRedBlueMask;
+                Vector128<byte> output = input.AsByte() - i;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
                idx += 4;
            }
@ -503,7 +463,7 @@ internal static unsafe class LosslessUtils
                TransformColorInverseScalar(m, pixelData[(int)idx..]);
            }
        }
-        else if (Sse2.IsSupported && pixelData.Length >= 4)
+        else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
        {
            Vector128<byte> transformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
            Vector128<int> multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
@ -514,17 +474,17 @@ internal static unsafe class LosslessUtils
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
                Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
-                Vector128<byte> a = Sse2.And(input.AsByte(), transformColorInverseAlphaGreenMask);
-                Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
-                Vector128<byte> e = Sse2.Add(input.AsByte(), d.AsByte());
-                Vector128<short> f = Sse2.ShiftLeftLogical(e.AsInt16(), 8);
-                Vector128<short> g = Sse2.MultiplyHigh(f, multsb2.AsInt16());
-                Vector128<int> h = Sse2.ShiftRightLogical(g.AsInt32(), 8);
-                Vector128<byte> i = Sse2.Add(h.AsByte(), f.AsByte());
-                Vector128<short> j = Sse2.ShiftRightLogical(i.AsInt16(), 8);
-                Vector128<byte> output = Sse2.Or(j.AsByte(), a);
+                Vector128<byte> a = input.AsByte() & transformColorInverseAlphaGreenMask;
+                Vector128<short> b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
+                Vector128<byte> e = input.AsByte() + d.AsByte();
+                Vector128<short> f = Vector128_.ShiftLeftLogical(e.AsInt16(), 8);
+                Vector128<short> g = Vector128_.MultiplyHigh(f, multsb2.AsInt16());
+                Vector128<int> h = Vector128.ShiftRightLogical(g.AsInt32(), 8);
+                Vector128<byte> i = h.AsByte() + f.AsByte();
+                Vector128<short> j = Vector128.ShiftRightLogical(i.AsInt16(), 8);
+                Vector128<byte> output = j.AsByte() | a;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
            }

@ -1401,15 +1361,15 @@ internal static unsafe class LosslessUtils

    private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
-            Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
-            Vector128<short> v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16());
-            Vector128<short> v2 = Sse2.Subtract(v1, c2Vec.AsInt16());
-            Vector128<byte> b = Sse2.PackUnsignedSaturate(v2, v2);
-            return Sse2.ConvertToUInt32(b.AsUInt32());
+            Vector128<byte> c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> c2Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128<byte>.Zero);
+            Vector128<short> v1 = c0Vec.AsInt16() + c1Vec.AsInt16();
+            Vector128<short> v2 = v1 - c2Vec.AsInt16();
+            Vector128<byte> b = Vector128_.PackUnsignedSaturate(v2, v2);
+            return b.AsUInt32().ToScalar();
        }

        {
@ -1432,20 +1392,20 @@ internal static unsafe class LosslessUtils

    private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
-            Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> b0 = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
-            Vector128<short> avg = Sse2.Add(c1Vec.AsInt16(), c0Vec.AsInt16());
-            Vector128<short> a0 = Sse2.ShiftRightLogical(avg, 1);
-            Vector128<short> a1 = Sse2.Subtract(a0, b0.AsInt16());
-            Vector128<short> bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16());
-            Vector128<short> a2 = Sse2.Subtract(a1, bgta);
-            Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2, 1);
-            Vector128<short> a4 = Sse2.Add(a0, a3).AsInt16();
-            Vector128<byte> a5 = Sse2.PackUnsignedSaturate(a4, a4);
-            return Sse2.ConvertToUInt32(a5.AsUInt32());
+            Vector128<byte> c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> b0 = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128<byte>.Zero);
+            Vector128<short> avg = c1Vec.AsInt16() + c0Vec.AsInt16();
+            Vector128<short> a0 = Vector128.ShiftRightLogical(avg, 1);
+            Vector128<short> a1 = a0 - b0.AsInt16();
+            Vector128<short> bgta = Vector128.GreaterThan(b0.AsInt16(), a0.AsInt16());
+            Vector128<short> a2 = a1 - bgta;
+            Vector128<short> a3 = Vector128.ShiftRightArithmetic(a2, 1);
+            Vector128<short> a4 = (a0 + a3).AsInt16();
+            Vector128<byte> a5 = Vector128_.PackUnsignedSaturate(a4, a4);
+            return a5.AsUInt32().ToScalar();
        }

        {
@ -1475,23 +1435,23 @@ internal static unsafe class LosslessUtils

    private static uint Select(uint a, uint b, uint c, Span<short> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            fixed (short* ptr = &MemoryMarshal.GetReference(scratch))
            {
-                Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
-                Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
-                Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte();
-                Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0);
-                Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0);
-                Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0);
-                Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
-                Vector128<byte> ac = Sse2.Or(ac0, ca0);
-                Vector128<byte> bc = Sse2.Or(bc0, cb0);
-                Vector128<byte> pa = Sse2.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
-                Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
-                Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
-                Sse2.Store((ushort*)ptr, diff);
+                Vector128<byte> a0 = Vector128.CreateScalar(a).AsByte();
+                Vector128<byte> b0 = Vector128.CreateScalar(b).AsByte();
+                Vector128<byte> c0 = Vector128.CreateScalar(c).AsByte();
+                Vector128<byte> ac0 = Vector128_.SubtractSaturate(a0, c0);
+                Vector128<byte> ca0 = Vector128_.SubtractSaturate(c0, a0);
+                Vector128<byte> bc0 = Vector128_.SubtractSaturate(b0, c0);
+                Vector128<byte> cb0 = Vector128_.SubtractSaturate(c0, b0);
+                Vector128<byte> ac = ac0 | ca0;
+                Vector128<byte> bc = bc0 | cb0;
+                Vector128<byte> pa = Vector128_.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
+                Vector128<byte> pb = Vector128_.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
+                Vector128<ushort> diff = pb.AsUInt16() - pa.AsUInt16();
+                diff.Store((ushort*)ptr);
                int paMinusPb = ptr[3] + ptr[2] + ptr[1] + ptr[0];
                return (paMinusPb <= 0) ? a : b;
            }
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@ -2,10 +2,11 @@
 // Licensed under the Six Labors Split License.

 using System.Buffers.Binary;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossy;

@ -78,7 +79,7 @@ internal static unsafe class Vp8Encoding
    // Does two inverse transforms.
    public static void ITransformTwo(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            // This implementation makes use of 16-bit fixed point versions of two
            // multiply constants:
@ -116,10 +117,10 @@ internal static unsafe class Vp8Encoding
            Vector128<long> inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
            Vector128<long> inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);

-            in0 = Sse2.UnpackLow(in0, inb0);
-            in1 = Sse2.UnpackLow(in1, inb1);
-            in2 = Sse2.UnpackLow(in2, inb2);
-            in3 = Sse2.UnpackLow(in3, inb3);
+            in0 = Vector128_.UnpackLow(in0, inb0);
+            in1 = Vector128_.UnpackLow(in1, inb1);
+            in2 = Vector128_.UnpackLow(in2, inb2);
+            in3 = Vector128_.UnpackLow(in3, inb3);

            // a00 a10 a20 a30   b00 b10 b20 b30
            // a01 a11 a21 a31   b01 b11 b21 b31
@ -128,49 +129,45 @@ internal static unsafe class Vp8Encoding

            // Vertical pass and subsequent transpose.
            // First pass, c and d calculations are longer because of the "trick" multiplications.
-            InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
+            InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);

            // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);

            // Horizontal pass and subsequent transpose.
            // First pass, c and d calculations are longer because of the "trick" multiplications.
-            InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
+            InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);

            // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);

            // Add inverse transform to 'ref' and store.
            // Load the reference(s).
-            Vector128<byte> ref0 = Vector128<byte>.Zero;
-            Vector128<byte> ref1 = Vector128<byte>.Zero;
-            Vector128<byte> ref2 = Vector128<byte>.Zero;
-            Vector128<byte> ref3 = Vector128<byte>.Zero;
            ref byte referenceRef = ref MemoryMarshal.GetReference(reference);

            // Load eight bytes/pixels per line.
-            ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
-            ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
-            ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
-            ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
+            Vector128<byte> ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
+            Vector128<byte> ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
+            Vector128<byte> ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
+            Vector128<byte> ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();

            // Convert to 16b.
-            ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
-            ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
-            ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
-            ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
+            ref0 = Vector128_.UnpackLow(ref0, Vector128<byte>.Zero);
+            ref1 = Vector128_.UnpackLow(ref1, Vector128<byte>.Zero);
+            ref2 = Vector128_.UnpackLow(ref2, Vector128<byte>.Zero);
+            ref3 = Vector128_.UnpackLow(ref3, Vector128<byte>.Zero);

            // Add the inverse transform(s).
-            Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
-            Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
-            Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
-            Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
+            Vector128<short> ref0InvAdded = ref0.AsInt16() + t0.AsInt16();
+            Vector128<short> ref1InvAdded = ref1.AsInt16() + t1.AsInt16();
+            Vector128<short> ref2InvAdded = ref2.AsInt16() + t2.AsInt16();
+            Vector128<short> ref3InvAdded = ref3.AsInt16() + t3.AsInt16();

            // Unsigned saturate to 8b.
-            ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
-            ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
-            ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
-            ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
+            ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
+            ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
+            ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
+            ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);

            // Store eight bytes/pixels per line.
            ref byte outputRef = ref MemoryMarshal.GetReference(dst);
@ -188,7 +185,7 @@ internal static unsafe class Vp8Encoding

    public static void ITransformOne(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            // Load and concatenate the transform coefficients (we'll do two inverse
            // transforms in parallel). In the case of only one inverse transform, the
@ -207,63 +204,59 @@ internal static unsafe class Vp8Encoding

            // Vertical pass and subsequent transpose.
            // First pass, c and d calculations are longer because of the "trick" multiplications.
-            InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
+            InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);

            // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);

            // Horizontal pass and subsequent transpose.
            // First pass, c and d calculations are longer because of the "trick" multiplications.
-            InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
+            InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);

            // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);

            // Add inverse transform to 'ref' and store.
            // Load the reference(s).
-            Vector128<byte> ref0 = Vector128<byte>.Zero;
-            Vector128<byte> ref1 = Vector128<byte>.Zero;
-            Vector128<byte> ref2 = Vector128<byte>.Zero;
-            Vector128<byte> ref3 = Vector128<byte>.Zero;
            ref byte referenceRef = ref MemoryMarshal.GetReference(reference);

            // Load four bytes/pixels per line.
-            ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
-            ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
-            ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
-            ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
+            Vector128<byte> ref0 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref referenceRef)).AsByte();
+            Vector128<byte> ref1 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
+            Vector128<byte> ref2 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
+            Vector128<byte> ref3 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();

            // Convert to 16b.
-            ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
-            ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
-            ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
-            ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
+            ref0 = Vector128_.UnpackLow(ref0, Vector128<byte>.Zero);
+            ref1 = Vector128_.UnpackLow(ref1, Vector128<byte>.Zero);
+            ref2 = Vector128_.UnpackLow(ref2, Vector128<byte>.Zero);
+            ref3 = Vector128_.UnpackLow(ref3, Vector128<byte>.Zero);

            // Add the inverse transform(s).
-            Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
-            Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
-            Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
-            Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
+            Vector128<short> ref0InvAdded = ref0.AsInt16() + t0.AsInt16();
+            Vector128<short> ref1InvAdded = ref1.AsInt16() + t1.AsInt16();
+            Vector128<short> ref2InvAdded = ref2.AsInt16() + t2.AsInt16();
+            Vector128<short> ref3InvAdded = ref3.AsInt16() + t3.AsInt16();

            // Unsigned saturate to 8b.
-            ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
-            ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
-            ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
-            ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
+            ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
+            ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
+            ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
+            ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);

            // Unsigned saturate to 8b.
            ref byte outputRef = ref MemoryMarshal.GetReference(dst);

            // Store four bytes/pixels per line.
-            int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
-            int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
-            int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
-            int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
-
-            Unsafe.As<byte, int>(ref outputRef) = output0;
-            Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
-            Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
-            Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
+            int output0 = ref0.AsInt32().ToScalar();
+            int output1 = ref1.AsInt32().ToScalar();
+            int output2 = ref2.AsInt32().ToScalar();
+            int output3 = ref3.AsInt32().ToScalar();
+
+            Unsafe.WriteUnaligned(ref outputRef, output0);
+            Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps), output1);
+            Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2), output2);
+            Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3), output3);
        }
        else
        {
@ -302,72 +295,72 @@ internal static unsafe class Vp8Encoding
        }
    }

-    private static void InverseTransformVerticalPass(Vector128<long> in0, Vector128<long> in2, Vector128<long> in1, Vector128<long> in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3)
+    private static void InverseTransformVerticalPassVector128(Vector128<long> in0, Vector128<long> in2, Vector128<long> in1, Vector128<long> in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3)
    {
-        Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
-        Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
+        Vector128<short> a = in0.AsInt16() + in2.AsInt16();
+        Vector128<short> b = in0.AsInt16() - in2.AsInt16();

        Vector128<short> k1 = Vector128.Create((short)20091).AsInt16();
        Vector128<short> k2 = Vector128.Create((short)-30068).AsInt16();

        // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
-        Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2);
-        Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1);
-        Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
-        Vector128<short> c4 = Sse2.Subtract(c1, c2);
-        Vector128<short> c = Sse2.Add(c3, c4);
+        Vector128<short> c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2);
+        Vector128<short> c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1);
+        Vector128<short> c3 = in1.AsInt16() - in3.AsInt16();
+        Vector128<short> c4 = c1 - c2;
+        Vector128<short> c = c3 + c4;

        // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
-        Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1);
-        Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2);
-        Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
-        Vector128<short> d4 = Sse2.Add(d1, d2);
-        Vector128<short> d = Sse2.Add(d3, d4);
+        Vector128<short> d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1);
+        Vector128<short> d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2);
+        Vector128<short> d3 = in1.AsInt16() + in3.AsInt16();
+        Vector128<short> d4 = d1 + d2;
+        Vector128<short> d = d3 + d4;

        // Second pass.
-        tmp0 = Sse2.Add(a, d);
-        tmp1 = Sse2.Add(b, c);
-        tmp2 = Sse2.Subtract(b, c);
-        tmp3 = Sse2.Subtract(a, d);
+        tmp0 = a + d;
+        tmp1 = b + c;
+        tmp2 = b - c;
+        tmp3 = a - d;
    }

-    private static void InverseTransformHorizontalPass(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
+    private static void InverseTransformHorizontalPassVector128(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
    {
-        Vector128<short> dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4));
-        Vector128<short> a = Sse2.Add(dc, t2.AsInt16());
-        Vector128<short> b = Sse2.Subtract(dc, t2.AsInt16());
+        Vector128<short> dc = t0.AsInt16() + Vector128.Create((short)4);
+        Vector128<short> a = dc + t2.AsInt16();
+        Vector128<short> b = dc - t2.AsInt16();

        Vector128<short> k1 = Vector128.Create((short)20091).AsInt16();
        Vector128<short> k2 = Vector128.Create((short)-30068).AsInt16();

        // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
-        Vector128<short> c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2);
-        Vector128<short> c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1);
-        Vector128<short> c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
-        Vector128<short> c4 = Sse2.Subtract(c1, c2);
-        Vector128<short> c = Sse2.Add(c3, c4);
+        Vector128<short> c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2);
+        Vector128<short> c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1);
+        Vector128<short> c3 = t1.AsInt16() - t3.AsInt16();
+        Vector128<short> c4 = c1 - c2;
+        Vector128<short> c = c3 + c4;

        // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
-        Vector128<short> d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1);
-        Vector128<short> d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2);
-        Vector128<short> d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
-        Vector128<short> d4 = Sse2.Add(d1, d2);
-        Vector128<short> d = Sse2.Add(d3, d4);
+        Vector128<short> d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1);
+        Vector128<short> d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2);
+        Vector128<short> d3 = t1.AsInt16() + t3.AsInt16();
+        Vector128<short> d4 = d1 + d2;
+        Vector128<short> d = d3 + d4;

        // Second pass.
-        Vector128<short> tmp0 = Sse2.Add(a, d);
-        Vector128<short> tmp1 = Sse2.Add(b, c);
-        Vector128<short> tmp2 = Sse2.Subtract(b, c);
-        Vector128<short> tmp3 = Sse2.Subtract(a, d);
-        shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
-        shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
-        shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
-        shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+        Vector128<short> tmp0 = a + d;
+        Vector128<short> tmp1 = b + c;
+        Vector128<short> tmp2 = b - c;
+        Vector128<short> tmp3 = a - d;
+        shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3);
+        shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3);
+        shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3);
+        shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3);
    }

    public static void FTransform2(Span<byte> src, Span<byte> reference, Span<short> output, Span<short> output2, Span<int> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            ref byte srcRef = ref MemoryMarshal.GetReference(src);
            ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
@ -385,38 +378,38 @@ internal static unsafe class Vp8Encoding
            Vector128<long> ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0);

            // Convert both to 16 bit.
-            Vector128<byte> srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> srcLow0 = Vector128_.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> srcLow1 = Vector128_.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> srcLow2 = Vector128_.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> srcLow3 = Vector128_.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> refLow0 = Vector128_.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> refLow1 = Vector128_.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> refLow2 = Vector128_.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> refLow3 = Vector128_.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);

            // Compute difference. -> 00 01 02 03  00' 01' 02' 03'
-            Vector128<short> diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16());
-            Vector128<short> diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16());
-            Vector128<short> diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16());
-            Vector128<short> diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16());
+            Vector128<short> diff0 = srcLow0.AsInt16() - refLow0.AsInt16();
+            Vector128<short> diff1 = srcLow1.AsInt16() - refLow1.AsInt16();
+            Vector128<short> diff2 = srcLow2.AsInt16() - refLow2.AsInt16();
+            Vector128<short> diff3 = srcLow3.AsInt16() - refLow3.AsInt16();

            // Unpack and shuffle.
            // 00 01 02 03   0 0 0 0
            // 10 11 12 13   0 0 0 0
            // 20 21 22 23   0 0 0 0
            // 30 31 32 33   0 0 0 0
-            Vector128<int> shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
-            Vector128<int> shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
-            Vector128<int> shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
-            Vector128<int> shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());
+            Vector128<int> shuf01l = Vector128_.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
+            Vector128<int> shuf23l = Vector128_.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
+            Vector128<int> shuf01h = Vector128_.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
+            Vector128<int> shuf23h = Vector128_.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());

            // First pass.
-            FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
-            FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);
+            FTransformPass1Vector128(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
+            FTransformPass1Vector128(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);

            // Second pass.
-            FTransformPass2SSE2(v01l, v32l, output);
-            FTransformPass2SSE2(v01h, v32h, output2);
+            FTransformPass2Vector128(v01l, v32l, output);
+            FTransformPass2Vector128(v01h, v32h, output2);
        }
        else
        {
@ -427,7 +420,7 @@ internal static unsafe class Vp8Encoding

    public static void FTransform(Span<byte> src, Span<byte> reference, Span<short> output, Span<int> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            ref byte srcRef = ref MemoryMarshal.GetReference(src);
            ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
@ -449,29 +442,29 @@ internal static unsafe class Vp8Encoding
            // 20 21 22 23 *
            // 30 31 32 33 *
            // Shuffle.
-            Vector128<short> srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16());
-            Vector128<short> srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16());
-            Vector128<short> refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
-            Vector128<short> refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16());
+            Vector128<short> srcLow0 = Vector128_.UnpackLow(src0.AsInt16(), src1.AsInt16());
+            Vector128<short> srcLow1 = Vector128_.UnpackLow(src2.AsInt16(), src3.AsInt16());
+            Vector128<short> refLow0 = Vector128_.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
+            Vector128<short> refLow1 = Vector128_.UnpackLow(ref2.AsInt16(), ref3.AsInt16());

            // 00 01 10 11 02 03 12 13 * * ...
            // 20 21 30 31 22 22 32 33 * * ...

            // Convert both to 16 bit.
-            Vector128<byte> src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> src0_16b = Vector128_.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> src1_16b = Vector128_.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> ref0_16b = Vector128_.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> ref1_16b = Vector128_.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);

            // Compute the difference.
-            Vector128<short> row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16());
-            Vector128<short> row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16());
+            Vector128<short> row01 = src0_16b.AsInt16() - ref0_16b.AsInt16();
+            Vector128<short> row23 = src1_16b.AsInt16() - ref1_16b.AsInt16();

            // First pass.
-            FTransformPass1SSE2(row01, row23, out Vector128<int> v01, out Vector128<int> v32);
+            FTransformPass1Vector128(row01, row23, out Vector128<int> v01, out Vector128<int> v32);

            // Second pass.
-            FTransformPass2SSE2(v01, v32, output);
+            FTransformPass2Vector128(v01, v32, output);
        }
        else
        {
@ -517,88 +510,88 @@ internal static unsafe class Vp8Encoding
        }
    }

-    public static void FTransformPass1SSE2(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32)
+    public static void FTransformPass1Vector128(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32)
    {
        // *in01 = 00 01 10 11 02 03 12 13
        // *in23 = 20 21 30 31 22 23 32 33
-        Vector128<short> shuf01_p = Sse2.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301);
-        Vector128<short> shuf32_p = Sse2.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301);
+        Vector128<short> shuf01_p = Vector128_.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301);
+        Vector128<short> shuf32_p = Vector128_.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301);

        // 00 01 10 11 03 02 13 12
        // 20 21 30 31 23 22 33 32
-        Vector128<long> s01 = Sse2.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64());
-        Vector128<long> s32 = Sse2.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64());
+        Vector128<long> s01 = Vector128_.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64());
+        Vector128<long> s32 = Vector128_.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64());

        // 00 01 10 11 20 21 30 31
        // 03 02 13 12 23 22 33 32
-        Vector128<short> a01 = Sse2.Add(s01.AsInt16(), s32.AsInt16());
-        Vector128<short> a32 = Sse2.Subtract(s01.AsInt16(), s32.AsInt16());
+        Vector128<short> a01 = s01.AsInt16() + s32.AsInt16();
+        Vector128<short> a32 = s01.AsInt16() - s32.AsInt16();

        // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
        // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]

        // [ (a0 + a1) << 3, ... ]
-        Vector128<int> tmp0 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p
+        Vector128<int> tmp0 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p

        // [ (a0 - a1) << 3, ... ]
-        Vector128<int> tmp2 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16());        // K88m
-        Vector128<int> tmp11 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16());   // K5352_2217p
-        Vector128<int> tmp31 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16());   // K5352_2217m
-        Vector128<int> tmp12 = Sse2.Add(tmp11, Vector128.Create(1812));
-        Vector128<int> tmp32 = Sse2.Add(tmp31, Vector128.Create(937));
-        Vector128<int> tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9);
-        Vector128<int> tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9);
-        Vector128<short> s03 = Sse2.PackSignedSaturate(tmp0, tmp2);
-        Vector128<short> s12 = Sse2.PackSignedSaturate(tmp1, tmp3);
-        Vector128<short> slo = Sse2.UnpackLow(s03, s12); // 0 1 0 1 0 1...
-        Vector128<short> shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3
-        Vector128<int> v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32());
-        out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32());
-        out32 = Sse2.Shuffle(v23, SimdUtils.Shuffle.MMShuffle1032);
+        Vector128<int> tmp2 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16());        // K88m
+        Vector128<int> tmp11 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16());   // K5352_2217p
+        Vector128<int> tmp31 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16());   // K5352_2217m
+        Vector128<int> tmp12 = tmp11 + Vector128.Create(1812);
+        Vector128<int> tmp32 = tmp31 + Vector128.Create(937);
+        Vector128<int> tmp1 = Vector128.ShiftRightArithmetic(tmp12, 9);
+        Vector128<int> tmp3 = Vector128.ShiftRightArithmetic(tmp32, 9);
+        Vector128<short> s03 = Vector128_.PackSignedSaturate(tmp0, tmp2);
+        Vector128<short> s12 = Vector128_.PackSignedSaturate(tmp1, tmp3);
+        Vector128<short> slo = Vector128_.UnpackLow(s03, s12); // 0 1 0 1 0 1...
+        Vector128<short> shi = Vector128_.UnpackHigh(s03, s12); // 2 3 2 3 2 3
+        Vector128<int> v23 = Vector128_.UnpackHigh(slo.AsInt32(), shi.AsInt32());
+        out01 = Vector128_.UnpackLow(slo.AsInt32(), shi.AsInt32());
+        out32 = Vector128_.ShuffleNative(v23, SimdUtils.Shuffle.MMShuffle1032);
    }

-    public static void FTransformPass2SSE2(Vector128<int> v01, Vector128<int> v32, Span<short> output)
+    public static void FTransformPass2Vector128(Vector128<int> v01, Vector128<int> v32, Span<short> output)
    {
        // Same operations are done on the (0,3) and (1,2) pairs.
        // a3 = v0 - v3
        // a2 = v1 - v2
-        Vector128<short> a32 = Sse2.Subtract(v01.AsInt16(), v32.AsInt16());
-        Vector128<long> a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64());
+        Vector128<short> a32 = v01.AsInt16() - v32.AsInt16();
+        Vector128<long> a22 = Vector128_.UnpackHigh(a32.AsInt64(), a32.AsInt64());

-        Vector128<short> b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16());
-        Vector128<int> c1 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16());  // K5352_2217
-        Vector128<int> c3 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16());  // K2217_5352
-        Vector128<int> d1 = Sse2.Add(c1, Vector128.Create(12000 + (1 << 16)));  // K12000PlusOne
-        Vector128<int> d3 = Sse2.Add(c3, Vector128.Create(51000));
-        Vector128<int> e1 = Sse2.ShiftRightArithmetic(d1, 16);
-        Vector128<int> e3 = Sse2.ShiftRightArithmetic(d3, 16);
+        Vector128<short> b23 = Vector128_.UnpackLow(a22.AsInt16(), a32.AsInt16());
+        Vector128<int> c1 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16());  // K5352_2217
+        Vector128<int> c3 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16());  // K2217_5352
+        Vector128<int> d1 = c1 + Vector128.Create(12000 + (1 << 16));  // K12000PlusOne
+        Vector128<int> d3 = c3 + Vector128.Create(51000);
+        Vector128<int> e1 = Vector128.ShiftRightArithmetic(d1, 16);
+        Vector128<int> e3 = Vector128.ShiftRightArithmetic(d3, 16);

        // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
        // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
-        Vector128<short> f1 = Sse2.PackSignedSaturate(e1, e1);
-        Vector128<short> f3 = Sse2.PackSignedSaturate(e3, e3);
+        Vector128<short> f1 = Vector128_.PackSignedSaturate(e1, e1);
+        Vector128<short> f3 = Vector128_.PackSignedSaturate(e3, e3);

        // g1 = f1 + (a3 != 0);
        // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
        // desired (0, 1), we add one earlier through k12000_plus_one.
        // -> g1 = f1 + 1 - (a3 == 0)
-        Vector128<short> g1 = Sse2.Add(f1, Sse2.CompareEqual(a32, Vector128<short>.Zero));
+        Vector128<short> g1 = f1 + Vector128.Equals(a32, Vector128<short>.Zero);

        // a0 = v0 + v3
        // a1 = v1 + v2
-        Vector128<short> a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16());
-        Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), Vector128.Create((short)7));
-        Vector128<short> a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
-        Vector128<short> c0 = Sse2.Add(a01Plus7, a11);
-        Vector128<short> c2 = Sse2.Subtract(a01Plus7, a11);
+        Vector128<short> a01 = v01.AsInt16() + v32.AsInt16();
+        Vector128<short> a01Plus7 = a01.AsInt16() + Vector128.Create((short)7);
+        Vector128<short> a11 = Vector128_.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
+        Vector128<short> c0 = a01Plus7 + a11;
+        Vector128<short> c2 = a01Plus7 - a11;

        // d0 = (a0 + a1 + 7) >> 4;
        // d2 = (a0 - a1 + 7) >> 4;
-        Vector128<short> d0 = Sse2.ShiftRightArithmetic(c0, 4);
-        Vector128<short> d2 = Sse2.ShiftRightArithmetic(c2, 4);
+        Vector128<short> d0 = Vector128.ShiftRightArithmetic(c0, 4);
+        Vector128<short> d2 = Vector128.ShiftRightArithmetic(c2, 4);

-        Vector128<long> d0g1 = Sse2.UnpackLow(d0.AsInt64(), g1.AsInt64());
-        Vector128<long> d2f3 = Sse2.UnpackLow(d2.AsInt64(), f3.AsInt64());
+        Vector128<long> d0g1 = Vector128_.UnpackLow(d0.AsInt64(), g1.AsInt64());
+        Vector128<long> d2f3 = Vector128_.UnpackLow(d2.AsInt64(), f3.AsInt64());

        ref short outputRef = ref MemoryMarshal.GetReference(output);
        Unsafe.As<short, Vector128<short>>(ref outputRef) = d0g1.AsInt16();
--- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
@ -5,7 +5,7 @@ using System.Buffers;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;

@ -29,9 +29,9 @@ internal static class YuvConversion
    //  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
    public static void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
    {
-        if (Sse41.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
-            UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
+            UpSampleVector128(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
        }
        else
        {
@ -107,7 +107,7 @@ internal static class YuvConversion
    //
    // Then m can be written as
    // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
-    private static void UpSampleSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
+    private static void UpSampleVector128(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
    {
        const int xStep = 3;
        Array.Clear(uvBuffer);
@ -138,18 +138,18 @@ internal static class YuvConversion
        {
            for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
            {
-                UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
-                UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
-                ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
+                ConvertYuvToBgrWithBottomYVector128(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
            }
        }
        else
        {
            for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
            {
-                UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
-                UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
-                ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
+                ConvertYuvToBgrVector128(topY, topDst, ru, rv, pos, xStep);
            }
        }

@ -161,18 +161,18 @@ internal static class YuvConversion
            Span<byte> tmpBottomDst = tmpTopDst[(4 * 32)..];
            Span<byte> tmpTop = tmpBottomDst[(4 * 32)..];
            Span<byte> tmpBottom = bottomY.IsEmpty ? null : tmpTop[32..];
-            UpSampleLastBlock(topU[uvPos..], curU[uvPos..], leftOver, ru);
-            UpSampleLastBlock(topV[uvPos..], curV[uvPos..], leftOver, rv);
+            UpSampleLastBlockVector128(topU[uvPos..], curU[uvPos..], leftOver, ru);
+            UpSampleLastBlockVector128(topV[uvPos..], curV[uvPos..], leftOver, rv);

            topY[pos..len].CopyTo(tmpTop);
            if (!bottomY.IsEmpty)
            {
                bottomY[pos..len].CopyTo(tmpBottom);
-                ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
+                ConvertYuvToBgrWithBottomYVector128(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
            }
            else
            {
-                ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep);
+                ConvertYuvToBgrVector128(tmpTop, tmpTopDst, ru, rv, 0, xStep);
            }

            tmpTopDst[..((len - pos) * xStep)].CopyTo(topDst[(pos * xStep)..]);
@ -184,7 +184,7 @@ internal static class YuvConversion
    }

    // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
-    private static void UpSample32Pixels(ref byte r1, ref byte r2, Span<byte> output)
+    private static void UpSample32PixelsVector128(ref byte r1, ref byte r2, Span<byte> output)
    {
        // Load inputs.
        Vector128<byte> a = Unsafe.As<byte, Vector128<byte>>(ref r1);
@ -192,28 +192,28 @@ internal static class YuvConversion
        Vector128<byte> c = Unsafe.As<byte, Vector128<byte>>(ref r2);
        Vector128<byte> d = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref r2, 1));

-        Vector128<byte> s = Sse2.Average(a, d); // s = (a + d + 1) / 2
-        Vector128<byte> t = Sse2.Average(b, c); // t = (b + c + 1) / 2
-        Vector128<byte> st = Sse2.Xor(s, t); // st = s^t
+        Vector128<byte> s = Vector128_.Average(a, d); // s = (a + d + 1) / 2
+        Vector128<byte> t = Vector128_.Average(b, c); // t = (b + c + 1) / 2
+        Vector128<byte> st = s ^ t; // st = s^t

-        Vector128<byte> ad = Sse2.Xor(a, d); // ad = a^d
-        Vector128<byte> bc = Sse2.Xor(b, c); // bc = b^c
+        Vector128<byte> ad = a ^ d; // ad = a^d
+        Vector128<byte> bc = b ^ c; // bc = b^c

-        Vector128<byte> t1 = Sse2.Or(ad, bc); // (a^d) | (b^c)
-        Vector128<byte> t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t)
-        Vector128<byte> t3 = Sse2.And(t2, Vector128.Create((byte)1)); // (a^d) | (b^c) | (s^t) & 1
-        Vector128<byte> t4 = Sse2.Average(s, t);
-        Vector128<byte> k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4
+        Vector128<byte> t1 = ad | bc; // (a^d) | (b^c)
+        Vector128<byte> t2 = t1 | st; // (a^d) | (b^c) | (s^t)
+        Vector128<byte> t3 = t2 & Vector128.Create((byte)1); // (a^d) | (b^c) | (s^t) & 1
+        Vector128<byte> t4 = Vector128_.Average(s, t);
+        Vector128<byte> k = t4 - t3; // k = (a + b + c + d) / 4

-        Vector128<byte> diag1 = GetM(k, st, bc, t);
-        Vector128<byte> diag2 = GetM(k, st, ad, s);
+        Vector128<byte> diag1 = GetMVector128(k, st, bc, t);
+        Vector128<byte> diag2 = GetMVector128(k, st, ad, s);

        // Pack the alternate pixels.
-        PackAndStore(a, b, diag1, diag2, output); // store top.
-        PackAndStore(c, d, diag2, diag1, output[(2 * 32)..]);
+        PackAndStoreVector128(a, b, diag1, diag2, output); // store top.
+        PackAndStoreVector128(c, d, diag2, diag1, output[(2 * 32)..]);
    }

-    private static void UpSampleLastBlock(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
+    private static void UpSampleLastBlockVector128(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
    {
        Span<byte> r1 = stackalloc byte[17];
        Span<byte> r2 = stackalloc byte[17];
@ -230,27 +230,27 @@ internal static class YuvConversion

        ref byte r1Ref = ref MemoryMarshal.GetReference(r1);
        ref byte r2Ref = ref MemoryMarshal.GetReference(r2);
-        UpSample32Pixels(ref r1Ref, ref r2Ref, output);
+        UpSample32PixelsVector128(ref r1Ref, ref r2Ref, output);
    }

    // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
-    private static Vector128<byte> GetM(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
+    private static Vector128<byte> GetMVector128(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
    {
-        Vector128<byte> tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2
-        Vector128<byte> tmp1 = Sse2.And(ij, st); // (ij) & (s^t)
-        Vector128<byte> tmp2 = Sse2.Xor(k, input); // (k^in)
-        Vector128<byte> tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in)
-        Vector128<byte> tmp4 = Sse2.And(tmp3, Vector128.Create((byte)1)); // & 1 -> lsb_correction
+        Vector128<byte> tmp0 = Vector128_.Average(k, input); // (k + in + 1) / 2
+        Vector128<byte> tmp1 = ij & st; // (ij) & (s^t)
+        Vector128<byte> tmp2 = k ^ input; // (k^in)
+        Vector128<byte> tmp3 = tmp1 | tmp2; // ((ij) & (s^t)) | (k^in)
+        Vector128<byte> tmp4 = tmp3 & Vector128.Create((byte)1); // & 1 -> lsb_correction

-        return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction
+        return tmp0 - tmp4; // (k + in + 1) / 2 - lsb_correction
    }

-    private static void PackAndStore(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
+    private static void PackAndStoreVector128(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
    {
-        Vector128<byte> ta = Sse2.Average(a, da); // (9a + 3b + 3c +  d + 8) / 16
-        Vector128<byte> tb = Sse2.Average(b, db); // (3a + 9b +  c + 3d + 8) / 16
-        Vector128<byte> t1 = Sse2.UnpackLow(ta, tb);
-        Vector128<byte> t2 = Sse2.UnpackHigh(ta, tb);
+        Vector128<byte> ta = Vector128_.Average(a, da); // (9a + 3b + 3c +  d + 8) / 16
+        Vector128<byte> tb = Vector128_.Average(b, db); // (3a + 9b +  c + 3d + 8) / 16
+        Vector128<byte> t1 = Vector128_.UnpackLow(ta, tb);
+        Vector128<byte> t2 = Vector128_.UnpackHigh(ta, tb);

        ref byte output0Ref = ref MemoryMarshal.GetReference(output);
        ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16);
@ -562,41 +562,42 @@ internal static class YuvConversion
    }

    [MethodImpl(InliningOptions.ShortMethod)]
-    private static void ConvertYuvToBgrSse41(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step) => YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
+    private static void ConvertYuvToBgrVector128(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step)
+        => YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);

    [MethodImpl(InliningOptions.ShortMethod)]
-    private static void ConvertYuvToBgrWithBottomYSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
+    private static void ConvertYuvToBgrWithBottomYVector128(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
    {
-        YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
-        YuvToBgrSse41(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
+        YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);
+        YuvToBgrVector128(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
    }

-    private static void YuvToBgrSse41(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
+    private static void YuvToBgrVector128(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
    {
        ref byte yRef = ref MemoryMarshal.GetReference(y);
        ref byte uRef = ref MemoryMarshal.GetReference(u);
        ref byte vRef = ref MemoryMarshal.GetReference(v);
-        ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
-        ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
-        ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
-        ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);
+        ConvertYuv444ToBgrVector128(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
+        ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
+        ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
+        ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);

        // Cast to 8b and store as BBBBGGGGRRRR.
-        Vector128<byte> bgr0 = Sse2.PackUnsignedSaturate(b0, b1);
-        Vector128<byte> bgr1 = Sse2.PackUnsignedSaturate(b2, b3);
-        Vector128<byte> bgr2 = Sse2.PackUnsignedSaturate(g0, g1);
-        Vector128<byte> bgr3 = Sse2.PackUnsignedSaturate(g2, g3);
-        Vector128<byte> bgr4 = Sse2.PackUnsignedSaturate(r0, r1);
-        Vector128<byte> bgr5 = Sse2.PackUnsignedSaturate(r2, r3);
+        Vector128<byte> bgr0 = Vector128_.PackUnsignedSaturate(b0, b1);
+        Vector128<byte> bgr1 = Vector128_.PackUnsignedSaturate(b2, b3);
+        Vector128<byte> bgr2 = Vector128_.PackUnsignedSaturate(g0, g1);
+        Vector128<byte> bgr3 = Vector128_.PackUnsignedSaturate(g2, g3);
+        Vector128<byte> bgr4 = Vector128_.PackUnsignedSaturate(r0, r1);
+        Vector128<byte> bgr5 = Vector128_.PackUnsignedSaturate(r2, r3);

        // Pack as BGRBGRBGRBGR.
-        PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
+        PlanarTo24bVector128(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
    }

    // Pack the planar buffers
    // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
    // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-    private static void PlanarTo24bSse41(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
+    private static void PlanarTo24bVector128(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
    {
        // The input is 6 registers of sixteen 8b but for the sake of explanation,
        // let's take 6 registers of four 8b values.
@ -612,7 +613,7 @@ internal static class YuvConversion
        //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7

        // Process R.
-        ChannelMixing(
+        ChannelMixingVector128(
            input0,
            input1,
            Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5),        // PlanarTo24Shuffle0
@ -627,7 +628,7 @@ internal static class YuvConversion

        // Process G.
        // Same as before, just shifted to the left by one and including the right padding.
-        ChannelMixing(
+        ChannelMixingVector128(
            input2,
            input3,
            Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255),      // PlanarTo24Shuffle3
@ -641,7 +642,7 @@ internal static class YuvConversion
            out Vector128<byte> g5);

        // Process B.
-        ChannelMixing(
+        ChannelMixingVector128(
            input4,
            input5,
            Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255),     // PlanarTo24Shuffle6
@ -655,24 +656,24 @@ internal static class YuvConversion
            out Vector128<byte> b5);

        // OR the different channels.
-        Vector128<byte> rg0 = Sse2.Or(r0, g0);
-        Vector128<byte> rg1 = Sse2.Or(r1, g1);
-        Vector128<byte> rg2 = Sse2.Or(r2, g2);
-        Vector128<byte> rg3 = Sse2.Or(r3, g3);
-        Vector128<byte> rg4 = Sse2.Or(r4, g4);
-        Vector128<byte> rg5 = Sse2.Or(r5, g5);
+        Vector128<byte> rg0 = r0 | g0;
+        Vector128<byte> rg1 = r1 | g1;
+        Vector128<byte> rg2 = r2 | g2;
+        Vector128<byte> rg3 = r3 | g3;
+        Vector128<byte> rg4 = r4 | g4;
+        Vector128<byte> rg5 = r5 | g5;

        ref byte outputRef = ref MemoryMarshal.GetReference(rgb);
-        Unsafe.As<byte, Vector128<byte>>(ref outputRef) = Sse2.Or(rg0, b0);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5);
+        Unsafe.As<byte, Vector128<byte>>(ref outputRef) = rg0 | b0;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = rg1 | b1;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = rg2 | b2;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = rg3 | b3;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = rg4 | b4;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = rg5 | b5;
    }

    // Shuffles the input buffer as A0 0 0 A1 0 0 A2
-    private static void ChannelMixing(
+    private static void ChannelMixingVector128(
        Vector128<byte> input0,
        Vector128<byte> input1,
        Vector128<byte> shuffle0,
@ -685,53 +686,53 @@ internal static class YuvConversion
        out Vector128<byte> output4,
        out Vector128<byte> output5)
    {
-        output0 = Ssse3.Shuffle(input0, shuffle0);
-        output1 = Ssse3.Shuffle(input0, shuffle1);
-        output2 = Ssse3.Shuffle(input0, shuffle2);
-        output3 = Ssse3.Shuffle(input1, shuffle0);
-        output4 = Ssse3.Shuffle(input1, shuffle1);
-        output5 = Ssse3.Shuffle(input1, shuffle2);
+        output0 = Vector128_.ShuffleNative(input0, shuffle0);
+        output1 = Vector128_.ShuffleNative(input0, shuffle1);
+        output2 = Vector128_.ShuffleNative(input0, shuffle2);
+        output3 = Vector128_.ShuffleNative(input1, shuffle0);
+        output4 = Vector128_.ShuffleNative(input1, shuffle1);
+        output5 = Vector128_.ShuffleNative(input1, shuffle2);
    }

    // Convert 32 samples of YUV444 to B/G/R
-    private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
+    private static void ConvertYuv444ToBgrVector128(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
    {
        // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
        Vector128<byte> y0 = Unsafe.As<byte, Vector128<byte>>(ref y);
        Vector128<byte> u0 = Unsafe.As<byte, Vector128<byte>>(ref u);
        Vector128<byte> v0 = Unsafe.As<byte, Vector128<byte>>(ref v);
-        y0 = Sse2.UnpackLow(Vector128<byte>.Zero, y0);
-        u0 = Sse2.UnpackLow(Vector128<byte>.Zero, u0);
-        v0 = Sse2.UnpackLow(Vector128<byte>.Zero, v0);
+        y0 = Vector128_.UnpackLow(Vector128<byte>.Zero, y0);
+        u0 = Vector128_.UnpackLow(Vector128<byte>.Zero, u0);
+        v0 = Vector128_.UnpackLow(Vector128<byte>.Zero, v0);

        // These constants are 14b fixed-point version of ITU-R BT.601 constants.
        // R = (19077 * y             + 26149 * v - 14234) >> 6
        // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
        // B = (19077 * y + 33050 * u             - 17685) >> 6
-        var k19077 = Vector128.Create((ushort)19077);
-        var k26149 = Vector128.Create((ushort)26149);
-        var k14234 = Vector128.Create((ushort)14234);
+        Vector128<ushort> k19077 = Vector128.Create((ushort)19077);
+        Vector128<ushort> k26149 = Vector128.Create((ushort)26149);
+        Vector128<ushort> k14234 = Vector128.Create((ushort)14234);

-        Vector128<ushort> y1 = Sse2.MultiplyHigh(y0.AsUInt16(), k19077);
-        Vector128<ushort> r0 = Sse2.MultiplyHigh(v0.AsUInt16(), k26149);
-        Vector128<ushort> g0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
-        Vector128<ushort> g1 = Sse2.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));
+        Vector128<ushort> y1 = Vector128_.MultiplyHigh(y0.AsUInt16(), k19077);
+        Vector128<ushort> r0 = Vector128_.MultiplyHigh(v0.AsUInt16(), k26149);
+        Vector128<ushort> g0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
+        Vector128<ushort> g1 = Vector128_.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));

-        Vector128<ushort> r1 = Sse2.Subtract(y1.AsUInt16(), k14234);
-        Vector128<ushort> r2 = Sse2.Add(r1, r0);
+        Vector128<ushort> r1 = y1.AsUInt16() - k14234;
+        Vector128<ushort> r2 = r1 + r0;

-        Vector128<ushort> g2 = Sse2.Add(y1.AsUInt16(), Vector128.Create((ushort)8708));
-        Vector128<ushort> g3 = Sse2.Add(g0, g1);
-        Vector128<ushort> g4 = Sse2.Subtract(g2, g3);
+        Vector128<ushort> g2 = y1.AsUInt16() + Vector128.Create((ushort)8708);
+        Vector128<ushort> g3 = g0 + g1;
+        Vector128<ushort> g4 = g2 - g3;

-        Vector128<ushort> b0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
-        Vector128<ushort> b1 = Sse2.AddSaturate(b0, y1);
-        Vector128<ushort> b2 = Sse2.SubtractSaturate(b1, Vector128.Create((ushort)17685));
+        Vector128<ushort> b0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
+        Vector128<ushort> b1 = Vector128_.AddSaturate(b0, y1);
+        Vector128<ushort> b2 = Vector128_.SubtractSaturate(b1, Vector128.Create((ushort)17685));

        // Use logical shift for B2, which can be larger than 32767.
-        r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
-        g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
-        b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
+        r = Vector128.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
+        g = Vector128.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
+        b = Vector128.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
    }

    [MethodImpl(InliningOptions.ShortMethod)]
--- a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
+++ b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
@ -3,7 +3,7 @@

 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.PixelFormats;

 namespace SixLabors.ImageSharp.Formats.Webp;
@ -20,7 +20,7 @@ internal static class WebpCommonUtils
    /// <returns>Returns true if alpha has non-0xff values.</returns>
    public static unsafe bool CheckNonOpaque(ReadOnlySpan<Bgra32> row)
    {
-        if (Avx2.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
        {
            ReadOnlySpan<byte> rowBytes = MemoryMarshal.AsBytes(row);
            int i = 0;
@ -32,20 +32,20 @@ internal static class WebpCommonUtils

                for (; i + 128 <= length; i += 128)
                {
-                    Vector256<byte> a0 = Avx.LoadVector256(src + i).AsByte();
-                    Vector256<byte> a1 = Avx.LoadVector256(src + i + 32).AsByte();
-                    Vector256<byte> a2 = Avx.LoadVector256(src + i + 64).AsByte();
-                    Vector256<byte> a3 = Avx.LoadVector256(src + i + 96).AsByte();
-                    Vector256<int> b0 = Avx2.And(a0, alphaMaskVector256).AsInt32();
-                    Vector256<int> b1 = Avx2.And(a1, alphaMaskVector256).AsInt32();
-                    Vector256<int> b2 = Avx2.And(a2, alphaMaskVector256).AsInt32();
-                    Vector256<int> b3 = Avx2.And(a3, alphaMaskVector256).AsInt32();
-                    Vector256<short> c0 = Avx2.PackSignedSaturate(b0, b1).AsInt16();
-                    Vector256<short> c1 = Avx2.PackSignedSaturate(b2, b3).AsInt16();
-                    Vector256<byte> d = Avx2.PackSignedSaturate(c0, c1).AsByte();
-                    Vector256<byte> bits = Avx2.CompareEqual(d, all0x80Vector256);
-                    int mask = Avx2.MoveMask(bits);
-                    if (mask != -1)
+                    Vector256<byte> a0 = Vector256.Load(src + i).AsByte();
+                    Vector256<byte> a1 = Vector256.Load(src + i + 32).AsByte();
+                    Vector256<byte> a2 = Vector256.Load(src + i + 64).AsByte();
+                    Vector256<byte> a3 = Vector256.Load(src + i + 96).AsByte();
+                    Vector256<int> b0 = (a0 & alphaMaskVector256).AsInt32();
+                    Vector256<int> b1 = (a1 & alphaMaskVector256).AsInt32();
+                    Vector256<int> b2 = (a2 & alphaMaskVector256).AsInt32();
+                    Vector256<int> b3 = (a3 & alphaMaskVector256).AsInt32();
+                    Vector256<short> c0 = Vector256_.PackSignedSaturate(b0, b1).AsInt16();
+                    Vector256<short> c1 = Vector256_.PackSignedSaturate(b2, b3).AsInt16();
+                    Vector256<byte> d = Vector256_.PackSignedSaturate(c0, c1).AsByte();
+                    Vector256<byte> bits = Vector256.Equals(d, all0x80Vector256);
+                    uint mask = bits.ExtractMostSignificantBits();
+                    if (mask != 0xFFFF_FFFF)
                    {
                        return true;
                    }
@ -53,7 +53,7 @@ internal static class WebpCommonUtils

                for (; i + 64 <= length; i += 64)
                {
-                    if (IsNoneOpaque64Bytes(src, i))
+                    if (IsNoneOpaque64BytesVector128(src, i))
                    {
                        return true;
                    }
@ -61,7 +61,7 @@ internal static class WebpCommonUtils

                for (; i + 32 <= length; i += 32)
                {
-                    if (IsNoneOpaque32Bytes(src, i))
+                    if (IsNonOpaque32BytesVector128(src, i))
                    {
                        return true;
                    }
@ -76,7 +76,7 @@ internal static class WebpCommonUtils
                }
            }
        }
-        else if (Sse2.IsSupported)
+        else if (Vector128.IsHardwareAccelerated)
        {
            ReadOnlySpan<byte> rowBytes = MemoryMarshal.AsBytes(row);
            int i = 0;
@ -85,7 +85,7 @@ internal static class WebpCommonUtils
            {
                for (; i + 64 <= length; i += 64)
                {
-                    if (IsNoneOpaque64Bytes(src, i))
+                    if (IsNoneOpaque64BytesVector128(src, i))
                    {
                        return true;
                    }
@ -93,7 +93,7 @@ internal static class WebpCommonUtils

                for (; i + 32 <= length; i += 32)
                {
-                    if (IsNoneOpaque32Bytes(src, i))
+                    if (IsNonOpaque32BytesVector128(src, i))
                    {
                        return true;
                    }
@ -122,38 +122,38 @@ internal static class WebpCommonUtils
        return false;
    }

-    private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i)
+    private static unsafe bool IsNoneOpaque64BytesVector128(byte* src, int i)
    {
        Vector128<byte> alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);

-        Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
-        Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
-        Vector128<byte> a2 = Sse2.LoadVector128(src + i + 32).AsByte();
-        Vector128<byte> a3 = Sse2.LoadVector128(src + i + 48).AsByte();
-        Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
-        Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
-        Vector128<int> b2 = Sse2.And(a2, alphaMask).AsInt32();
-        Vector128<int> b3 = Sse2.And(a3, alphaMask).AsInt32();
-        Vector128<short> c0 = Sse2.PackSignedSaturate(b0, b1).AsInt16();
-        Vector128<short> c1 = Sse2.PackSignedSaturate(b2, b3).AsInt16();
-        Vector128<byte> d = Sse2.PackSignedSaturate(c0, c1).AsByte();
-        Vector128<byte> bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte());
-        int mask = Sse2.MoveMask(bits);
+        Vector128<byte> a0 = Vector128.Load(src + i).AsByte();
+        Vector128<byte> a1 = Vector128.Load(src + i + 16).AsByte();
+        Vector128<byte> a2 = Vector128.Load(src + i + 32).AsByte();
+        Vector128<byte> a3 = Vector128.Load(src + i + 48).AsByte();
+        Vector128<int> b0 = (a0 & alphaMask).AsInt32();
+        Vector128<int> b1 = (a1 & alphaMask).AsInt32();
+        Vector128<int> b2 = (a2 & alphaMask).AsInt32();
+        Vector128<int> b3 = (a3 & alphaMask).AsInt32();
+        Vector128<short> c0 = Vector128_.PackSignedSaturate(b0, b1).AsInt16();
+        Vector128<short> c1 = Vector128_.PackSignedSaturate(b2, b3).AsInt16();
+        Vector128<byte> d = Vector128_.PackSignedSaturate(c0, c1).AsByte();
+        Vector128<byte> bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte());
+        uint mask = bits.ExtractMostSignificantBits();
        return mask != 0xFFFF;
    }

-    private static unsafe bool IsNoneOpaque32Bytes(byte* src, int i)
+    private static unsafe bool IsNonOpaque32BytesVector128(byte* src, int i)
    {
        Vector128<byte> alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);

-        Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
-        Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
-        Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
-        Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
-        Vector128<short> c = Sse2.PackSignedSaturate(b0, b1).AsInt16();
-        Vector128<byte> d = Sse2.PackSignedSaturate(c, c).AsByte();
-        Vector128<byte> bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte());
-        int mask = Sse2.MoveMask(bits);
+        Vector128<byte> a0 = Vector128.Load(src + i).AsByte();
+        Vector128<byte> a1 = Vector128.Load(src + i + 16).AsByte();
+        Vector128<int> b0 = (a0 & alphaMask).AsInt32();
+        Vector128<int> b1 = (a1 & alphaMask).AsInt32();
+        Vector128<short> c = Vector128_.PackSignedSaturate(b0, b1).AsInt16();
+        Vector128<byte> d = Vector128_.PackSignedSaturate(c, c).AsByte();
+        Vector128<byte> bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte());
+        uint mask = bits.ExtractMostSignificantBits();
        return mask != 0xFFFF;
    }
 }
--- a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs
@ -71,17 +71,17 @@ public class ColorSpaceTransformUtilsTests
    public void CollectColorBlueTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.AllowAll);

    [Fact]
-    public void CollectColorBlueTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);
+    public void CollectColorBlueTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);

    [Fact]
-    public void CollectColorBlueTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);
+    public void CollectColorBlueTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);

    [Fact]
    public void CollectColorRedTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.AllowAll);

    [Fact]
-    public void CollectColorRedTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);
+    public void CollectColorRedTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);

    [Fact]
-    public void CollectColorRedTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
+    public void CollectColorRedTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
 }