Merge branch 'main' into main

8 months ago · 33c055d125
51 changed files with 3697 additions and 1452 deletions
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@ -884,23 +884,6 @@ internal static class Numerics
        accumulator += intHigh;
    }

-    /// <summary>
-    /// Reduces elements of the vector into one sum.
-    /// </summary>
-    /// <param name="accumulator">The accumulator to reduce.</param>
-    /// <returns>The sum of all elements.</returns>
-    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static int ReduceSum(Vector128<int> accumulator)
-    {
-        // Add odd to even.
-        Vector128<int> vsum = Sse2.Add(accumulator, Sse2.Shuffle(accumulator, 0b_11_11_01_01));
-
-        // Add high to low.
-        vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));
-
-        return Sse2.ConvertToInt32(vsum);
-    }
-
    /// <summary>
    /// Reduces elements of the vector into one sum.
    /// </summary>
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@ -66,9 +66,9 @@ internal static partial class SimdUtils
            ref Span<float> destination,
            [ConstantExpected] byte control)
        {
-            if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) ||
-                (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) ||
-                 Vector128.IsHardwareAccelerated)
+            if (Vector512.IsHardwareAccelerated ||
+                Vector256.IsHardwareAccelerated ||
+                Vector128.IsHardwareAccelerated)
            {
                int remainder = 0;
                if (Vector512.IsHardwareAccelerated)
@ -112,9 +112,9 @@ internal static partial class SimdUtils
            ref Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) ||
-                (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) ||
-                (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte))
+            if (Vector512.IsHardwareAccelerated ||
+                Vector256.IsHardwareAccelerated ||
+                Vector128.IsHardwareAccelerated)
            {
                int remainder = 0;
                if (Vector512.IsHardwareAccelerated)
@ -158,7 +158,7 @@ internal static partial class SimdUtils
            ref Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight)
+            if (Vector128.IsHardwareAccelerated)
            {
                int remainder = source.Length % (Vector128<byte>.Count * 3);

@ -190,7 +190,7 @@ internal static partial class SimdUtils
            ref Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated)
            {
                int remainder = source.Length % (Vector128<byte>.Count * 3);

@ -223,7 +223,7 @@ internal static partial class SimdUtils
            ref Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
+            if (Vector128.IsHardwareAccelerated)
            {
                int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1);    // bit-hack for modulo

@ -249,7 +249,7 @@ internal static partial class SimdUtils
            Span<float> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat)
+            if (Vector512.IsHardwareAccelerated)
            {
                ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source));
                ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));
@ -277,7 +277,7 @@ internal static partial class SimdUtils
                    }
                }
            }
-            else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat)
+            else if (Vector256.IsHardwareAccelerated)
            {
                ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source));
                ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
@ -341,7 +341,7 @@ internal static partial class SimdUtils
            Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte)
+            if (Vector512.IsHardwareAccelerated)
            {
                Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
                Shuffle.MMShuffleSpan(ref temp, control);
@ -373,8 +373,13 @@ internal static partial class SimdUtils
                    }
                }
            }
-            else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte)
+            else if (Vector256.IsHardwareAccelerated)
            {
+                // ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb).
+                // MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte,
+                // so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F)
+                // for indexing within each lane, and ignores the upper bits unless bit 7 is set,
+                // this usage is guaranteed to remain within-lane and non-zeroing.
                Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
                Shuffle.MMShuffleSpan(ref temp, control);
                Vector256<byte> mask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp));
@ -391,21 +396,21 @@ internal static partial class SimdUtils
                    ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
                    ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);

-                    vd0 = Vector256_.ShuffleNative(vs0, mask);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
+                    vd0 = Vector256_.ShufflePerLane(vs0, mask);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)1), mask);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)2), mask);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)3), mask);
                }

                if (m > 0)
                {
                    for (nuint i = u; i < n; i++)
                    {
-                        Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
+                        Unsafe.Add(ref destinationBase, i) = Vector256_.ShufflePerLane(Unsafe.Add(ref sourceBase, i), mask);
                    }
                }
            }
-            else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)
+            else if (Vector128.IsHardwareAccelerated)
            {
                Span<byte> temp = stackalloc byte[Vector128<byte>.Count];
                Shuffle.MMShuffleSpan(ref temp, control);
@ -445,9 +450,7 @@ internal static partial class SimdUtils
            Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated &&
-                Vector128_.SupportsShuffleNativeByte &&
-                Vector128_.SupportsAlignRight)
+            if (Vector128.IsHardwareAccelerated)
            {
                Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
                Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
@ -507,10 +510,7 @@ internal static partial class SimdUtils
            Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated &&
-                Vector128_.SupportsShuffleNativeByte &&
-                Vector128_.SupportsShiftByte &&
-                Vector128_.SupportsAlignRight)
+            if (Vector128.IsHardwareAccelerated)
            {
                Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16();
                Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@ -553,10 +553,7 @@ internal static partial class SimdUtils
            Span<byte> destination,
            [ConstantExpected] byte control)
        {
-            if (Vector128.IsHardwareAccelerated &&
-                Vector128_.SupportsShuffleNativeByte &&
-                Vector128_.SupportsShiftByte &&
-                Vector128_.SupportsAlignRight)
+            if (Vector128.IsHardwareAccelerated)
            {
                Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
                Vector128<byte> maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@ -1,7 +1,6 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.

-using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers;
 internal static class Vector256_
 #pragma warning restore SA1649 // File name should match first type name
 {
-    /// <summary>
-    /// Gets a value indicating whether shuffle byte operations are supported.
-    /// </summary>
-    public static bool SupportsShuffleNativeFloat
-    {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx.IsSupported;
-    }
-
-    /// <summary>
-    /// Gets a value indicating whether shuffle byte operations are supported.
-    /// </summary>
-    public static bool SupportsShuffleNativeByte
-    {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx2.IsSupported;
-    }
-
    /// <summary>
    /// Creates a new vector by selecting values from an input vector using a set of indices.
    /// </summary>
@ -47,15 +28,7 @@ internal static class Vector256_
    /// <returns>The <see cref="Vector256{Single}"/>.</returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector256<float> ShuffleNative(Vector256<float> vector, [ConstantExpected] byte control)
-    {
-        if (Avx.IsSupported)
-        {
-            return Avx.Shuffle(vector, vector, control);
-        }
-
-        ThrowUnreachableException();
-        return default;
-    }
+        => Avx.Shuffle(vector, vector, control);

    /// <summary>
    /// Creates a new vector by selecting values from an input vector using a set of indices.</summary>
@ -66,15 +39,17 @@ internal static class Vector256_
    /// </param>
    /// <returns>The <see cref="Vector256{Single}"/>.</returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<byte> ShuffleNative(Vector256<byte> vector, Vector256<byte> indices)
+    public static Vector256<byte> ShufflePerLane(Vector256<byte> vector, Vector256<byte> indices)
    {
        if (Avx2.IsSupported)
        {
            return Avx2.Shuffle(vector, indices);
        }

-        ThrowUnreachableException();
-        return default;
+        Vector128<byte> indicesLo = indices.GetLower();
+        Vector128<byte> lower = Vector128_.ShuffleNative(vector.GetLower(), indicesLo);
+        Vector128<byte> upper = Vector128_.ShuffleNative(vector.GetUpper(), indicesLo);
+        return Vector256.Create(lower, upper);
    }

    /// <summary>
@ -162,6 +137,54 @@ internal static class Vector256_
        return (vm0 * vm1) - vs;
    }

+    /// <summary>
+    /// Multiply packed signed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
+    /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and
+    /// pack the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed signed 16-bit integers to multiply and add.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed signed 16-bit integers to multiply and add.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<int> MultiplyAddAdjacent(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.MultiplyAddAdjacent(left, right);
+        }
+
+        return Vector256.Create(
+            Vector128_.MultiplyAddAdjacent(left.GetLower(), right.GetLower()),
+            Vector128_.MultiplyAddAdjacent(left.GetUpper(), right.GetUpper()));
+    }
+
+    /// <summary>
+    /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
+    /// </summary>
+    /// <param name="left">The left hand source vector.</param>
+    /// <param name="right">The right hand source vector.</param>
+    /// <returns>The <see cref="Vector256{UInt16}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<ushort> PackUnsignedSaturate(Vector256<int> left, Vector256<int> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.PackUnsignedSaturate(left, right);
+        }
+
+        Vector256<int> min = Vector256.Create((int)ushort.MinValue);
+        Vector256<int> max = Vector256.Create((int)ushort.MaxValue);
+        Vector256<uint> lefClamped = Clamp(left, min, max).AsUInt32();
+        Vector256<uint> rightClamped = Clamp(right, min, max).AsUInt32();
+        return Vector256.Narrow(lefClamped, rightClamped);
+    }
+
    /// <summary>
    /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
    /// </summary>
@ -183,6 +206,27 @@ internal static class Vector256_
        return Vector256.Narrow(lefClamped, rightClamped);
    }

+    /// <summary>
+    /// Packs signed 16-bit integers to signed 8-bit integers and saturates.
+    /// </summary>
+    /// <param name="left">The left hand source vector.</param>
+    /// <param name="right">The right hand source vector.</param>
+    /// <returns>The <see cref="Vector256{SByte}"/>.</returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<sbyte> PackSignedSaturate(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.PackSignedSaturate(left, right);
+        }
+
+        Vector256<short> min = Vector256.Create((short)sbyte.MinValue);
+        Vector256<short> max = Vector256.Create((short)sbyte.MaxValue);
+        Vector256<short> lefClamped = Clamp(left, min, max);
+        Vector256<short> rightClamped = Clamp(right, min, max);
+        return Vector256.Narrow(lefClamped, rightClamped);
+    }
+
    /// <summary>
    /// Restricts a vector between a minimum and a maximum value.
    /// </summary>
@ -210,6 +254,211 @@ internal static class Vector256_
        return Vector256.WidenLower(value.ToVector256());
    }

-    [DoesNotReturn]
-    private static void ThrowUnreachableException() => throw new UnreachableException();
+    /// <summary>
+    /// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
+    /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 16-bit integers to multiply.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 16-bit integers to multiply.
+    /// </param>
+    /// <returns>
+    /// A vector containing the low 16 bits of the products of the packed 16-bit integers
+    /// from <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<short> MultiplyLow(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.MultiplyLow(left, right);
+        }
+
+        // Widen each half of the short vectors into two int vectors
+        (Vector256<int> leftLower, Vector256<int> leftUpper) = Vector256.Widen(left);
+        (Vector256<int> rightLower, Vector256<int> rightUpper) = Vector256.Widen(right);
+
+        // Elementwise multiply: each int lane now holds the full 32-bit product
+        Vector256<int> prodLo = leftLower * rightLower;
+        Vector256<int> prodHi = leftUpper * rightUpper;
+
+        // Narrow the two int vectors back into one short vector
+        return Vector256.Narrow(prodLo, prodHi);
+    }
+
+    /// <summary>
+    /// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
+    /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 16-bit integers to multiply.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 16-bit integers to multiply.
+    /// </param>
+    /// <returns>
+    /// A vector containing the high 16 bits of the products of the packed 16-bit integers
+    /// from <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<short> MultiplyHigh(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.MultiplyHigh(left, right);
+        }
+
+        // Widen each half of the short vectors into two int vectors
+        (Vector256<int> leftLower, Vector256<int> leftUpper) = Vector256.Widen(left);
+        (Vector256<int> rightLower, Vector256<int> rightUpper) = Vector256.Widen(right);
+
+        // Elementwise multiply: each int lane now holds the full 32-bit product
+        Vector256<int> prodLo = leftLower * rightLower;
+        Vector256<int> prodHi = leftUpper * rightUpper;
+
+        // Arithmetic shift right by 16 bits to extract the high word
+        prodLo >>= 16;
+        prodHi >>= 16;
+
+        // Narrow the two int vectors back into one short vector
+        return Vector256.Narrow(prodLo, prodHi);
+    }
+
+    /// <summary>
+    /// Unpack and interleave 32-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 32-bit integers to unpack from the low half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 32-bit integers to unpack from the low half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 32-bit integers from the low
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<int> UnpackLow(Vector256<int> left, Vector256<int> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.UnpackLow(left, right);
+        }
+
+        Vector128<int> lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
+        Vector128<int> hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
+
+        return Vector256.Create(lo, hi);
+    }
+
+    /// <summary>
+    /// Unpack and interleave 8-bit integers from the high half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 8-bit integers to unpack from the high half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 8-bit integers to unpack from the high half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 8-bit integers from the high
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<byte> UnpackHigh(Vector256<byte> left, Vector256<byte> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.UnpackHigh(left, right);
+        }
+
+        Vector128<byte> lo = Vector128_.UnpackHigh(left.GetLower(), right.GetLower());
+        Vector128<byte> hi = Vector128_.UnpackHigh(left.GetUpper(), right.GetUpper());
+
+        return Vector256.Create(lo, hi);
+    }
+
+    /// <summary>
+    /// Unpack and interleave 8-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 8-bit integers to unpack from the low half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 8-bit integers to unpack from the low half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 8-bit integers from the low
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<byte> UnpackLow(Vector256<byte> left, Vector256<byte> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.UnpackLow(left, right);
+        }
+
+        Vector128<byte> lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
+        Vector128<byte> hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
+
+        return Vector256.Create(lo, hi);
+    }
+
+    /// <summary>
+    /// Subtract packed signed 16-bit integers in <paramref name="right"/> from packed signed 16-bit integers
+    /// in <paramref name="left"/> using saturation, and store the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed signed 16-bit integers to subtract from.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed signed 16-bit integers to subtract.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of subtracting packed unsigned 16-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<short> SubtractSaturate(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.SubtractSaturate(left, right);
+        }
+
+        return Vector256.Create(
+            Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
+            Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
+    }
+
+    /// <summary>
+    /// Subtract packed unsigned 8-bit integers in <paramref name="right"/> from packed unsigned 8-bit integers
+    /// in <paramref name="left"/> using saturation, and store the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed unsigned 8-bit integers to subtract from.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed unsigned 8-bit integers to subtract.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of subtracting packed unsigned 8-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<byte> SubtractSaturate(Vector256<byte> left, Vector256<byte> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.SubtractSaturate(left, right);
+        }
+
+        return Vector256.Create(
+            Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
+            Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
+    }
 }
--- a/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector512Utilities.cs
@ -1,7 +1,6 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.

-using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Runtime.Intrinsics;
@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers;
 internal static class Vector512_
 #pragma warning restore SA1649 // File name should match first type name
 {
-    /// <summary>
-    /// Gets a value indicating whether shuffle float operations are supported.
-    /// </summary>
-    public static bool SupportsShuffleNativeFloat
-    {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx512F.IsSupported;
-    }
-
-    /// <summary>
-    /// Gets a value indicating whether shuffle byte operations are supported.
-    /// </summary>
-    public static bool SupportsShuffleNativeByte
-    {
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Avx512BW.IsSupported;
-    }
-
    /// <summary>
    /// Creates a new vector by selecting values from an input vector using the control.
    /// </summary>
@ -47,15 +28,7 @@ internal static class Vector512_
    /// <returns>The <see cref="Vector512{Single}"/>.</returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector512<float> ShuffleNative(Vector512<float> vector, [ConstantExpected] byte control)
-    {
-        if (Avx512F.IsSupported)
-        {
-            return Avx512F.Shuffle(vector, vector, control);
-        }
-
-        ThrowUnreachableException();
-        return default;
-    }
+        => Avx512F.Shuffle(vector, vector, control);

    /// <summary>
    /// Creates a new vector by selecting values from an input vector using a set of indices.
@ -73,8 +46,7 @@ internal static class Vector512_
            return Avx512BW.Shuffle(vector, indices);
        }

-        ThrowUnreachableException();
-        return default;
+        return Vector512.Shuffle(vector, indices);
    }

    /// <summary>
@ -85,25 +57,7 @@ internal static class Vector512_
    /// <returns>The <see cref="Vector128{Int32}"/>.</returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector512<int> ConvertToInt32RoundToEven(Vector512<float> vector)
-    {
-        if (Avx512F.IsSupported)
-        {
-            return Avx512F.ConvertToVector512Int32(vector);
-        }
-
-        if (Avx.IsSupported)
-        {
-            Vector256<int> lower = Avx.ConvertToVector256Int32(vector.GetLower());
-            Vector256<int> upper = Avx.ConvertToVector256Int32(vector.GetUpper());
-            return Vector512.Create(lower, upper);
-        }
-
-        Vector512<float> sign = vector & Vector512.Create(-0.0f);
-        Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608.0f);
-
-        val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
-        return Vector512.ConvertToInt32(val_2p23_f32 | sign);
-    }
+        => Avx512F.ConvertToVector512Int32(vector);

    /// <summary>
    /// Rounds all values in <paramref name="vector"/> to the nearest integer
@ -112,28 +66,11 @@ internal static class Vector512_
    /// <param name="vector">The vector</param>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector512<float> RoundToNearestInteger(Vector512<float> vector)
-    {
-        if (Avx512F.IsSupported)
-        {
-            // imm8 = 0b1000:
-            //   imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers)
-            //   imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions)
-            return Avx512F.RoundScale(vector, 0b0000_1000);
-        }
-
-        if (Avx.IsSupported)
-        {
-            Vector256<float> lower = Avx.RoundToNearestInteger(vector.GetLower());
-            Vector256<float> upper = Avx.RoundToNearestInteger(vector.GetUpper());
-            return Vector512.Create(lower, upper);
-        }
-
-        Vector512<float> sign = vector & Vector512.Create(-0F);
-        Vector512<float> val_2p23_f32 = sign | Vector512.Create(8388608F);

-        val_2p23_f32 = (vector + val_2p23_f32) - val_2p23_f32;
-        return val_2p23_f32 | sign;
-    }
+          // imm8 = 0b1000:
+          //   imm8[7:4] = 0b0000 -> preserve 0 fractional bits (round to whole numbers)
+          //   imm8[3:0] = 0b1000 -> _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC (round to nearest even, suppress exceptions)
+          => Avx512F.RoundScale(vector, 0b0000_1000);

    /// <summary>
    /// Performs a multiplication and an addition of the <see cref="Vector512{Single}"/>.
@ -148,21 +85,7 @@ internal static class Vector512_
        Vector512<float> va,
        Vector512<float> vm0,
        Vector512<float> vm1)
-    {
-        if (Avx512F.IsSupported)
-        {
-            return Avx512F.FusedMultiplyAdd(vm0, vm1, va);
-        }
-
-        if (Fma.IsSupported)
-        {
-            Vector256<float> lower = Fma.MultiplyAdd(vm0.GetLower(), vm1.GetLower(), va.GetLower());
-            Vector256<float> upper = Fma.MultiplyAdd(vm0.GetUpper(), vm1.GetUpper(), va.GetUpper());
-            return Vector512.Create(lower, upper);
-        }
-
-        return va + (vm0 * vm1);
-    }
+        => Avx512F.FusedMultiplyAdd(vm0, vm1, va);

    /// <summary>
    /// Restricts a vector between a minimum and a maximum value.
@ -175,7 +98,4 @@ internal static class Vector512_
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector512<T> Clamp<T>(Vector512<T> value, Vector512<T> min, Vector512<T> max)
        => Vector512.Min(Vector512.Max(value, min), max);
-
-    [DoesNotReturn]
-    private static void ThrowUnreachableException() => throw new UnreachableException();
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.RgbScalar.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.RgbScalar.cs
@ -75,6 +75,7 @@ internal abstract partial class JpegColorConverterBase

        internal static void ConvertFromRgb(ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
        {
+            // TODO: This doesn't seem correct. We should be scaling to the maximum value here.
            rLane.CopyTo(values.Component0);
            gLane.CopyTo(values.Component1);
            bLane.CopyTo(values.Component2);
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykScalar.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykScalar.cs
@ -0,0 +1,118 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Buffers;
+using System.Numerics;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.ColorProfiles;
+using SixLabors.ImageSharp.ColorProfiles.Icc;
+using SixLabors.ImageSharp.Metadata.Profiles.Icc;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal abstract partial class JpegColorConverterBase
+{
+    /// <summary>
+    /// Color converter for tiff images, which use the jpeg compression and CMYK colorspace.
+    /// </summary>
+    internal sealed class TiffCmykScalar : JpegColorConverterScalar
+    {
+        public TiffCmykScalar(int precision)
+            : base(JpegColorSpace.TiffCmyk, precision)
+        {
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlace(in ComponentValues values)
+            => ConvertToRgbInPlace(in values, this.MaximumValue);
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
+            => ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
+
+        public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+            => ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane);
+
+        public static void ConvertToRgbInPlace(in ComponentValues values, float maxValue)
+        {
+            Span<float> c0 = values.Component0;
+            Span<float> c1 = values.Component1;
+            Span<float> c2 = values.Component2;
+            Span<float> c3 = values.Component3;
+
+            float scale = 1 / maxValue;
+            for (int i = 0; i < c0.Length; i++)
+            {
+                float c = c0[i] * scale;
+                float m = c1[i] * scale;
+                float y = c2[i] * scale;
+                float k = 1 - (c3[i] * scale);
+
+                c0[i] = (1 - c) * k;
+                c1[i] = (1 - m) * k;
+                c2[i] = (1 - y) * k;
+            }
+        }
+
+        public static void ConvertFromRgb(in ComponentValues values, float maxValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+        {
+            Span<float> c = values.Component0;
+            Span<float> m = values.Component1;
+            Span<float> y = values.Component2;
+            Span<float> k = values.Component3;
+
+            for (int i = 0; i < c.Length; i++)
+            {
+                float ctmp = 255F - rLane[i];
+                float mtmp = 255F - gLane[i];
+                float ytmp = 255F - bLane[i];
+                float ktmp = MathF.Min(MathF.Min(ctmp, mtmp), ytmp);
+
+                if (ktmp >= 255F)
+                {
+                    ctmp = 0F;
+                    mtmp = 0F;
+                    ytmp = 0F;
+                }
+                else
+                {
+                    float divisor = 1 / (255F - ktmp);
+                    ctmp = (ctmp - ktmp) * divisor;
+                    mtmp = (mtmp - ktmp) * divisor;
+                    ytmp = (ytmp - ktmp) * divisor;
+                }
+
+                c[i] = ctmp * maxValue;
+                m[i] = mtmp * maxValue;
+                y[i] = ytmp * maxValue;
+                k[i] = ktmp;
+            }
+        }
+
+        public static void ConvertToRgbInPlaceWithIcc(Configuration configuration, IccProfile profile, in ComponentValues values, float maxValue)
+        {
+            using IMemoryOwner<float> memoryOwner = configuration.MemoryAllocator.Allocate<float>(values.Component0.Length * 4);
+            Span<float> packed = memoryOwner.Memory.Span;
+
+            Span<float> c0 = values.Component0;
+            Span<float> c1 = values.Component1;
+            Span<float> c2 = values.Component2;
+            Span<float> c3 = values.Component3;
+
+            PackedNormalizeInterleave4(c0, c1, c2, c3, packed, maxValue);
+
+            Span<Cmyk> source = MemoryMarshal.Cast<float, Cmyk>(packed);
+            Span<Rgb> destination = MemoryMarshal.Cast<float, Rgb>(packed)[..source.Length];
+
+            ColorConversionOptions options = new()
+            {
+                SourceIccProfile = profile,
+                TargetIccProfile = CompactSrgbV4Profile.Profile,
+            };
+            ColorProfileConverter converter = new(options);
+            converter.Convert<Cmyk, Rgb>(source, destination);
+
+            UnpackDeinterleave3(MemoryMarshal.Cast<float, Vector3>(packed)[..source.Length], c0, c1, c2);
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector128.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector128.cs
@ -0,0 +1,99 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using SixLabors.ImageSharp.Metadata.Profiles.Icc;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal abstract partial class JpegColorConverterBase
+{
+    internal sealed class TiffCmykVector128 : JpegColorConverterVector128
+    {
+        public TiffCmykVector128(int precision)
+            : base(JpegColorSpace.TiffCmyk, precision)
+        {
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlace(in ComponentValues values)
+        {
+            ref Vector128<float> c0Base =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector128<float> c1Base =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector128<float> c2Base =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector128<float> c3Base =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            Vector128<float> scale = Vector128.Create(1 / this.MaximumValue);
+
+            nuint n = values.Component0.Vector128Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                ref Vector128<float> c = ref Unsafe.Add(ref c0Base, i);
+                ref Vector128<float> m = ref Unsafe.Add(ref c1Base, i);
+                ref Vector128<float> y = ref Unsafe.Add(ref c2Base, i);
+                Vector128<float> k = Unsafe.Add(ref c3Base, i);
+
+                k = Vector128<float>.One - (k * scale);
+                c = (Vector128<float>.One - (c * scale)) * k;
+                m = (Vector128<float>.One - (m * scale)) * k;
+                y = (Vector128<float>.One - (y * scale)) * k;
+            }
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
+            => TiffCmykScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
+
+        /// <inheritdoc/>
+        public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+            => ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane);
+
+        public static void ConvertFromRgb(in ComponentValues values, float maxValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+        {
+            ref Vector128<float> destC =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector128<float> destM =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector128<float> destY =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector128<float> destK =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            ref Vector128<float> srcR =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(rLane));
+            ref Vector128<float> srcG =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(gLane));
+            ref Vector128<float> srcB =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(bLane));
+
+            Vector128<float> scale = Vector128.Create(maxValue);
+
+            nuint n = values.Component0.Vector128Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                Vector128<float> ctmp = scale - Unsafe.Add(ref srcR, i);
+                Vector128<float> mtmp = scale - Unsafe.Add(ref srcG, i);
+                Vector128<float> ytmp = scale - Unsafe.Add(ref srcB, i);
+                Vector128<float> ktmp = Vector128.Min(ctmp, Vector128.Min(mtmp, ytmp));
+
+                Vector128<float> kMask = ~Vector128.Equals(ktmp, scale);
+                Vector128<float> divisor = Vector128<float>.One / (scale - ktmp);
+
+                ctmp = ((ctmp - ktmp) * divisor) & kMask;
+                mtmp = ((mtmp - ktmp) * divisor) & kMask;
+                ytmp = ((ytmp - ktmp) * divisor) & kMask;
+
+                Unsafe.Add(ref destC, i) = ctmp * scale;
+                Unsafe.Add(ref destM, i) = mtmp * scale;
+                Unsafe.Add(ref destY, i) = ytmp * scale;
+                Unsafe.Add(ref destK, i) = ktmp;
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector256.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector256.cs
@ -0,0 +1,99 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using SixLabors.ImageSharp.Metadata.Profiles.Icc;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal abstract partial class JpegColorConverterBase
+{
+    internal sealed class TiffCmykVector256 : JpegColorConverterVector256
+    {
+        public TiffCmykVector256(int precision)
+            : base(JpegColorSpace.TiffCmyk, precision)
+        {
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlace(in ComponentValues values)
+        {
+            ref Vector256<float> c0Base =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector256<float> c1Base =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector256<float> c2Base =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector256<float> c3Base =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            Vector256<float> scale = Vector256.Create(1 / this.MaximumValue);
+
+            nuint n = values.Component0.Vector256Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                ref Vector256<float> c = ref Unsafe.Add(ref c0Base, i);
+                ref Vector256<float> m = ref Unsafe.Add(ref c1Base, i);
+                ref Vector256<float> y = ref Unsafe.Add(ref c2Base, i);
+                Vector256<float> k = Unsafe.Add(ref c3Base, i);
+
+                k = Vector256<float>.One - (k * scale);
+                c = (Vector256<float>.One - (c * scale)) * k;
+                m = (Vector256<float>.One - (m * scale)) * k;
+                y = (Vector256<float>.One - (y * scale)) * k;
+            }
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
+            => CmykScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
+
+        /// <inheritdoc/>
+        public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+            => ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane);
+
+        public static void ConvertFromRgb(in ComponentValues values, float maxValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+        {
+            ref Vector256<float> destC =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector256<float> destM =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector256<float> destY =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector256<float> destK =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            ref Vector256<float> srcR =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(rLane));
+            ref Vector256<float> srcG =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(gLane));
+            ref Vector256<float> srcB =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(bLane));
+
+            Vector256<float> scale = Vector256.Create(maxValue);
+
+            nuint n = values.Component0.Vector256Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                Vector256<float> ctmp = scale - Unsafe.Add(ref srcR, i);
+                Vector256<float> mtmp = scale - Unsafe.Add(ref srcG, i);
+                Vector256<float> ytmp = scale - Unsafe.Add(ref srcB, i);
+                Vector256<float> ktmp = Vector256.Min(ctmp, Vector256.Min(mtmp, ytmp));
+
+                Vector256<float> kMask = ~Vector256.Equals(ktmp, scale);
+                Vector256<float> divisor = Vector256<float>.One / (scale - ktmp);
+
+                ctmp = ((ctmp - ktmp) * divisor) & kMask;
+                mtmp = ((mtmp - ktmp) * divisor) & kMask;
+                ytmp = ((ytmp - ktmp) * divisor) & kMask;
+
+                Unsafe.Add(ref destC, i) = ctmp * scale;
+                Unsafe.Add(ref destM, i) = mtmp * scale;
+                Unsafe.Add(ref destY, i) = ytmp * scale;
+                Unsafe.Add(ref destK, i) = ktmp;
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector512.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffCmykVector512.cs
@ -0,0 +1,108 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using SixLabors.ImageSharp.Metadata.Profiles.Icc;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal abstract partial class JpegColorConverterBase
+{
+    internal sealed class TiffCmykVector512 : JpegColorConverterVector512
+    {
+        public TiffCmykVector512(int precision)
+            : base(JpegColorSpace.TiffCmyk, precision)
+        {
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
+            => TiffCmykScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
+
+        /// <inheritdoc/>
+        protected override void ConvertToRgbInPlaceVectorized(in ComponentValues values)
+        {
+            ref Vector512<float> c0Base =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector512<float> c1Base =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector512<float> c2Base =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector512<float> c3Base =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            // Used for the color conversion
+            Vector512<float> scale = Vector512.Create(1 / this.MaximumValue);
+
+            nuint n = values.Component0.Vector512Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                ref Vector512<float> c = ref Unsafe.Add(ref c0Base, i);
+                ref Vector512<float> m = ref Unsafe.Add(ref c1Base, i);
+                ref Vector512<float> y = ref Unsafe.Add(ref c2Base, i);
+                Vector512<float> k = Unsafe.Add(ref c3Base, i);
+
+                k = Vector512<float>.One - (k * scale);
+                c = (Vector512<float>.One - (c * scale)) * k;
+                m = (Vector512<float>.One - (m * scale)) * k;
+                y = (Vector512<float>.One - (y * scale)) * k;
+            }
+        }
+
+        /// <inheritdoc/>
+        protected override void ConvertFromRgbVectorized(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+            => ConvertFromRgbVectorized(in values, this.MaximumValue, rLane, gLane, bLane);
+
+        /// <inheritdoc/>
+        protected override void ConvertToRgbInPlaceScalarRemainder(in ComponentValues values)
+             => TiffCmykScalar.ConvertToRgbInPlace(values, this.MaximumValue);
+
+        /// <inheritdoc/>
+        protected override void ConvertFromRgbScalarRemainder(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+            => TiffCmykScalar.ConvertFromRgb(values, this.MaximumValue, rLane, gLane, bLane);
+
+        internal static void ConvertFromRgbVectorized(in ComponentValues values, float maxValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+        {
+            ref Vector512<float> destC =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector512<float> destM =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector512<float> destY =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector512<float> destK =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            ref Vector512<float> srcR =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(rLane));
+            ref Vector512<float> srcG =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(gLane));
+            ref Vector512<float> srcB =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(bLane));
+
+            Vector512<float> scale = Vector512.Create(maxValue);
+
+            nuint n = values.Component0.Vector512Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                Vector512<float> ctmp = scale - Unsafe.Add(ref srcR, i);
+                Vector512<float> mtmp = scale - Unsafe.Add(ref srcG, i);
+                Vector512<float> ytmp = scale - Unsafe.Add(ref srcB, i);
+                Vector512<float> ktmp = Vector512.Min(ctmp, Vector512.Min(mtmp, ytmp));
+
+                Vector512<float> kMask = ~Vector512.Equals(ktmp, scale);
+                Vector512<float> divisor = Vector512<float>.One / (scale - ktmp);
+
+                ctmp = ((ctmp - ktmp) * divisor) & kMask;
+                mtmp = ((mtmp - ktmp) * divisor) & kMask;
+                ytmp = ((ytmp - ktmp) * divisor) & kMask;
+
+                Unsafe.Add(ref destC, i) = ctmp * scale;
+                Unsafe.Add(ref destM, i) = mtmp * scale;
+                Unsafe.Add(ref destY, i) = ytmp * scale;
+                Unsafe.Add(ref destK, i) = ktmp;
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKScalar.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKScalar.cs
@ -0,0 +1,153 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Buffers;
+using System.Numerics;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.ColorProfiles;
+using SixLabors.ImageSharp.ColorProfiles.Icc;
+using SixLabors.ImageSharp.Metadata.Profiles.Icc;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal abstract partial class JpegColorConverterBase
+{
+    /// <summary>
+    /// Color converter for tiff images, which use the jpeg compression and CMYK colorspace.
+    /// </summary>
+    internal sealed class TiffYccKScalar : JpegColorConverterScalar
+    {
+        // Derived from ITU-T Rec. T.871
+        internal const float RCrMult = 1.402f;
+        internal const float GCbMult = (float)(0.114 * 1.772 / 0.587);
+        internal const float GCrMult = (float)(0.299 * 1.402 / 0.587);
+        internal const float BCbMult = 1.772f;
+
+        public TiffYccKScalar(int precision)
+            : base(JpegColorSpace.TiffYccK, precision)
+        {
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlace(in ComponentValues values)
+            => ConvertToRgbInPlace(in values, this.MaximumValue, this.HalfValue);
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
+            => ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
+
+        public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+            => ConvertFromRgb(values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane);
+
+        public static void ConvertToRgbInPlace(in ComponentValues values, float maxValue, float halfValue)
+        {
+            Span<float> c0 = values.Component0;
+            Span<float> c1 = values.Component1;
+            Span<float> c2 = values.Component2;
+            Span<float> c3 = values.Component3;
+
+            float scale = 1F / maxValue;
+            halfValue *= scale;
+
+            for (int i = 0; i < values.Component0.Length; i++)
+            {
+                float y = c0[i] * scale;
+                float cb = (c1[i] * scale) - halfValue;
+                float cr = (c2[i] * scale) - halfValue;
+                float scaledK = 1 - (c3[i] * scale);
+
+                // r = y + (1.402F * cr);
+                // g = y - (0.344136F * cb) - (0.714136F * cr);
+                // b = y + (1.772F * cb);
+                c0[i] = (y + (RCrMult * cr)) * scaledK;
+                c1[i] = (y - (GCbMult * cb) - (GCrMult * cr)) * scaledK;
+                c2[i] = (y + (BCbMult * cb)) * scaledK;
+            }
+        }
+
+        public static void ConvertFromRgb(in ComponentValues values, float halfValue, float maxValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+        {
+            Span<float> y = values.Component0;
+            Span<float> cb = values.Component1;
+            Span<float> cr = values.Component2;
+            Span<float> k = values.Component3;
+
+            for (int i = 0; i < cr.Length; i++)
+            {
+                // Scale down to [0-1]
+                const float divisor = 1F / 255F;
+                float r = rLane[i] * divisor;
+                float g = gLane[i] * divisor;
+                float b = bLane[i] * divisor;
+
+                float ytmp;
+                float cbtmp;
+                float crtmp;
+                float ktmp = 1F - MathF.Max(r, MathF.Max(g, b));
+
+                if (ktmp >= 1F)
+                {
+                    ytmp = 0F;
+                    cbtmp = 0.5F;
+                    crtmp = 0.5F;
+                    ktmp = maxValue;
+                }
+                else
+                {
+                    float kmask = 1F / (1F - ktmp);
+                    r *= kmask;
+                    g *= kmask;
+                    b *= kmask;
+
+                    // Scale to [0-maxValue]
+                    ytmp = ((0.299f * r) + (0.587f * g) + (0.114f * b)) * maxValue;
+                    cbtmp = halfValue - (((0.168736f * r) - (0.331264f * g) + (0.5f * b)) * maxValue);
+                    crtmp = halfValue + (((0.5f * r) - (0.418688f * g) - (0.081312f * b)) * maxValue);
+                    ktmp *= maxValue;
+                }
+
+                y[i] = ytmp;
+                cb[i] = cbtmp;
+                cr[i] = crtmp;
+                k[i] = ktmp;
+            }
+        }
+
+        public static void ConvertToRgbInPlaceWithIcc(Configuration configuration, IccProfile profile, in ComponentValues values, float maxValue)
+        {
+            using IMemoryOwner<float> memoryOwner = configuration.MemoryAllocator.Allocate<float>(values.Component0.Length * 4);
+            Span<float> packed = memoryOwner.Memory.Span;
+
+            Span<float> c0 = values.Component0;
+            Span<float> c1 = values.Component1;
+            Span<float> c2 = values.Component2;
+            Span<float> c3 = values.Component3;
+
+            PackedNormalizeInterleave4(c0, c1, c2, c3, packed, maxValue);
+
+            ColorProfileConverter converter = new();
+            Span<Cmyk> source = MemoryMarshal.Cast<float, Cmyk>(packed);
+
+            // YccK is not a defined ICC color space — it's a JPEG-specific encoding used in Adobe-style CMYK JPEGs.
+            // ICC profiles expect colorimetric CMYK values, so we must first convert YccK to CMYK using a hardcoded inverse transform.
+            // This transform assumes Rec.601 YCbCr coefficients and an inverted K channel.
+            //
+            // The YccK => Cmyk conversion is independent of any embedded ICC profile.
+            // Since the same RGB working space is used during conversion to and from XYZ,
+            // colorimetric accuracy is preserved.
+            converter.Convert<YccK, Cmyk>(MemoryMarshal.Cast<Cmyk, YccK>(source), source);
+
+            Span<Rgb> destination = MemoryMarshal.Cast<float, Rgb>(packed)[..source.Length];
+
+            ColorConversionOptions options = new()
+            {
+                SourceIccProfile = profile,
+                TargetIccProfile = CompactSrgbV4Profile.Profile,
+            };
+            converter = new(options);
+            converter.Convert<Cmyk, Rgb>(source, destination);
+
+            UnpackDeinterleave3(MemoryMarshal.Cast<float, Vector3>(packed)[..source.Length], c0, c1, c2);
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector128.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector128.cs
@ -0,0 +1,131 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using SixLabors.ImageSharp.Common.Helpers;
+using SixLabors.ImageSharp.Metadata.Profiles.Icc;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal abstract partial class JpegColorConverterBase
+{
+    internal sealed class TiffYccKVector128 : JpegColorConverterVector128
+    {
+        public TiffYccKVector128(int precision)
+            : base(JpegColorSpace.TiffYccK, precision)
+        {
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlace(in ComponentValues values)
+        {
+            ref Vector128<float> c0Base =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector128<float> c1Base =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector128<float> c2Base =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector128<float> c3Base =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            Vector128<float> scale = Vector128.Create(1F / this.MaximumValue);
+            Vector128<float> chromaOffset = Vector128.Create(this.HalfValue) * scale;
+            Vector128<float> rCrMult = Vector128.Create(YCbCrScalar.RCrMult);
+            Vector128<float> gCbMult = Vector128.Create(-YCbCrScalar.GCbMult);
+            Vector128<float> gCrMult = Vector128.Create(-YCbCrScalar.GCrMult);
+            Vector128<float> bCbMult = Vector128.Create(YCbCrScalar.BCbMult);
+
+            nuint n = values.Component0.Vector128Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                ref Vector128<float> c0 = ref Unsafe.Add(ref c0Base, i);
+                ref Vector128<float> c1 = ref Unsafe.Add(ref c1Base, i);
+                ref Vector128<float> c2 = ref Unsafe.Add(ref c2Base, i);
+                ref Vector128<float> c3 = ref Unsafe.Add(ref c3Base, i);
+
+                Vector128<float> y = c0 * scale;
+                Vector128<float> cb = (c1 * scale) - chromaOffset;
+                Vector128<float> cr = (c2 * scale) - chromaOffset;
+                Vector128<float> scaledK = Vector128<float>.One - (c3 * scale);
+
+                // r = y + (1.402F * cr);
+                // g = y - (0.344136F * cb) - (0.714136F * cr);
+                // b = y + (1.772F * cb);
+                Vector128<float> r = Vector128_.MultiplyAdd(y, cr, rCrMult) * scaledK;
+                Vector128<float> g = Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(y, cb, gCbMult), cr, gCrMult) * scaledK;
+                Vector128<float> b = Vector128_.MultiplyAdd(y, cb, bCbMult) * scaledK;
+
+                c0 = r;
+                c1 = g;
+                c2 = b;
+            }
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
+            => TiffYccKScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
+
+        /// <inheritdoc/>
+        public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+        {
+            ref Vector128<float> srcR =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(rLane));
+            ref Vector128<float> srcG =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(gLane));
+            ref Vector128<float> srcB =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(bLane));
+
+            ref Vector128<float> destY =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector128<float> destCb =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector128<float> destCr =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector128<float> destK =
+                ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            Vector128<float> maxSourceValue = Vector128.Create(1 / 255F);
+            Vector128<float> maxSampleValue = Vector128.Create(this.MaximumValue);
+            Vector128<float> chromaOffset = Vector128.Create(this.HalfValue);
+
+            Vector128<float> f0299 = Vector128.Create(0.299f);
+            Vector128<float> f0587 = Vector128.Create(0.587f);
+            Vector128<float> f0114 = Vector128.Create(0.114f);
+            Vector128<float> fn0168736 = Vector128.Create(-0.168736f);
+            Vector128<float> fn0331264 = Vector128.Create(-0.331264f);
+            Vector128<float> fn0418688 = Vector128.Create(-0.418688f);
+            Vector128<float> fn0081312F = Vector128.Create(-0.081312F);
+            Vector128<float> f05 = Vector128.Create(0.5f);
+
+            nuint n = values.Component0.Vector128Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                Vector128<float> r = Unsafe.Add(ref srcR, i) * maxSourceValue;
+                Vector128<float> g = Unsafe.Add(ref srcG, i) * maxSourceValue;
+                Vector128<float> b = Unsafe.Add(ref srcB, i) * maxSourceValue;
+                Vector128<float> ktmp = Vector128<float>.One - Vector128.Max(r, Vector128.Min(g, b));
+
+                Vector128<float> kMask = ~Vector128.Equals(ktmp, Vector128<float>.One);
+                Vector128<float> divisor = Vector128<float>.One / (Vector128<float>.One - ktmp);
+
+                r = (r * divisor) & kMask;
+                g = (g * divisor) & kMask;
+                b = (b * divisor) & kMask;
+
+                // y  =   0 + (0.299 * r) + (0.587 * g) + (0.114 * b)
+                // cb = 128 - (0.168736 * r) - (0.331264 * g) + (0.5 * b)
+                // cr = 128 + (0.5 * r) - (0.418688 * g) - (0.081312 * b)
+                Vector128<float> y = Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
+                Vector128<float> cb = chromaOffset + Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(f05 * b, fn0331264, g), fn0168736, r);
+                Vector128<float> cr = chromaOffset + Vector128_.MultiplyAdd(Vector128_.MultiplyAdd(fn0081312F * b, fn0418688, g), f05, r);
+
+                Unsafe.Add(ref destY, i) = y * maxSampleValue;
+                Unsafe.Add(ref destCb, i) = chromaOffset + (cb * maxSampleValue);
+                Unsafe.Add(ref destCr, i) = chromaOffset + (cr * maxSampleValue);
+                Unsafe.Add(ref destK, i) = ktmp * maxSampleValue;
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector256.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector256.cs
@ -0,0 +1,131 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using SixLabors.ImageSharp.Common.Helpers;
+using SixLabors.ImageSharp.Metadata.Profiles.Icc;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal abstract partial class JpegColorConverterBase
+{
+    internal sealed class TiffYccKVector256 : JpegColorConverterVector256
+    {
+        public TiffYccKVector256(int precision)
+            : base(JpegColorSpace.TiffYccK, precision)
+        {
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlace(in ComponentValues values)
+        {
+            ref Vector256<float> c0Base =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector256<float> c1Base =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector256<float> c2Base =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector256<float> c3Base =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            Vector256<float> scale = Vector256.Create(1F / this.MaximumValue);
+            Vector256<float> chromaOffset = Vector256.Create(this.HalfValue) * scale;
+            Vector256<float> rCrMult = Vector256.Create(YCbCrScalar.RCrMult);
+            Vector256<float> gCbMult = Vector256.Create(-YCbCrScalar.GCbMult);
+            Vector256<float> gCrMult = Vector256.Create(-YCbCrScalar.GCrMult);
+            Vector256<float> bCbMult = Vector256.Create(YCbCrScalar.BCbMult);
+
+            nuint n = values.Component0.Vector256Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                ref Vector256<float> c0 = ref Unsafe.Add(ref c0Base, i);
+                ref Vector256<float> c1 = ref Unsafe.Add(ref c1Base, i);
+                ref Vector256<float> c2 = ref Unsafe.Add(ref c2Base, i);
+                ref Vector256<float> c3 = ref Unsafe.Add(ref c3Base, i);
+
+                Vector256<float> y = c0 * scale;
+                Vector256<float> cb = (c1 * scale) - chromaOffset;
+                Vector256<float> cr = (c2 * scale) - chromaOffset;
+                Vector256<float> scaledK = Vector256<float>.One - (c3 * scale);
+
+                // r = y + (1.402F * cr);
+                // g = y - (0.344136F * cb) - (0.714136F * cr);
+                // b = y + (1.772F * cb);
+                Vector256<float> r = Vector256_.MultiplyAdd(y, cr, rCrMult) * scaledK;
+                Vector256<float> g = Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(y, cb, gCbMult), cr, gCrMult) * scaledK;
+                Vector256<float> b = Vector256_.MultiplyAdd(y, cb, bCbMult) * scaledK;
+
+                c0 = r;
+                c1 = g;
+                c2 = b;
+            }
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
+            => TiffYccKScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
+
+        /// <inheritdoc/>
+        public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+        {
+            ref Vector256<float> srcR =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(rLane));
+            ref Vector256<float> srcG =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(gLane));
+            ref Vector256<float> srcB =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(bLane));
+
+            ref Vector256<float> destY =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector256<float> destCb =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector256<float> destCr =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector256<float> destK =
+                ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            Vector256<float> maxSourceValue = Vector256.Create(255F);
+            Vector256<float> maxSampleValue = Vector256.Create(this.MaximumValue);
+            Vector256<float> chromaOffset = Vector256.Create(this.HalfValue);
+
+            Vector256<float> f0299 = Vector256.Create(0.299f);
+            Vector256<float> f0587 = Vector256.Create(0.587f);
+            Vector256<float> f0114 = Vector256.Create(0.114f);
+            Vector256<float> fn0168736 = Vector256.Create(-0.168736f);
+            Vector256<float> fn0331264 = Vector256.Create(-0.331264f);
+            Vector256<float> fn0418688 = Vector256.Create(-0.418688f);
+            Vector256<float> fn0081312F = Vector256.Create(-0.081312F);
+            Vector256<float> f05 = Vector256.Create(0.5f);
+
+            nuint n = values.Component0.Vector256Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                Vector256<float> r = Unsafe.Add(ref srcR, i) / maxSourceValue;
+                Vector256<float> g = Unsafe.Add(ref srcG, i) / maxSourceValue;
+                Vector256<float> b = Unsafe.Add(ref srcB, i) / maxSourceValue;
+                Vector256<float> ktmp = Vector256<float>.One - Vector256.Max(r, Vector256.Min(g, b));
+
+                Vector256<float> kMask = ~Vector256.Equals(ktmp, Vector256<float>.One);
+                Vector256<float> divisor = Vector256<float>.One / (Vector256<float>.One - ktmp);
+
+                r = (r * divisor) & kMask;
+                g = (g * divisor) & kMask;
+                b = (b * divisor) & kMask;
+
+                // y  =   0 + (0.299 * r) + (0.587 * g) + (0.114 * b)
+                // cb = 128 - (0.168736 * r) - (0.331264 * g) + (0.5 * b)
+                // cr = 128 + (0.5 * r) - (0.418688 * g) - (0.081312 * b)
+                Vector256<float> y = Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
+                Vector256<float> cb = chromaOffset + Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(f05 * b, fn0331264, g), fn0168736, r);
+                Vector256<float> cr = chromaOffset + Vector256_.MultiplyAdd(Vector256_.MultiplyAdd(fn0081312F * b, fn0418688, g), f05, r);
+
+                Unsafe.Add(ref destY, i) = y * maxSampleValue;
+                Unsafe.Add(ref destCb, i) = chromaOffset + (cb * maxSampleValue);
+                Unsafe.Add(ref destCr, i) = chromaOffset + (cr * maxSampleValue);
+                Unsafe.Add(ref destK, i) = ktmp * maxSampleValue;
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector512.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.TiffYccKVector512.cs
@ -0,0 +1,142 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using SixLabors.ImageSharp.Common.Helpers;
+using SixLabors.ImageSharp.Metadata.Profiles.Icc;
+
+namespace SixLabors.ImageSharp.Formats.Jpeg.Components;
+
+internal abstract partial class JpegColorConverterBase
+{
+    internal sealed class TiffYccKVector512 : JpegColorConverterVector512
+    {
+        public TiffYccKVector512(int precision)
+            : base(JpegColorSpace.TiffYccK, precision)
+        {
+        }
+
+        /// <inheritdoc/>
+        public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
+            => TiffYccKScalar.ConvertToRgbInPlaceWithIcc(configuration, profile, values, this.MaximumValue);
+
+        /// <inheritdoc/>
+        protected override void ConvertToRgbInPlaceVectorized(in ComponentValues values)
+        {
+            ref Vector512<float> c0Base =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector512<float> c1Base =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector512<float> c2Base =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector512<float> c3Base =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            Vector512<float> scale = Vector512.Create(1F / this.MaximumValue);
+            Vector512<float> chromaOffset = Vector512.Create(this.HalfValue) * scale;
+            Vector512<float> rCrMult = Vector512.Create(YCbCrScalar.RCrMult);
+            Vector512<float> gCbMult = Vector512.Create(-YCbCrScalar.GCbMult);
+            Vector512<float> gCrMult = Vector512.Create(-YCbCrScalar.GCrMult);
+            Vector512<float> bCbMult = Vector512.Create(YCbCrScalar.BCbMult);
+
+            nuint n = values.Component0.Vector512Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                ref Vector512<float> c0 = ref Unsafe.Add(ref c0Base, i);
+                ref Vector512<float> c1 = ref Unsafe.Add(ref c1Base, i);
+                ref Vector512<float> c2 = ref Unsafe.Add(ref c2Base, i);
+                ref Vector512<float> c3 = ref Unsafe.Add(ref c3Base, i);
+
+                Vector512<float> y = c0 * scale;
+                Vector512<float> cb = (c1 * scale) - chromaOffset;
+                Vector512<float> cr = (c2 * scale) - chromaOffset;
+                Vector512<float> scaledK = Vector512<float>.One - (c3 * scale);
+
+                // r = y + (1.402F * cr);
+                // g = y - (0.344136F * cb) - (0.714136F * cr);
+                // b = y + (1.772F * cb);
+                Vector512<float> r = Vector512_.MultiplyAdd(y, cr, rCrMult) * scaledK;
+                Vector512<float> g = Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(y, cb, gCbMult), cr, gCrMult) * scaledK;
+                Vector512<float> b = Vector512_.MultiplyAdd(y, cb, bCbMult) * scaledK;
+
+                c0 = r;
+                c1 = g;
+                c2 = b;
+            }
+        }
+
+        /// <inheritdoc/>
+        protected override void ConvertFromRgbVectorized(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+            => ConvertFromRgbVectorized(in values, this.MaximumValue, this.HalfValue, rLane, gLane, bLane);
+
+        /// <inheritdoc/>
+        protected override void ConvertToRgbInPlaceScalarRemainder(in ComponentValues values)
+             => TiffYccKScalar.ConvertToRgbInPlace(values, this.MaximumValue, this.HalfValue);
+
+        /// <inheritdoc/>
+        protected override void ConvertFromRgbScalarRemainder(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+            => TiffYccKScalar.ConvertFromRgb(values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane);
+
+        internal static void ConvertFromRgbVectorized(in ComponentValues values, float maxValue, float halfValue, Span<float> rLane, Span<float> gLane, Span<float> bLane)
+        {
+            ref Vector512<float> srcR =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(rLane));
+            ref Vector512<float> srcG =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(gLane));
+            ref Vector512<float> srcB =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(bLane));
+
+            ref Vector512<float> destY =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component0));
+            ref Vector512<float> destCb =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component1));
+            ref Vector512<float> destCr =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component2));
+            ref Vector512<float> destK =
+                ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(values.Component3));
+
+            Vector512<float> maxSourceValue = Vector512.Create(255F);
+            Vector512<float> maxSampleValue = Vector512.Create(maxValue);
+            Vector512<float> chromaOffset = Vector512.Create(halfValue);
+
+            Vector512<float> f0299 = Vector512.Create(0.299f);
+            Vector512<float> f0587 = Vector512.Create(0.587f);
+            Vector512<float> f0114 = Vector512.Create(0.114f);
+            Vector512<float> fn0168736 = Vector512.Create(-0.168736f);
+            Vector512<float> fn0331264 = Vector512.Create(-0.331264f);
+            Vector512<float> fn0418688 = Vector512.Create(-0.418688f);
+            Vector512<float> fn0081312F = Vector512.Create(-0.081312F);
+            Vector512<float> f05 = Vector512.Create(0.5f);
+
+            nuint n = values.Component0.Vector512Count<float>();
+            for (nuint i = 0; i < n; i++)
+            {
+                Vector512<float> r = Unsafe.Add(ref srcR, i) / maxSourceValue;
+                Vector512<float> g = Unsafe.Add(ref srcG, i) / maxSourceValue;
+                Vector512<float> b = Unsafe.Add(ref srcB, i) / maxSourceValue;
+                Vector512<float> ktmp = Vector512<float>.One - Vector512.Max(r, Vector512.Min(g, b));
+
+                Vector512<float> kMask = ~Vector512.Equals(ktmp, Vector512<float>.One);
+                Vector512<float> divisor = Vector512<float>.One / (Vector512<float>.One - ktmp);
+
+                r = (r * divisor) & kMask;
+                g = (g * divisor) & kMask;
+                b = (b * divisor) & kMask;
+
+                // y  =   0 + (0.299 * r) + (0.587 * g) + (0.114 * b)
+                // cb = 128 - (0.168736 * r) - (0.331264 * g) + (0.5 * b)
+                // cr = 128 + (0.5 * r) - (0.418688 * g) - (0.081312 * b)
+                Vector512<float> y = Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(f0114 * b, f0587, g), f0299, r);
+                Vector512<float> cb = chromaOffset + Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(f05 * b, fn0331264, g), fn0168736, r);
+                Vector512<float> cr = chromaOffset + Vector512_.MultiplyAdd(Vector512_.MultiplyAdd(fn0081312F * b, fn0418688, g), f05, r);
+
+                Unsafe.Add(ref destY, i) = y * maxSampleValue;
+                Unsafe.Add(ref destCb, i) = chromaOffset + (cb * maxSampleValue);
+                Unsafe.Add(ref destCr, i) = chromaOffset + (cr * maxSampleValue);
+                Unsafe.Add(ref destK, i) = ktmp * maxSampleValue;
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKScalar.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKScalar.cs
@ -14,7 +14,7 @@ internal abstract partial class JpegColorConverterBase
 {
    internal sealed class YccKScalar : JpegColorConverterScalar
    {
-        // derived from ITU-T Rec. T.871
+        // Derived from ITU-T Rec. T.871
        internal const float RCrMult = 1.402f;
        internal const float GCbMult = (float)(0.114 * 1.772 / 0.587);
        internal const float GCrMult = (float)(0.299 * 1.402 / 0.587);
@ -27,7 +27,7 @@ internal abstract partial class JpegColorConverterBase

        /// <inheritdoc/>
        public override void ConvertToRgbInPlace(in ComponentValues values)
-            => ConvertToRgpInPlace(values, this.MaximumValue, this.HalfValue);
+            => ConvertToRgbInPlace(values, this.MaximumValue, this.HalfValue);

        /// <inheritdoc/>
        public override void ConvertToRgbInPlaceWithIcc(Configuration configuration, in ComponentValues values, IccProfile profile)
@ -37,7 +37,7 @@ internal abstract partial class JpegColorConverterBase
        public override void ConvertFromRgb(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
            => ConvertFromRgb(values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane);

-        public static void ConvertToRgpInPlace(in ComponentValues values, float maxValue, float halfValue)
+        public static void ConvertToRgbInPlace(in ComponentValues values, float maxValue, float halfValue)
        {
            Span<float> c0 = values.Component0;
            Span<float> c1 = values.Component1;
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverter.YccKVector512.cs
@ -82,7 +82,7 @@ internal abstract partial class JpegColorConverterBase

        /// <inheritdoc/>
        protected override void ConvertToRgbInPlaceScalarRemainder(in ComponentValues values)
-            => YccKScalar.ConvertToRgpInPlace(values, this.MaximumValue, this.HalfValue);
+            => YccKScalar.ConvertToRgbInPlace(values, this.MaximumValue, this.HalfValue);

        /// <inheritdoc/>
        protected override void ConvertFromRgbVectorized(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
@ -138,12 +138,6 @@ internal abstract partial class JpegColorConverterBase

        /// <inheritdoc/>
        protected override void ConvertFromRgbScalarRemainder(in ComponentValues values, Span<float> rLane, Span<float> gLane, Span<float> bLane)
-        {
-            // rgb -> cmyk
-            CmykScalar.ConvertFromRgb(in values, this.MaximumValue, rLane, gLane, bLane);
-
-            // cmyk -> ycck
-            YccKScalar.ConvertFromRgb(in values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane);
-        }
+            => YccKScalar.ConvertFromRgb(in values, this.HalfValue, this.MaximumValue, rLane, gLane, bLane);
    }
 }
--- a/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverterBase.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/ColorConverters/JpegColorConverterBase.cs
@ -153,6 +153,39 @@ internal abstract partial class JpegColorConverterBase
        }
    }

+    public static void PackedNormalizeInterleave4(
+        ReadOnlySpan<float> xLane,
+        ReadOnlySpan<float> yLane,
+        ReadOnlySpan<float> zLane,
+        ReadOnlySpan<float> wLane,
+        Span<float> packed,
+        float maxValue)
+    {
+        DebugGuard.IsTrue(packed.Length % 4 == 0, "Packed length must be divisible by 4.");
+        DebugGuard.IsTrue(yLane.Length == xLane.Length, nameof(yLane), "Channels must be of same size!");
+        DebugGuard.IsTrue(zLane.Length == xLane.Length, nameof(zLane), "Channels must be of same size!");
+        DebugGuard.IsTrue(wLane.Length == xLane.Length, nameof(wLane), "Channels must be of same size!");
+        DebugGuard.MustBeLessThanOrEqualTo(packed.Length / 4, xLane.Length, nameof(packed));
+
+        float scale = 1F / maxValue;
+
+        // TODO: Investigate SIMD version of this.
+        ref float xLaneRef = ref MemoryMarshal.GetReference(xLane);
+        ref float yLaneRef = ref MemoryMarshal.GetReference(yLane);
+        ref float zLaneRef = ref MemoryMarshal.GetReference(zLane);
+        ref float wLaneRef = ref MemoryMarshal.GetReference(wLane);
+        ref float packedRef = ref MemoryMarshal.GetReference(packed);
+
+        for (nuint i = 0; i < (nuint)xLane.Length; i++)
+        {
+            nuint baseIdx = i * 4;
+            Unsafe.Add(ref packedRef, baseIdx) = Unsafe.Add(ref xLaneRef, i) * scale;
+            Unsafe.Add(ref packedRef, baseIdx + 1) = Unsafe.Add(ref yLaneRef, i) * scale;
+            Unsafe.Add(ref packedRef, baseIdx + 2) = Unsafe.Add(ref zLaneRef, i) * scale;
+            Unsafe.Add(ref packedRef, baseIdx + 3) = Unsafe.Add(ref wLaneRef, i) * scale;
+        }
+    }
+
    public static void PackedInvertNormalizeInterleave4(
        ReadOnlySpan<float> xLane,
        ReadOnlySpan<float> yLane,
@ -198,6 +231,8 @@ internal abstract partial class JpegColorConverterBase
            GetCmykConverter(8),
            GetGrayScaleConverter(8),
            GetRgbConverter(8),
+            GetTiffCmykConverter(8),
+            GetTiffYccKConverter(8),

            // 12-bit converters
            GetYCbCrConverter(12),
@ -205,6 +240,8 @@ internal abstract partial class JpegColorConverterBase
            GetCmykConverter(12),
            GetGrayScaleConverter(12),
            GetRgbConverter(12),
+            GetTiffCmykConverter(12),
+            GetTiffYccKConverter(12),
        ];

    /// <summary>
@ -327,6 +364,46 @@ internal abstract partial class JpegColorConverterBase
        return new RgbScalar(precision);
    }

+    private static JpegColorConverterBase GetTiffCmykConverter(int precision)
+    {
+        if (JpegColorConverterVector512.IsSupported)
+        {
+            return new TiffCmykVector512(precision);
+        }
+
+        if (JpegColorConverterVector256.IsSupported)
+        {
+            return new TiffCmykVector256(precision);
+        }
+
+        if (JpegColorConverterVector128.IsSupported)
+        {
+            return new TiffCmykVector128(precision);
+        }
+
+        return new TiffCmykScalar(precision);
+    }
+
+    private static JpegColorConverterBase GetTiffYccKConverter(int precision)
+    {
+        if (JpegColorConverterVector512.IsSupported)
+        {
+            return new TiffYccKVector512(precision);
+        }
+
+        if (JpegColorConverterVector256.IsSupported)
+        {
+            return new TiffYccKVector256(precision);
+        }
+
+        if (JpegColorConverterVector128.IsSupported)
+        {
+            return new TiffYccKVector128(precision);
+        }
+
+        return new TiffYccKScalar(precision);
+    }
+
    /// <summary>
    /// A stack-only struct to reference the input buffers using <see cref="ReadOnlySpan{T}"/>-s.
    /// </summary>
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/SpectralConverter.cs
@ -83,7 +83,8 @@ internal abstract class SpectralConverter
    /// <param name="frame">The jpeg frame with the color space to convert to.</param>
    /// <param name="jpegData">The raw JPEG data.</param>
    /// <returns>The color converter.</returns>
-    protected virtual JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData) => JpegColorConverterBase.GetConverter(jpegData.ColorSpace, frame.Precision);
+    protected virtual JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData)
+        => JpegColorConverterBase.GetConverter(jpegData.ColorSpace, frame.Precision);

    /// <summary>
    /// Calculates image size with optional scaling.
--- a/src/ImageSharp/Formats/Jpeg/Components/Encoder/SpectralConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Encoder/SpectralConverter{TPixel}.cs
@ -2,7 +2,6 @@
 // Licensed under the Six Labors Split License.

 using System.Buffers;
-using SixLabors.ImageSharp.Advanced;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;

@ -109,6 +108,8 @@ internal class SpectralConverter<TPixel> : SpectralConverter, IDisposable
            int y = yy - this.pixelRowCounter;

            // Unpack TPixel to r/g/b planes
+            // TODO: The individual implementation code would be much easier here if
+            // we scaled to [0-1] before passing to the individual converters.
            int srcIndex = Math.Min(yy, pixelBufferLastVerticalIndex);
            Span<TPixel> sourceRow = this.pixelBuffer.DangerousGetRowSpan(srcIndex);
            PixelOperations<TPixel>.Instance.UnpackIntoRgbPlanes(rLane, gLane, bLane, sourceRow);
--- a/src/ImageSharp/Formats/Jpeg/Components/JpegColorSpace.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/JpegColorSpace.cs
@ -23,6 +23,16 @@ internal enum JpegColorSpace
    /// </summary>
    Cmyk,

+    /// <summary>
+    /// YccK color space with 4 components, used with tiff images, which use jpeg compression.
+    /// </summary>
+    TiffYccK,
+
+    /// <summary>
+    /// Cmyk color space with 4 components, used with tiff images, which use jpeg compression.
+    /// </summary>
+    TiffCmyk,
+
    /// <summary>
    /// Color space with 3 components.
    /// </summary>
--- a/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
+++ b/src/ImageSharp/Formats/Jpeg/JpegDecoderCore.cs
@ -115,12 +115,14 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData
    /// Initializes a new instance of the <see cref="JpegDecoderCore"/> class.
    /// </summary>
    /// <param name="options">The decoder options.</param>
-    public JpegDecoderCore(JpegDecoderOptions options)
+    /// <param name="iccProfile">The ICC profile to use for color conversion.</param>
+    public JpegDecoderCore(JpegDecoderOptions options, IccProfile iccProfile = null)
        : base(options.GeneralOptions)
    {
        this.resizeMode = options.ResizeMode;
        this.configuration = options.GeneralOptions.Configuration;
        this.skipMetadata = options.GeneralOptions.SkipMetadata;
+        this.SetIccMetadata(iccProfile);
    }

    /// <summary>
@ -231,7 +233,7 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData
    /// <param name="scanDecoder">The scan decoder.</param>
    public void LoadTables(byte[] tableBytes, IJpegScanDecoder scanDecoder)
    {
-        this.Metadata = new ImageMetadata();
+        this.Metadata ??= new ImageMetadata();
        this.QuantizationTables = new Block8x8F[4];
        this.scanDecoder = scanDecoder;
        if (tableBytes.Length < 4)
@ -314,7 +316,7 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData

        this.scanDecoder ??= new HuffmanScanDecoder(stream, spectralConverter, cancellationToken);

-        this.Metadata = new ImageMetadata();
+        this.Metadata ??= new ImageMetadata();

        Span<byte> markerBuffer = stackalloc byte[2];

@ -678,6 +680,16 @@ internal sealed class JpegDecoderCore : ImageDecoderCore, IRawJpegData
        }
    }

+    private void SetIccMetadata(IccProfile profile)
+    {
+        if (!this.skipMetadata && profile?.CheckIsValid() == true)
+        {
+            this.hasIcc = true;
+            this.Metadata ??= new ImageMetadata();
+            this.Metadata.IccProfile = profile;
+        }
+    }
+
    /// <summary>
    /// Initializes the IPTC profile.
    /// </summary>
--- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/JpegTiffCompression.cs
@ -6,6 +6,7 @@ using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder;
 using SixLabors.ImageSharp.Formats.Tiff.Constants;
 using SixLabors.ImageSharp.IO;
 using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.Metadata;
 using SixLabors.ImageSharp.Metadata.Profiles.Icc;
 using SixLabors.ImageSharp.PixelFormats;

@ -22,6 +23,8 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor

    private readonly TiffPhotometricInterpretation photometricInterpretation;

+    private readonly ImageFrameMetadata metadata;
+
    /// <summary>
    /// Initializes a new instance of the <see cref="JpegTiffCompression"/> class.
    /// </summary>
@ -29,6 +32,7 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor
    /// <param name="memoryAllocator">The memoryAllocator to use for buffer allocations.</param>
    /// <param name="width">The image width.</param>
    /// <param name="bitsPerPixel">The bits per pixel.</param>
+    /// <param name="metadata">The image frame metadata.</param>
    /// <param name="jpegTables">The JPEG tables containing the quantization and/or Huffman tables.</param>
    /// <param name="photometricInterpretation">The photometric interpretation.</param>
    public JpegTiffCompression(
@ -36,11 +40,13 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor
        MemoryAllocator memoryAllocator,
        int width,
        int bitsPerPixel,
+        ImageFrameMetadata metadata,
        byte[] jpegTables,
        TiffPhotometricInterpretation photometricInterpretation)
        : base(memoryAllocator, width, bitsPerPixel)
    {
        this.options = options;
+        this.metadata = metadata;
        this.jpegTables = jpegTables;
        this.photometricInterpretation = photometricInterpretation;
    }
@ -61,7 +67,7 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor

    private void DecodeJpegData(BufferedReadStream stream, Span<byte> buffer, CancellationToken cancellationToken)
    {
-        using JpegDecoderCore jpegDecoder = new(this.options);
+        using JpegDecoderCore jpegDecoder = new(this.options, this.metadata.IccProfile);
        Configuration configuration = this.options.GeneralOptions.Configuration;
        switch (this.photometricInterpretation)
        {
@ -85,6 +91,7 @@ internal sealed class JpegTiffCompression : TiffBaseDecompressor

            case TiffPhotometricInterpretation.YCbCr:
            case TiffPhotometricInterpretation.Rgb:
+            case TiffPhotometricInterpretation.Separated:
            {
                using SpectralConverter<Rgb24> spectralConverter = new TiffJpegSpectralConverter<Rgb24>(configuration, this.photometricInterpretation);
                HuffmanScanDecoder scanDecoder = new(stream, spectralConverter, cancellationToken);
--- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/OldJpegTiffCompression.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/OldJpegTiffCompression.cs
@ -6,6 +6,7 @@ using SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder;
 using SixLabors.ImageSharp.Formats.Tiff.Constants;
 using SixLabors.ImageSharp.IO;
 using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.Metadata;
 using SixLabors.ImageSharp.Metadata.Profiles.Icc;
 using SixLabors.ImageSharp.PixelFormats;

@ -17,6 +18,8 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor

    private readonly uint startOfImageMarker;

+    private readonly ImageFrameMetadata metadata;
+
    private readonly TiffPhotometricInterpretation photometricInterpretation;

    public OldJpegTiffCompression(
@ -24,12 +27,14 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor
        MemoryAllocator memoryAllocator,
        int width,
        int bitsPerPixel,
+        ImageFrameMetadata metadata,
        uint startOfImageMarker,
        TiffPhotometricInterpretation photometricInterpretation)
        : base(memoryAllocator, width, bitsPerPixel)
    {
        this.options = options;
        this.startOfImageMarker = startOfImageMarker;
+        this.metadata = metadata;
        this.photometricInterpretation = photometricInterpretation;
    }

@ -47,7 +52,7 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor

    private void DecodeJpegData(BufferedReadStream stream, Span<byte> buffer, CancellationToken cancellationToken)
    {
-        using JpegDecoderCore jpegDecoder = new(this.options);
+        using JpegDecoderCore jpegDecoder = new(this.options, this.metadata.IccProfile);
        Configuration configuration = this.options.GeneralOptions.Configuration;
        switch (this.photometricInterpretation)
        {
@ -71,6 +76,7 @@ internal sealed class OldJpegTiffCompression : TiffBaseDecompressor

            case TiffPhotometricInterpretation.YCbCr:
            case TiffPhotometricInterpretation.Rgb:
+            case TiffPhotometricInterpretation.Separated:
            {
                using SpectralConverter<Rgb24> spectralConverter = new TiffOldJpegSpectralConverter<Rgb24>(configuration, this.photometricInterpretation);

--- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffJpegSpectralConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffJpegSpectralConverter{TPixel}.cs
@ -31,19 +31,30 @@ internal sealed class TiffJpegSpectralConverter<TPixel> : SpectralConverter<TPix
    /// <inheritdoc/>
    protected override JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData)
    {
-        JpegColorSpace colorSpace = GetJpegColorSpaceFromPhotometricInterpretation(this.photometricInterpretation);
+        JpegColorSpace colorSpace = GetJpegColorSpace(this.photometricInterpretation, jpegData);
        return JpegColorConverterBase.GetConverter(colorSpace, frame.Precision);
    }

    /// <summary>
-    /// This converter must be used only for RGB and YCbCr color spaces for performance reasons.
+    /// Photometric interpretation Rgb and YCbCr will be mapped to RGB colorspace, which means the jpeg decompression will leave the data as is (no color conversion).
+    /// The color conversion will be done after the decompression. For Separated/CMYK/YCCK, the jpeg color converter will handle the color conversion,
+    /// since the jpeg color converter needs to return RGB data and cannot return 4 component data.
    /// For grayscale images <see cref="GrayJpegSpectralConverter{TPixel}"/> must be used.
    /// </summary>
-    private static JpegColorSpace GetJpegColorSpaceFromPhotometricInterpretation(TiffPhotometricInterpretation interpretation)
-        => interpretation switch
-        {
-            TiffPhotometricInterpretation.Rgb => JpegColorSpace.RGB,
-            TiffPhotometricInterpretation.YCbCr => JpegColorSpace.RGB,
-            _ => throw new InvalidImageContentException($"Invalid tiff photometric interpretation for jpeg encoding: {interpretation}"),
-        };
+    /// <param name="interpretation">
+    /// The <see cref="TiffPhotometricInterpretation"/> to convert to a <see cref="JpegColorSpace"/>.
+    /// </param>
+    /// <param name="data">
+    /// The <see cref="IRawJpegData"/> containing the color space information.
+    /// </param>
+    /// <exception cref="InvalidImageContentException">
+    /// Thrown when the <paramref name="interpretation"/> is not supported for JPEG encoding.
+    /// </exception>
+    private static JpegColorSpace GetJpegColorSpace(TiffPhotometricInterpretation interpretation, IRawJpegData data) => interpretation switch
+    {
+        TiffPhotometricInterpretation.Rgb => JpegColorSpace.RGB,
+        TiffPhotometricInterpretation.Separated => data.ColorSpace == JpegColorSpace.Ycck ? JpegColorSpace.TiffYccK : JpegColorSpace.TiffCmyk,
+        TiffPhotometricInterpretation.YCbCr => JpegColorSpace.RGB, // TODO: Why doesn't this use the YCbCr color space?
+        _ => throw new InvalidImageContentException($"Invalid TIFF photometric interpretation for JPEG encoding: {interpretation}"),
+    };
 }
--- a/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffOldJpegSpectralConverter{TPixel}.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/Decompressors/TiffOldJpegSpectralConverter{TPixel}.cs
@ -30,15 +30,16 @@ internal sealed class TiffOldJpegSpectralConverter<TPixel> : SpectralConverter<T
    /// <inheritdoc/>
    protected override JpegColorConverterBase GetColorConverter(JpegFrame frame, IRawJpegData jpegData)
    {
-        JpegColorSpace colorSpace = GetJpegColorSpaceFromPhotometricInterpretation(this.photometricInterpretation);
+        JpegColorSpace colorSpace = GetJpegColorSpaceFromPhotometricInterpretation(this.photometricInterpretation, jpegData);
        return JpegColorConverterBase.GetConverter(colorSpace, frame.Precision);
    }

-    private static JpegColorSpace GetJpegColorSpaceFromPhotometricInterpretation(TiffPhotometricInterpretation interpretation)
+    private static JpegColorSpace GetJpegColorSpaceFromPhotometricInterpretation(TiffPhotometricInterpretation interpretation, IRawJpegData data)
        => interpretation switch
        {
            // Like libtiff: Always treat the pixel data as YCbCr when the data is compressed with old jpeg compression.
            TiffPhotometricInterpretation.Rgb => JpegColorSpace.YCbCr,
+            TiffPhotometricInterpretation.Separated => data.ColorSpace == JpegColorSpace.Ycck ? JpegColorSpace.TiffYccK : JpegColorSpace.TiffCmyk,
            TiffPhotometricInterpretation.YCbCr => JpegColorSpace.YCbCr,
            _ => throw new InvalidImageContentException($"Invalid tiff photometric interpretation for jpeg encoding: {interpretation}"),
        };
--- a/src/ImageSharp/Formats/Tiff/Compression/TiffDecompressorsFactory.cs
+++ b/src/ImageSharp/Formats/Tiff/Compression/TiffDecompressorsFactory.cs
@ -5,6 +5,7 @@ using SixLabors.ImageSharp.Formats.Tiff.Compression.Decompressors;
 using SixLabors.ImageSharp.Formats.Tiff.Constants;
 using SixLabors.ImageSharp.Formats.Tiff.PhotometricInterpretation;
 using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.Metadata;

 namespace SixLabors.ImageSharp.Formats.Tiff.Compression;

@ -17,6 +18,7 @@ internal static class TiffDecompressorsFactory
        TiffPhotometricInterpretation photometricInterpretation,
        int width,
        int bitsPerPixel,
+        ImageFrameMetadata metadata,
        TiffColorType colorType,
        TiffPredictor predictor,
        FaxCompressionOptions faxOptions,
@ -62,11 +64,11 @@ internal static class TiffDecompressorsFactory

            case TiffDecoderCompressionType.Jpeg:
                DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression");
-                return new JpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, jpegTables, photometricInterpretation);
+                return new JpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, metadata, jpegTables, photometricInterpretation);

            case TiffDecoderCompressionType.OldJpeg:
                DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression");
-                return new OldJpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, oldJpegStartOfImageMarker, photometricInterpretation);
+                return new OldJpegTiffCompression(new() { GeneralOptions = options }, allocator, width, bitsPerPixel, metadata, oldJpegStartOfImageMarker, photometricInterpretation);

            case TiffDecoderCompressionType.Webp:
                DebugGuard.IsTrue(predictor == TiffPredictor.None, "Predictor should only be used with lzw or deflate compression");
--- a/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/CmykTiffColor{TPixel}.cs
+++ b/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/CmykTiffColor{TPixel}.cs
@ -1,7 +1,9 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.

+using System.Numerics;
 using SixLabors.ImageSharp.ColorProfiles;
+using SixLabors.ImageSharp.Formats.Tiff.Compression;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;

@ -13,10 +15,31 @@ internal class CmykTiffColor<TPixel> : TiffBaseColorDecoder<TPixel>
    private static readonly ColorProfileConverter ColorProfileConverter = new();
    private const float Inv255 = 1f / 255f;

+    private readonly TiffDecoderCompressionType compression;
+
+    public CmykTiffColor(TiffDecoderCompressionType compression) => this.compression = compression;
+
    /// <inheritdoc/>
    public override void Decode(ReadOnlySpan<byte> data, Buffer2D<TPixel> pixels, int left, int top, int width, int height)
    {
        int offset = 0;
+
+        if (this.compression == TiffDecoderCompressionType.Jpeg)
+        {
+            for (int y = top; y < top + height; y++)
+            {
+                Span<TPixel> pixelRow = pixels.DangerousGetRowSpan(y).Slice(left, width);
+                for (int x = 0; x < pixelRow.Length; x++)
+                {
+                    pixelRow[x] = TPixel.FromVector4(new Vector4(data[offset] * Inv255, data[offset + 1] * Inv255, data[offset + 2] * Inv255, 1.0f));
+
+                    offset += 3;
+                }
+            }
+
+            return;
+        }
+
        for (int y = top; y < top + height; y++)
        {
            Span<TPixel> pixelRow = pixels.DangerousGetRowSpan(y).Slice(left, width);
--- a/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/TiffColorDecoderFactory{TPixel}.cs
+++ b/src/ImageSharp/Formats/Tiff/PhotometricInterpretation/TiffColorDecoderFactory{TPixel}.cs
@ -1,6 +1,7 @@
 // Copyright (c) Six Labors.
 // Licensed under the Six Labors Split License.

+using SixLabors.ImageSharp.Formats.Tiff.Compression;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;

@ -19,6 +20,7 @@ internal static class TiffColorDecoderFactory<TPixel>
        Rational[] referenceBlackAndWhite,
        Rational[] ycbcrCoefficients,
        ushort[] ycbcrSubSampling,
+        TiffDecoderCompressionType compression,
        ByteOrder byteOrder)
    {
        switch (colorType)
@ -410,7 +412,7 @@ internal static class TiffColorDecoderFactory<TPixel>
                    && bitsPerSample.Channel1 == 8
                    && bitsPerSample.Channel0 == 8,
                    "bitsPerSample");
-                return new CmykTiffColor<TPixel>();
+                return new CmykTiffColor<TPixel>(compression);

            default:
                throw TiffThrowHelper.InvalidColorType(colorType.ToString());
--- a/src/ImageSharp/Formats/Tiff/TiffDecoderCore.cs
+++ b/src/ImageSharp/Formats/Tiff/TiffDecoderCore.cs
@ -11,6 +11,7 @@ using SixLabors.ImageSharp.IO;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.Metadata;
 using SixLabors.ImageSharp.Metadata.Profiles.Exif;
+using SixLabors.ImageSharp.Metadata.Profiles.Icc;
 using SixLabors.ImageSharp.PixelFormats;

 namespace SixLabors.ImageSharp.Formats.Tiff;
@ -280,6 +281,12 @@ internal class TiffDecoderCore : ImageDecoderCore
        if (!this.skipMetadata)
        {
            imageFrameMetaData.ExifProfile = tags;
+
+            // We resolve the ICC profile early so that we can use it for color conversion if needed.
+            if (tags.TryGetValue(ExifTag.IccProfile, out IExifValue<byte[]> iccProfileBytes))
+            {
+                imageFrameMetaData.IccProfile = new IccProfile(iccProfileBytes.Value);
+            }
        }

        TiffFrameMetadata tiffMetadata = TiffFrameMetadata.Parse(tags);
@ -438,7 +445,7 @@ internal class TiffDecoderCore : ImageDecoderCore
                stripBuffers[stripIndex] = this.memoryAllocator.Allocate<byte>(uncompressedStripSize);
            }

-            using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(width, bitsPerPixel);
+            using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(width, bitsPerPixel, frame.Metadata);
            TiffBasePlanarColorDecoder<TPixel> colorDecoder = this.CreatePlanarColorDecoder<TPixel>();

            for (int i = 0; i < stripsPerPlane; i++)
@ -507,7 +514,7 @@ internal class TiffDecoderCore : ImageDecoderCore
        Span<byte> stripBufferSpan = stripBuffer.GetSpan();
        Buffer2D<TPixel> pixels = frame.PixelBuffer;

-        using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(width, bitsPerPixel);
+        using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(width, bitsPerPixel, frame.Metadata);
        TiffBaseColorDecoder<TPixel> colorDecoder = this.CreateChunkyColorDecoder<TPixel>();

        for (int stripIndex = 0; stripIndex < stripOffsets.Length; stripIndex++)
@ -578,7 +585,7 @@ internal class TiffDecoderCore : ImageDecoderCore
                tilesBuffers[i] = this.memoryAllocator.Allocate<byte>(uncompressedTilesSize, AllocationOptions.Clean);
            }

-            using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(frame.Width, bitsPerPixel);
+            using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(frame.Width, bitsPerPixel, frame.Metadata);
            TiffBasePlanarColorDecoder<TPixel> colorDecoder = this.CreatePlanarColorDecoder<TPixel>();

            int tileIndex = 0;
@ -679,7 +686,7 @@ internal class TiffDecoderCore : ImageDecoderCore
        using IMemoryOwner<byte> tileBuffer = this.memoryAllocator.Allocate<byte>(bytesPerTileRow * tileLength, AllocationOptions.Clean);
        Span<byte> tileBufferSpan = tileBuffer.GetSpan();

-        using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(frame.Width, bitsPerPixel, true, tileWidth, tileLength);
+        using TiffBaseDecompressor decompressor = this.CreateDecompressor<TPixel>(frame.Width, bitsPerPixel, frame.Metadata, true, tileWidth, tileLength);
        TiffBaseColorDecoder<TPixel> colorDecoder = this.CreateChunkyColorDecoder<TPixel>();

        int tileIndex = 0;
@ -733,6 +740,7 @@ internal class TiffDecoderCore : ImageDecoderCore
            this.ReferenceBlackAndWhite,
            this.YcbcrCoefficients,
            this.YcbcrSubSampling,
+            this.CompressionType,
            this.byteOrder);

    private TiffBasePlanarColorDecoder<TPixel> CreatePlanarColorDecoder<TPixel>()
@ -747,7 +755,13 @@ internal class TiffDecoderCore : ImageDecoderCore
            this.YcbcrSubSampling,
            this.byteOrder);

-    private TiffBaseDecompressor CreateDecompressor<TPixel>(int frameWidth, int bitsPerPixel, bool isTiled = false, int tileWidth = 0, int tileHeight = 0)
+    private TiffBaseDecompressor CreateDecompressor<TPixel>(
+        int frameWidth,
+        int bitsPerPixel,
+        ImageFrameMetadata metadata,
+        bool isTiled = false,
+        int tileWidth = 0,
+        int tileHeight = 0)
        where TPixel : unmanaged, IPixel<TPixel> =>
        TiffDecompressorsFactory.Create(
            this.Options,
@ -756,6 +770,7 @@ internal class TiffDecoderCore : ImageDecoderCore
            this.PhotometricInterpretation,
            frameWidth,
            bitsPerPixel,
+            metadata,
            this.ColorType,
            this.Predictor,
            this.FaxCompressionOptions,
--- a/src/ImageSharp/Formats/Tiff/TiffDecoderMetadataCreator.cs
+++ b/src/ImageSharp/Formats/Tiff/TiffDecoderMetadataCreator.cs
@ -5,7 +5,6 @@
 using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Metadata;
 using SixLabors.ImageSharp.Metadata.Profiles.Exif;
-using SixLabors.ImageSharp.Metadata.Profiles.Icc;
 using SixLabors.ImageSharp.Metadata.Profiles.Iptc;
 using SixLabors.ImageSharp.Metadata.Profiles.Xmp;

@ -29,6 +28,8 @@ internal static class TiffDecoderMetadataCreator
        {
            for (int i = 0; i < frames.Count; i++)
            {
+                // ICC profile data has already been resolved in the frame metadata,
+                // as it is required for color conversion.
                ImageFrameMetadata frameMetaData = frames[i];
                if (TryGetIptc(frameMetaData.ExifProfile.Values, out byte[] iptcBytes))
                {
@ -39,11 +40,6 @@ internal static class TiffDecoderMetadataCreator
                {
                    frameMetaData.XmpProfile = new XmpProfile(xmpProfileBytes.Value);
                }
-
-                if (frameMetaData.ExifProfile.TryGetValue(ExifTag.IccProfile, out IExifValue<byte[]> iccProfileBytes))
-                {
-                    frameMetaData.IccProfile = new IccProfile(iccProfileBytes.Value);
-                }
            }
        }

--- a/src/ImageSharp/Formats/Webp/AlphaDecoder.cs
+++ b/src/ImageSharp/Formats/Webp/AlphaDecoder.cs
@ -6,7 +6,6 @@ using System.Diagnostics.CodeAnalysis;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Formats.Webp.BitReader;
@ -314,7 +313,7 @@ internal class AlphaDecoder : IDisposable

    private static void HorizontalUnfilter(Span<byte> prev, Span<byte> input, Span<byte> dst, int width)
    {
-        if ((Sse2.IsSupported || AdvSimd.IsSupported) && width >= 9)
+        if (Vector128.IsHardwareAccelerated && width >= 9)
        {
            dst[0] = (byte)(input[0] + (prev.IsEmpty ? 0 : prev[0]));
            nuint i;
@ -362,7 +361,7 @@ internal class AlphaDecoder : IDisposable
        {
            HorizontalUnfilter(null, input, dst, width);
        }
-        else if (Avx2.IsSupported)
+        else if (Vector256.IsHardwareAccelerated)
        {
            ref byte inputRef = ref MemoryMarshal.GetReference(input);
            ref byte prevRef = ref MemoryMarshal.GetReference(prev);
@ -374,7 +373,7 @@ internal class AlphaDecoder : IDisposable
            {
                Vector256<int> a0 = Unsafe.As<byte, Vector256<int>>(ref Unsafe.Add(ref inputRef, i));
                Vector256<int> b0 = Unsafe.As<byte, Vector256<int>>(ref Unsafe.Add(ref prevRef, i));
-                Vector256<byte> c0 = Avx2.Add(a0.AsByte(), b0.AsByte());
+                Vector256<byte> c0 = a0.AsByte() + b0.AsByte();
                ref byte outputRef = ref Unsafe.Add(ref dstRef, i);
                Unsafe.As<byte, Vector256<byte>>(ref outputRef) = c0;
            }
--- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
@ -4,7 +4,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless;

@ -12,17 +12,20 @@ internal static class ColorSpaceTransformUtils
 {
    public static void CollectColorBlueTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
    {
-        if (Avx2.IsSupported && tileWidth >= 16)
+        if (Vector256.IsHardwareAccelerated && tileWidth >= 16)
        {
            const int span = 16;
            Span<ushort> values = stackalloc ushort[span];
-            var collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
-            var collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
-            var collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-            var collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
-            var collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
-            var multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
-            var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
+
+            // These shuffle masks are safe for use with Avx2.Shuffle because all indices are within their respective 128-bit lanes (0–15 for the low mask, 16–31 for the high mask),
+            // and all disabled lanes are set to 0xFF to zero those bytes per the vpshufb specification. This guarantees lane-local shuffling with no cross-lane violations.
+            Vector256<byte> collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
+            Vector256<byte> collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
+            Vector256<byte> collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+            Vector256<byte> collectColorBlueTransformsGreenMask256 = Vector256.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+            Vector256<byte> collectColorBlueTransformsBlueMask256 = Vector256.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+            Vector256<short> multsr = Vector256.Create(LosslessUtils.Cst5b(redToBlue));
+            Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToBlue));
            for (int y = 0; y < tileHeight; y++)
            {
                Span<uint> srcSpan = bgra[(y * stride)..];
@ -33,18 +36,18 @@ internal static class ColorSpaceTransformUtils
                    nuint input1Idx = x + (span / 2);
                    Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
                    Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
-                    Vector256<byte> r0 = Avx2.Shuffle(input0, collectColorBlueTransformsShuffleLowMask256);
-                    Vector256<byte> r1 = Avx2.Shuffle(input1, collectColorBlueTransformsShuffleHighMask256);
-                    Vector256<byte> r = Avx2.Or(r0, r1);
-                    Vector256<byte> gb0 = Avx2.And(input0, collectColorBlueTransformsGreenBlueMask256);
-                    Vector256<byte> gb1 = Avx2.And(input1, collectColorBlueTransformsGreenBlueMask256);
-                    Vector256<ushort> gb = Avx2.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
-                    Vector256<byte> g = Avx2.And(gb.AsByte(), collectColorBlueTransformsGreenMask256);
-                    Vector256<short> a = Avx2.MultiplyHigh(r.AsInt16(), multsr);
-                    Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg);
-                    Vector256<byte> c = Avx2.Subtract(gb.AsByte(), b.AsByte());
-                    Vector256<byte> d = Avx2.Subtract(c, a.AsByte());
-                    Vector256<byte> e = Avx2.And(d, collectColorBlueTransformsBlueMask256);
+                    Vector256<byte> r0 = Vector256_.ShufflePerLane(input0, collectColorBlueTransformsShuffleLowMask256);
+                    Vector256<byte> r1 = Vector256_.ShufflePerLane(input1, collectColorBlueTransformsShuffleHighMask256);
+                    Vector256<byte> r = r0 | r1;
+                    Vector256<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask256;
+                    Vector256<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask256;
+                    Vector256<ushort> gb = Vector256_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
+                    Vector256<byte> g = gb.AsByte() & collectColorBlueTransformsGreenMask256;
+                    Vector256<short> a = Vector256_.MultiplyHigh(r.AsInt16(), multsr);
+                    Vector256<short> b = Vector256_.MultiplyHigh(g.AsInt16(), multsg);
+                    Vector256<byte> c = gb.AsByte() - b.AsByte();
+                    Vector256<byte> d = c - a.AsByte();
+                    Vector256<byte> e = d & collectColorBlueTransformsBlueMask256;

                    ref ushort outputRef = ref MemoryMarshal.GetReference(values);
                    Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = e.AsUInt16();
@ -59,20 +62,20 @@ internal static class ColorSpaceTransformUtils
            int leftOver = tileWidth & (span - 1);
            if (leftOver > 0)
            {
-                CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
+                CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
            }
        }
-        else if (Sse41.IsSupported)
+        else if (Vector128.IsHardwareAccelerated)
        {
            const int span = 8;
            Span<ushort> values = stackalloc ushort[span];
-            var collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
-            var collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
-            var collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
-            var collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
-            var collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
-            var multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
-            var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
+            Vector128<byte> collectColorBlueTransformsShuffleLowMask = Vector128.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255);
+            Vector128<byte> collectColorBlueTransformsShuffleHighMask = Vector128.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14);
+            Vector128<byte> collectColorBlueTransformsGreenBlueMask = Vector128.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
+            Vector128<byte> collectColorBlueTransformsGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
+            Vector128<byte> collectColorBlueTransformsBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
+            Vector128<short> multsr = Vector128.Create(LosslessUtils.Cst5b(redToBlue));
+            Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToBlue));
            for (int y = 0; y < tileHeight; y++)
            {
                Span<uint> srcSpan = bgra[(y * stride)..];
@ -83,18 +86,18 @@ internal static class ColorSpaceTransformUtils
                    nuint input1Idx = x + (span / 2);
                    Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
                    Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
-                    Vector128<byte> r0 = Ssse3.Shuffle(input0, collectColorBlueTransformsShuffleLowMask);
-                    Vector128<byte> r1 = Ssse3.Shuffle(input1, collectColorBlueTransformsShuffleHighMask);
-                    Vector128<byte> r = Sse2.Or(r0, r1);
-                    Vector128<byte> gb0 = Sse2.And(input0, collectColorBlueTransformsGreenBlueMask);
-                    Vector128<byte> gb1 = Sse2.And(input1, collectColorBlueTransformsGreenBlueMask);
-                    Vector128<ushort> gb = Sse41.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
-                    Vector128<byte> g = Sse2.And(gb.AsByte(), collectColorBlueTransformsGreenMask);
-                    Vector128<short> a = Sse2.MultiplyHigh(r.AsInt16(), multsr);
-                    Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg);
-                    Vector128<byte> c = Sse2.Subtract(gb.AsByte(), b.AsByte());
-                    Vector128<byte> d = Sse2.Subtract(c, a.AsByte());
-                    Vector128<byte> e = Sse2.And(d, collectColorBlueTransformsBlueMask);
+                    Vector128<byte> r0 = Vector128_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask);
+                    Vector128<byte> r1 = Vector128_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask);
+                    Vector128<byte> r = r0 | r1;
+                    Vector128<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask;
+                    Vector128<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask;
+                    Vector128<ushort> gb = Vector128_.PackUnsignedSaturate(gb0.AsInt32(), gb1.AsInt32());
+                    Vector128<byte> g = gb.AsByte() & collectColorBlueTransformsGreenMask;
+                    Vector128<short> a = Vector128_.MultiplyHigh(r.AsInt16(), multsr);
+                    Vector128<short> b = Vector128_.MultiplyHigh(g.AsInt16(), multsg);
+                    Vector128<byte> c = gb.AsByte() - b.AsByte();
+                    Vector128<byte> d = c - a.AsByte();
+                    Vector128<byte> e = d & collectColorBlueTransformsBlueMask;

                    ref ushort outputRef = ref MemoryMarshal.GetReference(values);
                    Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = e.AsUInt16();
@ -109,16 +112,16 @@ internal static class ColorSpaceTransformUtils
            int leftOver = tileWidth & (span - 1);
            if (leftOver > 0)
            {
-                CollectColorBlueTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
+                CollectColorBlueTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToBlue, redToBlue, histo);
            }
        }
        else
        {
-            CollectColorBlueTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
+            CollectColorBlueTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToBlue, redToBlue, histo);
        }
    }

-    private static void CollectColorBlueTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
+    private static void CollectColorBlueTransformsScalar(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToBlue, int redToBlue, Span<int> histo)
    {
        int pos = 0;
        while (tileHeight-- > 0)
@ -135,11 +138,11 @@ internal static class ColorSpaceTransformUtils

    public static void CollectColorRedTransforms(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
    {
-        if (Avx2.IsSupported && tileWidth >= 16)
+        if (Vector256.IsHardwareAccelerated && tileWidth >= 16)
        {
            Vector256<byte> collectColorRedTransformsGreenMask256 = Vector256.Create(0x00ff00).AsByte();
            Vector256<byte> collectColorRedTransformsAndMask256 = Vector256.Create((short)0xff).AsByte();
-            var multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
+            Vector256<short> multsg = Vector256.Create(LosslessUtils.Cst5b(greenToRed));
            const int span = 16;
            Span<ushort> values = stackalloc ushort[span];
            for (int y = 0; y < tileHeight; y++)
@ -152,15 +155,15 @@ internal static class ColorSpaceTransformUtils
                    nuint input1Idx = x + (span / 2);
                    Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
                    Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
-                    Vector256<byte> g0 = Avx2.And(input0, collectColorRedTransformsGreenMask256); // 0 0  | g 0
-                    Vector256<byte> g1 = Avx2.And(input1, collectColorRedTransformsGreenMask256);
-                    Vector256<ushort> g = Avx2.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
-                    Vector256<int> a0 = Avx2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0  | x r
-                    Vector256<int> a1 = Avx2.ShiftRightLogical(input1.AsInt32(), 16);
-                    Vector256<ushort> a = Avx2.PackUnsignedSaturate(a0, a1); // x r
-                    Vector256<short> b = Avx2.MultiplyHigh(g.AsInt16(), multsg); // x dr
-                    Vector256<byte> c = Avx2.Subtract(a.AsByte(), b.AsByte()); // x r'
-                    Vector256<byte> d = Avx2.And(c, collectColorRedTransformsAndMask256); // 0 r'
+                    Vector256<byte> g0 = input0 & collectColorRedTransformsGreenMask256; // 0 0  | g 0
+                    Vector256<byte> g1 = input1 & collectColorRedTransformsGreenMask256;
+                    Vector256<ushort> g = Vector256_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
+                    Vector256<int> a0 = Vector256.ShiftRightLogical(input0.AsInt32(), 16); // 0 0  | x r
+                    Vector256<int> a1 = Vector256.ShiftRightLogical(input1.AsInt32(), 16);
+                    Vector256<ushort> a = Vector256_.PackUnsignedSaturate(a0, a1); // x r
+                    Vector256<short> b = Vector256_.MultiplyHigh(g.AsInt16(), multsg); // x dr
+                    Vector256<byte> c = a.AsByte() - b.AsByte(); // x r'
+                    Vector256<byte> d = c & collectColorRedTransformsAndMask256; // 0 r'

                    ref ushort outputRef = ref MemoryMarshal.GetReference(values);
                    Unsafe.As<ushort, Vector256<ushort>>(ref outputRef) = d.AsUInt16();
@ -175,14 +178,14 @@ internal static class ColorSpaceTransformUtils
            int leftOver = tileWidth & (span - 1);
            if (leftOver > 0)
            {
-                CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
+                CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
            }
        }
-        else if (Sse41.IsSupported)
+        else if (Vector128.IsHardwareAccelerated)
        {
            Vector128<byte> collectColorRedTransformsGreenMask = Vector128.Create(0x00ff00).AsByte();
            Vector128<byte> collectColorRedTransformsAndMask = Vector128.Create((short)0xff).AsByte();
-            var multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
+            Vector128<short> multsg = Vector128.Create(LosslessUtils.Cst5b(greenToRed));
            const int span = 8;
            Span<ushort> values = stackalloc ushort[span];
            for (int y = 0; y < tileHeight; y++)
@ -195,15 +198,15 @@ internal static class ColorSpaceTransformUtils
                    nuint input1Idx = x + (span / 2);
                    Vector128<byte> input0 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
                    Vector128<byte> input1 = Unsafe.As<uint, Vector128<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
-                    Vector128<byte> g0 = Sse2.And(input0, collectColorRedTransformsGreenMask); // 0 0  | g 0
-                    Vector128<byte> g1 = Sse2.And(input1, collectColorRedTransformsGreenMask);
-                    Vector128<ushort> g = Sse41.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
-                    Vector128<int> a0 = Sse2.ShiftRightLogical(input0.AsInt32(), 16); // 0 0  | x r
-                    Vector128<int> a1 = Sse2.ShiftRightLogical(input1.AsInt32(), 16);
-                    Vector128<ushort> a = Sse41.PackUnsignedSaturate(a0, a1); // x r
-                    Vector128<short> b = Sse2.MultiplyHigh(g.AsInt16(), multsg); // x dr
-                    Vector128<byte> c = Sse2.Subtract(a.AsByte(), b.AsByte()); // x r'
-                    Vector128<byte> d = Sse2.And(c, collectColorRedTransformsAndMask); // 0 r'
+                    Vector128<byte> g0 = input0 & collectColorRedTransformsGreenMask; // 0 0  | g 0
+                    Vector128<byte> g1 = input1 & collectColorRedTransformsGreenMask;
+                    Vector128<ushort> g = Vector128_.PackUnsignedSaturate(g0.AsInt32(), g1.AsInt32()); // g 0
+                    Vector128<int> a0 = Vector128.ShiftRightLogical(input0.AsInt32(), 16); // 0 0  | x r
+                    Vector128<int> a1 = Vector128.ShiftRightLogical(input1.AsInt32(), 16);
+                    Vector128<ushort> a = Vector128_.PackUnsignedSaturate(a0, a1); // x r
+                    Vector128<short> b = Vector128_.MultiplyHigh(g.AsInt16(), multsg); // x dr
+                    Vector128<byte> c = a.AsByte() - b.AsByte(); // x r'
+                    Vector128<byte> d = c & collectColorRedTransformsAndMask; // 0 r'

                    ref ushort outputRef = ref MemoryMarshal.GetReference(values);
                    Unsafe.As<ushort, Vector128<ushort>>(ref outputRef) = d.AsUInt16();
@ -218,16 +221,16 @@ internal static class ColorSpaceTransformUtils
            int leftOver = tileWidth & (span - 1);
            if (leftOver > 0)
            {
-                CollectColorRedTransformsNoneVectorized(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
+                CollectColorRedTransformsScalar(bgra[(tileWidth - leftOver)..], stride, leftOver, tileHeight, greenToRed, histo);
            }
        }
        else
        {
-            CollectColorRedTransformsNoneVectorized(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
+            CollectColorRedTransformsScalar(bgra, stride, tileWidth, tileHeight, greenToRed, histo);
        }
    }

-    private static void CollectColorRedTransformsNoneVectorized(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
+    private static void CollectColorRedTransformsScalar(Span<uint> bgra, int stride, int tileWidth, int tileHeight, int greenToRed, Span<int> histo)
    {
        int pos = 0;
        while (tileHeight-- > 0)
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@ -6,6 +6,7 @@ using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Memory;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossless;
@ -94,17 +95,20 @@ internal static unsafe class LosslessUtils
    /// <param name="pixelData">The pixel data to apply the transformation.</param>
    public static void AddGreenToBlueAndRed(Span<uint> pixelData)
    {
-        if (Avx2.IsSupported && pixelData.Length >= 8)
+        if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
        {
-            Vector256<byte> addGreenToBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
+            // The `255` values disable the write for alpha (A), since 0x80 is set in the control byte (high bit set).
+            // Each byte index is within its respective 128-bit lane (0–15 and 16–31), so this is safe for per-lane shuffle.
+            // The high bits are not set for the index bytes, and the values are always < 16 per lane, satisfying AVX2 lane rules.
+            Vector256<byte> addGreenToBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
            do
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
-                Vector256<byte> in0g0g = Avx2.Shuffle(input, addGreenToBlueAndRedMaskAvx2);
-                Vector256<byte> output = Avx2.Add(input, in0g0g);
+                Vector256<byte> in0g0g = Vector256_.ShufflePerLane(input, addGreenToBlueAndRedMask);
+                Vector256<byte> output = input + in0g0g;
                Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
                i += 8;
            }
@ -115,39 +119,17 @@ internal static unsafe class LosslessUtils
                AddGreenToBlueAndRedScalar(pixelData[(int)i..]);
            }
        }
-        else if (Ssse3.IsSupported && pixelData.Length >= 4)
-        {
-            Vector128<byte> addGreenToBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
-            nuint numPixels = (uint)pixelData.Length;
-            nuint i = 0;
-            do
-            {
-                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
-                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<byte> in0g0g = Ssse3.Shuffle(input, addGreenToBlueAndRedMaskSsse3);
-                Vector128<byte> output = Sse2.Add(input, in0g0g);
-                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
-                i += 4;
-            }
-            while (i <= numPixels - 4);
-
-            if (i != numPixels)
-            {
-                AddGreenToBlueAndRedScalar(pixelData[(int)i..]);
-            }
-        }
-        else if (Sse2.IsSupported && pixelData.Length >= 4)
+        else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
        {
+            Vector128<byte> addGreenToBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
            do
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
-                Vector128<ushort> b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<ushort> c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g
-                Vector128<byte> output = Sse2.Add(input.AsByte(), c.AsByte());
+                Vector128<byte> in0g0g = Vector128_.ShuffleNative(input, addGreenToBlueAndRedMask);
+                Vector128<byte> output = input + in0g0g;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
                i += 4;
            }
@ -180,17 +162,17 @@ internal static unsafe class LosslessUtils

    public static void SubtractGreenFromBlueAndRed(Span<uint> pixelData)
    {
-        if (Avx2.IsSupported && pixelData.Length >= 8)
+        if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
        {
-            Vector256<byte> subtractGreenFromBlueAndRedMaskAvx2 = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
+            Vector256<byte> subtractGreenFromBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
            do
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
-                Vector256<byte> in0g0g = Avx2.Shuffle(input, subtractGreenFromBlueAndRedMaskAvx2);
-                Vector256<byte> output = Avx2.Subtract(input, in0g0g);
+                Vector256<byte> in0g0g = Vector256_.ShufflePerLane(input, subtractGreenFromBlueAndRedMask);
+                Vector256<byte> output = input - in0g0g;
                Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
                i += 8;
            }
@ -201,39 +183,17 @@ internal static unsafe class LosslessUtils
                SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]);
            }
        }
-        else if (Ssse3.IsSupported && pixelData.Length >= 4)
-        {
-            Vector128<byte> subtractGreenFromBlueAndRedMaskSsse3 = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
-            nuint numPixels = (uint)pixelData.Length;
-            nuint i = 0;
-            do
-            {
-                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
-                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<byte> in0g0g = Ssse3.Shuffle(input, subtractGreenFromBlueAndRedMaskSsse3);
-                Vector128<byte> output = Sse2.Subtract(input, in0g0g);
-                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
-                i += 4;
-            }
-            while (i <= numPixels - 4);
-
-            if (i != numPixels)
-            {
-                SubtractGreenFromBlueAndRedScalar(pixelData[(int)i..]);
-            }
-        }
-        else if (Sse2.IsSupported && pixelData.Length >= 4)
+        else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
        {
+            Vector128<byte> subtractGreenFromBlueAndRedMask = Vector128.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
            do
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector128<byte> input = Unsafe.As<uint, Vector128<uint>>(ref pos).AsByte();
-                Vector128<ushort> a = Sse2.ShiftRightLogical(input.AsUInt16(), 8); // 0 a 0 g
-                Vector128<ushort> b = Sse2.ShuffleLow(a, SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<ushort> c = Sse2.ShuffleHigh(b, SimdUtils.Shuffle.MMShuffle2200); // 0g0g
-                Vector128<byte> output = Sse2.Subtract(input.AsByte(), c.AsByte());
+                Vector128<byte> in0g0g = Vector128_.ShuffleNative(input, subtractGreenFromBlueAndRedMask);
+                Vector128<byte> output = input - in0g0g;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
                i += 4;
            }
@ -412,7 +372,7 @@ internal static unsafe class LosslessUtils
                TransformColorScalar(m, pixelData[(int)idx..], numPixels - (int)idx);
            }
        }
-        else if (Sse2.IsSupported && numPixels >= 4)
+        else if (Vector128.IsHardwareAccelerated && numPixels >= 4)
        {
            Vector128<byte> transformColorAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
            Vector128<byte> transformColorRedBlueMask = Vector128.Create(255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0);
@ -423,16 +383,16 @@ internal static unsafe class LosslessUtils
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
                Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
-                Vector128<byte> a = Sse2.And(input.AsByte(), transformColorAlphaGreenMask);
-                Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
-                Vector128<short> e = Sse2.ShiftLeftLogical(input.AsInt16(), 8);
-                Vector128<short> f = Sse2.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
-                Vector128<int> g = Sse2.ShiftRightLogical(f.AsInt32(), 16);
-                Vector128<byte> h = Sse2.Add(g.AsByte(), d.AsByte());
-                Vector128<byte> i = Sse2.And(h, transformColorRedBlueMask);
-                Vector128<byte> output = Sse2.Subtract(input.AsByte(), i);
+                Vector128<byte> a = input.AsByte() & transformColorAlphaGreenMask;
+                Vector128<short> b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
+                Vector128<short> e = Vector128_.ShiftLeftLogical(input.AsInt16(), 8);
+                Vector128<short> f = Vector128_.MultiplyHigh(e.AsInt16(), multsb2.AsInt16());
+                Vector128<int> g = Vector128.ShiftRightLogical(f.AsInt32(), 16);
+                Vector128<byte> h = g.AsByte() + d.AsByte();
+                Vector128<byte> i = h & transformColorRedBlueMask;
+                Vector128<byte> output = input.AsByte() - i;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
                idx += 4;
            }
@ -503,7 +463,7 @@ internal static unsafe class LosslessUtils
                TransformColorInverseScalar(m, pixelData[(int)idx..]);
            }
        }
-        else if (Sse2.IsSupported && pixelData.Length >= 4)
+        else if (Vector128.IsHardwareAccelerated && pixelData.Length >= 4)
        {
            Vector128<byte> transformColorInverseAlphaGreenMask = Vector128.Create(0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255, 0, 255);
            Vector128<int> multsrb = MkCst16(Cst5b(m.GreenToRed), Cst5b(m.GreenToBlue));
@ -514,17 +474,17 @@ internal static unsafe class LosslessUtils
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), idx);
                Vector128<uint> input = Unsafe.As<uint, Vector128<uint>>(ref pos);
-                Vector128<byte> a = Sse2.And(input.AsByte(), transformColorInverseAlphaGreenMask);
-                Vector128<short> b = Sse2.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> c = Sse2.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
-                Vector128<short> d = Sse2.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
-                Vector128<byte> e = Sse2.Add(input.AsByte(), d.AsByte());
-                Vector128<short> f = Sse2.ShiftLeftLogical(e.AsInt16(), 8);
-                Vector128<short> g = Sse2.MultiplyHigh(f, multsb2.AsInt16());
-                Vector128<int> h = Sse2.ShiftRightLogical(g.AsInt32(), 8);
-                Vector128<byte> i = Sse2.Add(h.AsByte(), f.AsByte());
-                Vector128<short> j = Sse2.ShiftRightLogical(i.AsInt16(), 8);
-                Vector128<byte> output = Sse2.Or(j.AsByte(), a);
+                Vector128<byte> a = input.AsByte() & transformColorInverseAlphaGreenMask;
+                Vector128<short> b = Vector128_.ShuffleLow(a.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> c = Vector128_.ShuffleHigh(b.AsInt16(), SimdUtils.Shuffle.MMShuffle2200);
+                Vector128<short> d = Vector128_.MultiplyHigh(c.AsInt16(), multsrb.AsInt16());
+                Vector128<byte> e = input.AsByte() + d.AsByte();
+                Vector128<short> f = Vector128_.ShiftLeftLogical(e.AsInt16(), 8);
+                Vector128<short> g = Vector128_.MultiplyHigh(f, multsb2.AsInt16());
+                Vector128<int> h = Vector128.ShiftRightLogical(g.AsInt32(), 8);
+                Vector128<byte> i = h.AsByte() + f.AsByte();
+                Vector128<short> j = Vector128.ShiftRightLogical(i.AsInt16(), 8);
+                Vector128<byte> output = j.AsByte() | a;
                Unsafe.As<uint, Vector128<uint>>(ref pos) = output.AsUInt32();
            }

@ -1401,15 +1361,15 @@ internal static unsafe class LosslessUtils

    private static uint ClampedAddSubtractFull(uint c0, uint c1, uint c2)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
-            Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> c2Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
-            Vector128<short> v1 = Sse2.Add(c0Vec.AsInt16(), c1Vec.AsInt16());
-            Vector128<short> v2 = Sse2.Subtract(v1, c2Vec.AsInt16());
-            Vector128<byte> b = Sse2.PackUnsignedSaturate(v2, v2);
-            return Sse2.ConvertToUInt32(b.AsUInt32());
+            Vector128<byte> c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> c2Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128<byte>.Zero);
+            Vector128<short> v1 = c0Vec.AsInt16() + c1Vec.AsInt16();
+            Vector128<short> v2 = v1 - c2Vec.AsInt16();
+            Vector128<byte> b = Vector128_.PackUnsignedSaturate(v2, v2);
+            return b.AsUInt32().ToScalar();
        }

        {
@ -1432,20 +1392,20 @@ internal static unsafe class LosslessUtils

    private static uint ClampedAddSubtractHalf(uint c0, uint c1, uint c2)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
-            Vector128<byte> c0Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c0).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> c1Vec = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c1).AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> b0 = Sse2.UnpackLow(Sse2.ConvertScalarToVector128UInt32(c2).AsByte(), Vector128<byte>.Zero);
-            Vector128<short> avg = Sse2.Add(c1Vec.AsInt16(), c0Vec.AsInt16());
-            Vector128<short> a0 = Sse2.ShiftRightLogical(avg, 1);
-            Vector128<short> a1 = Sse2.Subtract(a0, b0.AsInt16());
-            Vector128<short> bgta = Sse2.CompareGreaterThan(b0.AsInt16(), a0.AsInt16());
-            Vector128<short> a2 = Sse2.Subtract(a1, bgta);
-            Vector128<short> a3 = Sse2.ShiftRightArithmetic(a2, 1);
-            Vector128<short> a4 = Sse2.Add(a0, a3).AsInt16();
-            Vector128<byte> a5 = Sse2.PackUnsignedSaturate(a4, a4);
-            return Sse2.ConvertToUInt32(a5.AsUInt32());
+            Vector128<byte> c0Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c0).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> c1Vec = Vector128_.UnpackLow(Vector128.CreateScalar(c1).AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> b0 = Vector128_.UnpackLow(Vector128.CreateScalar(c2).AsByte(), Vector128<byte>.Zero);
+            Vector128<short> avg = c1Vec.AsInt16() + c0Vec.AsInt16();
+            Vector128<short> a0 = Vector128.ShiftRightLogical(avg, 1);
+            Vector128<short> a1 = a0 - b0.AsInt16();
+            Vector128<short> bgta = Vector128.GreaterThan(b0.AsInt16(), a0.AsInt16());
+            Vector128<short> a2 = a1 - bgta;
+            Vector128<short> a3 = Vector128.ShiftRightArithmetic(a2, 1);
+            Vector128<short> a4 = (a0 + a3).AsInt16();
+            Vector128<byte> a5 = Vector128_.PackUnsignedSaturate(a4, a4);
+            return a5.AsUInt32().ToScalar();
        }

        {
@ -1475,23 +1435,23 @@ internal static unsafe class LosslessUtils

    private static uint Select(uint a, uint b, uint c, Span<short> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            fixed (short* ptr = &MemoryMarshal.GetReference(scratch))
            {
-                Vector128<byte> a0 = Sse2.ConvertScalarToVector128UInt32(a).AsByte();
-                Vector128<byte> b0 = Sse2.ConvertScalarToVector128UInt32(b).AsByte();
-                Vector128<byte> c0 = Sse2.ConvertScalarToVector128UInt32(c).AsByte();
-                Vector128<byte> ac0 = Sse2.SubtractSaturate(a0, c0);
-                Vector128<byte> ca0 = Sse2.SubtractSaturate(c0, a0);
-                Vector128<byte> bc0 = Sse2.SubtractSaturate(b0, c0);
-                Vector128<byte> cb0 = Sse2.SubtractSaturate(c0, b0);
-                Vector128<byte> ac = Sse2.Or(ac0, ca0);
-                Vector128<byte> bc = Sse2.Or(bc0, cb0);
-                Vector128<byte> pa = Sse2.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
-                Vector128<byte> pb = Sse2.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
-                Vector128<ushort> diff = Sse2.Subtract(pb.AsUInt16(), pa.AsUInt16());
-                Sse2.Store((ushort*)ptr, diff);
+                Vector128<byte> a0 = Vector128.CreateScalar(a).AsByte();
+                Vector128<byte> b0 = Vector128.CreateScalar(b).AsByte();
+                Vector128<byte> c0 = Vector128.CreateScalar(c).AsByte();
+                Vector128<byte> ac0 = Vector128_.SubtractSaturate(a0, c0);
+                Vector128<byte> ca0 = Vector128_.SubtractSaturate(c0, a0);
+                Vector128<byte> bc0 = Vector128_.SubtractSaturate(b0, c0);
+                Vector128<byte> cb0 = Vector128_.SubtractSaturate(c0, b0);
+                Vector128<byte> ac = ac0 | ca0;
+                Vector128<byte> bc = bc0 | cb0;
+                Vector128<byte> pa = Vector128_.UnpackLow(ac, Vector128<byte>.Zero); // |a - c|
+                Vector128<byte> pb = Vector128_.UnpackLow(bc, Vector128<byte>.Zero); // |b - c|
+                Vector128<ushort> diff = pb.AsUInt16() - pa.AsUInt16();
+                diff.Store((ushort*)ptr);
                int paMinusPb = ptr[3] + ptr[2] + ptr[1] + ptr[0];
                return (paMinusPb <= 0) ? a : b;
            }
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@ -2,10 +2,11 @@
 // Licensed under the Six Labors Split License.

 using System.Buffers.Binary;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;

 namespace SixLabors.ImageSharp.Formats.Webp.Lossy;

@ -78,7 +79,7 @@ internal static unsafe class Vp8Encoding
    // Does two inverse transforms.
    public static void ITransformTwo(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            // This implementation makes use of 16-bit fixed point versions of two
            // multiply constants:
@ -116,10 +117,10 @@ internal static unsafe class Vp8Encoding
            Vector128<long> inb2 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 24)), 0);
            Vector128<long> inb3 = Vector128.Create(Unsafe.As<short, long>(ref Unsafe.Add(ref inputRef, 28)), 0);

-            in0 = Sse2.UnpackLow(in0, inb0);
-            in1 = Sse2.UnpackLow(in1, inb1);
-            in2 = Sse2.UnpackLow(in2, inb2);
-            in3 = Sse2.UnpackLow(in3, inb3);
+            in0 = Vector128_.UnpackLow(in0, inb0);
+            in1 = Vector128_.UnpackLow(in1, inb1);
+            in2 = Vector128_.UnpackLow(in2, inb2);
+            in3 = Vector128_.UnpackLow(in3, inb3);

            // a00 a10 a20 a30   b00 b10 b20 b30
            // a01 a11 a21 a31   b01 b11 b21 b31
@ -128,49 +129,45 @@ internal static unsafe class Vp8Encoding

            // Vertical pass and subsequent transpose.
            // First pass, c and d calculations are longer because of the "trick" multiplications.
-            InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
+            InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);

            // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);

            // Horizontal pass and subsequent transpose.
            // First pass, c and d calculations are longer because of the "trick" multiplications.
-            InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
+            InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);

            // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);

            // Add inverse transform to 'ref' and store.
            // Load the reference(s).
-            Vector128<byte> ref0 = Vector128<byte>.Zero;
-            Vector128<byte> ref1 = Vector128<byte>.Zero;
-            Vector128<byte> ref2 = Vector128<byte>.Zero;
-            Vector128<byte> ref3 = Vector128<byte>.Zero;
            ref byte referenceRef = ref MemoryMarshal.GetReference(reference);

            // Load eight bytes/pixels per line.
-            ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
-            ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
-            ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
-            ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
+            Vector128<byte> ref0 = Vector128.Create(Unsafe.As<byte, long>(ref referenceRef), 0).AsByte();
+            Vector128<byte> ref1 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
+            Vector128<byte> ref2 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
+            Vector128<byte> ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();

            // Convert to 16b.
-            ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
-            ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
-            ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
-            ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
+            ref0 = Vector128_.UnpackLow(ref0, Vector128<byte>.Zero);
+            ref1 = Vector128_.UnpackLow(ref1, Vector128<byte>.Zero);
+            ref2 = Vector128_.UnpackLow(ref2, Vector128<byte>.Zero);
+            ref3 = Vector128_.UnpackLow(ref3, Vector128<byte>.Zero);

            // Add the inverse transform(s).
-            Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
-            Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
-            Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
-            Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
+            Vector128<short> ref0InvAdded = ref0.AsInt16() + t0.AsInt16();
+            Vector128<short> ref1InvAdded = ref1.AsInt16() + t1.AsInt16();
+            Vector128<short> ref2InvAdded = ref2.AsInt16() + t2.AsInt16();
+            Vector128<short> ref3InvAdded = ref3.AsInt16() + t3.AsInt16();

            // Unsigned saturate to 8b.
-            ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
-            ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
-            ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
-            ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
+            ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
+            ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
+            ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
+            ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);

            // Store eight bytes/pixels per line.
            ref byte outputRef = ref MemoryMarshal.GetReference(dst);
@ -188,7 +185,7 @@ internal static unsafe class Vp8Encoding

    public static void ITransformOne(Span<byte> reference, Span<short> input, Span<byte> dst, Span<int> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            // Load and concatenate the transform coefficients (we'll do two inverse
            // transforms in parallel). In the case of only one inverse transform, the
@ -207,63 +204,59 @@ internal static unsafe class Vp8Encoding

            // Vertical pass and subsequent transpose.
            // First pass, c and d calculations are longer because of the "trick" multiplications.
-            InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
+            InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);

            // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);

            // Horizontal pass and subsequent transpose.
            // First pass, c and d calculations are longer because of the "trick" multiplications.
-            InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
+            InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);

            // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);

            // Add inverse transform to 'ref' and store.
            // Load the reference(s).
-            Vector128<byte> ref0 = Vector128<byte>.Zero;
-            Vector128<byte> ref1 = Vector128<byte>.Zero;
-            Vector128<byte> ref2 = Vector128<byte>.Zero;
-            Vector128<byte> ref3 = Vector128<byte>.Zero;
            ref byte referenceRef = ref MemoryMarshal.GetReference(reference);

            // Load four bytes/pixels per line.
-            ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref referenceRef)).AsByte();
-            ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
-            ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
-            ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
+            Vector128<byte> ref0 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref referenceRef)).AsByte();
+            Vector128<byte> ref1 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
+            Vector128<byte> ref2 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
+            Vector128<byte> ref3 = Vector128.CreateScalar(Unsafe.ReadUnaligned<int>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();

            // Convert to 16b.
-            ref0 = Sse2.UnpackLow(ref0, Vector128<byte>.Zero);
-            ref1 = Sse2.UnpackLow(ref1, Vector128<byte>.Zero);
-            ref2 = Sse2.UnpackLow(ref2, Vector128<byte>.Zero);
-            ref3 = Sse2.UnpackLow(ref3, Vector128<byte>.Zero);
+            ref0 = Vector128_.UnpackLow(ref0, Vector128<byte>.Zero);
+            ref1 = Vector128_.UnpackLow(ref1, Vector128<byte>.Zero);
+            ref2 = Vector128_.UnpackLow(ref2, Vector128<byte>.Zero);
+            ref3 = Vector128_.UnpackLow(ref3, Vector128<byte>.Zero);

            // Add the inverse transform(s).
-            Vector128<short> ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
-            Vector128<short> ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
-            Vector128<short> ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
-            Vector128<short> ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
+            Vector128<short> ref0InvAdded = ref0.AsInt16() + t0.AsInt16();
+            Vector128<short> ref1InvAdded = ref1.AsInt16() + t1.AsInt16();
+            Vector128<short> ref2InvAdded = ref2.AsInt16() + t2.AsInt16();
+            Vector128<short> ref3InvAdded = ref3.AsInt16() + t3.AsInt16();

            // Unsigned saturate to 8b.
-            ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
-            ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
-            ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
-            ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
+            ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
+            ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
+            ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
+            ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);

            // Unsigned saturate to 8b.
            ref byte outputRef = ref MemoryMarshal.GetReference(dst);

            // Store four bytes/pixels per line.
-            int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
-            int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
-            int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
-            int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
-
-            Unsafe.As<byte, int>(ref outputRef) = output0;
-            Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
-            Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
-            Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3)) = output3;
+            int output0 = ref0.AsInt32().ToScalar();
+            int output1 = ref1.AsInt32().ToScalar();
+            int output2 = ref2.AsInt32().ToScalar();
+            int output3 = ref3.AsInt32().ToScalar();
+
+            Unsafe.WriteUnaligned(ref outputRef, output0);
+            Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps), output1);
+            Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2), output2);
+            Unsafe.WriteUnaligned(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 3), output3);
        }
        else
        {
@ -302,72 +295,72 @@ internal static unsafe class Vp8Encoding
        }
    }

-    private static void InverseTransformVerticalPass(Vector128<long> in0, Vector128<long> in2, Vector128<long> in1, Vector128<long> in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3)
+    private static void InverseTransformVerticalPassVector128(Vector128<long> in0, Vector128<long> in2, Vector128<long> in1, Vector128<long> in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3)
    {
-        Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
-        Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
+        Vector128<short> a = in0.AsInt16() + in2.AsInt16();
+        Vector128<short> b = in0.AsInt16() - in2.AsInt16();

        Vector128<short> k1 = Vector128.Create((short)20091).AsInt16();
        Vector128<short> k2 = Vector128.Create((short)-30068).AsInt16();

        // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
-        Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2);
-        Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1);
-        Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
-        Vector128<short> c4 = Sse2.Subtract(c1, c2);
-        Vector128<short> c = Sse2.Add(c3, c4);
+        Vector128<short> c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2);
+        Vector128<short> c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1);
+        Vector128<short> c3 = in1.AsInt16() - in3.AsInt16();
+        Vector128<short> c4 = c1 - c2;
+        Vector128<short> c = c3 + c4;

        // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
-        Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1);
-        Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2);
-        Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
-        Vector128<short> d4 = Sse2.Add(d1, d2);
-        Vector128<short> d = Sse2.Add(d3, d4);
+        Vector128<short> d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1);
+        Vector128<short> d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2);
+        Vector128<short> d3 = in1.AsInt16() + in3.AsInt16();
+        Vector128<short> d4 = d1 + d2;
+        Vector128<short> d = d3 + d4;

        // Second pass.
-        tmp0 = Sse2.Add(a, d);
-        tmp1 = Sse2.Add(b, c);
-        tmp2 = Sse2.Subtract(b, c);
-        tmp3 = Sse2.Subtract(a, d);
+        tmp0 = a + d;
+        tmp1 = b + c;
+        tmp2 = b - c;
+        tmp3 = a - d;
    }

-    private static void InverseTransformHorizontalPass(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
+    private static void InverseTransformHorizontalPassVector128(Vector128<long> t0, Vector128<long> t2, Vector128<long> t1, Vector128<long> t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3)
    {
-        Vector128<short> dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4));
-        Vector128<short> a = Sse2.Add(dc, t2.AsInt16());
-        Vector128<short> b = Sse2.Subtract(dc, t2.AsInt16());
+        Vector128<short> dc = t0.AsInt16() + Vector128.Create((short)4);
+        Vector128<short> a = dc + t2.AsInt16();
+        Vector128<short> b = dc - t2.AsInt16();

        Vector128<short> k1 = Vector128.Create((short)20091).AsInt16();
        Vector128<short> k2 = Vector128.Create((short)-30068).AsInt16();

        // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
-        Vector128<short> c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2);
-        Vector128<short> c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1);
-        Vector128<short> c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
-        Vector128<short> c4 = Sse2.Subtract(c1, c2);
-        Vector128<short> c = Sse2.Add(c3, c4);
+        Vector128<short> c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2);
+        Vector128<short> c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1);
+        Vector128<short> c3 = t1.AsInt16() - t3.AsInt16();
+        Vector128<short> c4 = c1 - c2;
+        Vector128<short> c = c3 + c4;

        // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
-        Vector128<short> d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1);
-        Vector128<short> d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2);
-        Vector128<short> d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
-        Vector128<short> d4 = Sse2.Add(d1, d2);
-        Vector128<short> d = Sse2.Add(d3, d4);
+        Vector128<short> d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1);
+        Vector128<short> d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2);
+        Vector128<short> d3 = t1.AsInt16() + t3.AsInt16();
+        Vector128<short> d4 = d1 + d2;
+        Vector128<short> d = d3 + d4;

        // Second pass.
-        Vector128<short> tmp0 = Sse2.Add(a, d);
-        Vector128<short> tmp1 = Sse2.Add(b, c);
-        Vector128<short> tmp2 = Sse2.Subtract(b, c);
-        Vector128<short> tmp3 = Sse2.Subtract(a, d);
-        shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
-        shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
-        shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
-        shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+        Vector128<short> tmp0 = a + d;
+        Vector128<short> tmp1 = b + c;
+        Vector128<short> tmp2 = b - c;
+        Vector128<short> tmp3 = a - d;
+        shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3);
+        shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3);
+        shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3);
+        shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3);
    }

    public static void FTransform2(Span<byte> src, Span<byte> reference, Span<short> output, Span<short> output2, Span<int> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            ref byte srcRef = ref MemoryMarshal.GetReference(src);
            ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
@ -385,38 +378,38 @@ internal static unsafe class Vp8Encoding
            Vector128<long> ref3 = Vector128.Create(Unsafe.As<byte, long>(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0);

            // Convert both to 16 bit.
-            Vector128<byte> srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> srcLow0 = Vector128_.UnpackLow(src0.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> srcLow1 = Vector128_.UnpackLow(src1.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> srcLow2 = Vector128_.UnpackLow(src2.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> srcLow3 = Vector128_.UnpackLow(src3.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> refLow0 = Vector128_.UnpackLow(ref0.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> refLow1 = Vector128_.UnpackLow(ref1.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> refLow2 = Vector128_.UnpackLow(ref2.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> refLow3 = Vector128_.UnpackLow(ref3.AsByte(), Vector128<byte>.Zero);

            // Compute difference. -> 00 01 02 03  00' 01' 02' 03'
-            Vector128<short> diff0 = Sse2.Subtract(srcLow0.AsInt16(), refLow0.AsInt16());
-            Vector128<short> diff1 = Sse2.Subtract(srcLow1.AsInt16(), refLow1.AsInt16());
-            Vector128<short> diff2 = Sse2.Subtract(srcLow2.AsInt16(), refLow2.AsInt16());
-            Vector128<short> diff3 = Sse2.Subtract(srcLow3.AsInt16(), refLow3.AsInt16());
+            Vector128<short> diff0 = srcLow0.AsInt16() - refLow0.AsInt16();
+            Vector128<short> diff1 = srcLow1.AsInt16() - refLow1.AsInt16();
+            Vector128<short> diff2 = srcLow2.AsInt16() - refLow2.AsInt16();
+            Vector128<short> diff3 = srcLow3.AsInt16() - refLow3.AsInt16();

            // Unpack and shuffle.
            // 00 01 02 03   0 0 0 0
            // 10 11 12 13   0 0 0 0
            // 20 21 22 23   0 0 0 0
            // 30 31 32 33   0 0 0 0
-            Vector128<int> shuf01l = Sse2.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
-            Vector128<int> shuf23l = Sse2.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
-            Vector128<int> shuf01h = Sse2.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
-            Vector128<int> shuf23h = Sse2.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());
+            Vector128<int> shuf01l = Vector128_.UnpackLow(diff0.AsInt32(), diff1.AsInt32());
+            Vector128<int> shuf23l = Vector128_.UnpackLow(diff2.AsInt32(), diff3.AsInt32());
+            Vector128<int> shuf01h = Vector128_.UnpackHigh(diff0.AsInt32(), diff1.AsInt32());
+            Vector128<int> shuf23h = Vector128_.UnpackHigh(diff2.AsInt32(), diff3.AsInt32());

            // First pass.
-            FTransformPass1SSE2(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
-            FTransformPass1SSE2(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);
+            FTransformPass1Vector128(shuf01l.AsInt16(), shuf23l.AsInt16(), out Vector128<int> v01l, out Vector128<int> v32l);
+            FTransformPass1Vector128(shuf01h.AsInt16(), shuf23h.AsInt16(), out Vector128<int> v01h, out Vector128<int> v32h);

            // Second pass.
-            FTransformPass2SSE2(v01l, v32l, output);
-            FTransformPass2SSE2(v01h, v32h, output2);
+            FTransformPass2Vector128(v01l, v32l, output);
+            FTransformPass2Vector128(v01h, v32h, output2);
        }
        else
        {
@ -427,7 +420,7 @@ internal static unsafe class Vp8Encoding

    public static void FTransform(Span<byte> src, Span<byte> reference, Span<short> output, Span<int> scratch)
    {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
            ref byte srcRef = ref MemoryMarshal.GetReference(src);
            ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
@ -449,29 +442,29 @@ internal static unsafe class Vp8Encoding
            // 20 21 22 23 *
            // 30 31 32 33 *
            // Shuffle.
-            Vector128<short> srcLow0 = Sse2.UnpackLow(src0.AsInt16(), src1.AsInt16());
-            Vector128<short> srcLow1 = Sse2.UnpackLow(src2.AsInt16(), src3.AsInt16());
-            Vector128<short> refLow0 = Sse2.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
-            Vector128<short> refLow1 = Sse2.UnpackLow(ref2.AsInt16(), ref3.AsInt16());
+            Vector128<short> srcLow0 = Vector128_.UnpackLow(src0.AsInt16(), src1.AsInt16());
+            Vector128<short> srcLow1 = Vector128_.UnpackLow(src2.AsInt16(), src3.AsInt16());
+            Vector128<short> refLow0 = Vector128_.UnpackLow(ref0.AsInt16(), ref1.AsInt16());
+            Vector128<short> refLow1 = Vector128_.UnpackLow(ref2.AsInt16(), ref3.AsInt16());

            // 00 01 10 11 02 03 12 13 * * ...
            // 20 21 30 31 22 22 32 33 * * ...

            // Convert both to 16 bit.
-            Vector128<byte> src0_16b = Sse2.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> src1_16b = Sse2.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> ref0_16b = Sse2.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> ref1_16b = Sse2.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> src0_16b = Vector128_.UnpackLow(srcLow0.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> src1_16b = Vector128_.UnpackLow(srcLow1.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> ref0_16b = Vector128_.UnpackLow(refLow0.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> ref1_16b = Vector128_.UnpackLow(refLow1.AsByte(), Vector128<byte>.Zero);

            // Compute the difference.
-            Vector128<short> row01 = Sse2.Subtract(src0_16b.AsInt16(), ref0_16b.AsInt16());
-            Vector128<short> row23 = Sse2.Subtract(src1_16b.AsInt16(), ref1_16b.AsInt16());
+            Vector128<short> row01 = src0_16b.AsInt16() - ref0_16b.AsInt16();
+            Vector128<short> row23 = src1_16b.AsInt16() - ref1_16b.AsInt16();

            // First pass.
-            FTransformPass1SSE2(row01, row23, out Vector128<int> v01, out Vector128<int> v32);
+            FTransformPass1Vector128(row01, row23, out Vector128<int> v01, out Vector128<int> v32);

            // Second pass.
-            FTransformPass2SSE2(v01, v32, output);
+            FTransformPass2Vector128(v01, v32, output);
        }
        else
        {
@ -517,88 +510,88 @@ internal static unsafe class Vp8Encoding
        }
    }

-    public static void FTransformPass1SSE2(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32)
+    public static void FTransformPass1Vector128(Vector128<short> row01, Vector128<short> row23, out Vector128<int> out01, out Vector128<int> out32)
    {
        // *in01 = 00 01 10 11 02 03 12 13
        // *in23 = 20 21 30 31 22 23 32 33
-        Vector128<short> shuf01_p = Sse2.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301);
-        Vector128<short> shuf32_p = Sse2.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301);
+        Vector128<short> shuf01_p = Vector128_.ShuffleHigh(row01, SimdUtils.Shuffle.MMShuffle2301);
+        Vector128<short> shuf32_p = Vector128_.ShuffleHigh(row23, SimdUtils.Shuffle.MMShuffle2301);

        // 00 01 10 11 03 02 13 12
        // 20 21 30 31 23 22 33 32
-        Vector128<long> s01 = Sse2.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64());
-        Vector128<long> s32 = Sse2.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64());
+        Vector128<long> s01 = Vector128_.UnpackLow(shuf01_p.AsInt64(), shuf32_p.AsInt64());
+        Vector128<long> s32 = Vector128_.UnpackHigh(shuf01_p.AsInt64(), shuf32_p.AsInt64());

        // 00 01 10 11 20 21 30 31
        // 03 02 13 12 23 22 33 32
-        Vector128<short> a01 = Sse2.Add(s01.AsInt16(), s32.AsInt16());
-        Vector128<short> a32 = Sse2.Subtract(s01.AsInt16(), s32.AsInt16());
+        Vector128<short> a01 = s01.AsInt16() + s32.AsInt16();
+        Vector128<short> a32 = s01.AsInt16() - s32.AsInt16();

        // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
        // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]

        // [ (a0 + a1) << 3, ... ]
-        Vector128<int> tmp0 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p
+        Vector128<int> tmp0 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0).AsInt16()); // K88p

        // [ (a0 - a1) << 3, ... ]
-        Vector128<int> tmp2 = Sse2.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16());        // K88m
-        Vector128<int> tmp11 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16());   // K5352_2217p
-        Vector128<int> tmp31 = Sse2.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16());   // K5352_2217m
-        Vector128<int> tmp12 = Sse2.Add(tmp11, Vector128.Create(1812));
-        Vector128<int> tmp32 = Sse2.Add(tmp31, Vector128.Create(937));
-        Vector128<int> tmp1 = Sse2.ShiftRightArithmetic(tmp12, 9);
-        Vector128<int> tmp3 = Sse2.ShiftRightArithmetic(tmp32, 9);
-        Vector128<short> s03 = Sse2.PackSignedSaturate(tmp0, tmp2);
-        Vector128<short> s12 = Sse2.PackSignedSaturate(tmp1, tmp3);
-        Vector128<short> slo = Sse2.UnpackLow(s03, s12); // 0 1 0 1 0 1...
-        Vector128<short> shi = Sse2.UnpackHigh(s03, s12); // 2 3 2 3 2 3
-        Vector128<int> v23 = Sse2.UnpackHigh(slo.AsInt32(), shi.AsInt32());
-        out01 = Sse2.UnpackLow(slo.AsInt32(), shi.AsInt32());
-        out32 = Sse2.Shuffle(v23, SimdUtils.Shuffle.MMShuffle1032);
+        Vector128<int> tmp2 = Vector128_.MultiplyAddAdjacent(a01, Vector128.Create(8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255, 8, 0, 248, 255).AsInt16());        // K88m
+        Vector128<int> tmp11 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8).AsInt16());   // K5352_2217p
+        Vector128<int> tmp31 = Vector128_.MultiplyAddAdjacent(a32, Vector128.Create(169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235).AsInt16());   // K5352_2217m
+        Vector128<int> tmp12 = tmp11 + Vector128.Create(1812);
+        Vector128<int> tmp32 = tmp31 + Vector128.Create(937);
+        Vector128<int> tmp1 = Vector128.ShiftRightArithmetic(tmp12, 9);
+        Vector128<int> tmp3 = Vector128.ShiftRightArithmetic(tmp32, 9);
+        Vector128<short> s03 = Vector128_.PackSignedSaturate(tmp0, tmp2);
+        Vector128<short> s12 = Vector128_.PackSignedSaturate(tmp1, tmp3);
+        Vector128<short> slo = Vector128_.UnpackLow(s03, s12); // 0 1 0 1 0 1...
+        Vector128<short> shi = Vector128_.UnpackHigh(s03, s12); // 2 3 2 3 2 3
+        Vector128<int> v23 = Vector128_.UnpackHigh(slo.AsInt32(), shi.AsInt32());
+        out01 = Vector128_.UnpackLow(slo.AsInt32(), shi.AsInt32());
+        out32 = Vector128_.ShuffleNative(v23, SimdUtils.Shuffle.MMShuffle1032);
    }

-    public static void FTransformPass2SSE2(Vector128<int> v01, Vector128<int> v32, Span<short> output)
+    public static void FTransformPass2Vector128(Vector128<int> v01, Vector128<int> v32, Span<short> output)
    {
        // Same operations are done on the (0,3) and (1,2) pairs.
        // a3 = v0 - v3
        // a2 = v1 - v2
-        Vector128<short> a32 = Sse2.Subtract(v01.AsInt16(), v32.AsInt16());
-        Vector128<long> a22 = Sse2.UnpackHigh(a32.AsInt64(), a32.AsInt64());
+        Vector128<short> a32 = v01.AsInt16() - v32.AsInt16();
+        Vector128<long> a22 = Vector128_.UnpackHigh(a32.AsInt64(), a32.AsInt64());

-        Vector128<short> b23 = Sse2.UnpackLow(a22.AsInt16(), a32.AsInt16());
-        Vector128<int> c1 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16());  // K5352_2217
-        Vector128<int> c3 = Sse2.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16());  // K2217_5352
-        Vector128<int> d1 = Sse2.Add(c1, Vector128.Create(12000 + (1 << 16)));  // K12000PlusOne
-        Vector128<int> d3 = Sse2.Add(c3, Vector128.Create(51000));
-        Vector128<int> e1 = Sse2.ShiftRightArithmetic(d1, 16);
-        Vector128<int> e3 = Sse2.ShiftRightArithmetic(d3, 16);
+        Vector128<short> b23 = Vector128_.UnpackLow(a22.AsInt16(), a32.AsInt16());
+        Vector128<int> c1 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20, 169, 8, 232, 20).AsInt16());  // K5352_2217
+        Vector128<int> c3 = Vector128_.MultiplyAddAdjacent(b23, Vector128.Create(24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8, 24, 235, 169, 8).AsInt16());  // K2217_5352
+        Vector128<int> d1 = c1 + Vector128.Create(12000 + (1 << 16));  // K12000PlusOne
+        Vector128<int> d3 = c3 + Vector128.Create(51000);
+        Vector128<int> e1 = Vector128.ShiftRightArithmetic(d1, 16);
+        Vector128<int> e3 = Vector128.ShiftRightArithmetic(d3, 16);

        // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
        // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
-        Vector128<short> f1 = Sse2.PackSignedSaturate(e1, e1);
-        Vector128<short> f3 = Sse2.PackSignedSaturate(e3, e3);
+        Vector128<short> f1 = Vector128_.PackSignedSaturate(e1, e1);
+        Vector128<short> f3 = Vector128_.PackSignedSaturate(e3, e3);

        // g1 = f1 + (a3 != 0);
        // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
        // desired (0, 1), we add one earlier through k12000_plus_one.
        // -> g1 = f1 + 1 - (a3 == 0)
-        Vector128<short> g1 = Sse2.Add(f1, Sse2.CompareEqual(a32, Vector128<short>.Zero));
+        Vector128<short> g1 = f1 + Vector128.Equals(a32, Vector128<short>.Zero);

        // a0 = v0 + v3
        // a1 = v1 + v2
-        Vector128<short> a01 = Sse2.Add(v01.AsInt16(), v32.AsInt16());
-        Vector128<short> a01Plus7 = Sse2.Add(a01.AsInt16(), Vector128.Create((short)7));
-        Vector128<short> a11 = Sse2.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
-        Vector128<short> c0 = Sse2.Add(a01Plus7, a11);
-        Vector128<short> c2 = Sse2.Subtract(a01Plus7, a11);
+        Vector128<short> a01 = v01.AsInt16() + v32.AsInt16();
+        Vector128<short> a01Plus7 = a01.AsInt16() + Vector128.Create((short)7);
+        Vector128<short> a11 = Vector128_.UnpackHigh(a01.AsInt64(), a01.AsInt64()).AsInt16();
+        Vector128<short> c0 = a01Plus7 + a11;
+        Vector128<short> c2 = a01Plus7 - a11;

        // d0 = (a0 + a1 + 7) >> 4;
        // d2 = (a0 - a1 + 7) >> 4;
-        Vector128<short> d0 = Sse2.ShiftRightArithmetic(c0, 4);
-        Vector128<short> d2 = Sse2.ShiftRightArithmetic(c2, 4);
+        Vector128<short> d0 = Vector128.ShiftRightArithmetic(c0, 4);
+        Vector128<short> d2 = Vector128.ShiftRightArithmetic(c2, 4);

-        Vector128<long> d0g1 = Sse2.UnpackLow(d0.AsInt64(), g1.AsInt64());
-        Vector128<long> d2f3 = Sse2.UnpackLow(d2.AsInt64(), f3.AsInt64());
+        Vector128<long> d0g1 = Vector128_.UnpackLow(d0.AsInt64(), g1.AsInt64());
+        Vector128<long> d2f3 = Vector128_.UnpackLow(d2.AsInt64(), f3.AsInt64());

        ref short outputRef = ref MemoryMarshal.GetReference(output);
        Unsafe.As<short, Vector128<short>>(ref outputRef) = d0g1.AsInt16();
--- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
@ -5,7 +5,7 @@ using System.Buffers;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.PixelFormats;

@ -29,9 +29,9 @@ internal static class YuvConversion
    //  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
    public static void UpSample(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
    {
-        if (Sse41.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
        {
-            UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
+            UpSampleVector128(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
        }
        else
        {
@ -107,7 +107,7 @@ internal static class YuvConversion
    //
    // Then m can be written as
    // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
-    private static void UpSampleSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
+    private static void UpSampleVector128(Span<byte> topY, Span<byte> bottomY, Span<byte> topU, Span<byte> topV, Span<byte> curU, Span<byte> curV, Span<byte> topDst, Span<byte> bottomDst, int len, byte[] uvBuffer)
    {
        const int xStep = 3;
        Array.Clear(uvBuffer);
@ -138,18 +138,18 @@ internal static class YuvConversion
        {
            for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
            {
-                UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
-                UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
-                ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
+                ConvertYuvToBgrWithBottomYVector128(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
            }
        }
        else
        {
            for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
            {
-                UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
-                UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
-                ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
+                UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
+                ConvertYuvToBgrVector128(topY, topDst, ru, rv, pos, xStep);
            }
        }

@ -161,18 +161,18 @@ internal static class YuvConversion
            Span<byte> tmpBottomDst = tmpTopDst[(4 * 32)..];
            Span<byte> tmpTop = tmpBottomDst[(4 * 32)..];
            Span<byte> tmpBottom = bottomY.IsEmpty ? null : tmpTop[32..];
-            UpSampleLastBlock(topU[uvPos..], curU[uvPos..], leftOver, ru);
-            UpSampleLastBlock(topV[uvPos..], curV[uvPos..], leftOver, rv);
+            UpSampleLastBlockVector128(topU[uvPos..], curU[uvPos..], leftOver, ru);
+            UpSampleLastBlockVector128(topV[uvPos..], curV[uvPos..], leftOver, rv);

            topY[pos..len].CopyTo(tmpTop);
            if (!bottomY.IsEmpty)
            {
                bottomY[pos..len].CopyTo(tmpBottom);
-                ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
+                ConvertYuvToBgrWithBottomYVector128(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
            }
            else
            {
-                ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep);
+                ConvertYuvToBgrVector128(tmpTop, tmpTopDst, ru, rv, 0, xStep);
            }

            tmpTopDst[..((len - pos) * xStep)].CopyTo(topDst[(pos * xStep)..]);
@ -184,7 +184,7 @@ internal static class YuvConversion
    }

    // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
-    private static void UpSample32Pixels(ref byte r1, ref byte r2, Span<byte> output)
+    private static void UpSample32PixelsVector128(ref byte r1, ref byte r2, Span<byte> output)
    {
        // Load inputs.
        Vector128<byte> a = Unsafe.As<byte, Vector128<byte>>(ref r1);
@ -192,28 +192,28 @@ internal static class YuvConversion
        Vector128<byte> c = Unsafe.As<byte, Vector128<byte>>(ref r2);
        Vector128<byte> d = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref r2, 1));

-        Vector128<byte> s = Sse2.Average(a, d); // s = (a + d + 1) / 2
-        Vector128<byte> t = Sse2.Average(b, c); // t = (b + c + 1) / 2
-        Vector128<byte> st = Sse2.Xor(s, t); // st = s^t
+        Vector128<byte> s = Vector128_.Average(a, d); // s = (a + d + 1) / 2
+        Vector128<byte> t = Vector128_.Average(b, c); // t = (b + c + 1) / 2
+        Vector128<byte> st = s ^ t; // st = s^t

-        Vector128<byte> ad = Sse2.Xor(a, d); // ad = a^d
-        Vector128<byte> bc = Sse2.Xor(b, c); // bc = b^c
+        Vector128<byte> ad = a ^ d; // ad = a^d
+        Vector128<byte> bc = b ^ c; // bc = b^c

-        Vector128<byte> t1 = Sse2.Or(ad, bc); // (a^d) | (b^c)
-        Vector128<byte> t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t)
-        Vector128<byte> t3 = Sse2.And(t2, Vector128.Create((byte)1)); // (a^d) | (b^c) | (s^t) & 1
-        Vector128<byte> t4 = Sse2.Average(s, t);
-        Vector128<byte> k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4
+        Vector128<byte> t1 = ad | bc; // (a^d) | (b^c)
+        Vector128<byte> t2 = t1 | st; // (a^d) | (b^c) | (s^t)
+        Vector128<byte> t3 = t2 & Vector128.Create((byte)1); // (a^d) | (b^c) | (s^t) & 1
+        Vector128<byte> t4 = Vector128_.Average(s, t);
+        Vector128<byte> k = t4 - t3; // k = (a + b + c + d) / 4

-        Vector128<byte> diag1 = GetM(k, st, bc, t);
-        Vector128<byte> diag2 = GetM(k, st, ad, s);
+        Vector128<byte> diag1 = GetMVector128(k, st, bc, t);
+        Vector128<byte> diag2 = GetMVector128(k, st, ad, s);

        // Pack the alternate pixels.
-        PackAndStore(a, b, diag1, diag2, output); // store top.
-        PackAndStore(c, d, diag2, diag1, output[(2 * 32)..]);
+        PackAndStoreVector128(a, b, diag1, diag2, output); // store top.
+        PackAndStoreVector128(c, d, diag2, diag1, output[(2 * 32)..]);
    }

-    private static void UpSampleLastBlock(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
+    private static void UpSampleLastBlockVector128(Span<byte> tb, Span<byte> bb, int numPixels, Span<byte> output)
    {
        Span<byte> r1 = stackalloc byte[17];
        Span<byte> r2 = stackalloc byte[17];
@ -230,27 +230,27 @@ internal static class YuvConversion

        ref byte r1Ref = ref MemoryMarshal.GetReference(r1);
        ref byte r2Ref = ref MemoryMarshal.GetReference(r2);
-        UpSample32Pixels(ref r1Ref, ref r2Ref, output);
+        UpSample32PixelsVector128(ref r1Ref, ref r2Ref, output);
    }

    // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
-    private static Vector128<byte> GetM(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
+    private static Vector128<byte> GetMVector128(Vector128<byte> k, Vector128<byte> st, Vector128<byte> ij, Vector128<byte> input)
    {
-        Vector128<byte> tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2
-        Vector128<byte> tmp1 = Sse2.And(ij, st); // (ij) & (s^t)
-        Vector128<byte> tmp2 = Sse2.Xor(k, input); // (k^in)
-        Vector128<byte> tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in)
-        Vector128<byte> tmp4 = Sse2.And(tmp3, Vector128.Create((byte)1)); // & 1 -> lsb_correction
+        Vector128<byte> tmp0 = Vector128_.Average(k, input); // (k + in + 1) / 2
+        Vector128<byte> tmp1 = ij & st; // (ij) & (s^t)
+        Vector128<byte> tmp2 = k ^ input; // (k^in)
+        Vector128<byte> tmp3 = tmp1 | tmp2; // ((ij) & (s^t)) | (k^in)
+        Vector128<byte> tmp4 = tmp3 & Vector128.Create((byte)1); // & 1 -> lsb_correction

-        return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction
+        return tmp0 - tmp4; // (k + in + 1) / 2 - lsb_correction
    }

-    private static void PackAndStore(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
+    private static void PackAndStoreVector128(Vector128<byte> a, Vector128<byte> b, Vector128<byte> da, Vector128<byte> db, Span<byte> output)
    {
-        Vector128<byte> ta = Sse2.Average(a, da); // (9a + 3b + 3c +  d + 8) / 16
-        Vector128<byte> tb = Sse2.Average(b, db); // (3a + 9b +  c + 3d + 8) / 16
-        Vector128<byte> t1 = Sse2.UnpackLow(ta, tb);
-        Vector128<byte> t2 = Sse2.UnpackHigh(ta, tb);
+        Vector128<byte> ta = Vector128_.Average(a, da); // (9a + 3b + 3c +  d + 8) / 16
+        Vector128<byte> tb = Vector128_.Average(b, db); // (3a + 9b +  c + 3d + 8) / 16
+        Vector128<byte> t1 = Vector128_.UnpackLow(ta, tb);
+        Vector128<byte> t2 = Vector128_.UnpackHigh(ta, tb);

        ref byte output0Ref = ref MemoryMarshal.GetReference(output);
        ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16);
@ -562,41 +562,42 @@ internal static class YuvConversion
    }

    [MethodImpl(InliningOptions.ShortMethod)]
-    private static void ConvertYuvToBgrSse41(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step) => YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
+    private static void ConvertYuvToBgrVector128(Span<byte> topY, Span<byte> topDst, Span<byte> ru, Span<byte> rv, int curX, int step)
+        => YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);

    [MethodImpl(InliningOptions.ShortMethod)]
-    private static void ConvertYuvToBgrWithBottomYSse41(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
+    private static void ConvertYuvToBgrWithBottomYVector128(Span<byte> topY, Span<byte> bottomY, Span<byte> topDst, Span<byte> bottomDst, Span<byte> ru, Span<byte> rv, int curX, int step)
    {
-        YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
-        YuvToBgrSse41(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
+        YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);
+        YuvToBgrVector128(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
    }

-    private static void YuvToBgrSse41(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
+    private static void YuvToBgrVector128(Span<byte> y, Span<byte> u, Span<byte> v, Span<byte> dst)
    {
        ref byte yRef = ref MemoryMarshal.GetReference(y);
        ref byte uRef = ref MemoryMarshal.GetReference(u);
        ref byte vRef = ref MemoryMarshal.GetReference(v);
-        ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
-        ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
-        ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
-        ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);
+        ConvertYuv444ToBgrVector128(ref yRef, ref uRef, ref vRef, out Vector128<short> r0, out Vector128<short> g0, out Vector128<short> b0);
+        ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128<short> r1, out Vector128<short> g1, out Vector128<short> b1);
+        ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128<short> r2, out Vector128<short> g2, out Vector128<short> b2);
+        ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128<short> r3, out Vector128<short> g3, out Vector128<short> b3);

        // Cast to 8b and store as BBBBGGGGRRRR.
-        Vector128<byte> bgr0 = Sse2.PackUnsignedSaturate(b0, b1);
-        Vector128<byte> bgr1 = Sse2.PackUnsignedSaturate(b2, b3);
-        Vector128<byte> bgr2 = Sse2.PackUnsignedSaturate(g0, g1);
-        Vector128<byte> bgr3 = Sse2.PackUnsignedSaturate(g2, g3);
-        Vector128<byte> bgr4 = Sse2.PackUnsignedSaturate(r0, r1);
-        Vector128<byte> bgr5 = Sse2.PackUnsignedSaturate(r2, r3);
+        Vector128<byte> bgr0 = Vector128_.PackUnsignedSaturate(b0, b1);
+        Vector128<byte> bgr1 = Vector128_.PackUnsignedSaturate(b2, b3);
+        Vector128<byte> bgr2 = Vector128_.PackUnsignedSaturate(g0, g1);
+        Vector128<byte> bgr3 = Vector128_.PackUnsignedSaturate(g2, g3);
+        Vector128<byte> bgr4 = Vector128_.PackUnsignedSaturate(r0, r1);
+        Vector128<byte> bgr5 = Vector128_.PackUnsignedSaturate(r2, r3);

        // Pack as BGRBGRBGRBGR.
-        PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
+        PlanarTo24bVector128(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
    }

    // Pack the planar buffers
    // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
    // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-    private static void PlanarTo24bSse41(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
+    private static void PlanarTo24bVector128(Vector128<byte> input0, Vector128<byte> input1, Vector128<byte> input2, Vector128<byte> input3, Vector128<byte> input4, Vector128<byte> input5, Span<byte> rgb)
    {
        // The input is 6 registers of sixteen 8b but for the sake of explanation,
        // let's take 6 registers of four 8b values.
@ -612,7 +613,7 @@ internal static class YuvConversion
        //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7

        // Process R.
-        ChannelMixing(
+        ChannelMixingVector128(
            input0,
            input1,
            Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5),        // PlanarTo24Shuffle0
@ -627,7 +628,7 @@ internal static class YuvConversion

        // Process G.
        // Same as before, just shifted to the left by one and including the right padding.
-        ChannelMixing(
+        ChannelMixingVector128(
            input2,
            input3,
            Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255),      // PlanarTo24Shuffle3
@ -641,7 +642,7 @@ internal static class YuvConversion
            out Vector128<byte> g5);

        // Process B.
-        ChannelMixing(
+        ChannelMixingVector128(
            input4,
            input5,
            Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255),     // PlanarTo24Shuffle6
@ -655,24 +656,24 @@ internal static class YuvConversion
            out Vector128<byte> b5);

        // OR the different channels.
-        Vector128<byte> rg0 = Sse2.Or(r0, g0);
-        Vector128<byte> rg1 = Sse2.Or(r1, g1);
-        Vector128<byte> rg2 = Sse2.Or(r2, g2);
-        Vector128<byte> rg3 = Sse2.Or(r3, g3);
-        Vector128<byte> rg4 = Sse2.Or(r4, g4);
-        Vector128<byte> rg5 = Sse2.Or(r5, g5);
+        Vector128<byte> rg0 = r0 | g0;
+        Vector128<byte> rg1 = r1 | g1;
+        Vector128<byte> rg2 = r2 | g2;
+        Vector128<byte> rg3 = r3 | g3;
+        Vector128<byte> rg4 = r4 | g4;
+        Vector128<byte> rg5 = r5 | g5;

        ref byte outputRef = ref MemoryMarshal.GetReference(rgb);
-        Unsafe.As<byte, Vector128<byte>>(ref outputRef) = Sse2.Or(rg0, b0);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4);
-        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5);
+        Unsafe.As<byte, Vector128<byte>>(ref outputRef) = rg0 | b0;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 16)) = rg1 | b1;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 32)) = rg2 | b2;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 48)) = rg3 | b3;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 64)) = rg4 | b4;
+        Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref outputRef, 80)) = rg5 | b5;
    }

    // Shuffles the input buffer as A0 0 0 A1 0 0 A2
-    private static void ChannelMixing(
+    private static void ChannelMixingVector128(
        Vector128<byte> input0,
        Vector128<byte> input1,
        Vector128<byte> shuffle0,
@ -685,53 +686,53 @@ internal static class YuvConversion
        out Vector128<byte> output4,
        out Vector128<byte> output5)
    {
-        output0 = Ssse3.Shuffle(input0, shuffle0);
-        output1 = Ssse3.Shuffle(input0, shuffle1);
-        output2 = Ssse3.Shuffle(input0, shuffle2);
-        output3 = Ssse3.Shuffle(input1, shuffle0);
-        output4 = Ssse3.Shuffle(input1, shuffle1);
-        output5 = Ssse3.Shuffle(input1, shuffle2);
+        output0 = Vector128_.ShuffleNative(input0, shuffle0);
+        output1 = Vector128_.ShuffleNative(input0, shuffle1);
+        output2 = Vector128_.ShuffleNative(input0, shuffle2);
+        output3 = Vector128_.ShuffleNative(input1, shuffle0);
+        output4 = Vector128_.ShuffleNative(input1, shuffle1);
+        output5 = Vector128_.ShuffleNative(input1, shuffle2);
    }

    // Convert 32 samples of YUV444 to B/G/R
-    private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
+    private static void ConvertYuv444ToBgrVector128(ref byte y, ref byte u, ref byte v, out Vector128<short> r, out Vector128<short> g, out Vector128<short> b)
    {
        // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
        Vector128<byte> y0 = Unsafe.As<byte, Vector128<byte>>(ref y);
        Vector128<byte> u0 = Unsafe.As<byte, Vector128<byte>>(ref u);
        Vector128<byte> v0 = Unsafe.As<byte, Vector128<byte>>(ref v);
-        y0 = Sse2.UnpackLow(Vector128<byte>.Zero, y0);
-        u0 = Sse2.UnpackLow(Vector128<byte>.Zero, u0);
-        v0 = Sse2.UnpackLow(Vector128<byte>.Zero, v0);
+        y0 = Vector128_.UnpackLow(Vector128<byte>.Zero, y0);
+        u0 = Vector128_.UnpackLow(Vector128<byte>.Zero, u0);
+        v0 = Vector128_.UnpackLow(Vector128<byte>.Zero, v0);

        // These constants are 14b fixed-point version of ITU-R BT.601 constants.
        // R = (19077 * y             + 26149 * v - 14234) >> 6
        // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
        // B = (19077 * y + 33050 * u             - 17685) >> 6
-        var k19077 = Vector128.Create((ushort)19077);
-        var k26149 = Vector128.Create((ushort)26149);
-        var k14234 = Vector128.Create((ushort)14234);
+        Vector128<ushort> k19077 = Vector128.Create((ushort)19077);
+        Vector128<ushort> k26149 = Vector128.Create((ushort)26149);
+        Vector128<ushort> k14234 = Vector128.Create((ushort)14234);

-        Vector128<ushort> y1 = Sse2.MultiplyHigh(y0.AsUInt16(), k19077);
-        Vector128<ushort> r0 = Sse2.MultiplyHigh(v0.AsUInt16(), k26149);
-        Vector128<ushort> g0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
-        Vector128<ushort> g1 = Sse2.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));
+        Vector128<ushort> y1 = Vector128_.MultiplyHigh(y0.AsUInt16(), k19077);
+        Vector128<ushort> r0 = Vector128_.MultiplyHigh(v0.AsUInt16(), k26149);
+        Vector128<ushort> g0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419));
+        Vector128<ushort> g1 = Vector128_.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320));

-        Vector128<ushort> r1 = Sse2.Subtract(y1.AsUInt16(), k14234);
-        Vector128<ushort> r2 = Sse2.Add(r1, r0);
+        Vector128<ushort> r1 = y1.AsUInt16() - k14234;
+        Vector128<ushort> r2 = r1 + r0;

-        Vector128<ushort> g2 = Sse2.Add(y1.AsUInt16(), Vector128.Create((ushort)8708));
-        Vector128<ushort> g3 = Sse2.Add(g0, g1);
-        Vector128<ushort> g4 = Sse2.Subtract(g2, g3);
+        Vector128<ushort> g2 = y1.AsUInt16() + Vector128.Create((ushort)8708);
+        Vector128<ushort> g3 = g0 + g1;
+        Vector128<ushort> g4 = g2 - g3;

-        Vector128<ushort> b0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
-        Vector128<ushort> b1 = Sse2.AddSaturate(b0, y1);
-        Vector128<ushort> b2 = Sse2.SubtractSaturate(b1, Vector128.Create((ushort)17685));
+        Vector128<ushort> b0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16());
+        Vector128<ushort> b1 = Vector128_.AddSaturate(b0, y1);
+        Vector128<ushort> b2 = Vector128_.SubtractSaturate(b1, Vector128.Create((ushort)17685));

        // Use logical shift for B2, which can be larger than 32767.
-        r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
-        g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
-        b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
+        r = Vector128.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815]
+        g = Vector128.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710]
+        b = Vector128.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238]
    }

    [MethodImpl(InliningOptions.ShortMethod)]
--- a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
+++ b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
@ -3,7 +3,7 @@

 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.PixelFormats;

 namespace SixLabors.ImageSharp.Formats.Webp;
@ -20,7 +20,7 @@ internal static class WebpCommonUtils
    /// <returns>Returns true if alpha has non-0xff values.</returns>
    public static unsafe bool CheckNonOpaque(ReadOnlySpan<Bgra32> row)
    {
-        if (Avx2.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
        {
            ReadOnlySpan<byte> rowBytes = MemoryMarshal.AsBytes(row);
            int i = 0;
@ -32,20 +32,20 @@ internal static class WebpCommonUtils

                for (; i + 128 <= length; i += 128)
                {
-                    Vector256<byte> a0 = Avx.LoadVector256(src + i).AsByte();
-                    Vector256<byte> a1 = Avx.LoadVector256(src + i + 32).AsByte();
-                    Vector256<byte> a2 = Avx.LoadVector256(src + i + 64).AsByte();
-                    Vector256<byte> a3 = Avx.LoadVector256(src + i + 96).AsByte();
-                    Vector256<int> b0 = Avx2.And(a0, alphaMaskVector256).AsInt32();
-                    Vector256<int> b1 = Avx2.And(a1, alphaMaskVector256).AsInt32();
-                    Vector256<int> b2 = Avx2.And(a2, alphaMaskVector256).AsInt32();
-                    Vector256<int> b3 = Avx2.And(a3, alphaMaskVector256).AsInt32();
-                    Vector256<short> c0 = Avx2.PackSignedSaturate(b0, b1).AsInt16();
-                    Vector256<short> c1 = Avx2.PackSignedSaturate(b2, b3).AsInt16();
-                    Vector256<byte> d = Avx2.PackSignedSaturate(c0, c1).AsByte();
-                    Vector256<byte> bits = Avx2.CompareEqual(d, all0x80Vector256);
-                    int mask = Avx2.MoveMask(bits);
-                    if (mask != -1)
+                    Vector256<byte> a0 = Vector256.Load(src + i).AsByte();
+                    Vector256<byte> a1 = Vector256.Load(src + i + 32).AsByte();
+                    Vector256<byte> a2 = Vector256.Load(src + i + 64).AsByte();
+                    Vector256<byte> a3 = Vector256.Load(src + i + 96).AsByte();
+                    Vector256<int> b0 = (a0 & alphaMaskVector256).AsInt32();
+                    Vector256<int> b1 = (a1 & alphaMaskVector256).AsInt32();
+                    Vector256<int> b2 = (a2 & alphaMaskVector256).AsInt32();
+                    Vector256<int> b3 = (a3 & alphaMaskVector256).AsInt32();
+                    Vector256<short> c0 = Vector256_.PackSignedSaturate(b0, b1).AsInt16();
+                    Vector256<short> c1 = Vector256_.PackSignedSaturate(b2, b3).AsInt16();
+                    Vector256<byte> d = Vector256_.PackSignedSaturate(c0, c1).AsByte();
+                    Vector256<byte> bits = Vector256.Equals(d, all0x80Vector256);
+                    uint mask = bits.ExtractMostSignificantBits();
+                    if (mask != 0xFFFF_FFFF)
                    {
                        return true;
                    }
@ -53,7 +53,7 @@ internal static class WebpCommonUtils

                for (; i + 64 <= length; i += 64)
                {
-                    if (IsNoneOpaque64Bytes(src, i))
+                    if (IsNoneOpaque64BytesVector128(src, i))
                    {
                        return true;
                    }
@ -61,7 +61,7 @@ internal static class WebpCommonUtils

                for (; i + 32 <= length; i += 32)
                {
-                    if (IsNoneOpaque32Bytes(src, i))
+                    if (IsNonOpaque32BytesVector128(src, i))
                    {
                        return true;
                    }
@ -76,7 +76,7 @@ internal static class WebpCommonUtils
                }
            }
        }
-        else if (Sse2.IsSupported)
+        else if (Vector128.IsHardwareAccelerated)
        {
            ReadOnlySpan<byte> rowBytes = MemoryMarshal.AsBytes(row);
            int i = 0;
@ -85,7 +85,7 @@ internal static class WebpCommonUtils
            {
                for (; i + 64 <= length; i += 64)
                {
-                    if (IsNoneOpaque64Bytes(src, i))
+                    if (IsNoneOpaque64BytesVector128(src, i))
                    {
                        return true;
                    }
@ -93,7 +93,7 @@ internal static class WebpCommonUtils

                for (; i + 32 <= length; i += 32)
                {
-                    if (IsNoneOpaque32Bytes(src, i))
+                    if (IsNonOpaque32BytesVector128(src, i))
                    {
                        return true;
                    }
@ -122,38 +122,38 @@ internal static class WebpCommonUtils
        return false;
    }

-    private static unsafe bool IsNoneOpaque64Bytes(byte* src, int i)
+    private static unsafe bool IsNoneOpaque64BytesVector128(byte* src, int i)
    {
        Vector128<byte> alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);

-        Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
-        Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
-        Vector128<byte> a2 = Sse2.LoadVector128(src + i + 32).AsByte();
-        Vector128<byte> a3 = Sse2.LoadVector128(src + i + 48).AsByte();
-        Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
-        Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
-        Vector128<int> b2 = Sse2.And(a2, alphaMask).AsInt32();
-        Vector128<int> b3 = Sse2.And(a3, alphaMask).AsInt32();
-        Vector128<short> c0 = Sse2.PackSignedSaturate(b0, b1).AsInt16();
-        Vector128<short> c1 = Sse2.PackSignedSaturate(b2, b3).AsInt16();
-        Vector128<byte> d = Sse2.PackSignedSaturate(c0, c1).AsByte();
-        Vector128<byte> bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte());
-        int mask = Sse2.MoveMask(bits);
+        Vector128<byte> a0 = Vector128.Load(src + i).AsByte();
+        Vector128<byte> a1 = Vector128.Load(src + i + 16).AsByte();
+        Vector128<byte> a2 = Vector128.Load(src + i + 32).AsByte();
+        Vector128<byte> a3 = Vector128.Load(src + i + 48).AsByte();
+        Vector128<int> b0 = (a0 & alphaMask).AsInt32();
+        Vector128<int> b1 = (a1 & alphaMask).AsInt32();
+        Vector128<int> b2 = (a2 & alphaMask).AsInt32();
+        Vector128<int> b3 = (a3 & alphaMask).AsInt32();
+        Vector128<short> c0 = Vector128_.PackSignedSaturate(b0, b1).AsInt16();
+        Vector128<short> c1 = Vector128_.PackSignedSaturate(b2, b3).AsInt16();
+        Vector128<byte> d = Vector128_.PackSignedSaturate(c0, c1).AsByte();
+        Vector128<byte> bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte());
+        uint mask = bits.ExtractMostSignificantBits();
        return mask != 0xFFFF;
    }

-    private static unsafe bool IsNoneOpaque32Bytes(byte* src, int i)
+    private static unsafe bool IsNonOpaque32BytesVector128(byte* src, int i)
    {
        Vector128<byte> alphaMask = Vector128.Create(0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255);

-        Vector128<byte> a0 = Sse2.LoadVector128(src + i).AsByte();
-        Vector128<byte> a1 = Sse2.LoadVector128(src + i + 16).AsByte();
-        Vector128<int> b0 = Sse2.And(a0, alphaMask).AsInt32();
-        Vector128<int> b1 = Sse2.And(a1, alphaMask).AsInt32();
-        Vector128<short> c = Sse2.PackSignedSaturate(b0, b1).AsInt16();
-        Vector128<byte> d = Sse2.PackSignedSaturate(c, c).AsByte();
-        Vector128<byte> bits = Sse2.CompareEqual(d, Vector128.Create((byte)0x80).AsByte());
-        int mask = Sse2.MoveMask(bits);
+        Vector128<byte> a0 = Vector128.Load(src + i).AsByte();
+        Vector128<byte> a1 = Vector128.Load(src + i + 16).AsByte();
+        Vector128<int> b0 = (a0 & alphaMask).AsInt32();
+        Vector128<int> b1 = (a1 & alphaMask).AsInt32();
+        Vector128<short> c = Vector128_.PackSignedSaturate(b0, b1).AsInt16();
+        Vector128<byte> d = Vector128_.PackSignedSaturate(c, c).AsByte();
+        Vector128<byte> bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte());
+        uint mask = bits.ExtractMostSignificantBits();
        return mask != 0xFFFF;
    }
 }
--- a/src/ImageSharp/Metadata/Profiles/IPTC/IptcProfile.cs
+++ b/src/ImageSharp/Metadata/Profiles/IPTC/IptcProfile.cs
@ -3,7 +3,6 @@

 using System.Buffers.Binary;
 using System.Collections.ObjectModel;
-using System.Diagnostics.CodeAnalysis;
 using System.Globalization;
 using System.Text;
 using SixLabors.ImageSharp.Metadata.Profiles.IPTC;
--- a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs
+++ b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs
@ -210,6 +210,8 @@ public partial class PixelOperations<TPixel>
    {
        GuardUnpackIntoRgbPlanes(redChannel, greenChannel, blueChannel, source);

+        // TODO: This can be much faster.
+        // Convert to Rgba32 first using pixel operations then use the R, G, B properties.
        int count = source.Length;

        ref float r = ref MemoryMarshal.GetReference(redChannel);
--- a/tests/ImageSharp.Tests/Formats/Tiff/TiffDecoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Tiff/TiffDecoderTests.cs
@ -341,16 +341,46 @@ public class TiffDecoderTests : TiffDecoderBaseTester
    [Theory]
    [WithFile(Cmyk, PixelTypes.Rgba32)]
    [WithFile(CmykLzwPredictor, PixelTypes.Rgba32)]
+    [WithFile(CmykJpeg, PixelTypes.Rgba32)]
    public void TiffDecoder_CanDecode_Cmyk<TPixel>(TestImageProvider<TPixel> provider)
        where TPixel : unmanaged, IPixel<TPixel>
    {
        // Note: The image from MagickReferenceDecoder does not look right, maybe we are doing something wrong
        // converting the pixel data from Magick.NET to our format with CMYK?
-        using Image<TPixel> image = provider.GetImage();
+        using Image<TPixel> image = provider.GetImage(TiffDecoder.Instance);
        image.DebugSave(provider);
        image.CompareToReferenceOutput(ImageComparer.Exact, provider);
    }

+    [Theory]
+    [WithFile(Issues2454_A, PixelTypes.Rgba32)]
+    [WithFile(Issues2454_B, PixelTypes.Rgba32)]
+    public void TiffDecoder_CanDecode_YccK<TPixel>(TestImageProvider<TPixel> provider)
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        using Image<TPixel> image = provider.GetImage(TiffDecoder.Instance);
+        image.DebugSave(provider);
+        image.CompareToReferenceOutput(ImageComparer.Exact, provider);
+    }
+
+    [Theory]
+    [WithFile(Issues2454_A, PixelTypes.Rgba32)]
+    [WithFile(Issues2454_B, PixelTypes.Rgba32)]
+    public void TiffDecoder_CanDecode_YccK_ICC<TPixel>(TestImageProvider<TPixel> provider)
+        where TPixel : unmanaged, IPixel<TPixel>
+    {
+        DecoderOptions options = new()
+        {
+            ColorProfileHandling = ColorProfileHandling.Convert,
+        };
+
+        using Image<TPixel> image = provider.GetImage(TiffDecoder.Instance, options);
+        image.DebugSave(provider);
+
+        // Linux reports a 0.0000% difference, so we use a tolerant comparer here.
+        image.CompareToReferenceOutput(ImageComparer.TolerantPercentage(0.0001F), provider);
+    }
+
    [Theory]
    [WithFile(FlowerRgb101010Contiguous, PixelTypes.Rgba32)]
    [WithFile(FlowerRgb101010Planar, PixelTypes.Rgba32)]
--- a/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Formats/WebP/ColorSpaceTransformUtilsTests.cs
@ -71,17 +71,17 @@ public class ColorSpaceTransformUtilsTests
    public void CollectColorBlueTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.AllowAll);

    [Fact]
-    public void CollectColorBlueTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);
+    public void CollectColorBlueTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableSSE41);

    [Fact]
-    public void CollectColorBlueTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);
+    public void CollectColorBlueTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorBlueTransformsTest, HwIntrinsics.DisableAVX2);

    [Fact]
    public void CollectColorRedTransforms_WithHardwareIntrinsics_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.AllowAll);

    [Fact]
-    public void CollectColorRedTransforms_WithoutSSE41_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);
+    public void CollectColorRedTransforms_WithoutVector128_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableSSE41);

    [Fact]
-    public void CollectColorRedTransforms_WithoutAvx2_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
+    public void CollectColorRedTransforms_WithoutVector256_Works() => FeatureTestRunner.RunWithHwIntrinsicsFeature(RunCollectColorRedTransformsTest, HwIntrinsics.DisableAVX2);
 }
--- a/tests/ImageSharp.Tests/TestImages.cs
+++ b/tests/ImageSharp.Tests/TestImages.cs
@ -1122,6 +1122,7 @@ public static class TestImages
        public const string Cmyk = "Tiff/Cmyk.tiff";
        public const string Cmyk64BitDeflate = "Tiff/cmyk_deflate_64bit.tiff";
        public const string CmykLzwPredictor = "Tiff/Cmyk-lzw-predictor.tiff";
+        public const string CmykJpeg = "Tiff/Cmyk-jpeg.tiff";

        public const string Issues1716Rgb161616BitLittleEndian = "Tiff/Issues/Issue1716.tiff";
        public const string Issues1891 = "Tiff/Issues/Issue1891.tiff";
@ -1129,6 +1130,8 @@ public static class TestImages
        public const string Issues2149 = "Tiff/Issues/Group4CompressionWithStrips.tiff";
        public const string Issues2255 = "Tiff/Issues/Issue2255.png";
        public const string Issues2435 = "Tiff/Issues/Issue2435.tiff";
+        public const string Issues2454_A = "Tiff/Issues/Issue2454_A.tif";
+        public const string Issues2454_B = "Tiff/Issues/Issue2454_B.tif";
        public const string Issues2587 = "Tiff/Issues/Issue2587.tiff";
        public const string Issues2679 = "Tiff/Issues/Issue2679.tiff";
        public const string JpegCompressedGray0000539558 = "Tiff/Issues/JpegCompressedGray-0000539558.tiff";
--- a/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_Cmyk_Rgba32_Cmyk-jpeg.png
+++ b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_Cmyk_Rgba32_Cmyk-jpeg.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f68db78d765a7f36570cd7b57a1f06cfca24c3b4916d0692a4aa051209ec327
+size 616
--- a/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_A.png
+++ b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_A.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c4f77673028643af0ac02a8f6a1e2db14052177e3401c369391a8ff7e943770c
+size 7679254
--- a/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_B.png
+++ b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_ICC_Rgba32_Issue2454_B.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e616895c21fd8b19a216e8a3ef4968bd413589b5875efdac29860f019a710527
+size 7517284
--- a/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_A.png
+++ b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_A.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d7911e059049c427229136479740fd62e2e09907549ec3e1421a6a60da6167cc
+size 7840892
--- a/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_B.png
+++ b/tests/Images/External/ReferenceOutput/TiffDecoderTests/TiffDecoder_CanDecode_YccK_Rgba32_Issue2454_B.png
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:291f2033a7b4cfc10fb3301283c167b3fbc288bc173c95b21bc726bf076865af
+size 7649213
--- a/tests/Images/Input/Tiff/Cmyk-jpeg.tiff
+++ b/tests/Images/Input/Tiff/Cmyk-jpeg.tiff
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abb923e457acc31a7f18c46a7d58fc5a42f5c3d197236403921e3ee623fa4fac
+size 2046
--- a/tests/Images/Input/Tiff/Cmyk-planar-jpg.tiff
+++ b/tests/Images/Input/Tiff/Cmyk-planar-jpg.tiff
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:abb923e457acc31a7f18c46a7d58fc5a42f5c3d197236403921e3ee623fa4fac
+size 2046
--- a/tests/Images/Input/Tiff/Issues/Issue2454_A.tif
+++ b/tests/Images/Input/Tiff/Issues/Issue2454_A.tif
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:868fbf7fc7a61bc6b1226160c8dc3bb1faebd8d4a2a6fe9494962f3fbe3a7fdc
+size 5024256
--- a/tests/Images/Input/Tiff/Issues/Issue2454_B.tif
+++ b/tests/Images/Input/Tiff/Issues/Issue2454_B.tif
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:867851192f540742ba1481f503834f8aa77caa03ac59f8204d098bf940b0bb3a
+size 4387646