Optimize and cleanup ByteToNormalizedFloatReduce

2 years ago · c6758df08b
15 changed files with 344 additions and 425 deletions
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@ -1069,4 +1069,24 @@ internal static class Numerics
    public static nuint Vector256Count<TVector>(int length)
        where TVector : struct
        => (uint)length / (uint)Vector256<TVector>.Count;
+
+    /// <summary>
+    /// Gets the count of vectors that safely fit into the given span.
+    /// </summary>
+    /// <typeparam name="TVector">The type of the vector.</typeparam>
+    /// <param name="span">The given span.</param>
+    /// <returns>Count of vectors that safely fit into the span.</returns>
+    public static nuint Vector512Count<TVector>(this Span<float> span)
+        where TVector : struct
+        => (uint)span.Length / (uint)Vector512<TVector>.Count;
+
+    /// <summary>
+    /// Gets the count of vectors that safely fit into length.
+    /// </summary>
+    /// <typeparam name="TVector">The type of the vector.</typeparam>
+    /// <param name="length">The given length.</param>
+    /// <returns>Count of vectors that safely fit into the length.</returns>
+    public static nuint Vector512Count<TVector>(int length)
+        where TVector : struct
+        => (uint)length / (uint)Vector512<TVector>.Count;
 }
--- a/src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.Convert.cs
@ -0,0 +1,77 @@
+// Copyright (c) Six Labors.
+// Licensed under the Six Labors Split License.
+
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp;
+
+internal static partial class SimdUtils
+{
+    /// <summary>
+    /// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
+    /// <paramref name="source"/> should be the of the same size as <paramref name="destination"/>,
+    /// but there are no restrictions on the span's length.
+    /// </summary>
+    /// <param name="source">The source span of bytes</param>
+    /// <param name="destination">The destination span of floats</param>
+    [MethodImpl(InliningOptions.ShortMethod)]
+    internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> destination)
+    {
+        DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
+
+        HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref destination);
+
+        if (source.Length > 0)
+        {
+            ConvertByteToNormalizedFloatRemainder(source, destination);
+        }
+    }
+
+    /// <summary>
+    /// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'destination' buffer of <see cref="byte"/>.
+    /// The values are scaled up into [0-255] and rounded, overflows are clamped.
+    /// <paramref name="source"/> should be the of the same size as <paramref name="destination"/>,
+    /// but there are no restrictions on the span's length.
+    /// </summary>
+    /// <param name="source">The source span of floats</param>
+    /// <param name="destination">The destination span of bytes</param>
+    [MethodImpl(InliningOptions.ShortMethod)]
+    internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> destination)
+    {
+        DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
+
+        HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref destination);
+
+        if (source.Length > 0)
+        {
+            ConvertNormalizedFloatToByteRemainder(source, destination);
+        }
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> destination)
+    {
+        ref byte sBase = ref MemoryMarshal.GetReference(source);
+        ref float dBase = ref MemoryMarshal.GetReference(destination);
+
+        for (int i = 0; i < source.Length; i++)
+        {
+            Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f;
+        }
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> destination)
+    {
+        ref float sBase = ref MemoryMarshal.GetReference(source);
+        ref byte dBase = ref MemoryMarshal.GetReference(destination);
+        for (int i = 0; i < source.Length; i++)
+        {
+            Unsafe.Add(ref dBase, i) = ConvertToByte(Unsafe.Add(ref sBase, i));
+        }
+    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static byte ConvertToByte(float f) => (byte)Numerics.Clamp((f * 255F) + 0.5F, 0, 255F);
+}
--- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@ -1,38 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-
-using System.Numerics;
-using System.Runtime.CompilerServices;
-
-// ReSharper disable MemberHidesStaticFromOuterClass
-namespace SixLabors.ImageSharp;
-
-internal static partial class SimdUtils
-{
-    /// <summary>
-    /// Implementation methods based on newer <see cref="Vector{T}"/> API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*).
-    /// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
-    /// See:
-    /// https://github.com/dotnet/coreclr/pull/10662
-    /// API Proposal:
-    /// https://github.com/dotnet/corefx/issues/15957
-    /// </summary>
-    public static class ExtendedIntrinsics
-    {
-        public static bool IsAvailable { get; } = Vector.IsHardwareAccelerated;
-
-        /// <summary>
-        /// Widen and convert a vector of <see cref="short"/> values into 2 vectors of <see cref="float"/>-s.
-        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        internal static void ConvertToSingle(
-            Vector<short> source,
-            out Vector<float> dest1,
-            out Vector<float> dest2)
-        {
-            Vector.Widen(source, out Vector<int> i1, out Vector<int> i2);
-            dest1 = Vector.ConvertToSingle(i1);
-            dest2 = Vector.ConvertToSingle(i2);
-        }
-    }
-}
--- a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
@ -1,83 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Six Labors Split License.
-
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-
-// ReSharper disable MemberHidesStaticFromOuterClass
-namespace SixLabors.ImageSharp;
-
-internal static partial class SimdUtils
-{
-    /// <summary>
-    /// Fallback implementation based on <see cref="Vector4"/> (128bit).
-    /// For <see cref="Vector4"/>, efficient software fallback implementations are present,
-    /// and we hope that even mono's JIT is able to emit SIMD instructions for that type :P
-    /// </summary>
-    public static class FallbackIntrinsics128
-    {
-        /// <summary>
-        /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
-        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
-        internal static void ByteToNormalizedFloatReduce(
-            ref ReadOnlySpan<byte> source,
-            ref Span<float> dest)
-        {
-            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
-            int remainder = Numerics.Modulo4(source.Length);
-            int adjustedCount = source.Length - remainder;
-
-            if (adjustedCount > 0)
-            {
-                ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]);
-
-                source = source[adjustedCount..];
-                dest = dest[adjustedCount..];
-            }
-        }
-
-        /// <summary>
-        /// Implementation of <see cref="SimdUtils.ByteToNormalizedFloat"/> using <see cref="Vector4"/>.
-        /// </summary>
-        [MethodImpl(InliningOptions.ColdPath)]
-        internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
-        {
-            DebugVerifySpanInput(source, dest, 4);
-
-            uint count = (uint)dest.Length / 4;
-            if (count == 0)
-            {
-                return;
-            }
-
-            ref ByteVector4 sBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(source));
-            ref Vector4 dBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(dest));
-
-            const float scale = 1f / 255f;
-            Vector4 d = default;
-
-            for (nuint i = 0; i < count; i++)
-            {
-                ref ByteVector4 s = ref Unsafe.Add(ref sBase, i);
-                d.X = s.X;
-                d.Y = s.Y;
-                d.Z = s.Z;
-                d.W = s.W;
-                d *= scale;
-                Unsafe.Add(ref dBase, i) = d;
-            }
-        }
-
-        [StructLayout(LayoutKind.Sequential)]
-        private struct ByteVector4
-        {
-            public byte X;
-            public byte Y;
-            public byte Z;
-            public byte W;
-        }
-    }
-}
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@ -752,17 +752,23 @@ internal static partial class SimdUtils
        /// <summary>
        /// <see cref="ByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
        /// </summary>
+        /// <param name="source">The source buffer.</param>
+        /// <param name="destination">The destination buffer.</param>
        [MethodImpl(InliningOptions.ShortMethod)]
        internal static void ByteToNormalizedFloatReduce(
            ref ReadOnlySpan<byte> source,
-            ref Span<float> dest)
+            ref Span<float> destination)
        {
-            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
+            DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");

            if (Avx2.IsSupported || Sse2.IsSupported)
            {
                int remainder;
-                if (Avx2.IsSupported)
+                if (Vector512.IsHardwareAccelerated && Avx512F.IsSupported)
+                {
+                    remainder = Numerics.ModuloP2(source.Length, Vector512<byte>.Count);
+                }
+                else if (Avx2.IsSupported)
                {
                    remainder = Numerics.ModuloP2(source.Length, Vector256<byte>.Count);
                }
@ -775,10 +781,10 @@ internal static partial class SimdUtils

                if (adjustedCount > 0)
                {
-                    ByteToNormalizedFloat(source[..adjustedCount], dest[..adjustedCount]);
+                    ByteToNormalizedFloat(source[..adjustedCount], destination[..adjustedCount]);

                    source = source[adjustedCount..];
-                    dest = dest[adjustedCount..];
+                    destination = destination[adjustedCount..];
                }
            }
        }
@ -786,97 +792,127 @@ internal static partial class SimdUtils
        /// <summary>
        /// Implementation <see cref="SimdUtils.ByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
        /// </summary>
+        /// <param name="source">The source buffer.</param>
+        /// <param name="destination">The destination buffer.</param>
        /// <remarks>
        /// Implementation is based on MagicScaler code:
        /// https://github.com/saucecontrol/PhotoSauce/blob/b5811908041200488aa18fdfd17df5fc457415dc/src/MagicScaler/Magic/Processors/ConvertersFloat.cs#L80-L182
        /// </remarks>
        internal static unsafe void ByteToNormalizedFloat(
            ReadOnlySpan<byte> source,
-            Span<float> dest)
+            Span<float> destination)
        {
-            fixed (byte* sourceBase = source)
+            if (Avx512F.IsSupported)
            {
-                if (Avx2.IsSupported)
-                {
-                    DebugVerifySpanInput(source, dest, Vector256<byte>.Count);
-
-                    nuint n = dest.Vector256Count<byte>();
+                DebugVerifySpanInput(source, destination, Vector512<byte>.Count);

-                    ref Vector256<float> destBase =
-                        ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest));
+                nuint n = destination.Vector512Count<byte>();

-                    Vector256<float> scale = Vector256.Create(1 / (float)byte.MaxValue);
+                ref byte sourceBase = ref MemoryMarshal.GetReference(source);
+                ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination));

-                    for (nuint i = 0; i < n; i++)
-                    {
-                        nuint si = (uint)Vector256<byte>.Count * i;
-                        Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si);
-                        Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count);
-                        Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2));
-                        Vector256<int> i3 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 3));
-
-                        Vector256<float> f0 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i0));
-                        Vector256<float> f1 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i1));
-                        Vector256<float> f2 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i2));
-                        Vector256<float> f3 = Avx.Multiply(scale, Avx.ConvertToVector256Single(i3));
-
-                        ref Vector256<float> d = ref Unsafe.Add(ref destBase, i * 4);
-
-                        d = f0;
-                        Unsafe.Add(ref d, 1) = f1;
-                        Unsafe.Add(ref d, 2) = f2;
-                        Unsafe.Add(ref d, 3) = f3;
-                    }
+                for (nuint i = 0; i < n; i++)
+                {
+                    nuint si = (uint)Vector512<byte>.Count * i;
+                    Vector512<int> i0 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si));
+                    Vector512<int> i1 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)Vector512<int>.Count));
+                    Vector512<int> i2 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector512<int>.Count * 2)));
+                    Vector512<int> i3 = Avx512F.ConvertToVector512Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector512<int>.Count * 3)));
+
+                    // Declare multiplier on each line. Codegen is better.
+                    Vector512<float> f0 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i0);
+                    Vector512<float> f1 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i1);
+                    Vector512<float> f2 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i2);
+                    Vector512<float> f3 = Vector512.Create(1 / (float)byte.MaxValue) * Avx512F.ConvertToVector512Single(i3);
+
+                    ref Vector512<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
+
+                    d = f0;
+                    Unsafe.Add(ref d, 1) = f1;
+                    Unsafe.Add(ref d, 2) = f2;
+                    Unsafe.Add(ref d, 3) = f3;
                }
-                else
+            }
+            else if (Avx2.IsSupported)
+            {
+                DebugVerifySpanInput(source, destination, Vector256<byte>.Count);
+
+                nuint n = destination.Vector256Count<byte>();
+
+                ref byte sourceBase = ref MemoryMarshal.GetReference(source);
+                ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination));
+
+                for (nuint i = 0; i < n; i++)
                {
-                    // Sse
-                    DebugVerifySpanInput(source, dest, Vector128<byte>.Count);
+                    nuint si = (uint)Vector256<byte>.Count * i;
+                    Vector256<int> i0 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si));
+                    Vector256<int> i1 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)Vector256<int>.Count));
+                    Vector256<int> i2 = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sourceBase, si + (nuint)(Vector256<int>.Count * 2)));
+
+                    // Ensure overreads past 16 byte boundary do not happen in debug due to lack of containment.
+                    ref ulong refULong = ref Unsafe.As<byte, ulong>(ref Unsafe.Add(ref sourceBase, si));
+                    Vector256<int> i3 = Avx2.ConvertToVector256Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refULong, 3)).AsByte());
+
+                    // Declare multiplier on each line. Codegen is better.
+                    Vector256<float> f0 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i0);
+                    Vector256<float> f1 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i1);
+                    Vector256<float> f2 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i2);
+                    Vector256<float> f3 = Vector256.Create(1 / (float)byte.MaxValue) * Avx.ConvertToVector256Single(i3);
+
+                    ref Vector256<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
+
+                    d = f0;
+                    Unsafe.Add(ref d, 1) = f1;
+                    Unsafe.Add(ref d, 2) = f2;
+                    Unsafe.Add(ref d, 3) = f3;
+                }
+            }
+            else if (Sse2.IsSupported || AdvSimd.IsSupported)
+            {
+                DebugVerifySpanInput(source, destination, Vector128<byte>.Count);

-                    nuint n = dest.Vector128Count<byte>();
+                nuint n = destination.Vector128Count<byte>();
+
+                ref byte sourceBase = ref MemoryMarshal.GetReference(source);
+                ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination));

-                    ref Vector128<float> destBase =
-                        ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest));
+                Vector128<float> scale = Vector128.Create(1 / (float)byte.MaxValue);
+                Vector128<byte> zero = Vector128<byte>.Zero;

-                    Vector128<float> scale = Vector128.Create(1 / (float)byte.MaxValue);
-                    Vector128<byte> zero = Vector128<byte>.Zero;
+                for (nuint i = 0; i < n; i++)
+                {
+                    nuint si = (uint)Vector128<byte>.Count * i;

-                    for (nuint i = 0; i < n; i++)
+                    Vector128<int> i0, i1, i2, i3;
+                    if (Sse41.IsSupported)
                    {
-                        nuint si = (uint)Vector128<byte>.Count * i;
-
-                        Vector128<int> i0, i1, i2, i3;
-                        if (Sse41.IsSupported)
-                        {
-                            i0 = Sse41.ConvertToVector128Int32(sourceBase + si);
-                            i1 = Sse41.ConvertToVector128Int32(sourceBase + si + Vector128<int>.Count);
-                            i2 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 2));
-                            i3 = Sse41.ConvertToVector128Int32(sourceBase + si + (Vector128<int>.Count * 3));
-                        }
-                        else
-                        {
-                            Vector128<byte> b = Sse2.LoadVector128(sourceBase + si);
-                            Vector128<short> s0 = Sse2.UnpackLow(b, zero).AsInt16();
-                            Vector128<short> s1 = Sse2.UnpackHigh(b, zero).AsInt16();
-
-                            i0 = Sse2.UnpackLow(s0, zero.AsInt16()).AsInt32();
-                            i1 = Sse2.UnpackHigh(s0, zero.AsInt16()).AsInt32();
-                            i2 = Sse2.UnpackLow(s1, zero.AsInt16()).AsInt32();
-                            i3 = Sse2.UnpackHigh(s1, zero.AsInt16()).AsInt32();
-                        }
-
-                        Vector128<float> f0 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i0));
-                        Vector128<float> f1 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i1));
-                        Vector128<float> f2 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i2));
-                        Vector128<float> f3 = Sse.Multiply(scale, Sse2.ConvertToVector128Single(i3));
-
-                        ref Vector128<float> d = ref Unsafe.Add(ref destBase, i * 4);
-
-                        d = f0;
-                        Unsafe.Add(ref d, 1) = f1;
-                        Unsafe.Add(ref d, 2) = f2;
-                        Unsafe.Add(ref d, 3) = f3;
+                        ref int refInt = ref Unsafe.As<byte, int>(ref Unsafe.Add(ref sourceBase, si));
+
+                        i0 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(refInt).AsByte());
+                        i1 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 1)).AsByte());
+                        i2 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 2)).AsByte());
+                        i3 = Sse41.ConvertToVector128Int32(Vector128.CreateScalarUnsafe(Unsafe.Add(ref refInt, 3)).AsByte());
                    }
+                    else
+                    {
+                        // Sse2, AdvSimd
+                        Vector128<byte> b = Vector128.LoadUnsafe(ref sourceBase, si);
+                        (Vector128<ushort> s0, Vector128<ushort> s1) = Vector128.Widen(b);
+                        (i0, i1) = Vector128.Widen(s0.AsInt16());
+                        (i2, i3) = Vector128.Widen(s1.AsInt16());
+                    }
+
+                    Vector128<float> f0 = scale * Vector128.ConvertToSingle(i0);
+                    Vector128<float> f1 = scale * Vector128.ConvertToSingle(i1);
+                    Vector128<float> f2 = scale * Vector128.ConvertToSingle(i2);
+                    Vector128<float> f3 = scale * Vector128.ConvertToSingle(i3);
+
+                    ref Vector128<float> d = ref Unsafe.Add(ref destinationBase, i * 4);
+
+                    d = f0;
+                    Unsafe.Add(ref d, 1) = f1;
+                    Unsafe.Add(ref d, 2) = f2;
+                    Unsafe.Add(ref d, 3) = f3;
                }
            }
        }
--- a/src/ImageSharp/Common/Helpers/SimdUtils.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@ -22,13 +22,6 @@ internal static partial class SimdUtils
    public static bool HasVector8 { get; } =
        Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;

-    /// <summary>
-    /// Gets a value indicating whether <see cref="Vector{T}"/> code is being JIT-ed to SSE instructions
-    /// where float and integer registers are of size 128 byte.
-    /// </summary>
-    public static bool HasVector4 { get; } =
-        Vector.IsHardwareAccelerated && Vector<float>.Count == 4;
-
    /// <summary>
    /// Transform all scalars in 'v' in a way that converting them to <see cref="int"/> would have rounding semantics.
    /// </summary>
@ -69,96 +62,6 @@ internal static partial class SimdUtils
        }
    }

-    /// <summary>
-    /// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
-    /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
-    /// but there are no restrictions on the span's length.
-    /// </summary>
-    /// <param name="source">The source span of bytes</param>
-    /// <param name="dest">The destination span of floats</param>
-    [MethodImpl(InliningOptions.ShortMethod)]
-    internal static void ByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
-    {
-        DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same length!");
-
-        HwIntrinsics.ByteToNormalizedFloatReduce(ref source, ref dest);
-
-        // Also deals with the remainder from previous conversions:
-        FallbackIntrinsics128.ByteToNormalizedFloatReduce(ref source, ref dest);
-
-        // Deal with the remainder:
-        if (source.Length > 0)
-        {
-            ConvertByteToNormalizedFloatRemainder(source, dest);
-        }
-    }
-
-    /// <summary>
-    /// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'destination' buffer of <see cref="byte"/>.
-    /// The values are scaled up into [0-255] and rounded, overflows are clamped.
-    /// <paramref name="source"/> should be the of the same size as <paramref name="destination"/>,
-    /// but there are no restrictions on the span's length.
-    /// </summary>
-    /// <param name="source">The source span of floats</param>
-    /// <param name="destination">The destination span of bytes</param>
-    [MethodImpl(InliningOptions.ShortMethod)]
-    internal static void NormalizedFloatToByteSaturate(ReadOnlySpan<float> source, Span<byte> destination)
-    {
-        DebugGuard.IsTrue(source.Length == destination.Length, nameof(source), "Input spans must be of same length!");
-        HwIntrinsics.NormalizedFloatToByteSaturateReduce(ref source, ref destination);
-
-        // Deal with the remainder:
-        if (source.Length > 0)
-        {
-            ConvertNormalizedFloatToByteRemainder(source, destination);
-        }
-    }
-
-    [MethodImpl(InliningOptions.ColdPath)]
-    private static void ConvertByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> destination)
-    {
-        ref byte sBase = ref MemoryMarshal.GetReference(source);
-        ref float dBase = ref MemoryMarshal.GetReference(destination);
-
-        // There are at most 3 elements at this point, having a for loop is overkill.
-        // Let's minimize the no. of instructions!
-        switch (source.Length)
-        {
-            case 3:
-                Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2) / 255f;
-                goto case 2;
-            case 2:
-                Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1) / 255f;
-                goto case 1;
-            case 1:
-                dBase = sBase / 255f;
-                break;
-        }
-    }
-
-    [MethodImpl(MethodImplOptions.NoInlining)]
-    private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> destination)
-    {
-        ref float sBase = ref MemoryMarshal.GetReference(source);
-        ref byte dBase = ref MemoryMarshal.GetReference(destination);
-        for (int i = 0; i < source.Length; i++)
-        {
-            Unsafe.Add(ref dBase, i) = ConvertToByte(Unsafe.Add(ref sBase, i));
-        }
-    }
-
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private static byte ConvertToByte(float f) => (byte)Numerics.Clamp((f * 255F) + 0.5F, 0, 255F);
-
-    [Conditional("DEBUG")]
-    private static void VerifyHasVector8(string operation)
-    {
-        if (!HasVector8)
-        {
-            throw new NotSupportedException($"{operation} is supported only on AVX2 CPU!");
-        }
-    }
-
    [Conditional("DEBUG")]
    private static void DebugVerifySpanInput(ReadOnlySpan<byte> source, Span<float> dest, int shouldBeDivisibleBy)
    {
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@ -26,7 +26,7 @@ internal static class Vector128Utilities
    public static bool SupportsShuffleFloat
    {
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
-        get => Sse.IsSupported || AdvSimd.IsSupported;
+        get => Sse.IsSupported;
    }

    /// <summary>
@ -70,17 +70,6 @@ internal static class Vector128Utilities
            return Sse.Shuffle(vector, vector, control);
        }

-        if (AdvSimd.IsSupported)
-        {
-#pragma warning disable CA1857 // A constant is expected for the parameter
-            Vector128<float> result = Vector128.Create(AdvSimd.Extract(vector, (byte)(control & 0x3)));
-            result = AdvSimd.Insert(result, 1, AdvSimd.Extract(vector, (byte)((control >> 2) & 0x3)));
-            result = AdvSimd.Insert(result, 2, AdvSimd.Extract(vector, (byte)((control >> 4) & 0x3)));
-            result = AdvSimd.Insert(result, 3, AdvSimd.Extract(vector, (byte)((control >> 6) & 0x3)));
-#pragma warning restore CA1857 // A constant is expected for the parameter
-            return result;
-        }
-
        ThrowUnreachableException();
        return default;
    }
--- a/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Block8x8F.cs
@ -386,29 +386,33 @@ internal partial struct Block8x8F : IEquatable<Block8x8F>
    public void LoadFromInt16ExtendedAvx2(ref Block8x8 source)
    {
        DebugGuard.IsTrue(
-            SimdUtils.HasVector8,
+            Avx2.IsSupported,
            "LoadFromUInt16ExtendedAvx2 only works on AVX2 compatible architecture!");

-        ref Vector<short> sRef = ref Unsafe.As<Block8x8, Vector<short>>(ref source);
-        ref Vector<float> dRef = ref Unsafe.As<Block8x8F, Vector<float>>(ref this);
+        ref short sRef = ref Unsafe.As<Block8x8, short>(ref source);
+        ref Vector256<float> dRef = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this);

-        // Vector<ushort>.Count == 16 on AVX2
+        // Vector256<ushort>.Count == 16 on AVX2
        // We can process 2 block rows in a single step
-        SimdUtils.ExtendedIntrinsics.ConvertToSingle(sRef, out Vector<float> top, out Vector<float> bottom);
-        dRef = top;
-        Unsafe.Add(ref dRef, 1) = bottom;
-
-        SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 1), out top, out bottom);
-        Unsafe.Add(ref dRef, 2) = top;
-        Unsafe.Add(ref dRef, 3) = bottom;
-
-        SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 2), out top, out bottom);
-        Unsafe.Add(ref dRef, 4) = top;
-        Unsafe.Add(ref dRef, 5) = bottom;
-
-        SimdUtils.ExtendedIntrinsics.ConvertToSingle(Unsafe.Add(ref sRef, 3), out top, out bottom);
-        Unsafe.Add(ref dRef, 6) = top;
-        Unsafe.Add(ref dRef, 7) = bottom;
+        Vector256<int> top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef));
+        Vector256<int> bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)Vector256<int>.Count));
+        dRef = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 1) = Avx.ConvertToVector256Single(bottom);
+
+        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 2)));
+        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 3)));
+        Unsafe.Add(ref dRef, 2) = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 3) = Avx.ConvertToVector256Single(bottom);
+
+        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 4)));
+        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 5)));
+        Unsafe.Add(ref dRef, 4) = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 5) = Avx.ConvertToVector256Single(bottom);
+
+        top = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 6)));
+        bottom = Avx2.ConvertToVector256Int32(Vector128.LoadUnsafe(ref sRef, (nuint)(Vector256<int>.Count * 7)));
+        Unsafe.Add(ref dRef, 6) = Avx.ConvertToVector256Single(top);
+        Unsafe.Add(ref dRef, 7) = Avx.ConvertToVector256Single(bottom);
    }

    /// <summary>
--- a/src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs
+++ b/src/ImageSharp/PixelFormats/Utils/Vector4Converters.RgbaCompatible.cs
@ -5,6 +5,7 @@ using System.Buffers;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;

 namespace SixLabors.ImageSharp.PixelFormats.Utils;

@ -31,74 +32,86 @@ internal static partial class Vector4Converters
        /// Provides an efficient default implementation for <see cref="PixelOperations{TPixel}.ToVector4(Configuration,ReadOnlySpan{TPixel},Span{Vector4},PixelConversionModifiers)"/>
        /// The method works by internally converting to a <see cref="Rgba32"/> therefore it's not applicable for that type!
        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        /// <typeparam name="TPixel">The type of pixel format.</typeparam>
+        /// <param name="configuration">The configuration.</param>
+        /// <param name="pixelOperations">The pixel operations instance.</param>
+        /// <param name="source">The source buffer.</param>
+        /// <param name="destination">The destination buffer.</param>
+        /// <param name="modifiers">The conversion modifier flags.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static void ToVector4<TPixel>(
            Configuration configuration,
            PixelOperations<TPixel> pixelOperations,
-            ReadOnlySpan<TPixel> sourcePixels,
-            Span<Vector4> destVectors,
+            ReadOnlySpan<TPixel> source,
+            Span<Vector4> destination,
            PixelConversionModifiers modifiers)
            where TPixel : unmanaged, IPixel<TPixel>
        {
            Guard.NotNull(configuration, nameof(configuration));
-            Guard.DestinationShouldNotBeTooShort(sourcePixels, destVectors, nameof(destVectors));
+            Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));

-            int count = sourcePixels.Length;
+            int count = source.Length;

            // Not worth for small buffers:
            if (count < Vector4ConversionThreshold)
            {
-                Default.UnsafeToVector4(sourcePixels, destVectors, modifiers);
+                Default.UnsafeToVector4(source, destination, modifiers);

                return;
            }

-            // Using the last quarter of 'destVectors' as a temporary buffer to avoid allocation:
+            // Using the last quarter of 'destination' as a temporary buffer to avoid allocation:
            int countWithoutLastItem = count - 1;
-            ReadOnlySpan<TPixel> reducedSource = sourcePixels[..countWithoutLastItem];
-            Span<Rgba32> lastQuarterOfDestBuffer = MemoryMarshal.Cast<Vector4, Rgba32>(destVectors).Slice((3 * count) + 1, countWithoutLastItem);
-            pixelOperations.ToRgba32(configuration, reducedSource, lastQuarterOfDestBuffer);
+            ReadOnlySpan<TPixel> reducedSource = source[..countWithoutLastItem];
+            Span<Rgba32> lastQuarterOfDestination = MemoryMarshal.Cast<Vector4, Rgba32>(destination).Slice((3 * count) + 1, countWithoutLastItem);
+            pixelOperations.ToRgba32(configuration, reducedSource, lastQuarterOfDestination);

-            // 'destVectors' and 'lastQuarterOfDestBuffer' are overlapping buffers,
+            // 'destination' and 'lastQuarterOfDestination' are overlapping buffers,
            // but we are always reading/writing at different positions:
            SimdUtils.ByteToNormalizedFloat(
-                MemoryMarshal.Cast<Rgba32, byte>(lastQuarterOfDestBuffer),
-                MemoryMarshal.Cast<Vector4, float>(destVectors[..countWithoutLastItem]));
+                MemoryMarshal.Cast<Rgba32, byte>(lastQuarterOfDestination),
+                MemoryMarshal.Cast<Vector4, float>(destination[..countWithoutLastItem]));

-            destVectors[countWithoutLastItem] = sourcePixels[countWithoutLastItem].ToVector4();
+            destination[countWithoutLastItem] = source[countWithoutLastItem].ToVector4();

            // TODO: Investigate optimized 1-pass approach!
-            ApplyForwardConversionModifiers(destVectors, modifiers);
+            ApplyForwardConversionModifiers(destination, modifiers);
        }

        /// <summary>
        /// Provides an efficient default implementation for <see cref="PixelOperations{TPixel}.FromVector4Destructive(Configuration,Span{Vector4},Span{TPixel},PixelConversionModifiers)"/>
        /// The method is works by internally converting to a <see cref="Rgba32"/> therefore it's not applicable for that type!
        /// </summary>
-        [MethodImpl(InliningOptions.ShortMethod)]
+        /// <typeparam name="TPixel">The type of pixel format.</typeparam>
+        /// <param name="configuration">The configuration.</param>
+        /// <param name="pixelOperations">The pixel operations instance.</param>
+        /// <param name="source">The source buffer.</param>
+        /// <param name="destination">The destination buffer.</param>
+        /// <param name="modifiers">The conversion modifier flags.</param>
+        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        internal static void FromVector4<TPixel>(
            Configuration configuration,
            PixelOperations<TPixel> pixelOperations,
-            Span<Vector4> sourceVectors,
-            Span<TPixel> destPixels,
+            Span<Vector4> source,
+            Span<TPixel> destination,
            PixelConversionModifiers modifiers)
            where TPixel : unmanaged, IPixel<TPixel>
        {
            Guard.NotNull(configuration, nameof(configuration));
-            Guard.DestinationShouldNotBeTooShort(sourceVectors, destPixels, nameof(destPixels));
+            Guard.DestinationShouldNotBeTooShort(source, destination, nameof(destination));

-            int count = sourceVectors.Length;
+            int count = source.Length;

            // Not worth for small buffers:
            if (count < Vector4ConversionThreshold)
            {
-                Default.UnsafeFromVector4(sourceVectors, destPixels, modifiers);
+                Default.UnsafeFromVector4(source, destination, modifiers);

                return;
            }

            // TODO: Investigate optimized 1-pass approach!
-            ApplyBackwardConversionModifiers(sourceVectors, modifiers);
+            ApplyBackwardConversionModifiers(source, modifiers);

            // For the opposite direction it's not easy to implement the trick used in RunRgba32CompatibleToVector4Conversion,
            // so let's allocate a temporary buffer as usually:
@ -106,20 +119,30 @@ internal static partial class Vector4Converters
            Span<Rgba32> tempSpan = tempBuffer.Memory.Span;

            SimdUtils.NormalizedFloatToByteSaturate(
-                MemoryMarshal.Cast<Vector4, float>(sourceVectors),
+                MemoryMarshal.Cast<Vector4, float>(source),
                MemoryMarshal.Cast<Rgba32, byte>(tempSpan));

-            pixelOperations.FromRgba32(configuration, tempSpan, destPixels);
+            pixelOperations.FromRgba32(configuration, tempSpan, destination);
        }

        private static int CalculateVector4ConversionThreshold()
        {
-            if (!Vector.IsHardwareAccelerated)
+            if (!Vector128.IsHardwareAccelerated)
            {
                return int.MaxValue;
            }

-            return SimdUtils.ExtendedIntrinsics.IsAvailable && SimdUtils.HasVector8 ? 256 : 128;
+            if (Vector512.IsHardwareAccelerated)
+            {
+                return 512;
+            }
+
+            if (Vector256.IsHardwareAccelerated)
+            {
+                return 256;
+            }
+
+            return 128;
        }
    }
 }
--- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4.cs
@ -14,9 +14,9 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk;
 public abstract class ToVector4<TPixel>
    where TPixel : unmanaged, IPixel<TPixel>
 {
-    protected IMemoryOwner<TPixel> source;
+    protected IMemoryOwner<TPixel> Source { get; set; }

-    protected IMemoryOwner<Vector4> destination;
+    protected IMemoryOwner<Vector4> Destination { get; set; }

    protected Configuration Configuration => Configuration.Default;

@ -26,22 +26,22 @@ public abstract class ToVector4<TPixel>
    [GlobalSetup]
    public void Setup()
    {
-        this.source = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
-        this.destination = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
+        this.Source = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
+        this.Destination = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
    }

    [GlobalCleanup]
    public void Cleanup()
    {
-        this.source.Dispose();
-        this.destination.Dispose();
+        this.Source.Dispose();
+        this.Destination.Dispose();
    }

    // [Benchmark]
    public void Naive()
    {
-        Span<TPixel> s = this.source.GetSpan();
-        Span<Vector4> d = this.destination.GetSpan();
+        Span<TPixel> s = this.Source.GetSpan();
+        Span<Vector4> d = this.Destination.GetSpan();

        for (int i = 0; i < this.Count; i++)
        {
@ -50,11 +50,8 @@ public abstract class ToVector4<TPixel>
    }

    [Benchmark]
-    public void PixelOperations_Specialized()
-    {
-        PixelOperations<TPixel>.Instance.ToVector4(
+    public void PixelOperations_Specialized() => PixelOperations<TPixel>.Instance.ToVector4(
            this.Configuration,
-            this.source.GetSpan(),
-            this.destination.GetSpan());
-    }
+            this.Source.GetSpan(),
+            this.Destination.GetSpan());
 }
--- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs
+++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Bgra32.cs
@ -16,8 +16,8 @@ public class ToVector4_Bgra32 : ToVector4<Bgra32>
    {
        new PixelOperations<Bgra32>().ToVector4(
            this.Configuration,
-            this.source.GetSpan(),
-            this.destination.GetSpan());
+            this.Source.GetSpan(),
+            this.Destination.GetSpan());
    }

    // RESULTS:
--- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs
+++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgb24.cs
@ -16,8 +16,8 @@ public class ToVector4_Rgb24 : ToVector4<Rgb24>
    {
        new PixelOperations<Rgb24>().ToVector4(
            this.Configuration,
-            this.source.GetSpan(),
-            this.destination.GetSpan());
+            this.Source.GetSpan(),
+            this.Destination.GetSpan());
    }
 }

--- a/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs
+++ b/tests/ImageSharp.Benchmarks/Bulk/ToVector4_Rgba32.cs
@ -14,27 +14,18 @@ namespace SixLabors.ImageSharp.Benchmarks.Bulk;
 [Config(typeof(Config.Short))]
 public class ToVector4_Rgba32 : ToVector4<Rgba32>
 {
-    [Benchmark]
-    public void FallbackIntrinsics128()
-    {
-        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
-        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
-
-        SimdUtils.FallbackIntrinsics128.ByteToNormalizedFloat(sBytes, dFloats);
-    }
-
    [Benchmark]
    public void PixelOperations_Base()
        => new PixelOperations<Rgba32>().ToVector4(
            this.Configuration,
-            this.source.GetSpan(),
-            this.destination.GetSpan());
+            this.Source.GetSpan(),
+            this.Destination.GetSpan());

    [Benchmark]
    public void HwIntrinsics()
    {
-        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
-        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
+        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());

        SimdUtils.HwIntrinsics.ByteToNormalizedFloat(sBytes, dFloats);
    }
@ -42,8 +33,8 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
    // [Benchmark]
    public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
    {
-        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
-        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
+        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());

        nuint n = (uint)dFloats.Length / (uint)Vector<byte>.Count;

@ -67,14 +58,14 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
        }

        n = (uint)(dFloats.Length / Vector<float>.Count);
-        var scale = new Vector<float>(1f / 255f);
+        Vector<float> scale = new(1f / 255f);

        for (nuint i = 0; i < n; i++)
        {
            ref Vector<float> dRef = ref Unsafe.Add(ref destBase, i);

-            var du = Vector.AsVectorInt32(dRef);
-            var v = Vector.ConvertToSingle(du);
+            Vector<int> du = Vector.AsVectorInt32(dRef);
+            Vector<float> v = Vector.ConvertToSingle(du);
            v *= scale;

            dRef = v;
@ -84,14 +75,14 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
    // [Benchmark]
    public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop()
    {
-        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
-        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
+        Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.Source.GetSpan());
+        Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.Destination.GetSpan());

        nuint n = (uint)dFloats.Length / (uint)Vector<byte>.Count;

        ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference((ReadOnlySpan<byte>)sBytes));
        ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dFloats));
-        var scale = new Vector<float>(1f / 255f);
+        Vector<float> scale = new(1f / 255f);

        for (nuint i = 0; i < n; i++)
        {
@ -117,8 +108,8 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    private static Vector<float> ConvertToNormalizedSingle(Vector<uint> u, Vector<float> scale)
    {
-        var vi = Vector.AsVectorInt32(u);
-        var v = Vector.ConvertToSingle(vi);
+        Vector<int> vi = Vector.AsVectorInt32(u);
+        Vector<float> v = Vector.ConvertToSingle(vi);
        v *= scale;
        return v;
    }
@ -151,4 +142,30 @@ public class ToVector4_Rgba32 : ToVector4<Rgba32>
            PixelOperations_Base |    Core |  2048 | 6,752.68 ns |   272.820 ns |  15.4148 ns |   1.67 |     0.02 |      - |      24 B |
     PixelOperations_Specialized |    Core |  2048 | 1,126.13 ns |    79.192 ns |   4.4745 ns |!! 0.28 |     0.00 |      - |       0 B | <--- ExtendedIntrinsics rock!
     */
+
+    /*
+    BenchmarkDotNet v0.13.10, Windows 11 (10.0.22631.3085/23H2/2023Update/SunValley3)
+    11th Gen Intel Core i7-11370H 3.30GHz, 1 CPU, 8 logical and 4 physical cores
+    .NET SDK 8.0.200-preview.23624.5
+      [Host]     : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
+      Job-DFEQJT : .NET 8.0.1 (8.0.123.58001), X64 RyuJIT AVX2
+
+    Runtime=.NET 8.0  Arguments=/p:DebugType=portable  IterationCount=3
+    LaunchCount=1  WarmupCount=3
+
+    | Method                      | Count | Mean        | Error      | StdDev    | Allocated |
+    |---------------------------- |------ |------------:|-----------:|----------:|----------:|
+    | FallbackIntrinsics128       | 64    |   139.66 ns |  27.429 ns |  1.503 ns |         - |
+    | PixelOperations_Base        | 64    |   124.65 ns |  29.653 ns |  1.625 ns |         - |
+    | HwIntrinsics                | 64    |    18.16 ns |   4.731 ns |  0.259 ns |         - |
+    | PixelOperations_Specialized | 64    |    27.94 ns |  15.220 ns |  0.834 ns |         - |
+    | FallbackIntrinsics128       | 256   |   525.07 ns |  34.397 ns |  1.885 ns |         - |
+    | PixelOperations_Base        | 256   |   464.17 ns |  46.897 ns |  2.571 ns |         - |
+    | HwIntrinsics                | 256   |    43.88 ns |   4.525 ns |  0.248 ns |         - |
+    | PixelOperations_Specialized | 256   |    55.57 ns |  14.587 ns |  0.800 ns |         - |
+    | FallbackIntrinsics128       | 2048  | 4,148.44 ns | 476.583 ns | 26.123 ns |         - |
+    | PixelOperations_Base        | 2048  | 3,608.42 ns |  66.293 ns |  3.634 ns |         - |
+    | HwIntrinsics                | 2048  |   361.42 ns |  35.576 ns |  1.950 ns |         - |
+    | PixelOperations_Specialized | 2048  |   374.82 ns |  33.371 ns |  1.829 ns |         - |
+    */
 }
--- a/tests/ImageSharp.Benchmarks/LoadResizeSave/README.md
+++ b/tests/ImageSharp.Benchmarks/LoadResizeSave/README.md
@ -1,4 +1,4 @@
-The benchmarks have been adapted from the
+The benchmarks have been adapted from the
 [PhotoSauce's MemoryStress project](https://github.com/saucecontrol/core-imaging-playground/tree/beeees/MemoryStress).  

 ### Setup
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@ -3,6 +3,7 @@

 using System.Numerics;
 using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics.Arm;
 using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Tests.TestUtilities;
@ -117,16 +118,10 @@ public partial class SimdUtilsTests
    public static readonly TheoryData<int> ArbitraryArraySizes = new() { 0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520 };

    [Theory]
-    [MemberData(nameof(ArraySizesDivisibleBy4))]
-    public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count) => TestImpl_BulkConvertByteToNormalizedFloat(
-            count,
-            (s, d) => SimdUtils.FallbackIntrinsics128.ByteToNormalizedFloat(s.Span, d.Span));
-
-    [Theory]
-    [MemberData(nameof(ArraySizesDivisibleBy32))]
+    [MemberData(nameof(ArraySizesDivisibleBy64))]
    public void HwIntrinsics_BulkConvertByteToNormalizedFloat(int count)
    {
-        if (!Sse2.IsSupported)
+        if (!Sse2.IsSupported && !AdvSimd.IsSupported)
        {
            return;
        }
@ -138,7 +133,7 @@ public partial class SimdUtilsTests
        FeatureTestRunner.RunWithHwIntrinsicsFeature(
            RunTest,
            count,
-            HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41);
+            HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512F | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41);
    }

    [Theory]
@ -160,32 +155,11 @@ public partial class SimdUtilsTests
        Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
    }

-    [Theory]
-    [InlineData(1234)]
-    public void ExtendedIntrinsics_ConvertToSingle(short scale)
-    {
-        int n = Vector<float>.Count;
-        short[] sData = new Random(scale).GenerateRandomInt16Array(2 * n, (short)-scale, scale);
-        float[] fData = sData.Select(u => (float)u).ToArray();
-
-        Vector<short> source = new(sData);
-
-        Vector<float> expected1 = new(fData, 0);
-        Vector<float> expected2 = new(fData, n);
-
-        // Act:
-        SimdUtils.ExtendedIntrinsics.ConvertToSingle(source, out Vector<float> actual1, out Vector<float> actual2);
-
-        // Assert:
-        Assert.Equal(expected1, actual1);
-        Assert.Equal(expected2, actual2);
-    }
-
    [Theory]
    [MemberData(nameof(ArraySizesDivisibleBy64))]
    public void HwIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
    {
-        if (!Sse2.IsSupported)
+        if (!Sse2.IsSupported && !AdvSimd.IsSupported)
        {
            return;
        }