Bulk conversion of arbitrary-sized Span-s of scalars

8 years ago · 81c57a812d
13 changed files with 537 additions and 290 deletions
--- a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
@ -0,0 +1,212 @@
+// Copyright (c) Six Labors and contributors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using System.Diagnostics;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+using SixLabors.ImageSharp.Tuples;
+
+// ReSharper disable MemberHidesStaticFromOuterClass
+namespace SixLabors.ImageSharp
+{
+    internal static partial class SimdUtils
+    {
+        /// <summary>
+        /// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*)
+        /// </summary>
+        public static class BasicIntrinsics256
+        {
+            public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture;
+
+            /// <summary>
+            /// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            internal static void BulkConvertByteToNormalizedFloatReduce(
+                ref ReadOnlySpan<byte> source,
+                ref Span<float> dest)
+            {
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
+
+                if (IsAvailable)
+                {
+                    int remainder = source.Length % 8;
+                    int alignedCount = source.Length - remainder;
+
+                    if (alignedCount > 0)
+                    {
+                        BulkConvertByteToNormalizedFloat(
+                            source.Slice(0, alignedCount),
+                            dest.Slice(0, alignedCount));
+
+                        source = source.Slice(alignedCount);
+                        dest = dest.Slice(alignedCount);
+                    }
+                }
+            }
+
+            /// <summary>
+            /// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source'
+            /// into 'dest' buffer of <see cref="byte"/>. The values are scaled up into [0-255] and rounded.
+            /// The implementation is SIMD optimized and works only with `source.Length` divisible by 8/>.
+            /// Based on:
+            /// <see>
+            ///     <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
+            /// </see>
+            /// </summary>
+            internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
+            {
+                GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
+
+                DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
+
+                if (source.Length == 0)
+                {
+                    return;
+                }
+
+                ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
+                ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
+                int n = source.Length / 8;
+
+                Vector<float> magick = new Vector<float>(32768.0f);
+                Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
+
+                // need to copy to a temporary struct, because
+                // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
+                // does not work. TODO: This might be a CoreClr bug, need to ask/report
+                var temp = default(Octet.OfUInt32);
+                ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
+
+                for (int i = 0; i < n; i++)
+                {
+                    // union { float f; uint32_t i; } u;
+                    // u.f = 32768.0f + x * (255.0f / 256.0f);
+                    // return (uint8_t)u.i;
+                    Vector<float> x = Unsafe.Add(ref srcBase, i);
+                    x = (x * scale) + magick;
+                    tempRef = x;
+
+                    ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
+                    d.LoadFrom(ref temp);
+                }
+            }
+
+            /// <summary>
+            /// SIMD optimized implementation for <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>.
+            /// Works only with `dest.Length` divisible by 8.
+            /// Implementation adapted from:
+            /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
+            /// http://stackoverflow.com/a/536278
+            /// </summary>
+            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
+            {
+                GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
+
+                DebugGuard.IsTrue((dest.Length % 8) == 0, nameof(source), "dest.Length should be divisable by 8!");
+
+                var bVec = new Vector<float>(256.0f / 255.0f);
+                var magicFloat = new Vector<float>(32768.0f);
+                var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
+                var mask = new Vector<uint>(255);
+
+                ref Octet.OfByte sourceBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(source));
+                ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As<float, Octet.OfUInt32>(ref MemoryMarshal.GetReference(dest));
+
+                ref Vector<float> destBaseAsFloat = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref destBaseAsWideOctet);
+
+                int n = dest.Length / 8;
+
+                for (int i = 0; i < n; i++)
+                {
+                    ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i);
+                    ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i);
+                    d.LoadFrom(ref s);
+                }
+
+                for (int i = 0; i < n; i++)
+                {
+                    ref Vector<float> df = ref Unsafe.Add(ref destBaseAsFloat, i);
+
+                    var vi = Vector.AsVectorUInt32(df);
+                    vi &= mask;
+                    vi |= magicInt;
+
+                    var vf = Vector.AsVectorSingle(vi);
+                    vf = (vf - magicFloat) * bVec;
+
+                    df = vf;
+                }
+            }
+
+            /// <summary>
+            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+                ref ReadOnlySpan<float> source,
+                ref Span<byte> dest)
+            {
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
+
+                if (IsAvailable)
+                {
+                    int remainder = source.Length % Vector<byte>.Count;
+                    int alignedCount = source.Length - remainder;
+
+                    if (alignedCount > 0)
+                    {
+                        BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
+
+                        source = source.Slice(alignedCount);
+                        dest = dest.Slice(alignedCount);
+                    }
+                }
+            }
+
+            /// <summary>
+            /// Same as <see cref="BulkConvertNormalizedFloatToByte"/> but clamps overflown values before conversion.
+            /// </summary>
+            internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
+            {
+                GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
+
+                DebugGuard.IsTrue((source.Length % 8) == 0, nameof(source), "source.Length should be divisible by 8!");
+
+                if (source.Length == 0)
+                {
+                    return;
+                }
+
+                ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
+                ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
+                int n = source.Length / 8;
+
+                Vector<float> magick = new Vector<float>(32768.0f);
+                Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
+
+                // need to copy to a temporary struct, because
+                // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
+                // does not work. TODO: This might be a CoreClr bug, need to ask/report
+                var temp = default(Octet.OfUInt32);
+                ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
+
+                for (int i = 0; i < n; i++)
+                {
+                    // union { float f; uint32_t i; } u;
+                    // u.f = 32768.0f + x * (255.0f / 256.0f);
+                    // return (uint8_t)u.i;
+                    Vector<float> x = Unsafe.Add(ref srcBase, i);
+                    x = Vector.Max(x, Vector<float>.Zero);
+                    x = Vector.Min(x, Vector<float>.One);
+
+                    x = (x * scale) + magick;
+                    tempRef = x;
+
+                    ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
+                    d.LoadFrom(ref temp);
+                }
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@ -1,8 +1,10 @@
 using System;
+using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;

+// ReSharper disable MemberHidesStaticFromOuterClass
 namespace SixLabors.ImageSharp
 {
    internal static partial class SimdUtils
@ -18,22 +20,47 @@ namespace SixLabors.ImageSharp
        {
            public static bool IsAvailable { get; } =
 #if NETCOREAPP2_1
-// TODO: Also available in .NET 4.7.2, we need to add a build target!
-                true;
+                // TODO: Also available in .NET 4.7.2, we need to add a build target!
+                Vector.IsHardwareAccelerated;
 #else
                false;
 #endif

            /// <summary>
-            /// A variant of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>, which is faster on new .NET runtime.
+            /// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            [Conditional("NETCOREAPP2_1")]
+            internal static void BulkConvertByteToNormalizedFloatReduce(
+                ref ReadOnlySpan<byte> source,
+                ref Span<float> dest)
+            {
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
+
+                if (IsAvailable)
+                {
+                    int remainder = source.Length % Vector<byte>.Count;
+                    int alignedCount = source.Length - remainder;
+
+                    if (alignedCount > 0)
+                    {
+                        BulkConvertByteToNormalizedFloat(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
+
+                        source = source.Slice(alignedCount);
+                        dest = dest.Slice(alignedCount);
+                    }
+                }
+            }
+
+            /// <summary>
+            /// A variant of <see cref="BasicIntrinsics256.BulkConvertByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
            /// </summary>
            // ReSharper disable once MemberHidesStaticFromOuterClass
            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
            {
-                Guard.IsTrue(
+                DebugGuard.IsTrue(
                    dest.Length % Vector<byte>.Count == 0,
                    nameof(source),
-                    "dest.Length should be divisable by Vector<byte>.Count!");
+                    "dest.Length should be divisible by Vector<byte>.Count!");

                int n = dest.Length / Vector<byte>.Count;

@ -63,34 +90,52 @@ namespace SixLabors.ImageSharp
                }
            }

-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            private static Vector<float> ConvertToSingle(Vector<uint> u, Vector<float> scale)
+            /// <summary>
+            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
+            /// </summary>
+            [Conditional("NETCOREAPP2_1")]
+            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
+                ref ReadOnlySpan<float> source,
+                ref Span<byte> dest)
            {
-                Vector<int> vi = Vector.AsVectorInt32(u);
-                Vector<float> v = Vector.ConvertToSingle(vi);
-                v *= scale;
-                return v;
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
+
+                if (IsAvailable)
+                {
+                    int remainder = source.Length % Vector<byte>.Count;
+                    int alignedCount = source.Length - remainder;
+
+                    if (alignedCount > 0)
+                    {
+                        BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
+
+                        source = source.Slice(alignedCount);
+                        dest = dest.Slice(alignedCount);
+                    }
+                }
            }

            /// <summary>
-            /// A variant of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
+            /// A variant of <see cref="BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
            /// </summary>
            /// <remarks>
            /// It does NOT worth yet to utilize this method (2018 Oct).
            /// See benchmark results for the "PackFromVector4_Rgba32" benchmark!
            /// TODO: Check again later!
            /// </remarks>
-            // ReSharper disable once MemberHidesStaticFromOuterClass
-            internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
+            internal static void BulkConvertNormalizedFloatToByteClampOverflows(
+                ReadOnlySpan<float> source,
+                Span<byte> dest)
            {
-                Guard.IsTrue(
+                DebugGuard.IsTrue(
                    dest.Length % Vector<byte>.Count == 0,
                    nameof(dest),
-                    "dest.Length should be divisable by Vector<byte>.Count!");
+                    "dest.Length should be divisible by Vector<byte>.Count!");

                int n = dest.Length / Vector<byte>.Count;

-                ref Vector<float> sourceBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
+                ref Vector<float> sourceBase =
+                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
                ref Vector<byte> destBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(dest));

                for (int i = 0; i < n; i++)
@ -126,6 +171,15 @@ namespace SixLabors.ImageSharp
                Vector<int> vi = Vector.ConvertToInt32(vf);
                return Vector.AsVectorUInt32(vi);
            }
+
+            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            private static Vector<float> ConvertToSingle(Vector<uint> u, Vector<float> scale)
+            {
+                Vector<int> vi = Vector.AsVectorInt32(u);
+                Vector<float> v = Vector.ConvertToSingle(vi);
+                v *= scale;
+                return v;
+            }
        }
    }
-}
+}
--- a/src/ImageSharp/Common/Helpers/SimdUtils.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@ -6,6 +6,9 @@ using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;

+using SixLabors.ImageSharp.PixelFormats;
+using SixLabors.ImageSharp.Tuples;
+
 namespace SixLabors.ImageSharp
 {
    /// <summary>
@ -16,7 +19,8 @@ namespace SixLabors.ImageSharp
        /// <summary>
        /// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
        /// </summary>
-        public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;
+        public static bool IsAvx2CompatibleArchitecture { get; } =
+            Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;

        internal static void GuardAvx2(string operation)
        {
@ -57,236 +61,61 @@ namespace SixLabors.ImageSharp
        }

        /// <summary>
-        /// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/> values.
-        /// The values are scaled up into [0-255] and rounded.
-        /// The implementation is SIMD optimized and works only with `source.Length` divisible by <see cref="Vector{UInt32}.Count"/>.
-        /// Based on:
-        /// <see>
-        ///     <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
-        /// </see>
-        /// </summary>
-        internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
-        {
-            GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
-
-            DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
-
-            if (source.Length == 0)
-            {
-                return;
-            }
-
-            ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
-            ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
-            int n = source.Length / 8;
-
-            Vector<float> magick = new Vector<float>(32768.0f);
-            Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
-
-            // need to copy to a temporary struct, because
-            // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
-            // does not work. TODO: This might be a CoreClr bug, need to ask/report
-            var temp = default(Octet.OfUInt32);
-            ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
-
-            for (int i = 0; i < n; i++)
-            {
-                // union { float f; uint32_t i; } u;
-                // u.f = 32768.0f + x * (255.0f / 256.0f);
-                // return (uint8_t)u.i;
-                Vector<float> x = Unsafe.Add(ref srcBase, i);
-                x = (x * scale) + magick;
-                tempRef = x;
-
-                ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
-                d.LoadFrom(ref temp);
-            }
-        }
-
-        /// <summary>
-        /// Converts `dest.Length` bytes to <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1]
-        /// The implementation is SIMD optimized and works only with `dest.Length` divisible by <see cref="Vector{UInt32}.Count"/>.
-        /// Implementation adapted from:
-        /// <see>
-        ///     <cref>http://stackoverflow.com/a/5362789</cref>
-        /// </see>
+        /// Converts `dest.Length` <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
+        /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
+        /// but there are no restrictions on the span's length.
        /// </summary>
        internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
        {
-            GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
-
-            DebugGuard.IsTrue((dest.Length % Vector<float>.Count) == 0, nameof(source), "dest.Length should be divisable by Vector<float>.Count!");
-
-            var bVec = new Vector<float>(256.0f / 255.0f);
-            var magicFloat = new Vector<float>(32768.0f);
-            var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
-            var mask = new Vector<uint>(255);
+            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");

-            ref Octet.OfByte sourceBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(source));
-            ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As<float, Octet.OfUInt32>(ref MemoryMarshal.GetReference(dest));
-
-            ref Vector<float> destBaseAsFloat = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref destBaseAsWideOctet);
-
-            int n = dest.Length / 8;
-
-            for (int i = 0; i < n; i++)
-            {
-                ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i);
-                ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i);
-                d.LoadFrom(ref s);
-            }
+            ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
+            BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);

-            for (int i = 0; i < n; i++)
+            // Deal with the remainder:
+            int count = source.Length;
+            if (count > 0)
            {
-                ref Vector<float> df = ref Unsafe.Add(ref destBaseAsFloat, i);
-
-                var vi = Vector.AsVectorUInt32(df);
-                vi &= mask;
-                vi |= magicInt;
-
-                var vf = Vector.AsVectorSingle(vi);
-                vf = (vf - magicFloat) * bVec;
-
-                df = vf;
+                // TODO: Do we need to optimize anything on this? (There are at most 7 remainders)
+                ref byte sBase = ref MemoryMarshal.GetReference(source);
+                ref float dBase = ref MemoryMarshal.GetReference(dest);
+                for (int i = 0; i < count; i++)
+                {
+                    Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f;
+                }
            }
        }

        /// <summary>
-        /// Same as <see cref="BulkConvertNormalizedFloatToByte"/> but clamps overflown values before conversion.
+        /// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
+        /// The values are scaled up into [0-255] and rounded, overflows are clamped.
+        /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
+        /// but there are no restrictions on the span's length.
        /// </summary>
        internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
        {
-            GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
-
-            DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
-
-            if (source.Length == 0)
-            {
-                return;
-            }
-
-            ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
-            ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
-            int n = source.Length / 8;
-
-            Vector<float> magick = new Vector<float>(32768.0f);
-            Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
-
-            // need to copy to a temporary struct, because
-            // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
-            // does not work. TODO: This might be a CoreClr bug, need to ask/report
-            var temp = default(Octet.OfUInt32);
-            ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
-
-            for (int i = 0; i < n; i++)
-            {
-                // union { float f; uint32_t i; } u;
-                // u.f = 32768.0f + x * (255.0f / 256.0f);
-                // return (uint8_t)u.i;
-                Vector<float> x = Unsafe.Add(ref srcBase, i);
-                x = Vector.Max(x, Vector<float>.Zero);
-                x = Vector.Min(x, Vector<float>.One);
-
-                x = (x * scale) + magick;
-                tempRef = x;
-
-                ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
-                d.LoadFrom(ref temp);
-            }
-        }
-
-        // TODO: Replace these with T4-d library level tuples!
-        internal static class Octet
-        {
-            [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
-            public struct OfUInt32
-            {
-                [FieldOffset(0 * sizeof(uint))]
-                public uint V0;
-
-                [FieldOffset(1 * sizeof(uint))]
-                public uint V1;
-
-                [FieldOffset(2 * sizeof(uint))]
-                public uint V2;
-
-                [FieldOffset(3 * sizeof(uint))]
-                public uint V3;
-
-                [FieldOffset(4 * sizeof(uint))]
-                public uint V4;
-
-                [FieldOffset(5 * sizeof(uint))]
-                public uint V5;
+            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");

-                [FieldOffset(6 * sizeof(uint))]
-                public uint V6;
+            ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
+            BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);

-                [FieldOffset(7 * sizeof(uint))]
-                public uint V7;
-
-                public override string ToString()
-                {
-                    return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
-                }
-
-                [MethodImpl(InliningOptions.ShortMethod)]
-                public void LoadFrom(ref OfByte src)
-                {
-                    this.V0 = src.V0;
-                    this.V1 = src.V1;
-                    this.V2 = src.V2;
-                    this.V3 = src.V3;
-                    this.V4 = src.V4;
-                    this.V5 = src.V5;
-                    this.V6 = src.V6;
-                    this.V7 = src.V7;
-                }
-            }
-
-            [StructLayout(LayoutKind.Explicit, Size = 8)]
-            public struct OfByte
+            // Deal with the remainder:
+            int count = source.Length;
+            if (count > 0)
            {
-                [FieldOffset(0)]
-                public byte V0;
-
-                [FieldOffset(1)]
-                public byte V1;
-
-                [FieldOffset(2)]
-                public byte V2;
-
-                [FieldOffset(3)]
-                public byte V3;
-
-                [FieldOffset(4)]
-                public byte V4;
-
-                [FieldOffset(5)]
-                public byte V5;
-
-                [FieldOffset(6)]
-                public byte V6;
-
-                [FieldOffset(7)]
-                public byte V7;
-
-                public override string ToString()
-                {
-                    return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
-                }
+                ref float sBase = ref MemoryMarshal.GetReference(source);
+                ref byte dBase = ref MemoryMarshal.GetReference(dest);

-                [MethodImpl(InliningOptions.ShortMethod)]
-                public void LoadFrom(ref OfUInt32 src)
+                for (int i = 0; i < count; i++)
                {
-                    this.V0 = (byte)src.V0;
-                    this.V1 = (byte)src.V1;
-                    this.V2 = (byte)src.V2;
-                    this.V3 = (byte)src.V3;
-                    this.V4 = (byte)src.V4;
-                    this.V5 = (byte)src.V5;
-                    this.V6 = (byte)src.V6;
-                    this.V7 = (byte)src.V7;
+                    // TODO: Do we need to optimize anything on this? (There are at most 7 remainders)
+                    float f = Unsafe.Add(ref sBase, i);
+                    f *= 255f;
+                    f += 0.5f;
+                    f = MathF.Max(0, f);
+                    f = MathF.Min(255f, f);
+
+                    Unsafe.Add(ref dBase, i) = (byte)f;
                }
            }
        }
--- a/src/ImageSharp/Common/Tuples/Octet.cs
+++ b/src/ImageSharp/Common/Tuples/Octet.cs
@ -0,0 +1,100 @@
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp.Tuples
+{
+    internal static class Octet
+    {
+        [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
+        public struct OfUInt32
+        {
+            [FieldOffset(0 * sizeof(uint))]
+            public uint V0;
+
+            [FieldOffset(1 * sizeof(uint))]
+            public uint V1;
+
+            [FieldOffset(2 * sizeof(uint))]
+            public uint V2;
+
+            [FieldOffset(3 * sizeof(uint))]
+            public uint V3;
+
+            [FieldOffset(4 * sizeof(uint))]
+            public uint V4;
+
+            [FieldOffset(5 * sizeof(uint))]
+            public uint V5;
+
+            [FieldOffset(6 * sizeof(uint))]
+            public uint V6;
+
+            [FieldOffset(7 * sizeof(uint))]
+            public uint V7;
+
+            public override string ToString()
+            {
+                return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
+            }
+
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public void LoadFrom(ref OfByte src)
+            {
+                this.V0 = src.V0;
+                this.V1 = src.V1;
+                this.V2 = src.V2;
+                this.V3 = src.V3;
+                this.V4 = src.V4;
+                this.V5 = src.V5;
+                this.V6 = src.V6;
+                this.V7 = src.V7;
+            }
+        }
+
+        [StructLayout(LayoutKind.Explicit, Size = 8)]
+        public struct OfByte
+        {
+            [FieldOffset(0)]
+            public byte V0;
+
+            [FieldOffset(1)]
+            public byte V1;
+
+            [FieldOffset(2)]
+            public byte V2;
+
+            [FieldOffset(3)]
+            public byte V3;
+
+            [FieldOffset(4)]
+            public byte V4;
+
+            [FieldOffset(5)]
+            public byte V5;
+
+            [FieldOffset(6)]
+            public byte V6;
+
+            [FieldOffset(7)]
+            public byte V7;
+
+            public override string ToString()
+            {
+                return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
+            }
+
+            [MethodImpl(InliningOptions.ShortMethod)]
+            public void LoadFrom(ref OfUInt32 src)
+            {
+                this.V0 = (byte)src.V0;
+                this.V1 = (byte)src.V1;
+                this.V2 = (byte)src.V2;
+                this.V3 = (byte)src.V3;
+                this.V4 = (byte)src.V4;
+                this.V5 = (byte)src.V5;
+                this.V6 = (byte)src.V6;
+                this.V7 = (byte)src.V7;
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Common/Tuples/Vector4Pair.cs
+++ b/src/ImageSharp/Common/Tuples/Vector4Pair.cs
@ -2,7 +2,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;

-namespace SixLabors.ImageSharp.Common.Tuples
+namespace SixLabors.ImageSharp.Tuples
 {
    /// <summary>
    /// Its faster to process multiple Vector4-s together, so let's pair them!
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
@ -6,7 +6,7 @@ using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;

-using SixLabors.ImageSharp.Common.Tuples;
+using SixLabors.ImageSharp.Tuples;

 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
 {
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
@ -6,7 +6,7 @@ using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;

-using SixLabors.ImageSharp.Common.Tuples;
+using SixLabors.ImageSharp.Tuples;

 // ReSharper disable ImpureMethodCallOnReadonlyValueField
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
@ -6,8 +6,8 @@ using System.Collections.Generic;
 using System.Linq;
 using System.Numerics;

-using SixLabors.ImageSharp.Common.Tuples;
 using SixLabors.ImageSharp.Memory;
+using SixLabors.ImageSharp.Tuples;
 using SixLabors.Memory;

 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
--- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
+++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                }
                else
                {
-                    ConvertToVector4UsingStandardIntrinsics(sourceColors, destinationVectors, count);
+                    ConvertToVector4UsingBasicIntrinsics(sourceColors, destinationVectors, count);
                }
            }

@ -58,7 +58,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                }
                else
                {
-                    ConvertFromVector4StandardIntrinsics(sourceVectors, destinationColors, count);
+                    ConvertFromVector4BasicIntrinsics(sourceVectors, destinationColors, count);
                }
            }

@ -112,7 +112,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                }
            }

-            private static void ConvertToVector4UsingStandardIntrinsics(
+            private static void ConvertToVector4UsingBasicIntrinsics(
                ReadOnlySpan<Rgba32> sourceColors,
                Span<Vector4> destinationVectors,
                int count)
@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                    ReadOnlySpan<byte> rawSrc = MemoryMarshal.Cast<Rgba32, byte>(sourceColors);
                    Span<float> rawDest = MemoryMarshal.Cast<Vector4, float>(destinationVectors.Slice(0, alignedCount));

-                    SimdUtils.BulkConvertByteToNormalizedFloat(rawSrc, rawDest);
+                    SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(rawSrc, rawDest);
                }

                if (remainder > 0)
@ -155,7 +155,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                }
            }

-            private static void ConvertFromVector4StandardIntrinsics(ReadOnlySpan<Vector4> sourceVectors, Span<Rgba32> destinationColors, int count)
+            private static void ConvertFromVector4BasicIntrinsics(ReadOnlySpan<Vector4> sourceVectors, Span<Rgba32> destinationColors, int count)
            {
                int remainder = count % 2;
                int alignedCount = count - remainder;
@ -165,7 +165,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                    ReadOnlySpan<float> rawSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors.Slice(0, alignedCount));
                    Span<byte> rawDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors);

-                    SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
+                    SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
                }

                if (remainder > 0)
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
@ -30,8 +30,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
        [Params(
            //64, 
            //256,
-            //512,
-            2048
+            512
+            //1024
            )]
        public int Count { get; set; }

@ -117,7 +117,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
            SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
        }

-        //[Benchmark]
+        [Benchmark]
        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
        {
            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
@ -159,7 +159,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
            }
        }

-        //[Benchmark]
+        [Benchmark]
        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop()
        {
            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
--- a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs
+++ b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs
@ -5,6 +5,7 @@ using BenchmarkDotNet.Attributes;

 namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
 {
+    [Config(typeof(Config.ShortClr))]
    public class UInt32ToSingle
    {
        private float[] data;
@ -66,8 +67,7 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
                Unsafe.Add(ref bf, i) = v;
            }
        }
-
-        // This code is not correct at all, it's just here as reference
+        
        [Benchmark]
        public void StandardSimdFromInt()
        {
@ -86,5 +86,28 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
                Unsafe.Add(ref bf, i) = v;
            }
        }
+
+
+        [Benchmark]
+        public void StandardSimdFromInt_RefCast()
+        {
+            int n = Count / Vector<float>.Count;
+
+            ref Vector<float> bf = ref Unsafe.As<float, Vector<float>>(ref this.data[0]);
+            ref Vector<int> bu = ref Unsafe.As<Vector<float>, Vector<int>>(ref bf);
+
+            var scale = new Vector<float>(1f / 255f);
+
+            for (int i = 0; i < n; i++)
+            {
+                ref Vector<float> fRef = ref Unsafe.Add(ref bf, i);
+
+                Vector<int> du = Vector.AsVectorInt32(fRef);
+                Vector<float> v = Vector.ConvertToSingle(du);
+                v *= scale;
+
+                fRef = v;
+            }
+        }
    }
 }
--- a/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs
+++ b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs
@ -3,8 +3,11 @@ using System.Runtime.CompilerServices;

 using BenchmarkDotNet.Attributes;

+using SixLabors.ImageSharp.Tuples;
+
 namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
 {
+    [Config(typeof(Config.ShortClr))]
    public class WidenBytesToUInt32
    {
        private byte[] source;
@ -25,8 +28,8 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
        {
            const int N = Count / 8;

-            ref SimdUtils.Octet.OfByte sBase = ref Unsafe.As<byte, SimdUtils.Octet.OfByte>(ref this.source[0]);
-            ref SimdUtils.Octet.OfUInt32 dBase = ref Unsafe.As<uint, SimdUtils.Octet.OfUInt32>(ref this.dest[0]);
+            ref Octet.OfByte sBase = ref Unsafe.As<byte, Octet.OfByte>(ref this.source[0]);
+            ref Octet.OfUInt32 dBase = ref Unsafe.As<uint, Octet.OfUInt32>(ref this.dest[0]);

            for (int i = 0; i < N; i++)
            {
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@ -62,7 +62,7 @@ namespace SixLabors.ImageSharp.Tests.Common
        {
            float[] data = new float[Vector<float>.Count];

-            var rnd = new Random();
+            var rnd = new Random(seed);

            for (int i = 0; i < Vector<float>.Count; i++)
            {
@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common
        [InlineData(1, 8)]
        [InlineData(2, 16)]
        [InlineData(3, 128)]
-        public void BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
+        public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
        {
            if (this.SkipOnNonAvx2())
            {
@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Tests.Common

            byte[] dest = new byte[count];

-            SimdUtils.BulkConvertNormalizedFloatToByte(normalized, dest);
+            SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(normalized, dest);

            byte[] expected = orig.Select(f => (byte)(f)).ToArray();

@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common
        [InlineData(1, 8)]
        [InlineData(2, 16)]
        [InlineData(3, 128)]
-        public void BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
+        public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
        {
            if (this.SkipOnNonAvx2())
            {
@ -153,87 +153,113 @@ namespace SixLabors.ImageSharp.Tests.Common

            byte[] dest = new byte[count];

-            SimdUtils.BulkConvertNormalizedFloatToByte(source, dest);
+            SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(source, dest);

            byte[] expected = source.Select(f => (byte)Math.Round(f * 255f)).ToArray();

            Assert.Equal(expected, dest);
        }

+        public static readonly TheoryData<int> ArraySizesDivisibleBy8 = new TheoryData<int> { 0, 8, 16, 1024 };
+
+        public static readonly TheoryData<int> ArraySizesDivisibleBy32 = new TheoryData<int> { 0, 32, 512 };
+
+        public static readonly TheoryData<int> ArbitraryArraySizes =
+            new TheoryData<int>
+                {
+                    0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520,
+                };

        [Theory]
-        [InlineData(1, 0)]
-        [InlineData(2, 32)]
-        [InlineData(3, 128)]
-        public void BulkConvertByteToNormalizedFloat(int seed, int count)
+        [MemberData(nameof(ArraySizesDivisibleBy8))]
+        public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count)
        {
            if (this.SkipOnNonAvx2())
            {
                return;
            }

-            byte[] source = new Random(seed).GenerateRandomByteArray(count);
-            float[] result = new float[count];
-            float[] expected = source.Select(b => (float)b / 255f).ToArray();
-
-            SimdUtils.BulkConvertByteToNormalizedFloat(source, result);
-
-            Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
+            TestImpl_BulkConvertByteToNormalizedFloat(
+                count,
+                (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
        }
        
        [Theory]
-        [InlineData(1, 0)]
-        [InlineData(2, 32)]
-        [InlineData(3, 128)]
-        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count)
+        [MemberData(nameof(ArraySizesDivisibleBy32))]
+        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int count)
+        {
+            TestImpl_BulkConvertByteToNormalizedFloat(
+                count,
+                (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
+        }
+
+        [Theory]
+        [MemberData(nameof(ArbitraryArraySizes))]
+        public void BulkConvertByteToNormalizedFloat(int count)
+        {
+            TestImpl_BulkConvertByteToNormalizedFloat(
+                count,
+                (s, d) => SimdUtils.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
+        }
+
+        private static void TestImpl_BulkConvertByteToNormalizedFloat(
+            int count,
+            Action<Memory<byte>, Memory<float>> convert)
        {
-            byte[] source = new Random(seed).GenerateRandomByteArray(count);
+            byte[] source = new Random(count).GenerateRandomByteArray(count);
            float[] result = new float[count];
            float[] expected = source.Select(b => (float)b / 255f).ToArray();

-            
-            SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result);
+            convert(source, result);

            Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
        }

-
-        public static readonly TheoryData<int> BulkConvertNormalizedFloatToByteClampOverflows_Data =
-            new TheoryData<int>
-                {
-                    0, 64, 1024
-                };
-
        [Theory]
-        [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))]
-        public void BulkConvertNormalizedFloatToByteClampOverflows(int count)
+        [MemberData(nameof(ArraySizesDivisibleBy8))]
+        public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
        {
            if (this.SkipOnNonAvx2())
            {
                return;
            }

-            float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f);
-            byte[] expected = source.Select(NormalizedFloatToByte).ToArray();
-            byte[] actual = new byte[count];
-
-            SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(source, actual);
-
-            Assert.Equal(expected, actual);
+            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+                (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+                );
        }

        [Theory]
-        [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))]
+        [MemberData(nameof(ArraySizesDivisibleBy32))]
        public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
+        {
+            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+                (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+            );
+        }
+
+        [Theory]
+        [MemberData(nameof(ArbitraryArraySizes))]
+        public void BulkConvertNormalizedFloatToByteClampOverflows(int count)
+        {
+            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
+                (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
+            );
+        }
+
+        private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
+            int count,
+            Action<Memory<float>, Memory<byte>> convert)
        {
            float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f);
            byte[] expected = source.Select(NormalizedFloatToByte).ToArray();
            byte[] actual = new byte[count];

-            SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(source, actual);
+            convert(source, actual);

            Assert.Equal(expected, actual);
        }
+
        private static byte NormalizedFloatToByte(float f) => (byte)Math.Min(255f, Math.Max(0f, f * 255f + 0.5f));

        [Theory]