Bulk conversion of arbitrary-sized Span-s of scalars

8 years ago · 81c57a812d
13 changed files with 537 additions and 290 deletions
--- a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
@ -0,0 +1,212 @@
 // Copyright (c) Six Labors and contributors.
 // Licensed under the Apache License, Version 2.0.
 using System;
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.Tuples;
 // ReSharper disable MemberHidesStaticFromOuterClass
 namespace SixLabors.ImageSharp
 {
    internal static partial class SimdUtils
    {
        /// <summary>
        /// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*)
        /// </summary>
        public static class BasicIntrinsics256
        {
            public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture;
            /// <summary>
            /// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            internal static void BulkConvertByteToNormalizedFloatReduce(
                ref ReadOnlySpan<byte> source,
                ref Span<float> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
                if (IsAvailable)
                {
                    int remainder = source.Length % 8;
                    int alignedCount = source.Length - remainder;
                    if (alignedCount > 0)
                    {
                        BulkConvertByteToNormalizedFloat(
                            source.Slice(0, alignedCount),
                            dest.Slice(0, alignedCount));
                        source = source.Slice(alignedCount);
                        dest = dest.Slice(alignedCount);
                    }
                }
            }
            /// <summary>
            /// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source'
            /// into 'dest' buffer of <see cref="byte"/>. The values are scaled up into [0-255] and rounded.
            /// The implementation is SIMD optimized and works only with `source.Length` divisible by 8/>.
            /// Based on:
            /// <see>
            ///     <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
            /// </see>
            /// </summary>
            internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
            {
                GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
                DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
                if (source.Length == 0)
                {
                    return;
                }
                ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
                ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
                int n = source.Length / 8;
                Vector<float> magick = new Vector<float>(32768.0f);
                Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
                // need to copy to a temporary struct, because
                // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
                // does not work. TODO: This might be a CoreClr bug, need to ask/report
                var temp = default(Octet.OfUInt32);
                ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
                for (int i = 0; i < n; i++)
                {
                    // union { float f; uint32_t i; } u;
                    // u.f = 32768.0f + x * (255.0f / 256.0f);
                    // return (uint8_t)u.i;
                    Vector<float> x = Unsafe.Add(ref srcBase, i);
                    x = (x * scale) + magick;
                    tempRef = x;
                    ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
                    d.LoadFrom(ref temp);
                }
            }
            /// <summary>
            /// SIMD optimized implementation for <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>.
            /// Works only with `dest.Length` divisible by 8.
            /// Implementation adapted from:
            /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions
            /// http://stackoverflow.com/a/536278
            /// </summary>
            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
            {
                GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
                DebugGuard.IsTrue((dest.Length % 8) == 0, nameof(source), "dest.Length should be divisable by 8!");
                var bVec = new Vector<float>(256.0f / 255.0f);
                var magicFloat = new Vector<float>(32768.0f);
                var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
                var mask = new Vector<uint>(255);
                ref Octet.OfByte sourceBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(source));
                ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As<float, Octet.OfUInt32>(ref MemoryMarshal.GetReference(dest));
                ref Vector<float> destBaseAsFloat = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref destBaseAsWideOctet);
                int n = dest.Length / 8;
                for (int i = 0; i < n; i++)
                {
                    ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i);
                    ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i);
                    d.LoadFrom(ref s);
                }
                for (int i = 0; i < n; i++)
                {
                    ref Vector<float> df = ref Unsafe.Add(ref destBaseAsFloat, i);
                    var vi = Vector.AsVectorUInt32(df);
                    vi &= mask;
                    vi |= magicInt;
                    var vf = Vector.AsVectorSingle(vi);
                    vf = (vf - magicFloat) * bVec;
                    df = vf;
                }
            }
            /// <summary>
            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
                ref ReadOnlySpan<float> source,
                ref Span<byte> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
                if (IsAvailable)
                {
                    int remainder = source.Length % Vector<byte>.Count;
                    int alignedCount = source.Length - remainder;
                    if (alignedCount > 0)
                    {
                        BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
                        source = source.Slice(alignedCount);
                        dest = dest.Slice(alignedCount);
                    }
                }
            }
            /// <summary>
            /// Same as <see cref="BulkConvertNormalizedFloatToByte"/> but clamps overflown values before conversion.
            /// </summary>
            internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
            {
                GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
                DebugGuard.IsTrue((source.Length % 8) == 0, nameof(source), "source.Length should be divisible by 8!");
                if (source.Length == 0)
                {
                    return;
                }
                ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
                ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
                int n = source.Length / 8;
                Vector<float> magick = new Vector<float>(32768.0f);
                Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
                // need to copy to a temporary struct, because
                // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
                // does not work. TODO: This might be a CoreClr bug, need to ask/report
                var temp = default(Octet.OfUInt32);
                ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
                for (int i = 0; i < n; i++)
                {
                    // union { float f; uint32_t i; } u;
                    // u.f = 32768.0f + x * (255.0f / 256.0f);
                    // return (uint8_t)u.i;
                    Vector<float> x = Unsafe.Add(ref srcBase, i);
                    x = Vector.Max(x, Vector<float>.Zero);
                    x = Vector.Min(x, Vector<float>.One);
                    x = (x * scale) + magick;
                    tempRef = x;
                    ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
                    d.LoadFrom(ref temp);
                }
            }
        }
    }
 }
--- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@ -1,8 +1,10 @@
 using System;
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 // ReSharper disable MemberHidesStaticFromOuterClass
 namespace SixLabors.ImageSharp
 {
    internal static partial class SimdUtils
@ -18,22 +20,47 @@ namespace SixLabors.ImageSharp
        {
            public static bool IsAvailable { get; } =
 #if NETCOREAPP2_1
-// TODO: Also available in .NET 4.7.2, we need to add a build target!
+                // TODO: Also available in .NET 4.7.2, we need to add a build target!
-                true;
+                Vector.IsHardwareAccelerated;
 #else
                false;
 #endif
            /// <summary>
-            /// A variant of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/>, which is faster on new .NET runtime.
+            /// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            [Conditional("NETCOREAPP2_1")]
            internal static void BulkConvertByteToNormalizedFloatReduce(
                ref ReadOnlySpan<byte> source,
                ref Span<float> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
                if (IsAvailable)
                {
                    int remainder = source.Length % Vector<byte>.Count;
                    int alignedCount = source.Length - remainder;
                    if (alignedCount > 0)
                    {
                        BulkConvertByteToNormalizedFloat(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
                        source = source.Slice(alignedCount);
                        dest = dest.Slice(alignedCount);
                    }
                }
            }
            /// <summary>
            /// A variant of <see cref="BasicIntrinsics256.BulkConvertByteToNormalizedFloat"/>, which is faster on new RyuJIT runtime.
            /// </summary>
            // ReSharper disable once MemberHidesStaticFromOuterClass
            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
            {
-                Guard.IsTrue(
+                DebugGuard.IsTrue(
                    dest.Length % Vector<byte>.Count == 0,
                    nameof(source),
-                    "dest.Length should be divisable by Vector<byte>.Count!");
+                    "dest.Length should be divisible by Vector<byte>.Count!");
                int n = dest.Length / Vector<byte>.Count;
@ -63,34 +90,52 @@ namespace SixLabors.ImageSharp
                }
            }
-            [MethodImpl(MethodImplOptions.AggressiveInlining)]
+            /// <summary>
-            private static Vector<float> ConvertToSingle(Vector<uint> u, Vector<float> scale)
+            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            [Conditional("NETCOREAPP2_1")]
            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
                ref ReadOnlySpan<float> source,
                ref Span<byte> dest)
            {
-                Vector<int> vi = Vector.AsVectorInt32(u);
+                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
-                Vector<float> v = Vector.ConvertToSingle(vi);
+
-                v *= scale;
+                if (IsAvailable)
-                return v;
+                {
                    int remainder = source.Length % Vector<byte>.Count;
                    int alignedCount = source.Length - remainder;
                    if (alignedCount > 0)
                    {
                        BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
                        source = source.Slice(alignedCount);
                        dest = dest.Slice(alignedCount);
                    }
                }
            }
            /// <summary>
-            /// A variant of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
+            /// A variant of <see cref="BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
            /// </summary>
            /// <remarks>
            /// It does NOT worth yet to utilize this method (2018 Oct).
            /// See benchmark results for the "PackFromVector4_Rgba32" benchmark!
            /// TODO: Check again later!
            /// </remarks>
-            // ReSharper disable once MemberHidesStaticFromOuterClass
+            internal static void BulkConvertNormalizedFloatToByteClampOverflows(
-            internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
+                ReadOnlySpan<float> source,
                Span<byte> dest)
            {
-                Guard.IsTrue(
+                DebugGuard.IsTrue(
                    dest.Length % Vector<byte>.Count == 0,
                    nameof(dest),
-                    "dest.Length should be divisable by Vector<byte>.Count!");
+                    "dest.Length should be divisible by Vector<byte>.Count!");
                int n = dest.Length / Vector<byte>.Count;
-                ref Vector<float> sourceBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
+                ref Vector<float> sourceBase =
                    ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
                ref Vector<byte> destBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(dest));
                for (int i = 0; i < n; i++)
@ -126,6 +171,15 @@ namespace SixLabors.ImageSharp
                Vector<int> vi = Vector.ConvertToInt32(vf);
                return Vector.AsVectorUInt32(vi);
            }
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
            private static Vector<float> ConvertToSingle(Vector<uint> u, Vector<float> scale)
            {
                Vector<int> vi = Vector.AsVectorInt32(u);
                Vector<float> v = Vector.ConvertToSingle(vi);
                v *= scale;
                return v;
            }
        }
    }
-}
+}
--- a/src/ImageSharp/Common/Helpers/SimdUtils.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@ -6,6 +6,9 @@ using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using SixLabors.ImageSharp.PixelFormats;
 using SixLabors.ImageSharp.Tuples;
 namespace SixLabors.ImageSharp
 {
    /// <summary>
@ -16,7 +19,8 @@ namespace SixLabors.ImageSharp
        /// <summary>
        /// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
        /// </summary>
-        public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;
+        public static bool IsAvx2CompatibleArchitecture { get; } =
            Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;
        internal static void GuardAvx2(string operation)
        {
@ -57,236 +61,61 @@ namespace SixLabors.ImageSharp
        }
        /// <summary>
-        /// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/> values.
+        /// Converts `dest.Length` <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
-        /// The values are scaled up into [0-255] and rounded.
+        /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
-        /// The implementation is SIMD optimized and works only with `source.Length` divisible by <see cref="Vector{UInt32}.Count"/>.
+        /// but there are no restrictions on the span's length.
        /// Based on:
        /// <see>
        ///     <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
        /// </see>
        /// </summary>
        internal static void BulkConvertNormalizedFloatToByte(ReadOnlySpan<float> source, Span<byte> dest)
        {
            GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
            DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
            if (source.Length == 0)
            {
                return;
            }
            ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
            ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
            int n = source.Length / 8;
            Vector<float> magick = new Vector<float>(32768.0f);
            Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
            // need to copy to a temporary struct, because
            // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
            // does not work. TODO: This might be a CoreClr bug, need to ask/report
            var temp = default(Octet.OfUInt32);
            ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
            for (int i = 0; i < n; i++)
            {
                // union { float f; uint32_t i; } u;
                // u.f = 32768.0f + x * (255.0f / 256.0f);
                // return (uint8_t)u.i;
                Vector<float> x = Unsafe.Add(ref srcBase, i);
                x = (x * scale) + magick;
                tempRef = x;
                ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
                d.LoadFrom(ref temp);
            }
        }
        /// <summary>
        /// Converts `dest.Length` bytes to <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1]
        /// The implementation is SIMD optimized and works only with `dest.Length` divisible by <see cref="Vector{UInt32}.Count"/>.
        /// Implementation adapted from:
        /// <see>
        ///     <cref>http://stackoverflow.com/a/5362789</cref>
        /// </see>
        /// </summary>
        internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
        {
-            GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
+            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
            DebugGuard.IsTrue((dest.Length % Vector<float>.Count) == 0, nameof(source), "dest.Length should be divisable by Vector<float>.Count!");
            var bVec = new Vector<float>(256.0f / 255.0f);
            var magicFloat = new Vector<float>(32768.0f);
            var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
            var mask = new Vector<uint>(255);
-            ref Octet.OfByte sourceBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(source));
+            ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
-            ref Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As<float, Octet.OfUInt32>(ref MemoryMarshal.GetReference(dest));
+            BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
            ref Vector<float> destBaseAsFloat = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref destBaseAsWideOctet);
            int n = dest.Length / 8;
            for (int i = 0; i < n; i++)
            {
                ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i);
                ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i);
                d.LoadFrom(ref s);
            }
-            for (int i = 0; i < n; i++)
+            // Deal with the remainder:
            int count = source.Length;
            if (count > 0)
            {
-                ref Vector<float> df = ref Unsafe.Add(ref destBaseAsFloat, i);
+                // TODO: Do we need to optimize anything on this? (There are at most 7 remainders)
-
+                ref byte sBase = ref MemoryMarshal.GetReference(source);
-                var vi = Vector.AsVectorUInt32(df);
+                ref float dBase = ref MemoryMarshal.GetReference(dest);
-                vi &= mask;
+                for (int i = 0; i < count; i++)
-                vi |= magicInt;
+                {
-
+                    Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f;
-                var vf = Vector.AsVectorSingle(vi);
+                }
                vf = (vf - magicFloat) * bVec;
                df = vf;
            }
        }
        /// <summary>
-        /// Same as <see cref="BulkConvertNormalizedFloatToByte"/> but clamps overflown values before conversion.
+        /// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
        /// The values are scaled up into [0-255] and rounded, overflows are clamped.
        /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
        /// but there are no restrictions on the span's length.
        /// </summary>
        internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
        {
-            GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
+            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
            DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
            if (source.Length == 0)
            {
                return;
            }
            ref Vector<float> srcBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(source));
            ref Octet.OfByte destBase = ref Unsafe.As<byte, Octet.OfByte>(ref MemoryMarshal.GetReference(dest));
            int n = source.Length / 8;
            Vector<float> magick = new Vector<float>(32768.0f);
            Vector<float> scale = new Vector<float>(255f) / new Vector<float>(256f);
            // need to copy to a temporary struct, because
            // SimdUtils.Octet.OfUInt32 temp = Unsafe.As<Vector<float>, SimdUtils.Octet.OfUInt32>(ref x)
            // does not work. TODO: This might be a CoreClr bug, need to ask/report
            var temp = default(Octet.OfUInt32);
            ref Vector<float> tempRef = ref Unsafe.As<Octet.OfUInt32, Vector<float>>(ref temp);
            for (int i = 0; i < n; i++)
            {
                // union { float f; uint32_t i; } u;
                // u.f = 32768.0f + x * (255.0f / 256.0f);
                // return (uint8_t)u.i;
                Vector<float> x = Unsafe.Add(ref srcBase, i);
                x = Vector.Max(x, Vector<float>.Zero);
                x = Vector.Min(x, Vector<float>.One);
                x = (x * scale) + magick;
                tempRef = x;
                ref Octet.OfByte d = ref Unsafe.Add(ref destBase, i);
                d.LoadFrom(ref temp);
            }
        }
        // TODO: Replace these with T4-d library level tuples!
        internal static class Octet
        {
            [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
            public struct OfUInt32
            {
                [FieldOffset(0 * sizeof(uint))]
                public uint V0;
                [FieldOffset(1 * sizeof(uint))]
                public uint V1;
                [FieldOffset(2 * sizeof(uint))]
                public uint V2;
                [FieldOffset(3 * sizeof(uint))]
                public uint V3;
                [FieldOffset(4 * sizeof(uint))]
                public uint V4;
                [FieldOffset(5 * sizeof(uint))]
                public uint V5;
-                [FieldOffset(6 * sizeof(uint))]
+            ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
-                public uint V6;
+            BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
-                [FieldOffset(7 * sizeof(uint))]
+            // Deal with the remainder:
-                public uint V7;
+            int count = source.Length;
-
+            if (count > 0)
                public override string ToString()
                {
                    return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
                }
                [MethodImpl(InliningOptions.ShortMethod)]
                public void LoadFrom(ref OfByte src)
                {
                    this.V0 = src.V0;
                    this.V1 = src.V1;
                    this.V2 = src.V2;
                    this.V3 = src.V3;
                    this.V4 = src.V4;
                    this.V5 = src.V5;
                    this.V6 = src.V6;
                    this.V7 = src.V7;
                }
            }
            [StructLayout(LayoutKind.Explicit, Size = 8)]
            public struct OfByte
            {
-                [FieldOffset(0)]
+                ref float sBase = ref MemoryMarshal.GetReference(source);
-                public byte V0;
+                ref byte dBase = ref MemoryMarshal.GetReference(dest);
                [FieldOffset(1)]
                public byte V1;
                [FieldOffset(2)]
                public byte V2;
                [FieldOffset(3)]
                public byte V3;
                [FieldOffset(4)]
                public byte V4;
                [FieldOffset(5)]
                public byte V5;
                [FieldOffset(6)]
                public byte V6;
                [FieldOffset(7)]
                public byte V7;
                public override string ToString()
                {
                    return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
                }
-                [MethodImpl(InliningOptions.ShortMethod)]
+                for (int i = 0; i < count; i++)
                public void LoadFrom(ref OfUInt32 src)
                {
-                    this.V0 = (byte)src.V0;
+                    // TODO: Do we need to optimize anything on this? (There are at most 7 remainders)
-                    this.V1 = (byte)src.V1;
+                    float f = Unsafe.Add(ref sBase, i);
-                    this.V2 = (byte)src.V2;
+                    f *= 255f;
-                    this.V3 = (byte)src.V3;
+                    f += 0.5f;
-                    this.V4 = (byte)src.V4;
+                    f = MathF.Max(0, f);
-                    this.V5 = (byte)src.V5;
+                    f = MathF.Min(255f, f);
-                    this.V6 = (byte)src.V6;
+
-                    this.V7 = (byte)src.V7;
+                    Unsafe.Add(ref dBase, i) = (byte)f;
                }
            }
        }
--- a/src/ImageSharp/Common/Tuples/Octet.cs
+++ b/src/ImageSharp/Common/Tuples/Octet.cs
@ -0,0 +1,100 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 namespace SixLabors.ImageSharp.Tuples
 {
    internal static class Octet
    {
        [StructLayout(LayoutKind.Explicit, Size = 8 * sizeof(uint))]
        public struct OfUInt32
        {
            [FieldOffset(0 * sizeof(uint))]
            public uint V0;
            [FieldOffset(1 * sizeof(uint))]
            public uint V1;
            [FieldOffset(2 * sizeof(uint))]
            public uint V2;
            [FieldOffset(3 * sizeof(uint))]
            public uint V3;
            [FieldOffset(4 * sizeof(uint))]
            public uint V4;
            [FieldOffset(5 * sizeof(uint))]
            public uint V5;
            [FieldOffset(6 * sizeof(uint))]
            public uint V6;
            [FieldOffset(7 * sizeof(uint))]
            public uint V7;
            public override string ToString()
            {
                return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
            }
            [MethodImpl(InliningOptions.ShortMethod)]
            public void LoadFrom(ref OfByte src)
            {
                this.V0 = src.V0;
                this.V1 = src.V1;
                this.V2 = src.V2;
                this.V3 = src.V3;
                this.V4 = src.V4;
                this.V5 = src.V5;
                this.V6 = src.V6;
                this.V7 = src.V7;
            }
        }
        [StructLayout(LayoutKind.Explicit, Size = 8)]
        public struct OfByte
        {
            [FieldOffset(0)]
            public byte V0;
            [FieldOffset(1)]
            public byte V1;
            [FieldOffset(2)]
            public byte V2;
            [FieldOffset(3)]
            public byte V3;
            [FieldOffset(4)]
            public byte V4;
            [FieldOffset(5)]
            public byte V5;
            [FieldOffset(6)]
            public byte V6;
            [FieldOffset(7)]
            public byte V7;
            public override string ToString()
            {
                return $"[{this.V0},{this.V1},{this.V2},{this.V3},{this.V4},{this.V5},{this.V6},{this.V7}]";
            }
            [MethodImpl(InliningOptions.ShortMethod)]
            public void LoadFrom(ref OfUInt32 src)
            {
                this.V0 = (byte)src.V0;
                this.V1 = (byte)src.V1;
                this.V2 = (byte)src.V2;
                this.V3 = (byte)src.V3;
                this.V4 = (byte)src.V4;
                this.V5 = (byte)src.V5;
                this.V6 = (byte)src.V6;
                this.V7 = (byte)src.V7;
            }
        }
    }
 }
--- a/src/ImageSharp/Common/Tuples/Vector4Pair.cs
+++ b/src/ImageSharp/Common/Tuples/Vector4Pair.cs
@ -2,7 +2,7 @@
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-namespace SixLabors.ImageSharp.Common.Tuples
+namespace SixLabors.ImageSharp.Tuples
 {
    /// <summary>
    /// Its faster to process multiple Vector4-s together, so let's pair them!
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimd.cs
@ -6,7 +6,7 @@ using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-using SixLabors.ImageSharp.Common.Tuples;
+using SixLabors.ImageSharp.Tuples;
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
 {
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.FromYCbCrSimdAvx2.cs
@ -6,7 +6,7 @@ using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
-using SixLabors.ImageSharp.Common.Tuples;
+using SixLabors.ImageSharp.Tuples;
 // ReSharper disable ImpureMethodCallOnReadonlyValueField
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
--- a/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
+++ b/src/ImageSharp/Formats/Jpeg/Components/Decoder/ColorConverters/JpegColorConverter.cs
@ -6,8 +6,8 @@ using System.Collections.Generic;
 using System.Linq;
 using System.Numerics;
 using SixLabors.ImageSharp.Common.Tuples;
 using SixLabors.ImageSharp.Memory;
 using SixLabors.ImageSharp.Tuples;
 using SixLabors.Memory;
 namespace SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters
--- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
+++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
@ -37,7 +37,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                }
                else
                {
-                    ConvertToVector4UsingStandardIntrinsics(sourceColors, destinationVectors, count);
+                    ConvertToVector4UsingBasicIntrinsics(sourceColors, destinationVectors, count);
                }
            }
@ -58,7 +58,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                }
                else
                {
-                    ConvertFromVector4StandardIntrinsics(sourceVectors, destinationColors, count);
+                    ConvertFromVector4BasicIntrinsics(sourceVectors, destinationColors, count);
                }
            }
@ -112,7 +112,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                }
            }
-            private static void ConvertToVector4UsingStandardIntrinsics(
+            private static void ConvertToVector4UsingBasicIntrinsics(
                ReadOnlySpan<Rgba32> sourceColors,
                Span<Vector4> destinationVectors,
                int count)
@ -125,7 +125,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                    ReadOnlySpan<byte> rawSrc = MemoryMarshal.Cast<Rgba32, byte>(sourceColors);
                    Span<float> rawDest = MemoryMarshal.Cast<Vector4, float>(destinationVectors.Slice(0, alignedCount));
-                    SimdUtils.BulkConvertByteToNormalizedFloat(rawSrc, rawDest);
+                    SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(rawSrc, rawDest);
                }
                if (remainder > 0)
@ -155,7 +155,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                }
            }
-            private static void ConvertFromVector4StandardIntrinsics(ReadOnlySpan<Vector4> sourceVectors, Span<Rgba32> destinationColors, int count)
+            private static void ConvertFromVector4BasicIntrinsics(ReadOnlySpan<Vector4> sourceVectors, Span<Rgba32> destinationColors, int count)
            {
                int remainder = count % 2;
                int alignedCount = count - remainder;
@ -165,7 +165,7 @@ namespace SixLabors.ImageSharp.PixelFormats
                    ReadOnlySpan<float> rawSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors.Slice(0, alignedCount));
                    Span<byte> rawDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors);
-                    SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
+                    SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
                }
                if (remainder > 0)
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
@ -30,8 +30,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
        [Params(
            //64, 
            //256,
-            //512,
+            512
-            2048
+            //1024
            )]
        public int Count { get; set; }
@ -117,7 +117,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
            SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
        }
-        //[Benchmark]
+        [Benchmark]
        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops()
        {
            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
@ -159,7 +159,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
            }
        }
-        //[Benchmark]
+        [Benchmark]
        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop()
        {
            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
--- a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs
+++ b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs
@ -5,6 +5,7 @@ using BenchmarkDotNet.Attributes;
 namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
 {
    [Config(typeof(Config.ShortClr))]
    public class UInt32ToSingle
    {
        private float[] data;
@ -66,8 +67,7 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
                Unsafe.Add(ref bf, i) = v;
            }
        }
-
+        
        // This code is not correct at all, it's just here as reference
        [Benchmark]
        public void StandardSimdFromInt()
        {
@ -86,5 +86,28 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
                Unsafe.Add(ref bf, i) = v;
            }
        }
        [Benchmark]
        public void StandardSimdFromInt_RefCast()
        {
            int n = Count / Vector<float>.Count;
            ref Vector<float> bf = ref Unsafe.As<float, Vector<float>>(ref this.data[0]);
            ref Vector<int> bu = ref Unsafe.As<Vector<float>, Vector<int>>(ref bf);
            var scale = new Vector<float>(1f / 255f);
            for (int i = 0; i < n; i++)
            {
                ref Vector<float> fRef = ref Unsafe.Add(ref bf, i);
                Vector<int> du = Vector.AsVectorInt32(fRef);
                Vector<float> v = Vector.ConvertToSingle(du);
                v *= scale;
                fRef = v;
            }
        }
    }
 }
--- a/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs
+++ b/tests/ImageSharp.Benchmarks/General/Vectorization/WidenBytesToUInt32.cs
@ -3,8 +3,11 @@ using System.Runtime.CompilerServices;
 using BenchmarkDotNet.Attributes;
 using SixLabors.ImageSharp.Tuples;
 namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
 {
    [Config(typeof(Config.ShortClr))]
    public class WidenBytesToUInt32
    {
        private byte[] source;
@ -25,8 +28,8 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization
        {
            const int N = Count / 8;
-            ref SimdUtils.Octet.OfByte sBase = ref Unsafe.As<byte, SimdUtils.Octet.OfByte>(ref this.source[0]);
+            ref Octet.OfByte sBase = ref Unsafe.As<byte, Octet.OfByte>(ref this.source[0]);
-            ref SimdUtils.Octet.OfUInt32 dBase = ref Unsafe.As<uint, SimdUtils.Octet.OfUInt32>(ref this.dest[0]);
+            ref Octet.OfUInt32 dBase = ref Unsafe.As<uint, Octet.OfUInt32>(ref this.dest[0]);
            for (int i = 0; i < N; i++)
            {
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@ -62,7 +62,7 @@ namespace SixLabors.ImageSharp.Tests.Common
        {
            float[] data = new float[Vector<float>.Count];
-            var rnd = new Random();
+            var rnd = new Random(seed);
            for (int i = 0; i < Vector<float>.Count; i++)
            {
@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common
        [InlineData(1, 8)]
        [InlineData(2, 16)]
        [InlineData(3, 128)]
-        public void BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
+        public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
        {
            if (this.SkipOnNonAvx2())
            {
@ -130,7 +130,7 @@ namespace SixLabors.ImageSharp.Tests.Common
            byte[] dest = new byte[count];
-            SimdUtils.BulkConvertNormalizedFloatToByte(normalized, dest);
+            SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(normalized, dest);
            byte[] expected = orig.Select(f => (byte)(f)).ToArray();
@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common
        [InlineData(1, 8)]
        [InlineData(2, 16)]
        [InlineData(3, 128)]
-        public void BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
+        public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
        {
            if (this.SkipOnNonAvx2())
            {
@ -153,87 +153,113 @@ namespace SixLabors.ImageSharp.Tests.Common
            byte[] dest = new byte[count];
-            SimdUtils.BulkConvertNormalizedFloatToByte(source, dest);
+            SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByte(source, dest);
            byte[] expected = source.Select(f => (byte)Math.Round(f * 255f)).ToArray();
            Assert.Equal(expected, dest);
        }
        public static readonly TheoryData<int> ArraySizesDivisibleBy8 = new TheoryData<int> { 0, 8, 16, 1024 };
        public static readonly TheoryData<int> ArraySizesDivisibleBy32 = new TheoryData<int> { 0, 32, 512 };
        public static readonly TheoryData<int> ArbitraryArraySizes =
            new TheoryData<int>
                {
                    0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520,
                };
        [Theory]
-        [InlineData(1, 0)]
+        [MemberData(nameof(ArraySizesDivisibleBy8))]
-        [InlineData(2, 32)]
+        public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count)
        [InlineData(3, 128)]
        public void BulkConvertByteToNormalizedFloat(int seed, int count)
        {
            if (this.SkipOnNonAvx2())
            {
                return;
            }
-            byte[] source = new Random(seed).GenerateRandomByteArray(count);
+            TestImpl_BulkConvertByteToNormalizedFloat(
-            float[] result = new float[count];
+                count,
-            float[] expected = source.Select(b => (float)b / 255f).ToArray();
+                (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
            SimdUtils.BulkConvertByteToNormalizedFloat(source, result);
            Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
        }
        [Theory]
-        [InlineData(1, 0)]
+        [MemberData(nameof(ArraySizesDivisibleBy32))]
-        [InlineData(2, 32)]
+        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int count)
-        [InlineData(3, 128)]
+        {
-        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count)
+            TestImpl_BulkConvertByteToNormalizedFloat(
                count,
                (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
        }
        [Theory]
        [MemberData(nameof(ArbitraryArraySizes))]
        public void BulkConvertByteToNormalizedFloat(int count)
        {
            TestImpl_BulkConvertByteToNormalizedFloat(
                count,
                (s, d) => SimdUtils.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
        }
        private static void TestImpl_BulkConvertByteToNormalizedFloat(
            int count,
            Action<Memory<byte>, Memory<float>> convert)
        {
-            byte[] source = new Random(seed).GenerateRandomByteArray(count);
+            byte[] source = new Random(count).GenerateRandomByteArray(count);
            float[] result = new float[count];
            float[] expected = source.Select(b => (float)b / 255f).ToArray();
-            
+            convert(source, result);
            SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result);
            Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
        }
        public static readonly TheoryData<int> BulkConvertNormalizedFloatToByteClampOverflows_Data =
            new TheoryData<int>
                {
                    0, 64, 1024
                };
        [Theory]
-        [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))]
+        [MemberData(nameof(ArraySizesDivisibleBy8))]
-        public void BulkConvertNormalizedFloatToByteClampOverflows(int count)
+        public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
        {
            if (this.SkipOnNonAvx2())
            {
                return;
            }
-            float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f);
+            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
-            byte[] expected = source.Select(NormalizedFloatToByte).ToArray();
+                (s, d) => SimdUtils.BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
-            byte[] actual = new byte[count];
+                );
            SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(source, actual);
            Assert.Equal(expected, actual);
        }
        [Theory]
-        [MemberData(nameof(BulkConvertNormalizedFloatToByteClampOverflows_Data))]
+        [MemberData(nameof(ArraySizesDivisibleBy32))]
        public void ExtendedIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
        {
            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
                (s, d) => SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
            );
        }
        [Theory]
        [MemberData(nameof(ArbitraryArraySizes))]
        public void BulkConvertNormalizedFloatToByteClampOverflows(int count)
        {
            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
                (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
            );
        }
        private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
            int count,
            Action<Memory<float>, Memory<byte>> convert)
        {
            float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f);
            byte[] expected = source.Select(NormalizedFloatToByte).ToArray();
            byte[] actual = new byte[count];
-            SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(source, actual);
+            convert(source, actual);
            Assert.Equal(expected, actual);
        }
        private static byte NormalizedFloatToByte(float f) => (byte)Math.Min(255f, Math.Max(0f, f * 255f + 0.5f));
        [Theory]