minimize ceremonial overhead

in BulkConvertByteToNormalizedFloat() and BulkConvertNormalizedFloatToByteClampOverflows()
8 years ago · 9b0ee6fb2a
15 changed files with 406 additions and 272 deletions
--- a/src/ImageSharp/Common/Helpers/ImageMaths.cs
+++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs
@ -39,22 +39,31 @@ namespace SixLabors.ImageSharp
            return (a / GreatestCommonDivisor(a, b)) * b;
        }
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        /// <summary>
-        public static int Modulo4(int a) => a & 3;
+        /// Calculates <paramref name="x"/> % 4
        /// </summary>
        [MethodImpl(InliningOptions.ShortMethod)]
        public static int Modulo4(int x) => x & 3;
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        /// <summary>
-        public static int Modulo8(int a) => a & 7;
+        /// Calculates <paramref name="x"/> % 8
        /// </summary>
        [MethodImpl(InliningOptions.ShortMethod)]
        public static int Modulo8(int x) => x & 7;
        /// <summary>
-        /// Fast (mod m) calculator,
+        /// Fast (x mod m) calculator, with the restriction that
-        /// where <paramref name="m"/> should be a power of 2.
+        /// <paramref name="m"/> should be power of 2.
        /// </summary>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
-        public static int ModuloP2(int a, int m)
+        public static int ModuloP2(int x, int m)
        {
-            return a & (m - 1);
+            return x & (m - 1);
        }
        [MethodImpl(InliningOptions.ShortMethod)]
        public static float Clamp(float x, float min, float max) => Math.Min(max, Math.Max(min, x));
        /// <summary>
        /// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation.
        /// </summary>
@ -62,7 +71,7 @@ namespace SixLabors.ImageSharp
        /// A number that is greater than <see cref="int.MinValue"/>, but less than or equal to <see cref="int.MaxValue"/>
        /// </param>
        /// <returns>The <see cref="int"/></returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
        public static int FastAbs(int x)
        {
            int y = x >> 31;
@ -74,7 +83,7 @@ namespace SixLabors.ImageSharp
        /// </summary>
        /// <param name="x">A single-precision floating-point number</param>
        /// <returns>The number <paramref name="x" /> raised to the power of 2.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
        public static float Pow2(float x) => x * x;
        /// <summary>
@ -82,7 +91,7 @@ namespace SixLabors.ImageSharp
        /// </summary>
        /// <param name="x">A single-precision floating-point number</param>
        /// <returns>The number <paramref name="x" /> raised to the power of 3.</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
        public static float Pow3(float x) => x * x * x;
        /// <summary>
@ -93,7 +102,7 @@ namespace SixLabors.ImageSharp
        /// <returns>
        /// The <see cref="int"/>
        /// </returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
        public static int GetBitsNeededForColorDepth(int colors) => Math.Max(1, (int)Math.Ceiling(Math.Log(colors, 2)));
        /// <summary>
@ -101,7 +110,7 @@ namespace SixLabors.ImageSharp
        /// </summary>
        /// <param name="bitDepth">The bit depth.</param>
        /// <returns>The <see cref="int"/></returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
        public static int GetColorCountForBitDepth(int bitDepth) => 1 << bitDepth;
        /// <summary>
@ -110,7 +119,7 @@ namespace SixLabors.ImageSharp
        /// <param name="x">The x provided to G(x).</param>
        /// <param name="sigma">The spread of the blur.</param>
        /// <returns>The Gaussian G(x)</returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
        public static float Gaussian(float x, float sigma)
        {
            const float Numerator = 1.0f;
@ -133,7 +142,7 @@ namespace SixLabors.ImageSharp
        /// <returns>
        /// The sine cardinal of <paramref name="f" />.
        /// </returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
        public static float SinC(float f)
        {
            if (MathF.Abs(f) > Constants.Epsilon)
@ -156,7 +165,7 @@ namespace SixLabors.ImageSharp
        /// <returns>
        /// The <see cref="float"/>.
        /// </returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
        public static float GetBcValue(float x, float b, float c)
        {
            if (x < 0F)
@ -192,7 +201,7 @@ namespace SixLabors.ImageSharp
        /// <returns>
        /// The bounding <see cref="Rectangle"/>.
        /// </returns>
-        [MethodImpl(MethodImplOptions.AggressiveInlining)]
+        [MethodImpl(InliningOptions.ShortMethod)]
        public static Rectangle GetBoundingRectangle(Point topLeft, Point bottomRight) => new Rectangle(topLeft.X, topLeft.Y, bottomRight.X - topLeft.X, bottomRight.Y - topLeft.Y);
        /// <summary>
--- a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
@ -21,28 +21,58 @@ namespace SixLabors.ImageSharp
            public static bool IsAvailable { get; } = IsAvx2CompatibleArchitecture;
            /// <summary>
-            /// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
+            /// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            [MethodImpl(InliningOptions.ShortMethod)]
            internal static void BulkConvertByteToNormalizedFloatReduce(
                ref ReadOnlySpan<byte> source,
                ref Span<float> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
-                if (IsAvailable)
+                if (!IsAvailable)
                {
-                    int remainder = source.Length % 8;
+                    return;
-                    int alignedCount = source.Length - remainder;
+                }
-
+
-                    if (alignedCount > 0)
+                int remainder = ImageMaths.Modulo8(source.Length);
-                    {
+                int adjustedCount = source.Length - remainder;
-                        BulkConvertByteToNormalizedFloat(
+
-                            source.Slice(0, alignedCount),
+                if (adjustedCount > 0)
-                            dest.Slice(0, alignedCount));
+                {
-
+                    BulkConvertByteToNormalizedFloat(
-                        source = source.Slice(alignedCount);
+                        source.Slice(0, adjustedCount),
-                        dest = dest.Slice(alignedCount);
+                        dest.Slice(0, adjustedCount));
-                    }
+
                    source = source.Slice(adjustedCount);
                    dest = dest.Slice(adjustedCount);
                }
            }
            /// <summary>
            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            [MethodImpl(InliningOptions.ShortMethod)]
            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
                ref ReadOnlySpan<float> source,
                ref Span<byte> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
                if (!IsAvailable)
                {
                    return;
                }
                int remainder = ImageMaths.Modulo8(source.Length);
                int adjustedCount = source.Length - remainder;
                if (adjustedCount > 0)
                {
                    BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
                    source = source.Slice(adjustedCount);
                    dest = dest.Slice(adjustedCount);
                }
            }
@ -57,7 +87,7 @@ namespace SixLabors.ImageSharp
            {
                GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
-                DebugGuard.IsTrue((dest.Length % 8) == 0, nameof(source), "dest.Length should be divisable by 8!");
+                DebugGuard.IsTrue(ImageMaths.Modulo8(dest.Length) == 0, nameof(source), "dest.Length should be divisable by 8!");
                var bVec = new Vector<float>(256.0f / 255.0f);
                var magicFloat = new Vector<float>(32768.0f);
@ -93,30 +123,6 @@ namespace SixLabors.ImageSharp
                }
            }
            /// <summary>
            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
                ref ReadOnlySpan<float> source,
                ref Span<byte> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
                if (IsAvailable)
                {
                    int remainder = source.Length % Vector<byte>.Count;
                    int alignedCount = source.Length - remainder;
                    if (alignedCount > 0)
                    {
                        BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
                        source = source.Slice(alignedCount);
                        dest = dest.Slice(alignedCount);
                    }
                }
            }
            /// <summary>
            /// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> which is faster on older runtimes.
            /// </summary>
@ -124,7 +130,7 @@ namespace SixLabors.ImageSharp
            {
                GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));
-                DebugGuard.IsTrue((source.Length % 8) == 0, nameof(source), "source.Length should be divisible by 8!");
+                DebugGuard.IsTrue(ImageMaths.Modulo8(source.Length) == 0, nameof(source), "source.Length should be divisible by 8!");
                if (source.Length == 0)
                {
@ -174,7 +180,10 @@ namespace SixLabors.ImageSharp
            {
                GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
-                DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");
+                DebugGuard.IsTrue(
                    ImageMaths.Modulo8(source.Length) == 0,
                    nameof(source),
                    "source.Length should be divisible by 8!");
                if (source.Length == 0)
                {
--- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@ -28,27 +28,58 @@ namespace SixLabors.ImageSharp
 #endif
            /// <summary>
-            /// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
+            /// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
            /// </summary>
-            [Conditional("NETCOREAPP2_1")]
+            [MethodImpl(InliningOptions.ShortMethod)]
            internal static void BulkConvertByteToNormalizedFloatReduce(
                ref ReadOnlySpan<byte> source,
                ref Span<float> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
-                if (IsAvailable)
+                if (!IsAvailable)
                {
-                    int remainder = source.Length % Vector<byte>.Count;
+                    return;
-                    int alignedCount = source.Length - remainder;
+                }
                int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
                int adjustedCount = source.Length - remainder;
-                    if (alignedCount > 0)
+                if (adjustedCount > 0)
-                    {
+                {
-                        BulkConvertByteToNormalizedFloat(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
+                    BulkConvertByteToNormalizedFloat(source.Slice(0, adjustedCount), dest.Slice(0, adjustedCount));
-                        source = source.Slice(alignedCount);
+                    source = source.Slice(adjustedCount);
-                        dest = dest.Slice(alignedCount);
+                    dest = dest.Slice(adjustedCount);
-                    }
+                }
            }
            /// <summary>
            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            [MethodImpl(InliningOptions.ShortMethod)]
            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
                ref ReadOnlySpan<float> source,
                ref Span<byte> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
                if (!IsAvailable)
                {
                    return;
                }
                int remainder = ImageMaths.ModuloP2(source.Length, Vector<byte>.Count);
                int adjustedCount = source.Length - remainder;
                if (adjustedCount > 0)
                {
                    BulkConvertNormalizedFloatToByteClampOverflows(
                        source.Slice(0, adjustedCount),
                        dest.Slice(0, adjustedCount));
                    source = source.Slice(adjustedCount);
                    dest = dest.Slice(adjustedCount);
                }
            }
@ -58,7 +89,7 @@ namespace SixLabors.ImageSharp
            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
            {
                DebugGuard.IsTrue(
-                    dest.Length % Vector<byte>.Count == 0,
+                    ImageMaths.ModuloP2(dest.Length, Vector<byte>.Count) == 0,
                    nameof(source),
                    "dest.Length should be divisible by Vector<byte>.Count!");
@ -67,8 +98,6 @@ namespace SixLabors.ImageSharp
                ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(source));
                ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dest));
                var scale = new Vector<float>(1f / 255f);
                for (int i = 0; i < n; i++)
                {
                    Vector<byte> b = Unsafe.Add(ref sourceBase, i);
@ -77,10 +106,10 @@ namespace SixLabors.ImageSharp
                    Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
                    Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
-                    Vector<float> f0 = ConvertToSingle(w0, scale);
+                    Vector<float> f0 = ConvertToSingle(w0);
-                    Vector<float> f1 = ConvertToSingle(w1, scale);
+                    Vector<float> f1 = ConvertToSingle(w1);
-                    Vector<float> f2 = ConvertToSingle(w2, scale);
+                    Vector<float> f2 = ConvertToSingle(w2);
-                    Vector<float> f3 = ConvertToSingle(w3, scale);
+                    Vector<float> f3 = ConvertToSingle(w3);
                    ref Vector<float> d = ref Unsafe.Add(ref destBase, i * 4);
                    d = f0;
@ -90,31 +119,6 @@ namespace SixLabors.ImageSharp
                }
            }
            /// <summary>
            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            [Conditional("NETCOREAPP2_1")]
            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
                ref ReadOnlySpan<float> source,
                ref Span<byte> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
                if (IsAvailable)
                {
                    int remainder = source.Length % Vector<byte>.Count;
                    int alignedCount = source.Length - remainder;
                    if (alignedCount > 0)
                    {
                        BulkConvertNormalizedFloatToByteClampOverflows(source.Slice(0, alignedCount), dest.Slice(0, alignedCount));
                        source = source.Slice(alignedCount);
                        dest = dest.Slice(alignedCount);
                    }
                }
            }
            /// <summary>
            /// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/>, which is faster on new .NET runtime.
            /// </summary>
@ -123,7 +127,7 @@ namespace SixLabors.ImageSharp
                Span<byte> dest)
            {
                DebugGuard.IsTrue(
-                    dest.Length % Vector<byte>.Count == 0,
+                    ImageMaths.ModuloP2(dest.Length, Vector<byte>.Count) == 0,
                    nameof(dest),
                    "dest.Length should be divisible by Vector<byte>.Count!");
@ -168,11 +172,11 @@ namespace SixLabors.ImageSharp
            }
            [MethodImpl(MethodImplOptions.AggressiveInlining)]
-            private static Vector<float> ConvertToSingle(Vector<uint> u, Vector<float> scale)
+            private static Vector<float> ConvertToSingle(Vector<uint> u)
            {
                Vector<int> vi = Vector.AsVectorInt32(u);
                Vector<float> v = Vector.ConvertToSingle(vi);
-                v *= scale;
+                v *= new Vector<float>(1f / 255f);
                return v;
            }
        }
--- a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
@ -1,71 +1,81 @@
-using System;
+// Copyright (c) Six Labors and contributors.
 // Licensed under the Apache License, Version 2.0.
 using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 // ReSharper disable MemberHidesStaticFromOuterClass
 namespace SixLabors.ImageSharp
 {
    internal static partial class SimdUtils
    {
        /// <summary>
        /// Fallback implementation based on <see cref="Vector4"/> (128bit).
-        /// For <see cref="Vector4"/>, efficient software fallback implementations are present
+        /// For <see cref="Vector4"/>, efficient software fallback implementations are present,
-        /// + maybe even mono can emit intrinsics for that type :P
+        /// and we hope that even mono's JIT is able to emit SIMD instructions for that type :P
        /// </summary>
        public static class FallbackIntrinsics128
        {
            /// <summary>
-            /// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
+            /// <see cref="BulkConvertByteToNormalizedFloat"/> as many elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            [MethodImpl(InliningOptions.ShortMethod)]
            internal static void BulkConvertByteToNormalizedFloatReduce(
                ref ReadOnlySpan<byte> source,
                ref Span<float> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
-                int remainder = source.Length % 4;
+                int remainder = ImageMaths.Modulo4(source.Length);
-                int alignedCount = source.Length - remainder;
+                int adjustedCount = source.Length - remainder;
-                if (alignedCount > 0)
+                if (adjustedCount > 0)
                {
                    BulkConvertByteToNormalizedFloat(
-                        source.Slice(0, alignedCount),
+                        source.Slice(0, adjustedCount),
-                        dest.Slice(0, alignedCount));
+                        dest.Slice(0, adjustedCount));
-                    source = source.Slice(alignedCount);
+                    source = source.Slice(adjustedCount);
-                    dest = dest.Slice(alignedCount);
+                    dest = dest.Slice(adjustedCount);
                }
            }
            /// <summary>
-            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
+            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as many elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            [MethodImpl(InliningOptions.ShortMethod)]
            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
                ref ReadOnlySpan<float> source,
                ref Span<byte> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
-                int remainder = source.Length % 4;
+                int remainder = ImageMaths.Modulo4(source.Length);
-                int alignedCount = source.Length - remainder;
+                int adjustedCount = source.Length - remainder;
-                if (alignedCount > 0)
+                if (adjustedCount > 0)
                {
                    BulkConvertNormalizedFloatToByteClampOverflows(
-                        source.Slice(0, alignedCount),
+                        source.Slice(0, adjustedCount),
-                        dest.Slice(0, alignedCount));
+                        dest.Slice(0, adjustedCount));
-                    source = source.Slice(alignedCount);
+                    source = source.Slice(adjustedCount);
-                    dest = dest.Slice(alignedCount);
+                    dest = dest.Slice(adjustedCount);
                }
            }
            /// <summary>
            /// Implementation of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/> using <see cref="Vector4"/>.
            /// </summary>
            [MethodImpl(InliningOptions.ColdPath)]
            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
            {
-                DebugGuard.IsTrue((dest.Length % 4) == 0, nameof(dest), "dest.Length should be divisible by 4!");
+                DebugGuard.IsTrue(
                    ImageMaths.Modulo4(dest.Length) == 0,
                    nameof(dest),
                    "dest.Length should be divisible by 4!");
                int count = dest.Length / 4;
                if (count == 0)
@ -94,11 +104,15 @@ namespace SixLabors.ImageSharp
            /// <summary>
            /// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> using <see cref="Vector4"/>.
            /// </summary>
            [MethodImpl(InliningOptions.ColdPath)]
            internal static void BulkConvertNormalizedFloatToByteClampOverflows(
                ReadOnlySpan<float> source,
                Span<byte> dest)
            {
-                DebugGuard.IsTrue((source.Length % 4) == 0, nameof(source), "source.Length should be divisible by 4!");
+                DebugGuard.IsTrue(
                    ImageMaths.Modulo4(source.Length) == 0,
                    nameof(source),
                    "source.Length should be divisible by 4!");
                int count = source.Length / 4;
                if (count == 0)
--- a/src/ImageSharp/Common/Helpers/SimdUtils.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@ -2,6 +2,7 @@
 // Licensed under the Apache License, Version 2.0.
 using System;
 using System.Diagnostics;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
@ -61,25 +62,22 @@ namespace SixLabors.ImageSharp
        /// </summary>
        /// <param name="source">The source span of bytes</param>
        /// <param name="dest">The destination span of floats</param>
        [MethodImpl(InliningOptions.ShortMethod)]
        internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
        {
            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
 #if NETCOREAPP2_1
            ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
 #else
            BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
 #endif
            FallbackIntrinsics128.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
            // Deal with the remainder:
-            int count = source.Length;
+            if (source.Length > 0)
            if (count > 0)
            {
-                // TODO: Do we need to optimize anything on this? (There are at most 7 remainders)
+                ConverByteToNormalizedFloatRemainder(source, dest);
                ref byte sBase = ref MemoryMarshal.GetReference(source);
                ref float dBase = ref MemoryMarshal.GetReference(dest);
                for (int i = 0; i < count; i++)
                {
                    Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, i) / 255f;
                }
            }
        }
@ -91,35 +89,71 @@ namespace SixLabors.ImageSharp
        /// </summary>
        /// <param name="source">The source span of floats</param>
        /// <param name="dest">The destination span of bytes</param>
        [MethodImpl(InliningOptions.ShortMethod)]
        internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
        {
            DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
 #if NETCOREAPP2_1
            ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
 #else
            BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
 #endif
            FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
            // Deal with the remainder:
-            int count = source.Length;
+            if (source.Length > 0)
            if (count > 0)
            {
-                ref float sBase = ref MemoryMarshal.GetReference(source);
+                ConvertNormalizedFloatToByteRemainder(source, dest);
                ref byte dBase = ref MemoryMarshal.GetReference(dest);
                for (int i = 0; i < count; i++)
                {
                    // TODO: Do we need to optimize anything on this? (There are at most 7 remainders)
                    float f = Unsafe.Add(ref sBase, i);
                    f *= 255f;
                    f += 0.5f;
                    f = MathF.Max(0, f);
                    f = MathF.Min(255f, f);
                    Unsafe.Add(ref dBase, i) = (byte)f;
                }
            }
        }
        [MethodImpl(InliningOptions.ColdPath)]
        private static void ConverByteToNormalizedFloatRemainder(ReadOnlySpan<byte> source, Span<float> dest)
        {
            ref byte sBase = ref MemoryMarshal.GetReference(source);
            ref float dBase = ref MemoryMarshal.GetReference(dest);
            // There are at most 3 elements at this point, having a for loop is overkill.
            // Let's minimize the no. of instructions!
            switch (source.Length)
            {
                case 3:
                    Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2) / 255f;
                    goto case 2;
                case 2:
                    Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1) / 255f;
                    goto case 1;
                case 1:
                    dBase = sBase / 255f;
                    break;
            }
        }
        [MethodImpl(InliningOptions.ColdPath)]
        private static void ConvertNormalizedFloatToByteRemainder(ReadOnlySpan<float> source, Span<byte> dest)
        {
            ref float sBase = ref MemoryMarshal.GetReference(source);
            ref byte dBase = ref MemoryMarshal.GetReference(dest);
            switch (source.Length)
            {
                case 3:
                    Unsafe.Add(ref dBase, 2) = ConvertToByte(Unsafe.Add(ref sBase, 2));
                    goto case 2;
                case 2:
                    Unsafe.Add(ref dBase, 1) = ConvertToByte(Unsafe.Add(ref sBase, 1));
                    goto case 1;
                case 1:
                    dBase = ConvertToByte(sBase);
                    break;
            }
        }
        [MethodImpl(InliningOptions.ShortMethod)]
        private static byte ConvertToByte(float f) => (byte)ImageMaths.Clamp((f * 255f) + 0.5f, 0, 255f);
        [Conditional("DEBUG")]
        private static void GuardAvx2(string operation)
        {
            if (!IsAvx2CompatibleArchitecture)
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
@ -99,30 +99,30 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
        }
        // RESULTS (2018 October):
-        //                                                             Method | Runtime | Count |         Mean |        Error |     StdDev | Scaled | ScaledSD |  Gen 0 | Allocated |
+        //                       Method | Runtime | Count |         Mean |        Error |      StdDev | Scaled | ScaledSD |  Gen 0 | Allocated |
-        // ------------------------------------------------------------------ |-------- |------ |-------------:|-------------:|-----------:|-------:|---------:|-------:|----------:|
+        // ---------------------------- |-------- |------ |-------------:|-------------:|------------:|-------:|---------:|-------:|----------:|
-        //                                                          BasicBulk |     Clr |    64 |    581.62 ns |    33.625 ns |  1.8999 ns |   2.27 |     0.02 |      - |       0 B |
+        //        FallbackIntrinsics128 |     Clr |    64 |    340.38 ns |    22.319 ns |   1.2611 ns |   1.41 |     0.01 |      - |       0 B |
-        //  BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows |     Clr |    64 |    256.66 ns |    45.153 ns |  2.5512 ns |   1.00 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |     Clr |    64 |    240.79 ns |    11.421 ns |   0.6453 ns |   1.00 |     0.00 |      - |       0 B |
-        //   ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows |     Clr |    64 |    201.92 ns |    30.161 ns |  1.7042 ns |   0.79 |     0.01 |      - |       0 B |
+        //            ExtendedIntrinsic |     Clr |    64 |    199.09 ns |   124.239 ns |   7.0198 ns |   0.83 |     0.02 |      - |       0 B |
-        //                                               PixelOperations_Base |     Clr |    64 |    665.01 ns |    13.032 ns |  0.7363 ns |   2.59 |     0.02 | 0.0067 |      24 B |
+        //         PixelOperations_Base |     Clr |    64 |    647.99 ns |    24.003 ns |   1.3562 ns |   2.69 |     0.01 | 0.0067 |      24 B |
-        //                                        PixelOperations_Specialized |     Clr |    64 |    295.14 ns |    26.335 ns |  1.4880 ns |   1.15 |     0.01 |      - |       0 B |
+        //  PixelOperations_Specialized |     Clr |    64 |    259.79 ns |    13.391 ns |   0.7566 ns |   1.08 |     0.00 |      - |       0 B | <--- ceremonial overhead has been minimized!
-        //                                                                    |         |       |              |              |            |        |          |        |           |
+        //                              |         |       |              |              |             |        |          |        |           |
-        //                                                          BasicBulk |    Core |    64 |    513.22 ns |    91.110 ns |  5.1479 ns |   3.19 |     0.03 |      - |       0 B |
+        //        FallbackIntrinsics128 |    Core |    64 |    234.64 ns |    12.320 ns |   0.6961 ns |   1.58 |     0.00 |      - |       0 B |
-        //  BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows |    Core |    64 |    160.76 ns |     2.760 ns |  0.1559 ns |   1.00 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |    Core |    64 |    148.87 ns |     2.794 ns |   0.1579 ns |   1.00 |     0.00 |      - |       0 B |
-        //   ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows |    Core |    64 |     95.98 ns |    10.077 ns |  0.5694 ns |   0.60 |     0.00 |      - |       0 B |
+        //            ExtendedIntrinsic |    Core |    64 |     94.06 ns |    10.015 ns |   0.5659 ns |   0.63 |     0.00 |      - |       0 B |
-        //                                               PixelOperations_Base |    Core |    64 |    591.74 ns |    49.856 ns |  2.8170 ns |   3.68 |     0.01 | 0.0067 |      24 B |
+        //         PixelOperations_Base |    Core |    64 |    573.52 ns |    31.865 ns |   1.8004 ns |   3.85 |     0.01 | 0.0067 |      24 B |
-        //                                        PixelOperations_Specialized |    Core |    64 |    149.11 ns |     4.485 ns |  0.2534 ns |   0.93 |     0.00 |      - |       0 B |
+        //  PixelOperations_Specialized |    Core |    64 |    117.21 ns |    13.264 ns |   0.7494 ns |   0.79 |     0.00 |      - |       0 B |
-        //                                                                    |         |       |              |              |            |        |          |        |           |
+        //                              |         |       |              |              |             |        |          |        |           |
-        //                                                          BasicBulk |     Clr |  2048 | 15,345.85 ns | 1,213.551 ns | 68.5679 ns |   3.90 |     0.01 |      - |       0 B |
+        //        FallbackIntrinsics128 |     Clr |  2048 |  6,735.93 ns | 2,139.340 ns | 120.8767 ns |   1.71 |     0.03 |      - |       0 B |
-        //  BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows |     Clr |  2048 |  3,939.49 ns |    71.101 ns |  4.0173 ns |   1.00 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |     Clr |  2048 |  3,929.29 ns |   334.027 ns |  18.8731 ns |   1.00 |     0.00 |      - |       0 B |
-        //   ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows |     Clr |  2048 |  2,272.61 ns |   110.671 ns |  6.2531 ns |   0.58 |     0.00 |      - |       0 B |
+        //            ExtendedIntrinsic |     Clr |  2048 |  2,226.01 ns |   130.525 ns |   7.3749 ns |!! 0.57 |     0.00 |      - |       0 B | <--- ExtendedIntrinsics rock!
-        //                                               PixelOperations_Base |     Clr |  2048 | 17,422.47 ns |   811.733 ns | 45.8644 ns |   4.42 |     0.01 |      - |      24 B |
+        //         PixelOperations_Base |     Clr |  2048 | 16,760.84 ns |   367.800 ns |  20.7814 ns |   4.27 |     0.02 |      - |      24 B | <--- Extra copies using "Vector4 TPixel.ToVector4()"
-        //                                        PixelOperations_Specialized |     Clr |  2048 |  3,984.26 ns |   110.352 ns |  6.2351 ns |   1.01 |     0.00 |      - |       0 B |
+        //  PixelOperations_Specialized |     Clr |  2048 |  3,986.03 ns |   237.238 ns |  13.4044 ns |   1.01 |     0.00 |      - |       0 B | <--- can't yet detect whether ExtendedIntrinsics are available :(
-        //                                                                    |         |       |              |              |            |        |          |        |           |
+        //                              |         |       |              |              |             |        |          |        |           |
-        //                                                          BasicBulk |    Core |  2048 | 14,950.43 ns |   699.309 ns | 39.5123 ns |   3.76 |     0.02 |      - |       0 B |
+        //        FallbackIntrinsics128 |    Core |  2048 |  6,644.65 ns | 2,677.090 ns | 151.2605 ns |   1.69 |     0.05 |      - |       0 B |
-        //  BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows |    Core |  2048 |  3,978.28 ns |   481.105 ns | 27.1833 ns |   1.00 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |    Core |  2048 |  3,923.70 ns | 1,971.760 ns | 111.4081 ns |   1.00 |     0.00 |      - |       0 B |
-        //   ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows |    Core |  2048 |  2,169.54 ns |    75.606 ns |  4.2719 ns | !!0.55!|     0.00 |      - |       0 B |
+        //            ExtendedIntrinsic |    Core |  2048 |  2,092.32 ns |   375.657 ns |  21.2253 ns |!! 0.53 |     0.01 |      - |       0 B | <--- ExtendedIntrinsics rock!
-        //                                               PixelOperations_Base |    Core |  2048 | 18,403.62 ns | 1,494.056 ns | 84.4169 ns |   4.63 |     0.03 |      - |      24 B |
+        //         PixelOperations_Base |    Core |  2048 | 16,875.73 ns | 1,271.957 ns |  71.8679 ns |   4.30 |     0.10 |      - |      24 B |
-        //                                        PixelOperations_Specialized |    Core |  2048 |  2,227.60 ns |   486.761 ns | 27.5029 ns | !!0.56!|     0.01 |      - |       0 B |
+        //  PixelOperations_Specialized |    Core |  2048 |  2,129.92 ns |   262.888 ns |  14.8537 ns |!! 0.54 |     0.01 |      - |       0 B | <--- ExtendedIntrinsics rock!
    }
 }
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
@ -191,30 +191,30 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
        // RESULTS (2018 October):
        //
-        //                                               Method | Runtime | Count |        Mean |        Error |     StdDev | Scaled | ScaledSD |  Gen 0 | Allocated |
+        //                       Method | Runtime | Count |        Mean |        Error |      StdDev | Scaled | ScaledSD |  Gen 0 | Allocated |
-        // ---------------------------------------------------- |-------- |------ |------------:|-------------:|-----------:|-------:|---------:|-------:|----------:|
+        // ---------------------------- |-------- |------ |------------:|-------------:|------------:|-------:|---------:|-------:|----------:|
-        //                                            BasicBulk |     Clr |    64 |   267.40 ns |    30.711 ns |  1.7352 ns |   1.07 |     0.01 |      - |       0 B |
+        //        FallbackIntrinsics128 |     Clr |    64 |   287.62 ns |     6.026 ns |   0.3405 ns |   1.19 |     0.00 |      - |       0 B |
-        //  BasicIntrinsics256_BulkConvertByteToNormalizedFloat |     Clr |    64 |   249.97 ns |    33.838 ns |  1.9119 ns |   1.00 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |     Clr |    64 |   240.83 ns |    10.585 ns |   0.5981 ns |   1.00 |     0.00 |      - |       0 B |
-        //  ExtendedIntrinsics_BulkConvertByteToNormalizedFloat |     Clr |    64 |   176.97 ns |     5.221 ns |  0.2950 ns |   0.71 |     0.00 |      - |       0 B |
+        //           ExtendedIntrinsics |     Clr |    64 |   168.28 ns |    11.478 ns |   0.6485 ns |   0.70 |     0.00 |      - |       0 B |
-        //                                 PixelOperations_Base |     Clr |    64 |   349.70 ns |   104.331 ns |  5.8949 ns |   1.40 |     0.02 | 0.0072 |      24 B |
+        //         PixelOperations_Base |     Clr |    64 |   334.08 ns |    38.048 ns |   2.1498 ns |   1.39 |     0.01 | 0.0072 |      24 B |
-        //                          PixelOperations_Specialized |     Clr |    64 |   288.31 ns |    26.833 ns |  1.5161 ns |   1.15 |     0.01 |      - |       0 B |
+        //  PixelOperations_Specialized |     Clr |    64 |   255.41 ns |    10.939 ns |   0.6181 ns |   1.06 |     0.00 |      - |       0 B | <--- ceremonial overhead has been minimized!
-        //                                                      |         |       |             |              |            |        |          |        |           |
+        //                              |         |       |             |              |             |        |          |        |           |
-        //                                            BasicBulk |    Core |    64 |   185.36 ns |    30.051 ns |  1.6979 ns |   1.26 |     0.01 |      - |       0 B |
+        //        FallbackIntrinsics128 |    Core |    64 |   183.29 ns |     8.931 ns |   0.5046 ns |   1.32 |     0.00 |      - |       0 B |
-        //  BasicIntrinsics256_BulkConvertByteToNormalizedFloat |    Core |    64 |   146.84 ns |    12.674 ns |  0.7161 ns |   1.00 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |    Core |    64 |   139.18 ns |     7.633 ns |   0.4313 ns |   1.00 |     0.00 |      - |       0 B |
-        //  ExtendedIntrinsics_BulkConvertByteToNormalizedFloat |    Core |    64 |    67.31 ns |     2.542 ns |  0.1436 ns |   0.46 |     0.00 |      - |       0 B |
+        //           ExtendedIntrinsics |    Core |    64 |    66.29 ns |    16.366 ns |   0.9247 ns |   0.48 |     0.01 |      - |       0 B |
-        //                                 PixelOperations_Base |    Core |    64 |   272.03 ns |    94.419 ns |  5.3348 ns |   1.85 |     0.03 | 0.0072 |      24 B |
+        //         PixelOperations_Base |    Core |    64 |   257.75 ns |    16.959 ns |   0.9582 ns |   1.85 |     0.01 | 0.0072 |      24 B |
-        //                          PixelOperations_Specialized |    Core |    64 |   121.91 ns |    31.477 ns |  1.7785 ns |   0.83 |     0.01 |      - |       0 B |
+        //  PixelOperations_Specialized |    Core |    64 |    90.14 ns |     9.955 ns |   0.5625 ns |   0.65 |     0.00 |      - |       0 B |
-        //                                                      |         |       |             |              |            |        |          |        |           |
+        //                              |         |       |             |              |             |        |          |        |           |
-        //                                            BasicBulk |     Clr |  2048 | 5,133.04 ns |   284.052 ns | 16.0494 ns |   1.21 |     0.01 |      - |       0 B |
+        //        FallbackIntrinsics128 |     Clr |  2048 | 5,011.84 ns |   347.991 ns |  19.6621 ns |   1.22 |     0.01 |      - |       0 B |
-        //  BasicIntrinsics256_BulkConvertByteToNormalizedFloat |     Clr |  2048 | 4,248.58 ns | 1,095.887 ns | 61.9196 ns |   1.00 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |     Clr |  2048 | 4,119.35 ns |   720.153 ns |  40.6900 ns |   1.00 |     0.00 |      - |       0 B |
-        //  ExtendedIntrinsics_BulkConvertByteToNormalizedFloat |     Clr |  2048 | 1,214.02 ns |   184.349 ns | 10.4160 ns |   0.29 |     0.00 |      - |       0 B |
+        //           ExtendedIntrinsics |     Clr |  2048 | 1,195.29 ns |   164.389 ns |   9.2883 ns |!! 0.29 |     0.00 |      - |       0 B | <--- ExtendedIntrinsics rock!
-        //                                 PixelOperations_Base |     Clr |  2048 | 7,096.04 ns |   362.350 ns | 20.4734 ns |   1.67 |     0.02 |      - |      24 B |
+        //         PixelOperations_Base |     Clr |  2048 | 6,820.58 ns |   823.433 ns |  46.5255 ns |   1.66 |     0.02 |      - |      24 B |
-        //                          PixelOperations_Specialized |     Clr |  2048 | 4,314.19 ns |   204.964 ns | 11.5809 ns |   1.02 |     0.01 |      - |       0 B |
+        //  PixelOperations_Specialized |     Clr |  2048 | 4,203.53 ns |   176.714 ns |   9.9847 ns |   1.02 |     0.01 |      - |       0 B | <--- can't yet detect whether ExtendedIntrinsics are available :(
-        //                                                      |         |       |             |              |            |        |          |        |           |
+        //                              |         |       |             |              |             |        |          |        |           |
-        //                                            BasicBulk |    Core |  2048 | 5,038.38 ns |   223.282 ns | 12.6158 ns |   1.20 |     0.01 |      - |       0 B |
+        //        FallbackIntrinsics128 |    Core |  2048 | 5,017.89 ns | 4,021.533 ns | 227.2241 ns |   1.24 |     0.05 |      - |       0 B |
-        //  BasicIntrinsics256_BulkConvertByteToNormalizedFloat |    Core |  2048 | 4,199.17 ns |   897.985 ns | 50.7378 ns |   1.00 |     0.00 |      - |       0 B |
+        //           BasicIntrinsics256 |    Core |  2048 | 4,046.51 ns | 1,150.390 ns |  64.9992 ns |   1.00 |     0.00 |      - |       0 B |
-        //  ExtendedIntrinsics_BulkConvertByteToNormalizedFloat |    Core |  2048 | 1,113.86 ns |    64.799 ns |  3.6613 ns | !!0.27!|     0.00 |      - |       0 B |
+        //           ExtendedIntrinsics |    Core |  2048 | 1,130.59 ns |   832.588 ns |  47.0427 ns |!! 0.28 |     0.01 |      - |       0 B | <--- ExtendedIntrinsics rock!
-        //                                 PixelOperations_Base |    Core |  2048 | 7,015.00 ns |   920.083 ns | 51.9864 ns |   1.67 |     0.02 |      - |      24 B |
+        //         PixelOperations_Base |    Core |  2048 | 6,752.68 ns |   272.820 ns |  15.4148 ns |   1.67 |     0.02 |      - |      24 B |
-        //                          PixelOperations_Specialized |    Core |  2048 | 1,176.59 ns |   256.955 ns | 14.5184 ns | !!0.28!|     0.00 |      - |       0 B |
+        //  PixelOperations_Specialized |    Core |  2048 | 1,126.13 ns |    79.192 ns |   4.4745 ns |!! 0.28 |     0.00 |      - |       0 B | <--- ExtendedIntrinsics rock!
    }
 }
--- a/tests/ImageSharp.Benchmarks/General/BasicMath/Abs.cs
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/Abs.cs
@ -1,9 +1,9 @@
-namespace SixLabors.ImageSharp.Benchmarks.General
+using System;
 {
    using System;
-    using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Attributes;
 namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
 {
    public class Abs
    {
        [Params(-1, 1)]
--- a/tests/ImageSharp.Benchmarks/General/BasicMath/Clamp.cs
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/Clamp.cs
@ -3,13 +3,13 @@
 // Licensed under the Apache License, Version 2.0.
 // </copyright>
-namespace SixLabors.ImageSharp.Benchmarks.General
+using System;
-{
+using System.Runtime.CompilerServices;
    using System;
    using System.Runtime.CompilerServices;
-    using BenchmarkDotNet.Attributes;
+using BenchmarkDotNet.Attributes;
 namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
 {
    public class Clamp
    {
        [Params(-1, 0, 255, 256)]
--- a/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoConstant.cs
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoConstant.cs
@ -0,0 +1,23 @@
 using BenchmarkDotNet.Attributes;
 using BenchmarkDotNet.Attributes.Jobs;
 namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
 {
    [LongRunJob]
    public class ModuloPowerOfTwoConstant
    {
        private readonly int value = 42;
        [Benchmark(Baseline = true)]
        public int Standard()
        {
            return this.value % 8;
        }
        [Benchmark]
        public int Bitwise()
        {
            return ImageMaths.Modulo8(this.value);
        }
    }
 }
--- a/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoVariable.cs
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/ModuloPowerOfTwoVariable.cs
@ -0,0 +1,32 @@
 using BenchmarkDotNet.Attributes;
 using BenchmarkDotNet.Attributes.Jobs;
 namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
 {
    [LongRunJob]
    public class ModuloPowerOfTwoVariable
    {
        private readonly int value = 42;
        private readonly int m = 32;
        [Benchmark(Baseline = true)]
        public int Standard()
        {
            return this.value % this.m;
        }
        [Benchmark]
        public int Bitwise()
        {
            return ImageMaths.ModuloP2(this.value, this.m);
        }
        // RESULTS:
        //
        //    Method |      Mean |     Error |    StdDev |    Median | Scaled | ScaledSD |
        // --------- |----------:|----------:|----------:|----------:|-------:|---------:|
        //  Standard | 1.2465 ns | 0.0093 ns | 0.0455 ns | 1.2423 ns |   1.00 |     0.00 |
        //   Bitwise | 0.0265 ns | 0.0103 ns | 0.0515 ns | 0.0000 ns |   0.02 |     0.04 |
    }
 }
--- a/tests/ImageSharp.Benchmarks/General/BasicMath/Pow.cs
+++ b/tests/ImageSharp.Benchmarks/General/BasicMath/Pow.cs
@ -1,7 +1,8 @@
 using System;
 using BenchmarkDotNet.Attributes;
-namespace SixLabors.ImageSharp.Benchmarks.General
+namespace SixLabors.ImageSharp.Benchmarks.General.BasicMath
 {
    public class Pow
    {
--- a/tests/ImageSharp.Benchmarks/General/Modulus.cs
+++ b/tests/ImageSharp.Benchmarks/General/Modulus.cs
@ -1,19 +0,0 @@
 namespace SixLabors.ImageSharp.Benchmarks.General
 {
    using BenchmarkDotNet.Attributes;
    public class Modulus
    {
        [Benchmark(Baseline = true, Description = "Standard Modulus using %")]
        public int StandardModulus()
        {
            return 255 % 256;
        }
        [Benchmark(Description = "Bitwise Modulus using &")]
        public int BitwiseModulus()
        {
            return 255 & 255;
        }
    }
 }
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@ -264,13 +264,26 @@ namespace SixLabors.ImageSharp.Tests.Common
            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
                (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
            );
            // for small values, let's stress test the implementation a bit:
            if (count > 0 && count < 10)
            {
                for (int i = 0; i < 20; i++)
                {
                    TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
                        count,
                        (s, d) => SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span),
                        i + 42);
                }
            }
        }
        private static void TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(
            int count,
-            Action<Memory<float>, Memory<byte>> convert)
+            Action<Memory<float>, Memory<byte>> convert, int seed = -1)
        {
-            float[] source = new Random(count).GenerateRandomFloatArray(count, -0.1f, 1.2f);
+            seed = seed > 0 ? seed : count;
            float[] source = new Random(seed).GenerateRandomFloatArray(count, -0.2f, 1.2f);
            byte[] expected = source.Select(NormalizedFloatToByte).ToArray();
            byte[] actual = new byte[count];
--- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
+++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
@ -10,56 +10,70 @@ namespace SixLabors.ImageSharp.Tests.Helpers
    public class ImageMathsTests
    {
        [Theory]
-        [InlineData(0, 0)]
+        [InlineData(0)]
-        [InlineData(1, 1)]
+        [InlineData(1)]
-        [InlineData(2, 2)]
+        [InlineData(2)]
-        [InlineData(3, 3)]
+        [InlineData(3)]
-        [InlineData(4, 0)]
+        [InlineData(4)]
-        [InlineData(100, 0)]
+        [InlineData(100)]
-        [InlineData(123, 3)]
+        [InlineData(123)]
-        [InlineData(53436353, 1)]
+        [InlineData(53436353)]
-        public void Modulo4(int a, int expected)
+        public void Modulo4(int x)
        {
-            int actual = ImageMaths.Modulo4(a);
+            int actual = ImageMaths.Modulo4(x);
-            Assert.Equal(expected, actual);
+            Assert.Equal(x % 4, actual);
        }
        [Theory]
-        [InlineData(0, 0)]
+        [InlineData(0)]
-        [InlineData(1, 1)]
+        [InlineData(1)]
        [InlineData(2)]
        [InlineData(6)]
        [InlineData(7)]
        [InlineData(8)]
        [InlineData(100)]
        [InlineData(123)]
        [InlineData(53436353)]
        [InlineData(975)]
        public void Modulo8(int x)
        {
            int actual = ImageMaths.Modulo8(x);
            Assert.Equal(x % 8, actual);
        }
        [Theory]
        [InlineData(0, 2)]
        [InlineData(1, 2)]
        [InlineData(2, 2)]
-        [InlineData(6, 6)]
+        [InlineData(0, 4)]
-        [InlineData(7, 7)]
+        [InlineData(3, 4)]
-        [InlineData(8, 0)]
+        [InlineData(5, 4)]
-        [InlineData(100, 4)]
+        [InlineData(5, 8)]
-        [InlineData(123, 3)]
+        [InlineData(8, 8)]
-        [InlineData(53436353, 1)]
+        [InlineData(8, 16)]
-        [InlineData(975, 7)]
+        [InlineData(15, 16)]
-        public void Modulo8(int a, int expected)
+        [InlineData(17, 16)]
        [InlineData(17, 32)]
        [InlineData(31, 32)]
        [InlineData(32, 32)]
        [InlineData(33, 32)]
        public void Modulo2P(int x, int m)
        {
-            int actual = ImageMaths.Modulo8(a);
+            int actual = ImageMaths.ModuloP2(x, m);
-            Assert.Equal(expected, actual);
+            Assert.Equal(x % m, actual);
        }
        [Theory]
-        [InlineData(0, 2, 0)]
+        [InlineData(0, 0, 0, 0)]
-        [InlineData(1, 2, 1)]
+        [InlineData(0.5f, 0, 1, 0.5f)]
-        [InlineData(2, 2, 0)]
+        [InlineData(-0.5f, -0.1f, 10, -0.1f)]
-        [InlineData(0, 4, 0)]
+        [InlineData(-0.05f, -0.1f, 10, -0.05f)]
-        [InlineData(3, 4, 3)]
+        [InlineData(9.9f, -0.1f, 10, 9.9f)]
-        [InlineData(5, 4, 1)]
+        [InlineData(10f, -0.1f, 10, 10f)]
-        [InlineData(5, 8, 5)]
+        [InlineData(10.1f, -0.1f, 10, 10f)]
-        [InlineData(8, 8, 0)]
+        public void Clamp(float x, float min, float max, float expected)
        [InlineData(8, 16, 8)]
        [InlineData(15, 16, 15)]
        [InlineData(17, 16, 1)]
        [InlineData(17, 32, 17)]
        [InlineData(31, 32, 31)]
        [InlineData(32, 32, 0)]
        [InlineData(33, 32, 1)]
        public void Modulo2P(int a, int m, int expected)
        {
-            int actual = ImageMaths.ModuloP2(a, m);
+            float actual = ImageMaths.Clamp(x, min, max);
            Assert.Equal(expected, actual);
        }