FallbackIntrinsics128 + ImageMaths.Modulo* implementations

8 years ago · 8793880447
9 changed files with 256 additions and 48 deletions
--- a/src/ImageSharp/Common/Helpers/ImageMaths.cs
+++ b/src/ImageSharp/Common/Helpers/ImageMaths.cs
@ -39,6 +39,22 @@ namespace SixLabors.ImageSharp
            return (a / GreatestCommonDivisor(a, b)) * b;
        }
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static int Modulo4(int a) => a & 3;
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static int Modulo8(int a) => a & 7;
        /// <summary>
        /// Fast (mod m) calculator,
        /// where <paramref name="m"/> should be a power of 2.
        /// </summary>
        [MethodImpl(MethodImplOptions.AggressiveInlining)]
        public static int ModuloP2(int a, int m)
        {
            return a & (m - 1);
        }
        /// <summary>
        /// Returns the absolute value of a 32-bit signed integer. Uses bit shifting to speed up the operation.
        /// </summary>
--- a/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.BasicIntrinsics256.cs
@ -14,7 +14,7 @@ namespace SixLabors.ImageSharp
    internal static partial class SimdUtils
    {
        /// <summary>
-        /// 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*)
+        /// Implementation with 256bit / AVX2 intrinsics NOT depending on newer API-s (Vector.Widen etc.)
        /// </summary>
        public static class BasicIntrinsics256
        {
--- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs
@ -10,8 +10,9 @@ namespace SixLabors.ImageSharp
    internal static partial class SimdUtils
    {
        /// <summary>
-        /// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
+        /// Implementation methods based on newer <see cref="Vector{T}"/> API-s (Vector.Widen, Vector.Narrow, Vector.ConvertTo*).
-        /// PR:
+        /// Only accelerated only on RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
        /// See:
        /// https://github.com/dotnet/coreclr/pull/10662
        /// API Proposal:
        /// https://github.com/dotnet/corefx/issues/15957
--- a/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.FallbackIntrinsics128.cs
@ -0,0 +1,143 @@
 using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 namespace SixLabors.ImageSharp
 {
    internal static partial class SimdUtils
    {
        /// <summary>
        /// Fallback implementation based on <see cref="Vector4"/> (128bit).
        /// For <see cref="Vector4"/>, efficient software fallback implementations are present
        /// + maybe even mono can emit intrinsics for that type :P
        /// </summary>
        public static class FallbackIntrinsics128
        {
            /// <summary>
            /// <see cref="BulkConvertByteToNormalizedFloat"/> as much elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            internal static void BulkConvertByteToNormalizedFloatReduce(
                ref ReadOnlySpan<byte> source,
                ref Span<float> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
                int remainder = source.Length % 4;
                int alignedCount = source.Length - remainder;
                if (alignedCount > 0)
                {
                    BulkConvertByteToNormalizedFloat(
                        source.Slice(0, alignedCount),
                        dest.Slice(0, alignedCount));
                    source = source.Slice(alignedCount);
                    dest = dest.Slice(alignedCount);
                }
            }
            /// <summary>
            /// <see cref="BulkConvertNormalizedFloatToByteClampOverflows"/> as much elements as possible, slicing them down (keeping the remainder).
            /// </summary>
            internal static void BulkConvertNormalizedFloatToByteClampOverflowsReduce(
                ref ReadOnlySpan<float> source,
                ref Span<byte> dest)
            {
                DebugGuard.IsTrue(source.Length == dest.Length, nameof(source), "Input spans must be of same size!");
                int remainder = source.Length % 4;
                int alignedCount = source.Length - remainder;
                if (alignedCount > 0)
                {
                    BulkConvertNormalizedFloatToByteClampOverflows(
                        source.Slice(0, alignedCount),
                        dest.Slice(0, alignedCount));
                    source = source.Slice(alignedCount);
                    dest = dest.Slice(alignedCount);
                }
            }
            /// <summary>
            /// Implementation of <see cref="SimdUtils.BulkConvertByteToNormalizedFloat"/> using <see cref="Vector4"/>.
            /// </summary>
            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
            {
                DebugGuard.IsTrue((dest.Length % 4) == 0, nameof(dest), "dest.Length should be divisible by 4!");
                int count = dest.Length / 4;
                if (count == 0)
                {
                    return;
                }
                ref ByteVector4 sBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(source));
                ref Vector4 dBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(dest));
                const float Scale = 1f / 255f;
                Vector4 d = default;
                for (int i = 0; i < count; i++)
                {
                    ref ByteVector4 s = ref Unsafe.Add(ref sBase, i);
                    d.X = s.X;
                    d.Y = s.Y;
                    d.Z = s.Z;
                    d.W = s.W;
                    d *= Scale;
                    Unsafe.Add(ref dBase, i) = d;
                }
            }
            /// <summary>
            /// Implementation of <see cref="SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows"/> using <see cref="Vector4"/>.
            /// </summary>
            internal static void BulkConvertNormalizedFloatToByteClampOverflows(
                ReadOnlySpan<float> source,
                Span<byte> dest)
            {
                DebugGuard.IsTrue((source.Length % 4) == 0, nameof(source), "source.Length should be divisible by 4!");
                int count = source.Length / 4;
                if (count == 0)
                {
                    return;
                }
                ref Vector4 sBase = ref Unsafe.As<float, Vector4>(ref MemoryMarshal.GetReference(source));
                ref ByteVector4 dBase = ref Unsafe.As<byte, ByteVector4>(ref MemoryMarshal.GetReference(dest));
                var half = new Vector4(0.5f);
                var maxBytes = new Vector4(255f);
                for (int i = 0; i < count; i++)
                {
                    Vector4 s = Unsafe.Add(ref sBase, i);
                    s *= maxBytes;
                    s += half;
                    // I'm not sure if Clamp() is properly implemented with intrinsics.
                    s = Vector4.Max(Vector4.Zero, s);
                    s = Vector4.Min(maxBytes, s);
                    ref ByteVector4 d = ref Unsafe.Add(ref dBase, i);
                    d.X = (byte)s.X;
                    d.Y = (byte)s.Y;
                    d.Z = (byte)s.Z;
                    d.W = (byte)s.W;
                }
            }
            [StructLayout(LayoutKind.Sequential)]
            private struct ByteVector4
            {
                public byte X;
                public byte Y;
                public byte Z;
                public byte W;
            }
        }
    }
 }
--- a/src/ImageSharp/Common/Helpers/SimdUtils.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs
@ -55,7 +55,7 @@ namespace SixLabors.ImageSharp
        }
        /// <summary>
-        /// Converts `dest.Length` <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
+        /// Converts all input <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1].
        /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
        /// but there are no restrictions on the span's length.
        /// </summary>
@ -67,6 +67,7 @@ namespace SixLabors.ImageSharp
            ExtendedIntrinsics.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
            BasicIntrinsics256.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
            FallbackIntrinsics128.BulkConvertByteToNormalizedFloatReduce(ref source, ref dest);
            // Deal with the remainder:
            int count = source.Length;
@ -83,7 +84,7 @@ namespace SixLabors.ImageSharp
        }
        /// <summary>
-        /// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
+        /// Convert all <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/>.
        /// The values are scaled up into [0-255] and rounded, overflows are clamped.
        /// <paramref name="source"/> should be the of the same size as <paramref name="dest"/>,
        /// but there are no restrictions on the span's length.
@ -96,6 +97,7 @@ namespace SixLabors.ImageSharp
            ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
            BasicIntrinsics256.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
            FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflowsReduce(ref source, ref dest);
            // Deal with the remainder:
            int count = source.Length;
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
@ -72,30 +72,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
    public class PackFromVector4_Rgba32 : PackFromVector4<Rgba32>
    {
        [Benchmark]
-        public void BasicBulk()
+        public void FallbackIntrinsics128()
        {
-            ref Vector4 sBase = ref this.source.GetSpan()[0];
+            Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
-            ref Rgba32 dBase = ref this.destination.GetSpan()[0];
+            Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
            Vector4 maxBytes = new Vector4(255);
            Vector4 half = new Vector4(0.5f);
-            for (int i = 0; i < this.Count; i++)
+            SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats);
            {
                Vector4 v = Unsafe.Add(ref sBase, i);
                v *= maxBytes;
                v += half;
                v = Vector4.Clamp(v, Vector4.Zero, maxBytes);
                ref Rgba32 d = ref Unsafe.Add(ref dBase, i);
                d.R = (byte)v.X;
                d.G = (byte)v.Y;
                d.B = (byte)v.Z;
                d.A = (byte)v.W;
            }
        }
        [Benchmark(Baseline = true)]
-        public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows()
+        public void BasicIntrinsics256()
        {
            Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
            Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
@ -104,7 +90,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
        }
        [Benchmark]
-        public void ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows()
+        public void ExtendedIntrinsic()
        {
            Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
            Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs
@ -79,29 +79,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
    public class ToVector4_Rgba32 : ToVector4<Rgba32>
    {
        [Benchmark]
-        public void BasicBulk()
+        public void FallbackIntrinsics128()
        {
-            ref Rgba32 sBase = ref this.source.GetSpan()[0];
+            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
-            ref Vector4 dBase = ref this.destination.GetSpan()[0];
+            Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
            Vector4 scale = new Vector4(1f / 255f);
            Vector4 v = default;
-            for (int i = 0; i < this.Count; i++)
+            SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(sBytes, dFloats);
            {
                ref Rgba32 s = ref Unsafe.Add(ref sBase, i);
                v.X = s.R;
                v.Y = s.G;
                v.Z = s.B;
                v.W = s.A;
                v *= scale;
                Unsafe.Add(ref dBase, i) = v;
            }
        }
        [Benchmark(Baseline = true)]
-        public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat()
+        public void BasicIntrinsics256()
        {
            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
            Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
@ -110,7 +97,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
        }
        [Benchmark]
-        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat()
+        public void ExtendedIntrinsics()
        {
            Span<byte> sBytes = MemoryMarshal.Cast<Rgba32, byte>(this.source.GetSpan());
            Span<float> dFloats = MemoryMarshal.Cast<Vector4, float>(this.destination.GetSpan());
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@ -118,7 +118,7 @@ namespace SixLabors.ImageSharp.Tests.Common
        [InlineData(1, 8)]
        [InlineData(2, 16)]
        [InlineData(3, 128)]
-        public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
+        public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithRoundedData(int seed, int count)
        {
            if (this.SkipOnNonAvx2())
            {
@ -142,7 +142,7 @@ namespace SixLabors.ImageSharp.Tests.Common
        [InlineData(1, 8)]
        [InlineData(2, 16)]
        [InlineData(3, 128)]
-        public void BasicIntrinsics_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
+        public void BasicIntrinsics256_BulkConvertNormalizedFloatToByte_WithNonRoundedData(int seed, int count)
        {
            if (this.SkipOnNonAvx2())
            {
@ -161,6 +161,7 @@ namespace SixLabors.ImageSharp.Tests.Common
        }
        public static readonly TheoryData<int> ArraySizesDivisibleBy8 = new TheoryData<int> { 0, 8, 16, 1024 };
        public static readonly TheoryData<int> ArraySizesDivisibleBy4 = new TheoryData<int> { 0, 4, 8, 28, 1020 };
        public static readonly TheoryData<int> ArraySizesDivisibleBy32 = new TheoryData<int> { 0, 32, 512 };
@ -170,9 +171,18 @@ namespace SixLabors.ImageSharp.Tests.Common
                    0, 1, 2, 3, 4, 7, 8, 9, 15, 16, 17, 63, 64, 255, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 520,
                };
        [Theory]
        [MemberData(nameof(ArraySizesDivisibleBy4))]
        public void FallbackIntrinsics128_BulkConvertByteToNormalizedFloat(int count)
        {
            TestImpl_BulkConvertByteToNormalizedFloat(
                count,
                (s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertByteToNormalizedFloat(s.Span, d.Span));
        }
        [Theory]
        [MemberData(nameof(ArraySizesDivisibleBy8))]
-        public void BasicIntrinsics_BulkConvertByteToNormalizedFloat(int count)
+        public void BasicIntrinsics256_BulkConvertByteToNormalizedFloat(int count)
        {
            if (this.SkipOnNonAvx2())
            {
@ -215,9 +225,18 @@ namespace SixLabors.ImageSharp.Tests.Common
            Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
        }
        [Theory]
        [MemberData(nameof(ArraySizesDivisibleBy4))]
        public void FallbackIntrinsics128_BulkConvertNormalizedFloatToByteClampOverflows(int count)
        {
            TestImpl_BulkConvertNormalizedFloatToByteClampOverflows(count,
                (s, d) => SimdUtils.FallbackIntrinsics128.BulkConvertNormalizedFloatToByteClampOverflows(s.Span, d.Span)
            );
        }
        [Theory]
        [MemberData(nameof(ArraySizesDivisibleBy8))]
-        public void BasicIntrinsics_BulkConvertNormalizedFloatToByteClampOverflows(int count)
+        public void BasicIntrinsics256_BulkConvertNormalizedFloatToByteClampOverflows(int count)
        {
            if (this.SkipOnNonAvx2())
            {
--- a/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
+++ b/tests/ImageSharp.Tests/Helpers/ImageMathsTests.cs
@ -9,6 +9,60 @@ namespace SixLabors.ImageSharp.Tests.Helpers
    public class ImageMathsTests
    {
        [Theory]
        [InlineData(0, 0)]
        [InlineData(1, 1)]
        [InlineData(2, 2)]
        [InlineData(3, 3)]
        [InlineData(4, 0)]
        [InlineData(100, 0)]
        [InlineData(123, 3)]
        [InlineData(53436353, 1)]
        public void Modulo4(int a, int expected)
        {
            int actual = ImageMaths.Modulo4(a);
            Assert.Equal(expected, actual);
        }
        [Theory]
        [InlineData(0, 0)]
        [InlineData(1, 1)]
        [InlineData(2, 2)]
        [InlineData(6, 6)]
        [InlineData(7, 7)]
        [InlineData(8, 0)]
        [InlineData(100, 4)]
        [InlineData(123, 3)]
        [InlineData(53436353, 1)]
        [InlineData(975, 7)]
        public void Modulo8(int a, int expected)
        {
            int actual = ImageMaths.Modulo8(a);
            Assert.Equal(expected, actual);
        }
        [Theory]
        [InlineData(0, 2, 0)]
        [InlineData(1, 2, 1)]
        [InlineData(2, 2, 0)]
        [InlineData(0, 4, 0)]
        [InlineData(3, 4, 3)]
        [InlineData(5, 4, 1)]
        [InlineData(5, 8, 5)]
        [InlineData(8, 8, 0)]
        [InlineData(8, 16, 8)]
        [InlineData(15, 16, 15)]
        [InlineData(17, 16, 1)]
        [InlineData(17, 32, 17)]
        [InlineData(31, 32, 31)]
        [InlineData(32, 32, 0)]
        [InlineData(33, 32, 1)]
        public void Modulo2P(int a, int m, int expected)
        {
            int actual = ImageMaths.ModuloP2(a, m);
            Assert.Equal(expected, actual);
        }
        [Fact]
        public void FasAbsResultMatchesMath()
        {