uniformize conversion code

7 years ago · f72fcbdc0f
6 changed files with 103 additions and 176 deletions
--- a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs
+++ b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs
@ -0,0 +1,64 @@
+using System;
+using System.Numerics;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+namespace SixLabors.ImageSharp
+{
+    internal static partial class SimdUtils
+    {
+        /// <summary>
+        /// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+)
+        /// PR:
+        /// https://github.com/dotnet/coreclr/pull/10662
+        /// API Proposal:
+        /// https://github.com/dotnet/corefx/issues/15957
+        /// </summary>
+        public static class ExtendedIntrinsics
+        {
+            public static bool IsAvailable { get; } =
+#if NETCOREAPP2_1
+// TODO: Add a build target for .NET 4.7.2
+                true;
+#else
+                false;
+#endif
+
+            // ReSharper disable once MemberHidesStaticFromOuterClass
+            internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
+            {
+                Guard.IsTrue(
+                    source.Length % Vector<byte>.Count == 0,
+                    nameof(source),
+                    "dest.Length should be divisable by Vector<byte>.Count!");
+
+                int n = source.Length / Vector<byte>.Count;
+
+                ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(source));
+                ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dest));
+
+                var scale = new Vector<float>(1f / 255f);
+
+                for (int i = 0; i < n; i++)
+                {
+                    Vector<byte> b = Unsafe.Add(ref sourceBase, i);
+
+                    Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1);
+                    Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
+                    Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
+
+                    Vector<float> f0 = Vector.ConvertToSingle(w0) * scale;
+                    Vector<float> f1 = Vector.ConvertToSingle(w1) * scale;
+                    Vector<float> f2 = Vector.ConvertToSingle(w2) * scale;
+                    Vector<float> f3 = Vector.ConvertToSingle(w3) * scale;
+
+                    ref Vector<float> d = ref Unsafe.Add(ref destBase, i * 4);
+                    d = f0;
+                    Unsafe.Add(ref d, 1) = f1;
+                    Unsafe.Add(ref d, 2) = f2;
+                    Unsafe.Add(ref d, 3) = f3;
+                }
+            }
+        }
+    }
+}
--- a/src/ImageSharp/Common/Extensions/SimdUtils.cs
+++ b/src/ImageSharp/Common/Extensions/SimdUtils.cs
@ -14,12 +14,12 @@ namespace SixLabors.ImageSharp
    /// <summary>
    /// Various extension and utility methods for <see cref="Vector4"/> and <see cref="Vector{T}"/> utilizing SIMD capabilities
    /// </summary>
-    internal static class SimdUtils
+    internal static partial class SimdUtils
    {
        /// <summary>
        /// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte.
        /// </summary>
-        public static bool IsAvx2CompatibleArchitecture => Vector<float>.Count == 8 && Vector<int>.Count == 8;
+        public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector<float>.Count == 8 && Vector<int>.Count == 8;

        internal static void GuardAvx2(string operation)
        {
@ -61,7 +61,8 @@ namespace SixLabors.ImageSharp

        /// <summary>
        /// Convert 'source.Length' <see cref="float"/> values normalized into [0..1] from 'source' into 'dest' buffer of <see cref="byte"/> values.
-        /// The values gonna be scaled up into [0-255] and rounded.
+        /// The values are scaled up into [0-255] and rounded.
+        /// The implementation is SIMD optimized and works only with `source.Length` divisible by <see cref="Vector{UInt32}.Count"/>.
        /// Based on:
        /// <see>
        ///     <cref>http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions</cref>
@ -106,46 +107,13 @@ namespace SixLabors.ImageSharp
        }

        /// <summary>
-        /// Fast <see cref="byte"/> -> <see cref="float"/> conversion for RyuJIT runtimes having dotnet/coreclr#10662 merged.
+        /// Converts `dest.Length` bytes to <see cref="byte"/>-s to <see cref="float"/>-s normalized into [0..1]
+        /// The implementation is SIMD optimized and works only with `dest.Length` divisible by <see cref="Vector{UInt32}.Count"/>.
+        /// Implementation adapted from:
        /// <see>
-        ///     <cref>https://github.com/dotnet/coreclr/pull/10662</cref>
+        ///     <cref>http://stackoverflow.com/a/5362789</cref>
        /// </see>
        /// </summary>
-        internal static void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(ReadOnlySpan<byte> source, Span<float> dest)
-        {
-            Guard.IsTrue(
-                source.Length % Vector<byte>.Count == 0,
-                nameof(source),
-                "dest.Length should be divisable by Vector<byte>.Count!");
-
-            int n = source.Length / Vector<byte>.Count;
-
-            ref Vector<byte> sourceBase = ref Unsafe.As<byte, Vector<byte>>(ref MemoryMarshal.GetReference(source));
-            ref Vector<float> destBase = ref Unsafe.As<float, Vector<float>>(ref MemoryMarshal.GetReference(dest));
-
-            var scale = new Vector<float>(1f / 255f);
-
-            for (int i = 0; i < n; i++)
-            {
-                Vector<byte> b = Unsafe.Add(ref sourceBase, i);
-
-                Vector.Widen(b, out Vector<ushort> s0, out Vector<ushort> s1);
-                Vector.Widen(s0, out Vector<uint> w0, out Vector<uint> w1);
-                Vector.Widen(s1, out Vector<uint> w2, out Vector<uint> w3);
-
-                Vector<float> f0 = Vector.ConvertToSingle(w0) * scale;
-                Vector<float> f1 = Vector.ConvertToSingle(w1) * scale;
-                Vector<float> f2 = Vector.ConvertToSingle(w2) * scale;
-                Vector<float> f3 = Vector.ConvertToSingle(w3) * scale;
-
-                ref Vector<float> d = ref Unsafe.Add(ref destBase, i * 4);
-                d = f0;
-                Unsafe.Add(ref d, 1) = f1;
-                Unsafe.Add(ref d, 2) = f2;
-                Unsafe.Add(ref d, 3) = f3;
-            }
-        }
-
        internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan<byte> source, Span<float> dest)
        {
            GuardAvx2(nameof(BulkConvertByteToNormalizedFloat));
@ -188,7 +156,7 @@ namespace SixLabors.ImageSharp
        /// </summary>
        internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan<float> source, Span<byte> dest)
        {
-            GuardAvx2(nameof(BulkConvertNormalizedFloatToByte));
+            GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows));

            DebugGuard.IsTrue((source.Length % Vector<float>.Count) == 0, nameof(source), "source.Length should be divisable by Vector<float>.Count!");

--- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
+++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs
@ -3,7 +3,6 @@

 using System;
 using System.Numerics;
-using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using SixLabors.Memory;

@ -19,99 +18,37 @@ namespace SixLabors.ImageSharp.PixelFormats
        /// </summary>
        internal partial class PixelOperations : PixelOperations<Rgba32>
        {
-            /// <summary>
-            /// SIMD optimized bulk implementation of <see cref="IPixel.PackFromVector4(Vector4)"/>
-            /// that works only with `count` divisible by <see cref="Vector{UInt32}.Count"/>.
-            /// </summary>
-            /// <param name="sourceColors">The <see cref="Span{T}"/> to the source colors.</param>
-            /// <param name="destVectors">The <see cref="Span{T}"/> to the dstination vectors.</param>
-            /// <param name="count">The number of pixels to convert.</param>
-            /// <remarks>
-            /// Implementation adapted from:
-            /// <see>
-            ///     <cref>http://stackoverflow.com/a/5362789</cref>
-            /// </see>
-            /// TODO: We can replace this implementation in the future using new Vector API-s:
-            /// <see>
-            ///     <cref>https://github.com/dotnet/corefx/issues/15957</cref>
-            /// </see>
-            /// </remarks>
-            internal static void ToVector4SimdAligned(ReadOnlySpan<Rgba32> sourceColors, Span<Vector4> destVectors, int count)
-            {
-                if (!Vector.IsHardwareAccelerated)
-                {
-                    throw new InvalidOperationException(
-                        "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!");
-                }
-
-                DebugGuard.IsTrue(
-                    count % Vector<uint>.Count == 0,
-                    nameof(count),
-                    "Argument 'count' should divisible by Vector<uint>.Count!");
-
-                var bVec = new Vector<float>(256.0f / 255.0f);
-                var magicFloat = new Vector<float>(32768.0f);
-                var magicInt = new Vector<uint>(1191182336); // reinterpreded value of 32768.0f
-                var mask = new Vector<uint>(255);
-
-                int unpackedRawCount = count * 4;
-
-                ref uint sourceBase = ref Unsafe.As<Rgba32, uint>(ref MemoryMarshal.GetReference(sourceColors));
-                ref WideRgba destBaseAsWide = ref Unsafe.As<Vector4, WideRgba>(ref MemoryMarshal.GetReference(destVectors));
-                ref Vector<uint> destBaseAsUInt = ref Unsafe.As<WideRgba, Vector<uint>>(ref destBaseAsWide);
-                ref Vector<float> destBaseAsFloat = ref Unsafe.As<WideRgba, Vector<float>>(ref destBaseAsWide);
-
-                for (int i = 0; i < count; i++)
-                {
-                    uint sVal = Unsafe.Add(ref sourceBase, i);
-                    ref WideRgba dst = ref Unsafe.Add(ref destBaseAsWide, i);
-
-                    // This call is the bottleneck now:
-                    dst.Load(sVal);
-                }
-
-                int numOfVectors = unpackedRawCount / Vector<uint>.Count;
-
-                for (int i = 0; i < numOfVectors; i++)
-                {
-                    Vector<uint> vi = Unsafe.Add(ref destBaseAsUInt, i);
-
-                    vi &= mask;
-                    vi |= magicInt;
-
-                    var vf = Vector.AsVectorSingle(vi);
-                    vf = (vf - magicFloat) * bVec;
-
-                    Unsafe.Add(ref destBaseAsFloat, i) = vf;
-                }
-            }
-
            /// <inheritdoc />
            internal override void ToVector4(ReadOnlySpan<Rgba32> sourceColors, Span<Vector4> destinationVectors, int count)
            {
                Guard.MustBeSizedAtLeast(sourceColors, count, nameof(sourceColors));
                Guard.MustBeSizedAtLeast(destinationVectors, count, nameof(destinationVectors));

-                if (count < 256 || !Vector.IsHardwareAccelerated)
+                if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture)
                {
                    // Doesn't worth to bother with SIMD:
                    base.ToVector4(sourceColors, destinationVectors, count);
                    return;
                }

-                int remainder = count % Vector<uint>.Count;
+                int remainder = count % 2;
                int alignedCount = count - remainder;

                if (alignedCount > 0)
                {
-                    ToVector4SimdAligned(sourceColors, destinationVectors, alignedCount);
+                    ReadOnlySpan<byte> rawSrc = MemoryMarshal.Cast<Rgba32, byte>(sourceColors);
+                    Span<float> rawDest = MemoryMarshal.Cast<Vector4, float>(destinationVectors.Slice(0, alignedCount));
+
+                    SimdUtils.BulkConvertByteToNormalizedFloat(
+                        rawSrc,
+                        rawDest);
                }

                if (remainder > 0)
                {
-                    sourceColors = sourceColors.Slice(alignedCount);
-                    destinationVectors = destinationVectors.Slice(alignedCount);
-                    base.ToVector4(sourceColors, destinationVectors, remainder);
+                    // actually: remainder == 1
+                    int lastIdx = count - 1;
+                    destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4();
                }
            }

@ -120,7 +57,7 @@ namespace SixLabors.ImageSharp.PixelFormats
            {
                GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count);

-                if (!SimdUtils.IsAvx2CompatibleArchitecture)
+                if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture)
                {
                    base.PackFromVector4(sourceVectors, destinationColors, count);
                    return;
@ -131,10 +68,10 @@ namespace SixLabors.ImageSharp.PixelFormats

                if (alignedCount > 0)
                {
-                    ReadOnlySpan<float> flatSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors.Slice(0, alignedCount));
-                    Span<byte> flatDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors);
+                    ReadOnlySpan<float> rawSrc = MemoryMarshal.Cast<Vector4, float>(sourceVectors.Slice(0, alignedCount));
+                    Span<byte> rawDest = MemoryMarshal.Cast<Rgba32, byte>(destinationColors);

-                    SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(flatSrc, flatDest);
+                    SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest);
                }

                if (remainder > 0)
@ -172,30 +109,6 @@ namespace SixLabors.ImageSharp.PixelFormats

                sourcePixels.Slice(0, count).CopyTo(dest);
            }
-
-            /// <summary>
-            /// Value type to store <see cref="Rgba32"/>-s widened into multiple <see cref="uint"/>-s.
-            /// </summary>
-            [StructLayout(LayoutKind.Sequential)]
-            private struct WideRgba
-            {
-                private uint r;
-
-                private uint g;
-
-                private uint b;
-
-                private uint a;
-
-                [MethodImpl(MethodImplOptions.AggressiveInlining)]
-                public void Load(uint p)
-                {
-                    this.r = p;
-                    this.g = p >> GreenShift;
-                    this.b = p >> BlueShift;
-                    this.a = p >> AlphaShift;
-                }
-            }
        }
    }
 }
--- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs
@ -23,7 +23,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk

        private IMemoryOwner<TPixel> destination;

-        [Params(16, 128, 512)]
+        [Params(
+            //64,
+            2048)]
        public int Count { get; set; }

        [GlobalSetup]
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs
@ -205,12 +205,12 @@ namespace SixLabors.ImageSharp.Tests.Common

            Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
        }
-
+        
        [Theory]
        [InlineData(1, 0)]
        [InlineData(2, 32)]
        [InlineData(3, 128)]
-        public void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(int seed, int count)
+        public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count)
        {
            if (!Vector.IsHardwareAccelerated)
            {
@ -221,7 +221,7 @@ namespace SixLabors.ImageSharp.Tests.Common
            float[] result = new float[count];
            float[] expected = source.Select(b => (float)b / 255f).ToArray();

-            SimdUtils.BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(source, result);
+            SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result);

            Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));
        }
--- a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs
+++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs
@ -17,43 +17,26 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
    {
        public class Rgba32 : PixelOperationsTests<ImageSharp.PixelFormats.Rgba32>
        {
+            public const string SkipProfilingBenchmarks =
+#if true
+                "Profiling benchmark - enable manually!";
+#else
+                null;
+#endif
+
            public Rgba32(ITestOutputHelper output)
                : base(output)
            {
            }

-            // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class:
-            public static new TheoryData<int> ArraySizesData => new TheoryData<int> { 7, 16, 1111 };
-
            [Fact]
            public void IsSpecialImplementation()
            {
                Assert.IsType<ImageSharp.PixelFormats.Rgba32.PixelOperations>(PixelOperations<ImageSharp.PixelFormats.Rgba32>.Instance);
            }

-            [Fact]
-            public void ToVector4SimdAligned()
-            {
-                if (!Vector.IsHardwareAccelerated)
-                {
-                    return;
-                }
-
-                ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(64);
-                Vector4[] expected = CreateExpectedVector4Data(source);
-
-                TestOperation(
-                    source,
-                    expected,
-                    (s, d) => ImageSharp.PixelFormats.Rgba32.PixelOperations.ToVector4SimdAligned(s, d.GetSpan(), 64)
-                );
-            }
-
-
-            // [Fact] // Profiling benchmark - enable manually!
-#pragma warning disable xUnit1013 // Public method should be marked as test
+            [Fact(Skip = SkipProfilingBenchmarks)]
            public void Benchmark_ToVector4()
-#pragma warning restore xUnit1013 // Public method should be marked as test
            {
                int times = 200000;
                int count = 1024;
@ -73,13 +56,10 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats

        public class Argb32 : PixelOperationsTests<ImageSharp.PixelFormats.Argb32>
        {
-            // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class:
            public Argb32(ITestOutputHelper output)
                : base(output)
            {
            }
-
-            public static new TheoryData<int> ArraySizesData => new TheoryData<int> { 7, 16, 1111 };
        }

        [Theory]
@ -110,7 +90,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats
        {
        }

-        public static TheoryData<int> ArraySizesData => new TheoryData<int> { 7, 16, 1111 };
+        public static TheoryData<int> ArraySizesData => new TheoryData<int> { 0, 1, 2, 7, 16, 1111 };

        private static PixelOperations<TPixel> Operations => PixelOperations<TPixel>.Instance;