diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs new file mode 100644 index 000000000..ec52b90ef --- /dev/null +++ b/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs @@ -0,0 +1,64 @@ +using System; +using System.Numerics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + /// + /// Methods accelerated only in RyuJIT having dotnet/coreclr#10662 merged (.NET Core 2.1+ .NET 4.7.2+) + /// PR: + /// https://github.com/dotnet/coreclr/pull/10662 + /// API Proposal: + /// https://github.com/dotnet/corefx/issues/15957 + /// + public static class ExtendedIntrinsics + { + public static bool IsAvailable { get; } = +#if NETCOREAPP2_1 +// TODO: Add a build target for .NET 4.7.2 + true; +#else + false; +#endif + + // ReSharper disable once MemberHidesStaticFromOuterClass + internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) + { + Guard.IsTrue( + source.Length % Vector.Count == 0, + nameof(source), + "dest.Length should be divisable by Vector.Count!"); + + int n = source.Length / Vector.Count; + + ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + Vector b = Unsafe.Add(ref sourceBase, i); + + Vector.Widen(b, out Vector s0, out Vector s1); + Vector.Widen(s0, out Vector w0, out Vector w1); + Vector.Widen(s1, out Vector w2, out Vector w3); + + Vector f0 = Vector.ConvertToSingle(w0) * scale; + Vector f1 = Vector.ConvertToSingle(w1) * scale; + Vector f2 = Vector.ConvertToSingle(w2) * scale; + Vector f3 = Vector.ConvertToSingle(w3) * scale; + + ref Vector d = ref Unsafe.Add(ref destBase, i * 4); + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; + } + } + } + } +} diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.cs b/src/ImageSharp/Common/Extensions/SimdUtils.cs index 481e0726d..3630ede32 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.cs +++ b/src/ImageSharp/Common/Extensions/SimdUtils.cs @@ -14,12 +14,12 @@ namespace SixLabors.ImageSharp /// /// Various extension and utility methods for and utilizing SIMD capabilities /// - internal static class SimdUtils + internal static partial class SimdUtils { /// /// Gets a value indicating whether the code is being executed on AVX2 CPU where both float and integer registers are of size 256 byte. /// - public static bool IsAvx2CompatibleArchitecture => Vector.Count == 8 && Vector.Count == 8; + public static bool IsAvx2CompatibleArchitecture { get; } = Vector.IsHardwareAccelerated && Vector.Count == 8 && Vector.Count == 8; internal static void GuardAvx2(string operation) { @@ -61,7 +61,8 @@ namespace SixLabors.ImageSharp /// /// Convert 'source.Length' values normalized into [0..1] from 'source' into 'dest' buffer of values. - /// The values gonna be scaled up into [0-255] and rounded. + /// The values are scaled up into [0-255] and rounded. + /// The implementation is SIMD optimized and works only with `source.Length` divisible by . /// Based on: /// /// http://lolengine.net/blog/2011/3/20/understanding-fast-float-integer-conversions @@ -106,46 +107,13 @@ namespace SixLabors.ImageSharp } /// - /// Fast -> conversion for RyuJIT runtimes having dotnet/coreclr#10662 merged. + /// Converts `dest.Length` bytes to -s to -s normalized into [0..1] + /// The implementation is SIMD optimized and works only with `dest.Length` divisible by . + /// Implementation adapted from: /// - /// https://github.com/dotnet/coreclr/pull/10662 + /// http://stackoverflow.com/a/5362789 /// /// - internal static void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(ReadOnlySpan source, Span dest) - { - Guard.IsTrue( - source.Length % Vector.Count == 0, - nameof(source), - "dest.Length should be divisable by Vector.Count!"); - - int n = source.Length / Vector.Count; - - ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); - ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - - var scale = new Vector(1f / 255f); - - for (int i = 0; i < n; i++) - { - Vector b = Unsafe.Add(ref sourceBase, i); - - Vector.Widen(b, out Vector s0, out Vector s1); - Vector.Widen(s0, out Vector w0, out Vector w1); - Vector.Widen(s1, out Vector w2, out Vector w3); - - Vector f0 = Vector.ConvertToSingle(w0) * scale; - Vector f1 = Vector.ConvertToSingle(w1) * scale; - Vector f2 = Vector.ConvertToSingle(w2) * scale; - Vector f3 = Vector.ConvertToSingle(w3) * scale; - - ref Vector d = ref Unsafe.Add(ref destBase, i * 4); - d = f0; - Unsafe.Add(ref d, 1) = f1; - Unsafe.Add(ref d, 2) = f2; - Unsafe.Add(ref d, 3) = f3; - } - } - internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { GuardAvx2(nameof(BulkConvertByteToNormalizedFloat)); @@ -188,7 +156,7 @@ namespace SixLabors.ImageSharp /// internal static void BulkConvertNormalizedFloatToByteClampOverflows(ReadOnlySpan source, Span dest) { - GuardAvx2(nameof(BulkConvertNormalizedFloatToByte)); + GuardAvx2(nameof(BulkConvertNormalizedFloatToByteClampOverflows)); DebugGuard.IsTrue((source.Length % Vector.Count) == 0, nameof(source), "source.Length should be divisable by Vector.Count!"); diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs index 76e119ba4..6745079da 100644 --- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs @@ -3,7 +3,6 @@ using System; using System.Numerics; -using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using SixLabors.Memory; @@ -19,99 +18,37 @@ namespace SixLabors.ImageSharp.PixelFormats /// internal partial class PixelOperations : PixelOperations { - /// - /// SIMD optimized bulk implementation of - /// that works only with `count` divisible by . - /// - /// The to the source colors. - /// The to the dstination vectors. - /// The number of pixels to convert. - /// - /// Implementation adapted from: - /// - /// http://stackoverflow.com/a/5362789 - /// - /// TODO: We can replace this implementation in the future using new Vector API-s: - /// - /// https://github.com/dotnet/corefx/issues/15957 - /// - /// - internal static void ToVector4SimdAligned(ReadOnlySpan sourceColors, Span destVectors, int count) - { - if (!Vector.IsHardwareAccelerated) - { - throw new InvalidOperationException( - "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); - } - - DebugGuard.IsTrue( - count % Vector.Count == 0, - nameof(count), - "Argument 'count' should divisible by Vector.Count!"); - - var bVec = new Vector(256.0f / 255.0f); - var magicFloat = new Vector(32768.0f); - var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f - var mask = new Vector(255); - - int unpackedRawCount = count * 4; - - ref uint sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(sourceColors)); - ref WideRgba destBaseAsWide = ref Unsafe.As(ref MemoryMarshal.GetReference(destVectors)); - ref Vector destBaseAsUInt = ref Unsafe.As>(ref destBaseAsWide); - ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWide); - - for (int i = 0; i < count; i++) - { - uint sVal = Unsafe.Add(ref sourceBase, i); - ref WideRgba dst = ref Unsafe.Add(ref destBaseAsWide, i); - - // This call is the bottleneck now: - dst.Load(sVal); - } - - int numOfVectors = unpackedRawCount / Vector.Count; - - for (int i = 0; i < numOfVectors; i++) - { - Vector vi = Unsafe.Add(ref destBaseAsUInt, i); - - vi &= mask; - vi |= magicInt; - - var vf = Vector.AsVectorSingle(vi); - vf = (vf - magicFloat) * bVec; - - Unsafe.Add(ref destBaseAsFloat, i) = vf; - } - } - /// internal override void ToVector4(ReadOnlySpan sourceColors, Span destinationVectors, int count) { Guard.MustBeSizedAtLeast(sourceColors, count, nameof(sourceColors)); Guard.MustBeSizedAtLeast(destinationVectors, count, nameof(destinationVectors)); - if (count < 256 || !Vector.IsHardwareAccelerated) + if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) { // Doesn't worth to bother with SIMD: base.ToVector4(sourceColors, destinationVectors, count); return; } - int remainder = count % Vector.Count; + int remainder = count % 2; int alignedCount = count - remainder; if (alignedCount > 0) { - ToVector4SimdAligned(sourceColors, destinationVectors, alignedCount); + ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); + Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); + + SimdUtils.BulkConvertByteToNormalizedFloat( + rawSrc, + rawDest); } if (remainder > 0) { - sourceColors = sourceColors.Slice(alignedCount); - destinationVectors = destinationVectors.Slice(alignedCount); - base.ToVector4(sourceColors, destinationVectors, remainder); + // actually: remainder == 1 + int lastIdx = count - 1; + destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4(); } } @@ -120,7 +57,7 @@ namespace SixLabors.ImageSharp.PixelFormats { GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); - if (!SimdUtils.IsAvx2CompatibleArchitecture) + if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) { base.PackFromVector4(sourceVectors, destinationColors, count); return; @@ -131,10 +68,10 @@ namespace SixLabors.ImageSharp.PixelFormats if (alignedCount > 0) { - ReadOnlySpan flatSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount)); - Span flatDest = MemoryMarshal.Cast(destinationColors); + ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceVectors.Slice(0, alignedCount)); + Span rawDest = MemoryMarshal.Cast(destinationColors); - SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(flatSrc, flatDest); + SimdUtils.BulkConvertNormalizedFloatToByteClampOverflows(rawSrc, rawDest); } if (remainder > 0) @@ -172,30 +109,6 @@ namespace SixLabors.ImageSharp.PixelFormats sourcePixels.Slice(0, count).CopyTo(dest); } - - /// - /// Value type to store -s widened into multiple -s. - /// - [StructLayout(LayoutKind.Sequential)] - private struct WideRgba - { - private uint r; - - private uint g; - - private uint b; - - private uint a; - - [MethodImpl(MethodImplOptions.AggressiveInlining)] - public void Load(uint p) - { - this.r = p; - this.g = p >> GreenShift; - this.b = p >> BlueShift; - this.a = p >> AlphaShift; - } - } } } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index a5fa59ba0..bdae7d065 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -23,7 +23,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk private IMemoryOwner destination; - [Params(16, 128, 512)] + [Params( + //64, + 2048)] public int Count { get; set; } [GlobalSetup] diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 4e39af70f..0488dd5e1 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -205,12 +205,12 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } - + [Theory] [InlineData(1, 0)] [InlineData(2, 32)] [InlineData(3, 128)] - public void BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(int seed, int count) + public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count) { if (!Vector.IsHardwareAccelerated) { @@ -221,7 +221,7 @@ namespace SixLabors.ImageSharp.Tests.Common float[] result = new float[count]; float[] expected = source.Select(b => (float)b / 255f).ToArray(); - SimdUtils.BulkConvertByteToNormalizedFloatWithExtendedIntrinsics(source, result); + SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result); Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f)); } diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs index 4d7ec71e7..535952e05 100644 --- a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs +++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs @@ -17,43 +17,26 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats { public class Rgba32 : PixelOperationsTests { + public const string SkipProfilingBenchmarks = +#if true + "Profiling benchmark - enable manually!"; +#else + null; +#endif + public Rgba32(ITestOutputHelper output) : base(output) { } - // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class: - public static new TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; - [Fact] public void IsSpecialImplementation() { Assert.IsType(PixelOperations.Instance); } - [Fact] - public void ToVector4SimdAligned() - { - if (!Vector.IsHardwareAccelerated) - { - return; - } - - ImageSharp.PixelFormats.Rgba32[] source = CreatePixelTestData(64); - Vector4[] expected = CreateExpectedVector4Data(source); - - TestOperation( - source, - expected, - (s, d) => ImageSharp.PixelFormats.Rgba32.PixelOperations.ToVector4SimdAligned(s, d.GetSpan(), 64) - ); - } - - - // [Fact] // Profiling benchmark - enable manually! -#pragma warning disable xUnit1013 // Public method should be marked as test + [Fact(Skip = SkipProfilingBenchmarks)] public void Benchmark_ToVector4() -#pragma warning restore xUnit1013 // Public method should be marked as test { int times = 200000; int count = 1024; @@ -73,13 +56,10 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats public class Argb32 : PixelOperationsTests { - // For 4.6 test runner MemberData does not work without redeclaring the public field in the derived test class: public Argb32(ITestOutputHelper output) : base(output) { } - - public static new TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; } [Theory] @@ -110,7 +90,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats { } - public static TheoryData ArraySizesData => new TheoryData { 7, 16, 1111 }; + public static TheoryData ArraySizesData => new TheoryData { 0, 1, 2, 7, 16, 1111 }; private static PixelOperations Operations => PixelOperations.Instance;