From 8f4e8a663a9c37347d91917a98ab4e95f709f016 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Fri, 19 Oct 2018 13:14:33 +0200 Subject: [PATCH] cleanup --- .../SimdUtils.ExtendedIntrinsics.cs | 32 +++-- .../{Extensions => Helpers}/SimdUtils.cs | 18 +-- .../PixelFormats/PixelOperations{TPixel}.cs | 56 +++++---- .../PixelFormats/Rgba32.PixelOperations.cs | 69 ++++++++--- .../Color/Bulk/ToVector4.cs | 112 ++++++++++++++++-- .../PixelFormats/PixelOperationsTests.cs | 2 +- 6 files changed, 220 insertions(+), 69 deletions(-) rename src/ImageSharp/Common/{Extensions => Helpers}/SimdUtils.ExtendedIntrinsics.cs (81%) rename src/ImageSharp/Common/{Extensions => Helpers}/SimdUtils.cs (96%) diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs similarity index 81% rename from src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs rename to src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index fba54b033..6def8938a 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -18,7 +18,7 @@ namespace SixLabors.ImageSharp { public static bool IsAvailable { get; } = #if NETCOREAPP2_1 -// TODO: Add a build target for .NET 4.7.2 +// TODO: Also available in .NET 4.7.2, we need to add a build target! true; #else false; @@ -31,14 +31,15 @@ namespace SixLabors.ImageSharp internal static void BulkConvertByteToNormalizedFloat(ReadOnlySpan source, Span dest) { Guard.IsTrue( - source.Length % Vector.Count == 0, + dest.Length % Vector.Count == 0, nameof(source), "dest.Length should be divisable by Vector.Count!"); - int n = source.Length / Vector.Count; + int n = dest.Length / Vector.Count; ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + ref Vector destBaseU = ref Unsafe.As, Vector>(ref destBase); const float Scale = 1f / 255f; @@ -50,16 +51,23 @@ namespace SixLabors.ImageSharp Vector.Widen(s0, out Vector w0, out Vector w1); Vector.Widen(s1, out Vector w2, out Vector w3); - Vector f0 = Vector.ConvertToSingle(w0) * Scale; - Vector f1 = Vector.ConvertToSingle(w1) * Scale; - Vector f2 = Vector.ConvertToSingle(w2) * Scale; - Vector f3 = Vector.ConvertToSingle(w3) * Scale; + ref Vector d = ref Unsafe.Add(ref destBaseU, i * 4); + d = w0; + Unsafe.Add(ref d, 1) = w1; + Unsafe.Add(ref d, 2) = w2; + Unsafe.Add(ref d, 3) = w3; + } + + n = dest.Length / Vector.Count; + + for (int i = 0; i < n; i++) + { + ref Vector df = ref Unsafe.Add(ref destBase, i); + ref Vector du = ref Unsafe.As, Vector>(ref df); - ref Vector d = ref Unsafe.Add(ref destBase, i * 4); - d = f0; - Unsafe.Add(ref d, 1) = f1; - Unsafe.Add(ref d, 2) = f2; - Unsafe.Add(ref d, 3) = f3; + Vector v = Vector.ConvertToSingle(du); + v *= Scale; + df = v; } } diff --git a/src/ImageSharp/Common/Extensions/SimdUtils.cs b/src/ImageSharp/Common/Helpers/SimdUtils.cs similarity index 96% rename from src/ImageSharp/Common/Extensions/SimdUtils.cs rename to src/ImageSharp/Common/Helpers/SimdUtils.cs index 3630ede32..91aed8c79 100644 --- a/src/ImageSharp/Common/Extensions/SimdUtils.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.cs @@ -2,13 +2,10 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Diagnostics; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; -using SixLabors.ImageSharp.PixelFormats; - namespace SixLabors.ImageSharp { /// @@ -131,23 +128,26 @@ namespace SixLabors.ImageSharp ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); int n = dest.Length / 8; - Octet.OfUInt32 temp = default; for (int i = 0; i < n; i++) { - Octet.OfByte sVal = Unsafe.Add(ref sourceBase, i); + ref Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); + ref Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); + d.LoadFrom(ref s); + } - // This call is the bottleneck now: - temp.LoadFrom(ref sVal); + for (int i = 0; i < n; i++) + { + ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i); - Vector vi = Unsafe.As>(ref temp); + var vi = Vector.AsVectorUInt32(df); vi &= mask; vi |= magicInt; var vf = Vector.AsVectorSingle(vi); vf = (vf - magicFloat) * bVec; - Unsafe.Add(ref destBaseAsFloat, i) = vf; + df = vf; } } diff --git a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs index b12a2bfa5..39c442fe0 100644 --- a/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs +++ b/src/ImageSharp/PixelFormats/PixelOperations{TPixel}.cs @@ -29,17 +29,7 @@ namespace SixLabors.ImageSharp.PixelFormats /// The number of pixels to convert. internal virtual void PackFromVector4(ReadOnlySpan sourceVectors, Span destinationColors, int count) { - GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); - - ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors); - ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors); - - for (int i = 0; i < count; i++) - { - ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i); - ref TPixel dp = ref Unsafe.Add(ref destRef, i); - dp.PackFromVector4(sp); - } + PackFromVector4Common(sourceVectors, destinationColors, count); } /// @@ -50,17 +40,7 @@ namespace SixLabors.ImageSharp.PixelFormats /// The number of pixels to convert. internal virtual void ToVector4(ReadOnlySpan sourceColors, Span destinationVectors, int count) { - GuardSpans(sourceColors, nameof(sourceColors), destinationVectors, nameof(destinationVectors), count); - - ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors); - ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors); - - for (int i = 0; i < count; i++) - { - ref TPixel sp = ref Unsafe.Add(ref sourceRef, i); - ref Vector4 dp = ref Unsafe.Add(ref destRef, i); - dp = sp.ToVector4(); - } + ToVector4Common(sourceColors, destinationVectors, count); } /// @@ -126,5 +106,37 @@ namespace SixLabors.ImageSharp.PixelFormats Guard.MustBeSizedAtLeast(source, minLength, sourceParamName); Guard.MustBeSizedAtLeast(destination, minLength, destinationParamName); } + + [MethodImpl(InliningOptions.ShortMethod)] + internal static void PackFromVector4Common(ReadOnlySpan sourceVectors, Span destinationColors, int count) + { + GuardSpans(sourceVectors, nameof(sourceVectors), destinationColors, nameof(destinationColors), count); + + ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceVectors); + ref TPixel destRef = ref MemoryMarshal.GetReference(destinationColors); + + for (int i = 0; i < count; i++) + { + ref Vector4 sp = ref Unsafe.Add(ref sourceRef, i); + ref TPixel dp = ref Unsafe.Add(ref destRef, i); + dp.PackFromVector4(sp); + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + internal static void ToVector4Common(ReadOnlySpan sourceColors, Span destinationVectors, int count) + { + GuardSpans(sourceColors, nameof(sourceColors), destinationVectors, nameof(destinationVectors), count); + + ref TPixel sourceRef = ref MemoryMarshal.GetReference(sourceColors); + ref Vector4 destRef = ref MemoryMarshal.GetReference(destinationVectors); + + for (int i = 0; i < count; i++) + { + ref TPixel sp = ref Unsafe.Add(ref sourceRef, i); + ref Vector4 dp = ref Unsafe.Add(ref destRef, i); + dp = sp.ToVector4(); + } + } } } \ No newline at end of file diff --git a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs index 6745079da..0b96a599b 100644 --- a/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs +++ b/src/ImageSharp/PixelFormats/Rgba32.PixelOperations.cs @@ -27,28 +27,17 @@ namespace SixLabors.ImageSharp.PixelFormats if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) { // Doesn't worth to bother with SIMD: - base.ToVector4(sourceColors, destinationVectors, count); + ToVector4Common(sourceColors, destinationVectors, count); return; } - int remainder = count % 2; - int alignedCount = count - remainder; - - if (alignedCount > 0) + if (SimdUtils.ExtendedIntrinsics.IsAvailable) { - ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); - Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); - - SimdUtils.BulkConvertByteToNormalizedFloat( - rawSrc, - rawDest); + ConvertToVector4UsingExtendedIntrinsics(sourceColors, destinationVectors, count); } - - if (remainder > 0) + else { - // actually: remainder == 1 - int lastIdx = count - 1; - destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4(); + ConvertToVector4UsingStandardIntrinsics(sourceColors, destinationVectors, count); } } @@ -59,7 +48,7 @@ namespace SixLabors.ImageSharp.PixelFormats if (count < 128 || !SimdUtils.IsAvx2CompatibleArchitecture) { - base.PackFromVector4(sourceVectors, destinationColors, count); + PackFromVector4Common(sourceVectors, destinationColors, count); return; } @@ -109,6 +98,52 @@ namespace SixLabors.ImageSharp.PixelFormats sourcePixels.Slice(0, count).CopyTo(dest); } + + private static void ConvertToVector4UsingExtendedIntrinsics( + ReadOnlySpan sourceColors, + Span destinationVectors, + int count) + { + int remainder = count % 8; + int alignedCount = count - remainder; + + if (alignedCount > 0) + { + ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); + Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); + + SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); + } + + if (remainder > 0) + { + ToVector4Common(sourceColors.Slice(alignedCount), destinationVectors.Slice(alignedCount), remainder); + } + } + + private static void ConvertToVector4UsingStandardIntrinsics( + ReadOnlySpan sourceColors, + Span destinationVectors, + int count) + { + int remainder = count % 2; + int alignedCount = count - remainder; + + if (alignedCount > 0) + { + ReadOnlySpan rawSrc = MemoryMarshal.Cast(sourceColors); + Span rawDest = MemoryMarshal.Cast(destinationVectors.Slice(0, alignedCount)); + + SimdUtils.BulkConvertByteToNormalizedFloat(rawSrc, rawDest); + } + + if (remainder > 0) + { + // actually: remainder == 1 + int lastIdx = count - 1; + destinationVectors[lastIdx] = sourceColors[lastIdx].ToVector4(); + } + } } } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 39c1fbd47..6afd3cf6b 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -6,6 +6,7 @@ using System.Buffers; using System; using System.Numerics; +using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using BenchmarkDotNet.Attributes; @@ -28,7 +29,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Params( //64, - 2048)] + //512 + 256 + )] public int Count { get; set; } [GlobalSetup] @@ -45,7 +48,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk this.destination.Dispose(); } - [Benchmark] + //[Benchmark] public void PerElement() { Span s = this.source.GetSpan(); @@ -53,32 +56,48 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk for (int i = 0; i < this.Count; i++) { - TPixel c = s[i]; - d[i] = c.ToVector4(); + d[i] = s[i].ToVector4(); } } - [Benchmark(Baseline = true)] + //[Benchmark] public void CommonBulk() { new PixelOperations().ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } - [Benchmark] + //[Benchmark] public void OptimizedBulk() { PixelOperations.Instance.ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } } - [CoreJob] - [ClrJob] + [RyuJitX64Job] + [DisassemblyDiagnoser(printAsm: true, printSource: true)] public class ToVector4_Rgba32 : ToVector4 { class Config : ManualConfig { } + [Benchmark(Baseline = true)] + public void FastScalarBulk() + { + ref Rgba32 sBase = ref this.source.GetSpan()[0]; + ref Vector4 dBase = ref this.destination.GetSpan()[0]; + + for (int i = 0; i < this.Count; i++) + { + ref Rgba32 s = ref Unsafe.Add(ref sBase, i); + ref Vector4 d = ref Unsafe.Add(ref dBase, i); + d.X = s.R; + d.Y = s.G; + d.Z = s.B; + d.W = s.A; + } + } + [Benchmark] public void BulkConvertByteToNormalizedFloat() { @@ -97,5 +116,82 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats); } + //[Benchmark] + public void Original() + { + ToVector4SimdAligned(this.source.GetSpan(), this.destination.GetSpan(), this.Count); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void ToVector4SimdAligned(ReadOnlySpan sourceColors, Span destVectors, int count) + { + if (!Vector.IsHardwareAccelerated) + { + throw new InvalidOperationException( + "Rgba32.PixelOperations.ToVector4SimdAligned() should not be called when Vector.IsHardwareAccelerated == false!"); + } + + DebugGuard.IsTrue( + count % Vector.Count == 0, + nameof(count), + "Argument 'count' should divisible by Vector.Count!"); + + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); + + int unpackedRawCount = count * 4; + + ref uint sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference(sourceColors)); + ref UnpackedRGBA destBaseAsUnpacked = ref Unsafe.As(ref MemoryMarshal.GetReference(destVectors)); + ref Vector destBaseAsUInt = ref Unsafe.As>(ref destBaseAsUnpacked); + ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsUnpacked); + + for (int i = 0; i < count; i++) + { + uint sVal = Unsafe.Add(ref sourceBase, i); + ref UnpackedRGBA dst = ref Unsafe.Add(ref destBaseAsUnpacked, i); + + // This call is the bottleneck now: + dst.Load(sVal); + } + + int numOfVectors = unpackedRawCount / Vector.Count; + + for (int i = 0; i < numOfVectors; i++) + { + Vector vi = Unsafe.Add(ref destBaseAsUInt, i); + + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; + + Unsafe.Add(ref destBaseAsFloat, i) = vf; + } + } + + [StructLayout(LayoutKind.Sequential)] + private struct UnpackedRGBA + { + private uint r; + + private uint g; + + private uint b; + + private uint a; + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Load(uint p) + { + this.r = p; + this.g = p >> 8; + this.b = p >> 16; + this.a = p >> 24; + } + } } } \ No newline at end of file diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs index 535952e05..abf764881 100644 --- a/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs +++ b/tests/ImageSharp.Tests/PixelFormats/PixelOperationsTests.cs @@ -90,7 +90,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats { } - public static TheoryData ArraySizesData => new TheoryData { 0, 1, 2, 7, 16, 1111 }; + public static TheoryData ArraySizesData => new TheoryData { 0, 1, 2, 7, 16, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 1111 }; private static PixelOperations Operations => PixelOperations.Instance;