diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs index 6def8938a9..3131f18738 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.ExtendedIntrinsics.cs @@ -39,9 +39,8 @@ namespace SixLabors.ImageSharp ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - ref Vector destBaseU = ref Unsafe.As, Vector>(ref destBase); - const float Scale = 1f / 255f; + var scale = new Vector(1f / 255f); for (int i = 0; i < n; i++) { @@ -51,26 +50,28 @@ namespace SixLabors.ImageSharp Vector.Widen(s0, out Vector w0, out Vector w1); Vector.Widen(s1, out Vector w2, out Vector w3); - ref Vector d = ref Unsafe.Add(ref destBaseU, i * 4); - d = w0; - Unsafe.Add(ref d, 1) = w1; - Unsafe.Add(ref d, 2) = w2; - Unsafe.Add(ref d, 3) = w3; - } - - n = dest.Length / Vector.Count; + Vector f0 = ConvertToSingle(w0, scale); + Vector f1 = ConvertToSingle(w1, scale); + Vector f2 = ConvertToSingle(w2, scale); + Vector f3 = ConvertToSingle(w3, scale); - for (int i = 0; i < n; i++) - { - ref Vector df = ref Unsafe.Add(ref destBase, i); - ref Vector du = ref Unsafe.As, Vector>(ref df); - - Vector v = Vector.ConvertToSingle(du); - v *= Scale; - df = v; + ref Vector d = ref Unsafe.Add(ref destBase, i * 4); + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; } } + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector ConvertToSingle(Vector u, Vector scale) + { + Vector vi = Vector.AsVectorInt32(u); + Vector v = Vector.ConvertToSingle(vi); + v *= scale; + return v; + } + /// /// A variant of , which is faster on new .NET runtime. /// @@ -92,26 +93,21 @@ namespace SixLabors.ImageSharp ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + Vector scale = new Vector(255); + for (int i = 0; i < n; i++) { ref Vector s = ref Unsafe.Add(ref sourceBase, i * 4); Vector f0 = s; - f0 = Clamp(f0); - Vector f1 = Unsafe.Add(ref s, 1); - f1 = Clamp(f1); - Vector f2 = Unsafe.Add(ref s, 2); - f2 = Clamp(f2); - Vector f3 = Unsafe.Add(ref s, 3); - f3 = Clamp(f3); - Vector w0 = Vector.ConvertToUInt32(f0 * 255f); - Vector w1 = Vector.ConvertToUInt32(f1 * 255f); - Vector w2 = Vector.ConvertToUInt32(f2 * 255f); - Vector w3 = Vector.ConvertToUInt32(f3 * 255f); + Vector w0 = ConvertToUInt32(f0, scale); + Vector w1 = ConvertToUInt32(f1, scale); + Vector w2 = ConvertToUInt32(f2, scale); + Vector w3 = ConvertToUInt32(f3, scale); Vector u0 = Vector.Narrow(w0, w1); Vector u1 = Vector.Narrow(w2, w3); @@ -123,9 +119,12 @@ namespace SixLabors.ImageSharp } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static Vector Clamp(Vector x) + private static Vector ConvertToUInt32(Vector vf, Vector scale) { - return Vector.Min(Vector.Max(x, Vector.Zero), Vector.One); + vf = Vector.Min(Vector.Max(vf, Vector.Zero), Vector.One); + vf *= scale; + Vector vi = Vector.ConvertToInt32(vf); + return Vector.AsVectorUInt32(vi); } } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs index fb505ddcbd..1153d8f401 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/PackFromVector4.cs @@ -26,7 +26,8 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Params( //64, - 2048)] + 2048 + )] public int Count { get; set; } [GlobalSetup] @@ -43,7 +44,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk this.source.Dispose(); } - [Benchmark] + //[Benchmark] public void PerElement() { ref Vector4 s = ref MemoryMarshal.GetReference(this.source.GetSpan()); @@ -55,14 +56,14 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - [Benchmark(Baseline = true)] - public void CommonBulk() + [Benchmark] + public void PixelOperations_Base() { new PixelOperations().PackFromVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } [Benchmark] - public void OptimizedBulk() + public void PixelOperations_Specialized() { PixelOperations.Instance.PackFromVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } @@ -70,7 +71,30 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk public class PackFromVector4_Rgba32 : PackFromVector4 { - //[Benchmark] + [Benchmark] + public void FastDefault() + { + ref Vector4 sBase = ref this.source.GetSpan()[0]; + ref Rgba32 dBase = ref this.destination.GetSpan()[0]; + + Vector4 maxBytes = new Vector4(255); + Vector4 half = new Vector4(0.5f); + + for (int i = 0; i < this.Count; i++) + { + Vector4 v = Unsafe.Add(ref sBase, i); + v *= maxBytes; + v += half; + v = Vector4.Clamp(v, Vector4.Zero, maxBytes); + ref Rgba32 d = ref Unsafe.Add(ref dBase, i); + d.R = (byte)v.X; + d.G = (byte)v.Y; + d.B = (byte)v.Z; + d.A = (byte)v.W; + } + } + + [Benchmark(Baseline = true)] public void BulkConvertNormalizedFloatToByteClampOverflows() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); @@ -88,29 +112,16 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk SimdUtils.ExtendedIntrinsics.BulkConvertNormalizedFloatToByteClampOverflows(sBytes, dFloats); } - // TODO: Check again later! // RESULTS: - // - // BenchmarkDotNet=v0.10.14, OS=Windows 10.0.17134 - // Intel Core i7-7700HQ CPU 2.80GHz (Kaby Lake), 1 CPU, 8 logical and 4 physical cores - // Frequency=2742187 Hz, Resolution=364.6724 ns, Timer=TSC - // .NET Core SDK=2.1.400-preview-009063 - // [Host] : .NET Core 2.1.1 (CoreCLR 4.6.26606.02, CoreFX 4.6.26606.05), 64bit RyuJIT - // Job-XIFINS : .NET Framework 4.7.1 (CLR 4.0.30319.42000), 64bit RyuJIT-v4.7.3190.0 - // Job-RTQZPN : .NET Core 2.1.1 (CoreCLR 4.6.26606.02, CoreFX 4.6.26606.05), 64bit RyuJIT - // - // LaunchCount=1 TargetCount=3 WarmupCount=3 + // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Allocated | + // ----------------------------------------------------------------- |-------- |------ |----------:|----------:|----------:|-------:|---------:|----------:| + // FastDefault | Clr | 2048 | 15.989 us | 6.1384 us | 0.3468 us | 4.07 | 0.08 | 0 B | + // BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3.931 us | 0.6264 us | 0.0354 us | 1.00 | 0.00 | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 2.100 us | 0.4717 us | 0.0267 us | 0.53 | 0.01 | 0 B | // - // Method | Runtime | Count | Mean | Error | StdDev | Scaled | ScaledSD | Allocated | - // ----------------------------------------------------------------- |-------- |------ |----------:|-----------:|----------:|-------:|---------:|----------:| - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Clr | 2048 | 3.755 us | 0.8959 us | 0.0506 us | 0.22 | 0.00 | 0 B | - // PerElement | Clr | 2048 | 17.387 us | 15.1569 us | 0.8564 us | 1.02 | 0.04 | 0 B | - // CommonBulk | Clr | 2048 | 17.121 us | 0.7634 us | 0.0431 us | 1.00 | 0.00 | 24 B | - // OptimizedBulk | Clr | 2048 | 4.018 us | 0.3858 us | 0.0218 us | 0.23 | 0.00 | 0 B | - // | | | | | | | | | - // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 22.232 us | 1.6154 us | 0.0913 us | 1.31 | 0.04 | 0 B | - // PerElement | Core | 2048 | 16.741 us | 2.9254 us | 0.1653 us | 0.98 | 0.03 | 0 B | - // CommonBulk | Core | 2048 | 17.022 us | 11.4894 us | 0.6492 us | 1.00 | 0.00 | 24 B | - // OptimizedBulk | Core | 2048 | 3.707 us | 0.1500 us | 0.0085 us | 0.22 | 0.01 | 0 B | + // | | | | | | | | | + // FastDefault | Core | 2048 | 14.693 us | 0.5131 us | 0.0290 us | 3.76 | 0.03 | 0 B | + // BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 3.913 us | 0.5661 us | 0.0320 us | 1.00 | 0.00 | 0 B | + // ExtendedIntrinsic_BulkConvertNormalizedFloatToByteClampOverflows | Core | 2048 | 1.966 us | 0.4056 us | 0.0229 us | 0.50 | 0.01 | 0 B | } } \ No newline at end of file diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs index 6afd3cf6b1..d699d168bb 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ToVector4.cs @@ -29,8 +29,9 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Params( //64, - //512 - 256 + //256, + //512, + 2048 )] public int Count { get; set; } @@ -60,70 +61,214 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - //[Benchmark] - public void CommonBulk() + [Benchmark] + public void PixelOperations_Base() { new PixelOperations().ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } - //[Benchmark] - public void OptimizedBulk() + [Benchmark] + public void PixelOperations_Specialized() { PixelOperations.Instance.ToVector4(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } } - [RyuJitX64Job] - [DisassemblyDiagnoser(printAsm: true, printSource: true)] + [Config(typeof(Config.ShortClr))] public class ToVector4_Rgba32 : ToVector4 { - class Config : ManualConfig - { - } - - [Benchmark(Baseline = true)] - public void FastScalarBulk() + [Benchmark] + public void BasicBulk() { ref Rgba32 sBase = ref this.source.GetSpan()[0]; ref Vector4 dBase = ref this.destination.GetSpan()[0]; + Vector4 scale = new Vector4(1f / 255f); + + Vector4 v = default; + for (int i = 0; i < this.Count; i++) { ref Rgba32 s = ref Unsafe.Add(ref sBase, i); - ref Vector4 d = ref Unsafe.Add(ref dBase, i); - d.X = s.R; - d.Y = s.G; - d.Z = s.B; - d.W = s.A; + v.X = s.R; + v.Y = s.G; + v.Z = s.B; + v.W = s.A; + v *= scale; + Unsafe.Add(ref dBase, i) = v; + } + } + + [Benchmark(Baseline = true)] + public void BulkConvertByteToNormalizedFloat_2Loops() + { + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); + + ref SimdUtils.Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); + ref SimdUtils.Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dFloats)); + + ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); + + int n = dFloats.Length / 8; + + for (int i = 0; i < n; i++) + { + ref SimdUtils.Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); + ref SimdUtils.Octet.OfUInt32 d = ref Unsafe.Add(ref destBaseAsWideOctet, i); + d.LoadFrom(ref s); + } + + for (int i = 0; i < n; i++) + { + ref Vector df = ref Unsafe.Add(ref destBaseAsFloat, i); + + var vi = Vector.AsVectorUInt32(df); + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; + + df = vf; + } + } + + //[Benchmark] + public void BulkConvertByteToNormalizedFloat_ConvertInSameLoop() + { + Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); + Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); + + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); + + ref SimdUtils.Octet.OfByte sourceBase = ref Unsafe.As(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); + ref SimdUtils.Octet.OfUInt32 destBaseAsWideOctet = ref Unsafe.As(ref MemoryMarshal.GetReference(dFloats)); + + ref Vector destBaseAsFloat = ref Unsafe.As>(ref destBaseAsWideOctet); + + int n = dFloats.Length / 8; + + var temp = default(SimdUtils.Octet.OfUInt32); + ref Vector tempRef = ref Unsafe.As>(ref temp); + + for (int i = 0; i < n; i++) + { + ref SimdUtils.Octet.OfByte s = ref Unsafe.Add(ref sourceBase, i); + temp.LoadFrom(ref s); + + Vector vi = tempRef; + + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; + + Unsafe.Add(ref destBaseAsFloat, i) = vf; } } [Benchmark] - public void BulkConvertByteToNormalizedFloat() + public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_2Loops() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - SimdUtils.BulkConvertByteToNormalizedFloat(sBytes, dFloats); + int n = dFloats.Length / Vector.Count; + + ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); + ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dFloats)); + ref Vector destBaseU = ref Unsafe.As, Vector>(ref destBase); + + for (int i = 0; i < n; i++) + { + Vector b = Unsafe.Add(ref sourceBase, i); + + Vector.Widen(b, out Vector s0, out Vector s1); + Vector.Widen(s0, out Vector w0, out Vector w1); + Vector.Widen(s1, out Vector w2, out Vector w3); + + ref Vector d = ref Unsafe.Add(ref destBaseU, i * 4); + d = w0; + Unsafe.Add(ref d, 1) = w1; + Unsafe.Add(ref d, 2) = w2; + Unsafe.Add(ref d, 3) = w3; + } + + n = dFloats.Length / Vector.Count; + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + ref Vector dRef = ref Unsafe.Add(ref destBase, i); + + Vector du = Vector.AsVectorInt32(dRef); + Vector v = Vector.ConvertToSingle(du); + v *= scale; + + dRef = v; + } } [Benchmark] - public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat() + public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat_ConvertInSameLoop() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); - SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(sBytes, dFloats); + int n = dFloats.Length / Vector.Count; + + ref Vector sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference((ReadOnlySpan)sBytes)); + ref Vector destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dFloats)); + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + Vector b = Unsafe.Add(ref sourceBase, i); + + Vector.Widen(b, out Vector s0, out Vector s1); + Vector.Widen(s0, out Vector w0, out Vector w1); + Vector.Widen(s1, out Vector w2, out Vector w3); + + Vector f0 = ConvertToNormalizedSingle(w0, scale); + Vector f1 = ConvertToNormalizedSingle(w1, scale); + Vector f2 = ConvertToNormalizedSingle(w2, scale); + Vector f3 = ConvertToNormalizedSingle(w3, scale); + + ref Vector d = ref Unsafe.Add(ref destBase, i * 4); + d = f0; + Unsafe.Add(ref d, 1) = f1; + Unsafe.Add(ref d, 2) = f2; + Unsafe.Add(ref d, 3) = f3; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static Vector ConvertToNormalizedSingle(Vector u, Vector scale) + { + Vector vi = Vector.AsVectorInt32(u); + Vector v = Vector.ConvertToSingle(vi); + v *= scale; + return v; } //[Benchmark] - public void Original() + public void OldImplementation() { - ToVector4SimdAligned(this.source.GetSpan(), this.destination.GetSpan(), this.Count); + ToVector4OldImplementation(this.source.GetSpan(), this.destination.GetSpan(), this.Count); } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static void ToVector4SimdAligned(ReadOnlySpan sourceColors, Span destVectors, int count) + private static void ToVector4OldImplementation(ReadOnlySpan sourceColors, Span destVectors, int count) { if (!Vector.IsHardwareAccelerated) { diff --git a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs index 4a4b939b65..be19e719a8 100644 --- a/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs +++ b/tests/ImageSharp.Benchmarks/General/Vectorization/UInt32ToSingle.cs @@ -9,7 +9,7 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization { private float[] data; - private const int Count = 64; + private const int Count = 32; [GlobalSetup] public void Setup() @@ -24,8 +24,10 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization int n = Count / Vector.Count; - Vector magick = new Vector(32768.0f); - Vector scale = new Vector(255f) / new Vector(256f); + var bVec = new Vector(256.0f / 255.0f); + var magicFloat = new Vector(32768.0f); + var magicInt = new Vector(1191182336); // reinterpreded value of 32768.0f + var mask = new Vector(255); for (int i = 0; i < n; i++) { @@ -33,13 +35,16 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization // u.f = 32768.0f + x * (255.0f / 256.0f); // return (uint8_t)u.i; - ref Vector d = ref Unsafe.Add(ref b, i); - Vector x = d; - //x = Vector.Max(x, Vector.Zero); - //x = Vector.Min(x, Vector.One); + ref Vector df = ref Unsafe.Add(ref b, i); + + var vi = Vector.AsVectorUInt32(df); + vi &= mask; + vi |= magicInt; + + var vf = Vector.AsVectorSingle(vi); + vf = (vf - magicFloat) * bVec; - x = (x * scale) + magick; - d = x; + df = vf; } } @@ -48,18 +53,37 @@ namespace SixLabors.ImageSharp.Benchmarks.General.Vectorization { int n = Count / Vector.Count; - ref Vector b = ref Unsafe.As>(ref this.data[0]); + ref Vector bf = ref Unsafe.As>(ref this.data[0]); + ref Vector bu = ref Unsafe.As, Vector>(ref bf); var scale = new Vector(1f / 255f); for (int i = 0; i < n; i++) { - ref Vector df = ref Unsafe.Add(ref b, i); - Vector du = Unsafe.As, Vector>(ref df); + Vector u = Unsafe.Add(ref bu, i); + Vector v = Vector.ConvertToSingle(u); + v *= scale; + Unsafe.Add(ref bf, i) = v; + } + } - Vector v = Vector.ConvertToSingle(du); + // This code is not correct at all, it's just here as reference + [Benchmark] + public void StandardSimdFromInt() + { + int n = Count / Vector.Count; + + ref Vector bf = ref Unsafe.As>(ref this.data[0]); + ref Vector bu = ref Unsafe.As, Vector>(ref bf); + + var scale = new Vector(1f / 255f); + + for (int i = 0; i < n; i++) + { + Vector u = Unsafe.Add(ref bu, i); + Vector v = Vector.ConvertToSingle(u); v *= scale; - df = v; + Unsafe.Add(ref bf, i) = v; } } } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 4b23ca30f1..7ed18ef86b 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -212,15 +212,11 @@ namespace SixLabors.ImageSharp.Tests.Common [InlineData(3, 128)] public void ExtendedIntrinsics_BulkConvertByteToNormalizedFloat(int seed, int count) { - if (!Vector.IsHardwareAccelerated) - { - return; - } - byte[] source = new Random(seed).GenerateRandomByteArray(count); float[] result = new float[count]; float[] expected = source.Select(b => (float)b / 255f).ToArray(); + SimdUtils.ExtendedIntrinsics.BulkConvertByteToNormalizedFloat(source, result); Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5f));