From 054cc41fc2d42ecf8fd35640ca46e40cc14bfb62 Mon Sep 17 00:00:00 2001 From: Anton Firszov Date: Sat, 29 Feb 2020 04:40:04 +0100 Subject: [PATCH] Tweak Block8x8F_CopyTo1x1 benchmarks --- .../BlockOperations/Block8x8F_CopyTo1x1.cs | 298 ++++++++++++++++-- 1 file changed, 276 insertions(+), 22 deletions(-) diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_CopyTo1x1.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_CopyTo1x1.cs index 21d114be3..3f54d68c9 100644 --- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_CopyTo1x1.cs +++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_CopyTo1x1.cs @@ -4,7 +4,9 @@ using System; using System.Numerics; using System.Runtime.CompilerServices; - +using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; using BenchmarkDotNet.Attributes; using SixLabors.ImageSharp.Formats.Jpeg.Components; @@ -13,13 +15,19 @@ using SixLabors.ImageSharp.Memory; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations { - public class Block8x8F_CopyTo1x1 + public unsafe class Block8x8F_CopyTo1x1 { private Block8x8F block; + private readonly Block8x8F[] blockArray = new Block8x8F[1]; - private Buffer2D buffer; + private static readonly int Width = 100; - private BufferArea destArea; + private float[] buffer = new float[Width * 500]; + private readonly float[] unpinnedBuffer = new float[Width * 500]; + private GCHandle bufferHandle; + private GCHandle blockHandle; + private float* bufferPtr; + private float* blockPtr; [GlobalSetup] public void Setup() @@ -29,16 +37,30 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations throw new InvalidOperationException("Benchmark Block8x8F_CopyTo1x1 is invalid on platforms without AVX2 support."); } - this.buffer = Configuration.Default.MemoryAllocator.Allocate2D(1000, 500); - this.destArea = this.buffer.GetArea(200, 100, 64, 64); + this.bufferHandle = GCHandle.Alloc(this.buffer, GCHandleType.Pinned); + this.bufferPtr = (float*)this.bufferHandle.AddrOfPinnedObject(); + + // Pin self so we can take address of to the block: + this.blockHandle = GCHandle.Alloc(this.blockArray, GCHandleType.Pinned); + this.blockPtr = (float*)Unsafe.AsPointer(ref this.block); + } + + [GlobalCleanup] + public void Cleanup() + { + this.bufferPtr = null; + this.blockPtr = null; + this.bufferHandle.Free(); + this.blockHandle.Free(); + this.buffer = null; } [Benchmark(Baseline = true)] public void Original() { ref byte selfBase = ref Unsafe.As(ref this.block); - ref byte destBase = ref Unsafe.As(ref this.destArea.GetReferenceToOrigin()); - int destStride = this.destArea.Stride * sizeof(float); + ref byte destBase = ref Unsafe.AsRef(this.bufferPtr); + int destStride = Width * sizeof(float); CopyRowImpl(ref selfBase, ref destBase, destStride, 0); CopyRowImpl(ref selfBase, ref destBase, destStride, 1); @@ -58,12 +80,12 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations Unsafe.CopyBlock(ref d, ref s, 8 * sizeof(float)); } - [Benchmark] + // [Benchmark] public void UseVector8() { ref Block8x8F s = ref this.block; - ref float origin = ref this.destArea.GetReferenceToOrigin(); - int stride = this.destArea.Stride; + ref float origin = ref Unsafe.AsRef(this.bufferPtr); + int stride = Width; ref Vector d0 = ref Unsafe.As>(ref origin); ref Vector d1 = ref Unsafe.As>(ref Unsafe.Add(ref origin, stride)); @@ -93,12 +115,12 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations d7 = row7; } - [Benchmark] + // [Benchmark] public void UseVector8_V2() { ref Block8x8F s = ref this.block; - ref float origin = ref this.destArea.GetReferenceToOrigin(); - int stride = this.destArea.Stride; + ref float origin = ref Unsafe.AsRef(this.bufferPtr); + int stride = Width; ref Vector d0 = ref Unsafe.As>(ref origin); ref Vector d1 = ref Unsafe.As>(ref Unsafe.Add(ref origin, stride)); @@ -119,15 +141,247 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations d7 = Unsafe.As>(ref s.V7L); } - // RESULTS: + [Benchmark] + public void UseVector8_V3() + { + int stride = Width * sizeof(float); + ref float d = ref this.unpinnedBuffer[0]; + ref Vector s = ref Unsafe.As>(ref this.block); + + Vector v0 = s; + Vector v1 = Unsafe.AddByteOffset(ref s, (IntPtr)1); + Vector v2 = Unsafe.AddByteOffset(ref s, (IntPtr)2); + Vector v3 = Unsafe.AddByteOffset(ref s, (IntPtr)3); + + Unsafe.As>(ref d) = v0; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)stride)) = v1; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 2))) = v2; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 3))) = v3; + + v0 = Unsafe.AddByteOffset(ref s, (IntPtr)4); + v1 = Unsafe.AddByteOffset(ref s, (IntPtr)5); + v2 = Unsafe.AddByteOffset(ref s, (IntPtr)6); + v3 = Unsafe.AddByteOffset(ref s, (IntPtr)7); + + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 4))) = v0; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 5))) = v1; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 6))) = v2; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 7))) = v3; + } + +#if SUPPORTS_RUNTIME_INTRINSICS + [Benchmark] + public void UseVector256_Avx2_Variant1() + { + int stride = Width; + float* d = this.bufferPtr; + float* s = this.blockPtr; + Vector256 v; + + v = Avx.LoadVector256(s); + Avx.Store(d, v); + + v = Avx.LoadVector256(s + 8); + Avx.Store(d + stride, v); + + v = Avx.LoadVector256(s + (8 * 2)); + Avx.Store(d + (stride * 2), v); + + v = Avx.LoadVector256(s + (8 * 3)); + Avx.Store(d + (stride * 3), v); + + v = Avx.LoadVector256(s + (8 * 4)); + Avx.Store(d + (stride * 4), v); + + v = Avx.LoadVector256(s + (8 * 5)); + Avx.Store(d + (stride * 5), v); + + v = Avx.LoadVector256(s + (8 * 6)); + Avx.Store(d + (stride * 6), v); + + v = Avx.LoadVector256(s + (8 * 7)); + Avx.Store(d + (stride * 7), v); + } + + [Benchmark] + public void UseVector256_Avx2_Variant2() + { + int stride = Width; + float* d = this.bufferPtr; + float* s = this.blockPtr; + + Vector256 v0 = Avx.LoadVector256(s); + Vector256 v1 = Avx.LoadVector256(s + 8); + Vector256 v2 = Avx.LoadVector256(s + (8 * 2)); + Vector256 v3 = Avx.LoadVector256(s + (8 * 3)); + Vector256 v4 = Avx.LoadVector256(s + (8 * 4)); + Vector256 v5 = Avx.LoadVector256(s + (8 * 5)); + Vector256 v6 = Avx.LoadVector256(s + (8 * 6)); + Vector256 v7 = Avx.LoadVector256(s + (8 * 7)); + + Avx.Store(d, v0); + Avx.Store(d + stride, v1); + Avx.Store(d + (stride * 2), v2); + Avx.Store(d + (stride * 3), v3); + Avx.Store(d + (stride * 4), v4); + Avx.Store(d + (stride * 5), v5); + Avx.Store(d + (stride * 6), v6); + Avx.Store(d + (stride * 7), v7); + } + + [Benchmark] + public void UseVector256_Avx2_Variant3() + { + int stride = Width; + float* d = this.bufferPtr; + float* s = this.blockPtr; + + Vector256 v0 = Avx.LoadVector256(s); + Vector256 v1 = Avx.LoadVector256(s + 8); + Vector256 v2 = Avx.LoadVector256(s + (8 * 2)); + Vector256 v3 = Avx.LoadVector256(s + (8 * 3)); + Avx.Store(d, v0); + Avx.Store(d + stride, v1); + Avx.Store(d + (stride * 2), v2); + Avx.Store(d + (stride * 3), v3); + + v0 = Avx.LoadVector256(s + (8 * 4)); + v1 = Avx.LoadVector256(s + (8 * 5)); + v2 = Avx.LoadVector256(s + (8 * 6)); + v3 = Avx.LoadVector256(s + (8 * 7)); + Avx.Store(d + (stride * 4), v0); + Avx.Store(d + (stride * 5), v1); + Avx.Store(d + (stride * 6), v2); + Avx.Store(d + (stride * 7), v3); + } + + [Benchmark] + public void UseVector256_Avx2_Variant3_RefCast() + { + int stride = Width; + ref float d = ref this.unpinnedBuffer[0]; + ref Vector256 s = ref Unsafe.As>(ref this.block); + + Vector256 v0 = s; + Vector256 v1 = Unsafe.Add(ref s, 1); + Vector256 v2 = Unsafe.Add(ref s, 2); + Vector256 v3 = Unsafe.Add(ref s, 3); + + Unsafe.As>(ref d) = v0; + Unsafe.As>(ref Unsafe.Add(ref d, stride)) = v1; + Unsafe.As>(ref Unsafe.Add(ref d, stride * 2)) = v2; + Unsafe.As>(ref Unsafe.Add(ref d, stride * 3)) = v3; + + v0 = Unsafe.Add(ref s, 4); + v1 = Unsafe.Add(ref s, 5); + v2 = Unsafe.Add(ref s, 6); + v3 = Unsafe.Add(ref s, 7); + + Unsafe.As>(ref Unsafe.Add(ref d, stride * 4)) = v0; + Unsafe.As>(ref Unsafe.Add(ref d, stride * 5)) = v1; + Unsafe.As>(ref Unsafe.Add(ref d, stride * 6)) = v2; + Unsafe.As>(ref Unsafe.Add(ref d, stride * 7)) = v3; + } + + [Benchmark] + public void UseVector256_Avx2_Variant3_RefCast_Mod() + { + int stride = Width * sizeof(float); + ref float d = ref this.unpinnedBuffer[0]; + ref Vector256 s = ref Unsafe.As>(ref this.block); + + Vector256 v0 = s; + Vector256 v1 = Unsafe.AddByteOffset(ref s, (IntPtr)1); + Vector256 v2 = Unsafe.AddByteOffset(ref s, (IntPtr)2); + Vector256 v3 = Unsafe.AddByteOffset(ref s, (IntPtr)3); + + Unsafe.As>(ref d) = v0; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)stride)) = v1; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 2))) = v2; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 3))) = v3; + + v0 = Unsafe.AddByteOffset(ref s, (IntPtr)4); + v1 = Unsafe.AddByteOffset(ref s, (IntPtr)5); + v2 = Unsafe.AddByteOffset(ref s, (IntPtr)6); + v3 = Unsafe.AddByteOffset(ref s, (IntPtr)7); + + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 4))) = v0; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 5))) = v1; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 6))) = v2; + Unsafe.As>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 7))) = v3; + } + + // [Benchmark] + public void UseVector256_Avx2_Variant3_WithLocalPinning() + { + int stride = Width; + fixed (float* d = this.unpinnedBuffer) + fixed (Block8x8F* ss = &this.block) + { + var s = (float*)ss; + Vector256 v0 = Avx.LoadVector256(s); + Vector256 v1 = Avx.LoadVector256(s + 8); + Vector256 v2 = Avx.LoadVector256(s + (8 * 2)); + Vector256 v3 = Avx.LoadVector256(s + (8 * 3)); + Avx.Store(d, v0); + Avx.Store(d + stride, v1); + Avx.Store(d + (stride * 2), v2); + Avx.Store(d + (stride * 3), v3); + + v0 = Avx.LoadVector256(s + (8 * 4)); + v1 = Avx.LoadVector256(s + (8 * 5)); + v2 = Avx.LoadVector256(s + (8 * 6)); + v3 = Avx.LoadVector256(s + (8 * 7)); + Avx.Store(d + (stride * 4), v0); + Avx.Store(d + (stride * 5), v1); + Avx.Store(d + (stride * 6), v2); + Avx.Store(d + (stride * 7), v3); + } + } + + // [Benchmark] + public void UseVector256_Avx2_Variant3_sbyte() + { + int stride = Width * 4; + var d = (sbyte*)this.bufferPtr; + var s = (sbyte*)this.blockPtr; + + Vector256 v0 = Avx.LoadVector256(s); + Vector256 v1 = Avx.LoadVector256(s + 32); + Vector256 v2 = Avx.LoadVector256(s + (32 * 2)); + Vector256 v3 = Avx.LoadVector256(s + (32 * 3)); + Avx.Store(d, v0); + Avx.Store(d + stride, v1); + Avx.Store(d + (stride * 2), v2); + Avx.Store(d + (stride * 3), v3); + + v0 = Avx.LoadVector256(s + (32 * 4)); + v1 = Avx.LoadVector256(s + (32 * 5)); + v2 = Avx.LoadVector256(s + (32 * 6)); + v3 = Avx.LoadVector256(s + (32 * 7)); + Avx.Store(d + (stride * 4), v0); + Avx.Store(d + (stride * 5), v1); + Avx.Store(d + (stride * 6), v2); + Avx.Store(d + (stride * 7), v3); + } +#endif + + // *** RESULTS 02/2020 *** + // BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18363 + // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK=3.1.200-preview-014971 + // [Host] : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT + // DefaultJob : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT // - // Method | Mean | Error | StdDev | Scaled | - // -------------- |---------:|----------:|----------:|-------:| - // Original | 22.53 ns | 0.1660 ns | 0.1553 ns | 1.00 | - // UseVector8 | 21.59 ns | 0.3079 ns | 0.2571 ns | 0.96 | - // UseVector8_V2 | 22.57 ns | 0.1699 ns | 0.1506 ns | 1.00 | // - // Conclusion: - // Doesn't worth to bother with this + // | Method | Mean | Error | StdDev | Ratio | RatioSD | + // |--------------------------------------- |---------:|----------:|----------:|------:|--------:| + // | Original | 4.012 ns | 0.0567 ns | 0.0531 ns | 1.00 | 0.00 | + // | UseVector8_V3 | 4.013 ns | 0.0947 ns | 0.0840 ns | 1.00 | 0.03 | + // | UseVector256_Avx2_Variant1 | 2.546 ns | 0.0376 ns | 0.0314 ns | 0.63 | 0.01 | + // | UseVector256_Avx2_Variant2 | 2.643 ns | 0.0162 ns | 0.0151 ns | 0.66 | 0.01 | + // | UseVector256_Avx2_Variant3 | 2.520 ns | 0.0760 ns | 0.0813 ns | 0.63 | 0.02 | + // | UseVector256_Avx2_Variant3_RefCast | 2.300 ns | 0.0877 ns | 0.0938 ns | 0.58 | 0.03 | + // | UseVector256_Avx2_Variant3_RefCast_Mod | 2.139 ns | 0.0698 ns | 0.0686 ns | 0.53 | 0.02 | } }