From 054cc41fc2d42ecf8fd35640ca46e40cc14bfb62 Mon Sep 17 00:00:00 2001
From: Anton Firszov <antonfir@gmail.com>
Date: Sat, 29 Feb 2020 04:40:04 +0100
Subject: [PATCH] Tweak Block8x8F_CopyTo1x1 benchmarks

---
 .../BlockOperations/Block8x8F_CopyTo1x1.cs    | 298 ++++++++++++++++--
 1 file changed, 276 insertions(+), 22 deletions(-)
diff --git a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_CopyTo1x1.cs b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_CopyTo1x1.cs
index 21d114be3..3f54d68c9 100644
--- a/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_CopyTo1x1.cs
+++ b/tests/ImageSharp.Benchmarks/Codecs/Jpeg/BlockOperations/Block8x8F_CopyTo1x1.cs
@@ -4,7 +4,9 @@
 using System;
 using System.Numerics;
 using System.Runtime.CompilerServices;
-
+using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 using BenchmarkDotNet.Attributes;
 
 using SixLabors.ImageSharp.Formats.Jpeg.Components;
@@ -13,13 +15,19 @@ using SixLabors.ImageSharp.Memory;
 // ReSharper disable InconsistentNaming
 namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
 {
-    public class Block8x8F_CopyTo1x1
+    public unsafe class Block8x8F_CopyTo1x1
     {
         private Block8x8F block;
+        private readonly Block8x8F[] blockArray = new Block8x8F[1];
 
-        private Buffer2D<float> buffer;
+        private static readonly int Width = 100;
 
-        private BufferArea<float> destArea;
+        private float[] buffer = new float[Width * 500];
+        private readonly float[] unpinnedBuffer = new float[Width * 500];
+        private GCHandle bufferHandle;
+        private GCHandle blockHandle;
+        private float* bufferPtr;
+        private float* blockPtr;
 
         [GlobalSetup]
         public void Setup()
@@ -29,16 +37,30 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
                 throw new InvalidOperationException("Benchmark Block8x8F_CopyTo1x1 is invalid on platforms without AVX2 support.");
             }
 
-            this.buffer = Configuration.Default.MemoryAllocator.Allocate2D<float>(1000, 500);
-            this.destArea = this.buffer.GetArea(200, 100, 64, 64);
+            this.bufferHandle = GCHandle.Alloc(this.buffer, GCHandleType.Pinned);
+            this.bufferPtr = (float*)this.bufferHandle.AddrOfPinnedObject();
+
+            // Pin self so we can take address of to the block:
+            this.blockHandle = GCHandle.Alloc(this.blockArray, GCHandleType.Pinned);
+            this.blockPtr = (float*)Unsafe.AsPointer(ref this.block);
+        }
+
+        [GlobalCleanup]
+        public void Cleanup()
+        {
+            this.bufferPtr = null;
+            this.blockPtr = null;
+            this.bufferHandle.Free();
+            this.blockHandle.Free();
+            this.buffer = null;
         }
 
         [Benchmark(Baseline = true)]
         public void Original()
         {
             ref byte selfBase = ref Unsafe.As<Block8x8F, byte>(ref this.block);
-            ref byte destBase = ref Unsafe.As<float, byte>(ref this.destArea.GetReferenceToOrigin());
-            int destStride = this.destArea.Stride * sizeof(float);
+            ref byte destBase = ref Unsafe.AsRef<byte>(this.bufferPtr);
+            int destStride = Width * sizeof(float);
 
             CopyRowImpl(ref selfBase, ref destBase, destStride, 0);
             CopyRowImpl(ref selfBase, ref destBase, destStride, 1);
@@ -58,12 +80,12 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
             Unsafe.CopyBlock(ref d, ref s, 8 * sizeof(float));
         }
 
-        [Benchmark]
+        // [Benchmark]
         public void UseVector8()
         {
             ref Block8x8F s = ref this.block;
-            ref float origin = ref this.destArea.GetReferenceToOrigin();
-            int stride = this.destArea.Stride;
+            ref float origin = ref Unsafe.AsRef<float>(this.bufferPtr);
+            int stride = Width;
 
             ref Vector<float> d0 = ref Unsafe.As<float, Vector<float>>(ref origin);
             ref Vector<float> d1 = ref Unsafe.As<float, Vector<float>>(ref Unsafe.Add(ref origin, stride));
@@ -93,12 +115,12 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
             d7 = row7;
         }
 
-        [Benchmark]
+        // [Benchmark]
         public void UseVector8_V2()
         {
             ref Block8x8F s = ref this.block;
-            ref float origin = ref this.destArea.GetReferenceToOrigin();
-            int stride = this.destArea.Stride;
+            ref float origin = ref Unsafe.AsRef<float>(this.bufferPtr);
+            int stride = Width;
 
             ref Vector<float> d0 = ref Unsafe.As<float, Vector<float>>(ref origin);
             ref Vector<float> d1 = ref Unsafe.As<float, Vector<float>>(ref Unsafe.Add(ref origin, stride));
@@ -119,15 +141,247 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations
             d7 = Unsafe.As<Vector4, Vector<float>>(ref s.V7L);
         }
 
-        // RESULTS:
+        [Benchmark]
+        public void UseVector8_V3()
+        {
+            int stride = Width * sizeof(float);
+            ref float d = ref this.unpinnedBuffer[0];
+            ref Vector<float> s = ref Unsafe.As<Block8x8F, Vector<float>>(ref this.block);
+
+            Vector<float> v0 = s;
+            Vector<float> v1 = Unsafe.AddByteOffset(ref s, (IntPtr)1);
+            Vector<float> v2 = Unsafe.AddByteOffset(ref s, (IntPtr)2);
+            Vector<float> v3 = Unsafe.AddByteOffset(ref s, (IntPtr)3);
+
+            Unsafe.As<float, Vector<float>>(ref d) = v0;
+            Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)stride)) = v1;
+            Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 2))) = v2;
+            Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 3))) = v3;
+
+            v0 = Unsafe.AddByteOffset(ref s, (IntPtr)4);
+            v1 = Unsafe.AddByteOffset(ref s, (IntPtr)5);
+            v2 = Unsafe.AddByteOffset(ref s, (IntPtr)6);
+            v3 = Unsafe.AddByteOffset(ref s, (IntPtr)7);
+
+            Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 4))) = v0;
+            Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 5))) = v1;
+            Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 6))) = v2;
+            Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 7))) = v3;
+        }
+
+#if SUPPORTS_RUNTIME_INTRINSICS
+        [Benchmark]
+        public void UseVector256_Avx2_Variant1()
+        {
+            int stride = Width;
+            float* d = this.bufferPtr;
+            float* s = this.blockPtr;
+            Vector256<float> v;
+
+            v = Avx.LoadVector256(s);
+            Avx.Store(d, v);
+
+            v = Avx.LoadVector256(s + 8);
+            Avx.Store(d + stride, v);
+
+            v = Avx.LoadVector256(s + (8 * 2));
+            Avx.Store(d + (stride * 2), v);
+
+            v = Avx.LoadVector256(s + (8 * 3));
+            Avx.Store(d + (stride * 3), v);
+
+            v = Avx.LoadVector256(s + (8 * 4));
+            Avx.Store(d + (stride * 4), v);
+
+            v = Avx.LoadVector256(s + (8 * 5));
+            Avx.Store(d + (stride * 5), v);
+
+            v = Avx.LoadVector256(s + (8 * 6));
+            Avx.Store(d + (stride * 6), v);
+
+            v = Avx.LoadVector256(s + (8 * 7));
+            Avx.Store(d + (stride * 7), v);
+        }
+
+        [Benchmark]
+        public void UseVector256_Avx2_Variant2()
+        {
+            int stride = Width;
+            float* d = this.bufferPtr;
+            float* s = this.blockPtr;
+
+            Vector256<float> v0 = Avx.LoadVector256(s);
+            Vector256<float> v1 = Avx.LoadVector256(s + 8);
+            Vector256<float> v2 = Avx.LoadVector256(s + (8 * 2));
+            Vector256<float> v3 = Avx.LoadVector256(s + (8 * 3));
+            Vector256<float> v4 = Avx.LoadVector256(s + (8 * 4));
+            Vector256<float> v5 = Avx.LoadVector256(s + (8 * 5));
+            Vector256<float> v6 = Avx.LoadVector256(s + (8 * 6));
+            Vector256<float> v7 = Avx.LoadVector256(s + (8 * 7));
+
+            Avx.Store(d, v0);
+            Avx.Store(d + stride, v1);
+            Avx.Store(d + (stride * 2), v2);
+            Avx.Store(d + (stride * 3), v3);
+            Avx.Store(d + (stride * 4), v4);
+            Avx.Store(d + (stride * 5), v5);
+            Avx.Store(d + (stride * 6), v6);
+            Avx.Store(d + (stride * 7), v7);
+        }
+
+        [Benchmark]
+        public void UseVector256_Avx2_Variant3()
+        {
+            int stride = Width;
+            float* d = this.bufferPtr;
+            float* s = this.blockPtr;
+
+            Vector256<float> v0 = Avx.LoadVector256(s);
+            Vector256<float> v1 = Avx.LoadVector256(s + 8);
+            Vector256<float> v2 = Avx.LoadVector256(s + (8 * 2));
+            Vector256<float> v3 = Avx.LoadVector256(s + (8 * 3));
+            Avx.Store(d, v0);
+            Avx.Store(d + stride, v1);
+            Avx.Store(d + (stride * 2), v2);
+            Avx.Store(d + (stride * 3), v3);
+
+            v0 = Avx.LoadVector256(s + (8 * 4));
+            v1 = Avx.LoadVector256(s + (8 * 5));
+            v2 = Avx.LoadVector256(s + (8 * 6));
+            v3 = Avx.LoadVector256(s + (8 * 7));
+            Avx.Store(d + (stride * 4), v0);
+            Avx.Store(d + (stride * 5), v1);
+            Avx.Store(d + (stride * 6), v2);
+            Avx.Store(d + (stride * 7), v3);
+        }
+
+        [Benchmark]
+        public void UseVector256_Avx2_Variant3_RefCast()
+        {
+            int stride = Width;
+            ref float d = ref this.unpinnedBuffer[0];
+            ref Vector256<float> s = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this.block);
+
+            Vector256<float> v0 = s;
+            Vector256<float> v1 = Unsafe.Add(ref s, 1);
+            Vector256<float> v2 = Unsafe.Add(ref s, 2);
+            Vector256<float> v3 = Unsafe.Add(ref s, 3);
+
+            Unsafe.As<float, Vector256<float>>(ref d) = v0;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride)) = v1;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 2)) = v2;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 3)) = v3;
+
+            v0 = Unsafe.Add(ref s, 4);
+            v1 = Unsafe.Add(ref s, 5);
+            v2 = Unsafe.Add(ref s, 6);
+            v3 = Unsafe.Add(ref s, 7);
+
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 4)) = v0;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 5)) = v1;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 6)) = v2;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 7)) = v3;
+        }
+
+        [Benchmark]
+        public void UseVector256_Avx2_Variant3_RefCast_Mod()
+        {
+            int stride = Width * sizeof(float);
+            ref float d = ref this.unpinnedBuffer[0];
+            ref Vector256<float> s = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this.block);
+
+            Vector256<float> v0 = s;
+            Vector256<float> v1 = Unsafe.AddByteOffset(ref s, (IntPtr)1);
+            Vector256<float> v2 = Unsafe.AddByteOffset(ref s, (IntPtr)2);
+            Vector256<float> v3 = Unsafe.AddByteOffset(ref s, (IntPtr)3);
+
+            Unsafe.As<float, Vector256<float>>(ref d) = v0;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)stride)) = v1;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 2))) = v2;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 3))) = v3;
+
+            v0 = Unsafe.AddByteOffset(ref s, (IntPtr)4);
+            v1 = Unsafe.AddByteOffset(ref s, (IntPtr)5);
+            v2 = Unsafe.AddByteOffset(ref s, (IntPtr)6);
+            v3 = Unsafe.AddByteOffset(ref s, (IntPtr)7);
+
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 4))) = v0;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 5))) = v1;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 6))) = v2;
+            Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 7))) = v3;
+        }
+
+        // [Benchmark]
+        public void UseVector256_Avx2_Variant3_WithLocalPinning()
+        {
+            int stride = Width;
+            fixed (float* d = this.unpinnedBuffer)
+            fixed (Block8x8F* ss = &this.block)
+            {
+                var s = (float*)ss;
+                Vector256<float> v0 = Avx.LoadVector256(s);
+                Vector256<float> v1 = Avx.LoadVector256(s + 8);
+                Vector256<float> v2 = Avx.LoadVector256(s + (8 * 2));
+                Vector256<float> v3 = Avx.LoadVector256(s + (8 * 3));
+                Avx.Store(d, v0);
+                Avx.Store(d + stride, v1);
+                Avx.Store(d + (stride * 2), v2);
+                Avx.Store(d + (stride * 3), v3);
+
+                v0 = Avx.LoadVector256(s + (8 * 4));
+                v1 = Avx.LoadVector256(s + (8 * 5));
+                v2 = Avx.LoadVector256(s + (8 * 6));
+                v3 = Avx.LoadVector256(s + (8 * 7));
+                Avx.Store(d + (stride * 4), v0);
+                Avx.Store(d + (stride * 5), v1);
+                Avx.Store(d + (stride * 6), v2);
+                Avx.Store(d + (stride * 7), v3);
+            }
+        }
+
+        // [Benchmark]
+        public void UseVector256_Avx2_Variant3_sbyte()
+        {
+            int stride = Width * 4;
+            var d = (sbyte*)this.bufferPtr;
+            var s = (sbyte*)this.blockPtr;
+
+            Vector256<sbyte> v0 = Avx.LoadVector256(s);
+            Vector256<sbyte> v1 = Avx.LoadVector256(s + 32);
+            Vector256<sbyte> v2 = Avx.LoadVector256(s + (32 * 2));
+            Vector256<sbyte> v3 = Avx.LoadVector256(s + (32 * 3));
+            Avx.Store(d, v0);
+            Avx.Store(d + stride, v1);
+            Avx.Store(d + (stride * 2), v2);
+            Avx.Store(d + (stride * 3), v3);
+
+            v0 = Avx.LoadVector256(s + (32 * 4));
+            v1 = Avx.LoadVector256(s + (32 * 5));
+            v2 = Avx.LoadVector256(s + (32 * 6));
+            v3 = Avx.LoadVector256(s + (32 * 7));
+            Avx.Store(d + (stride * 4), v0);
+            Avx.Store(d + (stride * 5), v1);
+            Avx.Store(d + (stride * 6), v2);
+            Avx.Store(d + (stride * 7), v3);
+        }
+#endif
+
+        // *** RESULTS 02/2020 ***
+        // BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18363
+        // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
+        // .NET Core SDK=3.1.200-preview-014971
+        //   [Host]     : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT
+        //   DefaultJob : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT
         //
-        //         Method |     Mean |     Error |    StdDev | Scaled |
-        // -------------- |---------:|----------:|----------:|-------:|
-        //       Original | 22.53 ns | 0.1660 ns | 0.1553 ns |   1.00 |
-        //     UseVector8 | 21.59 ns | 0.3079 ns | 0.2571 ns |   0.96 |
-        //  UseVector8_V2 | 22.57 ns | 0.1699 ns | 0.1506 ns |   1.00 |
         //
-        // Conclusion:
-        // Doesn't worth to bother with this
+        // |                                 Method |     Mean |     Error |    StdDev | Ratio | RatioSD |
+        // |--------------------------------------- |---------:|----------:|----------:|------:|--------:|
+        // |                               Original | 4.012 ns | 0.0567 ns | 0.0531 ns |  1.00 |    0.00 |
+        // |                          UseVector8_V3 | 4.013 ns | 0.0947 ns | 0.0840 ns |  1.00 |    0.03 |
+        // |             UseVector256_Avx2_Variant1 | 2.546 ns | 0.0376 ns | 0.0314 ns |  0.63 |    0.01 |
+        // |             UseVector256_Avx2_Variant2 | 2.643 ns | 0.0162 ns | 0.0151 ns |  0.66 |    0.01 |
+        // |             UseVector256_Avx2_Variant3 | 2.520 ns | 0.0760 ns | 0.0813 ns |  0.63 |    0.02 |
+        // |     UseVector256_Avx2_Variant3_RefCast | 2.300 ns | 0.0877 ns | 0.0938 ns |  0.58 |    0.03 |
+        // | UseVector256_Avx2_Variant3_RefCast_Mod | 2.139 ns | 0.0698 ns | 0.0686 ns |  0.53 |    0.02 |
     }
 }