|
|
@ -4,7 +4,9 @@ |
|
|
using System; |
|
|
using System; |
|
|
using System.Numerics; |
|
|
using System.Numerics; |
|
|
using System.Runtime.CompilerServices; |
|
|
using System.Runtime.CompilerServices; |
|
|
|
|
|
using System.Runtime.InteropServices; |
|
|
|
|
|
using System.Runtime.Intrinsics; |
|
|
|
|
|
using System.Runtime.Intrinsics.X86; |
|
|
using BenchmarkDotNet.Attributes; |
|
|
using BenchmarkDotNet.Attributes; |
|
|
|
|
|
|
|
|
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|
|
using SixLabors.ImageSharp.Formats.Jpeg.Components; |
|
|
@ -13,13 +15,19 @@ using SixLabors.ImageSharp.Memory; |
|
|
// ReSharper disable InconsistentNaming
|
|
|
// ReSharper disable InconsistentNaming
|
|
|
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|
|
namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|
|
{ |
|
|
{ |
|
|
public class Block8x8F_CopyTo1x1 |
|
|
public unsafe class Block8x8F_CopyTo1x1 |
|
|
{ |
|
|
{ |
|
|
private Block8x8F block; |
|
|
private Block8x8F block; |
|
|
|
|
|
private readonly Block8x8F[] blockArray = new Block8x8F[1]; |
|
|
|
|
|
|
|
|
private Buffer2D<float> buffer; |
|
|
private static readonly int Width = 100; |
|
|
|
|
|
|
|
|
private BufferArea<float> destArea; |
|
|
private float[] buffer = new float[Width * 500]; |
|
|
|
|
|
private readonly float[] unpinnedBuffer = new float[Width * 500]; |
|
|
|
|
|
private GCHandle bufferHandle; |
|
|
|
|
|
private GCHandle blockHandle; |
|
|
|
|
|
private float* bufferPtr; |
|
|
|
|
|
private float* blockPtr; |
|
|
|
|
|
|
|
|
[GlobalSetup] |
|
|
[GlobalSetup] |
|
|
public void Setup() |
|
|
public void Setup() |
|
|
@ -29,16 +37,30 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|
|
throw new InvalidOperationException("Benchmark Block8x8F_CopyTo1x1 is invalid on platforms without AVX2 support."); |
|
|
throw new InvalidOperationException("Benchmark Block8x8F_CopyTo1x1 is invalid on platforms without AVX2 support."); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
this.buffer = Configuration.Default.MemoryAllocator.Allocate2D<float>(1000, 500); |
|
|
this.bufferHandle = GCHandle.Alloc(this.buffer, GCHandleType.Pinned); |
|
|
this.destArea = this.buffer.GetArea(200, 100, 64, 64); |
|
|
this.bufferPtr = (float*)this.bufferHandle.AddrOfPinnedObject(); |
|
|
|
|
|
|
|
|
|
|
|
// Pin self so we can take address of to the block:
|
|
|
|
|
|
this.blockHandle = GCHandle.Alloc(this.blockArray, GCHandleType.Pinned); |
|
|
|
|
|
this.blockPtr = (float*)Unsafe.AsPointer(ref this.block); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[GlobalCleanup] |
|
|
|
|
|
public void Cleanup() |
|
|
|
|
|
{ |
|
|
|
|
|
this.bufferPtr = null; |
|
|
|
|
|
this.blockPtr = null; |
|
|
|
|
|
this.bufferHandle.Free(); |
|
|
|
|
|
this.blockHandle.Free(); |
|
|
|
|
|
this.buffer = null; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
[Benchmark(Baseline = true)] |
|
|
[Benchmark(Baseline = true)] |
|
|
public void Original() |
|
|
public void Original() |
|
|
{ |
|
|
{ |
|
|
ref byte selfBase = ref Unsafe.As<Block8x8F, byte>(ref this.block); |
|
|
ref byte selfBase = ref Unsafe.As<Block8x8F, byte>(ref this.block); |
|
|
ref byte destBase = ref Unsafe.As<float, byte>(ref this.destArea.GetReferenceToOrigin()); |
|
|
ref byte destBase = ref Unsafe.AsRef<byte>(this.bufferPtr); |
|
|
int destStride = this.destArea.Stride * sizeof(float); |
|
|
int destStride = Width * sizeof(float); |
|
|
|
|
|
|
|
|
CopyRowImpl(ref selfBase, ref destBase, destStride, 0); |
|
|
CopyRowImpl(ref selfBase, ref destBase, destStride, 0); |
|
|
CopyRowImpl(ref selfBase, ref destBase, destStride, 1); |
|
|
CopyRowImpl(ref selfBase, ref destBase, destStride, 1); |
|
|
@ -58,12 +80,12 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|
|
Unsafe.CopyBlock(ref d, ref s, 8 * sizeof(float)); |
|
|
Unsafe.CopyBlock(ref d, ref s, 8 * sizeof(float)); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
[Benchmark] |
|
|
// [Benchmark]
|
|
|
public void UseVector8() |
|
|
public void UseVector8() |
|
|
{ |
|
|
{ |
|
|
ref Block8x8F s = ref this.block; |
|
|
ref Block8x8F s = ref this.block; |
|
|
ref float origin = ref this.destArea.GetReferenceToOrigin(); |
|
|
ref float origin = ref Unsafe.AsRef<float>(this.bufferPtr); |
|
|
int stride = this.destArea.Stride; |
|
|
int stride = Width; |
|
|
|
|
|
|
|
|
ref Vector<float> d0 = ref Unsafe.As<float, Vector<float>>(ref origin); |
|
|
ref Vector<float> d0 = ref Unsafe.As<float, Vector<float>>(ref origin); |
|
|
ref Vector<float> d1 = ref Unsafe.As<float, Vector<float>>(ref Unsafe.Add(ref origin, stride)); |
|
|
ref Vector<float> d1 = ref Unsafe.As<float, Vector<float>>(ref Unsafe.Add(ref origin, stride)); |
|
|
@ -93,12 +115,12 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|
|
d7 = row7; |
|
|
d7 = row7; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
[Benchmark] |
|
|
// [Benchmark]
|
|
|
public void UseVector8_V2() |
|
|
public void UseVector8_V2() |
|
|
{ |
|
|
{ |
|
|
ref Block8x8F s = ref this.block; |
|
|
ref Block8x8F s = ref this.block; |
|
|
ref float origin = ref this.destArea.GetReferenceToOrigin(); |
|
|
ref float origin = ref Unsafe.AsRef<float>(this.bufferPtr); |
|
|
int stride = this.destArea.Stride; |
|
|
int stride = Width; |
|
|
|
|
|
|
|
|
ref Vector<float> d0 = ref Unsafe.As<float, Vector<float>>(ref origin); |
|
|
ref Vector<float> d0 = ref Unsafe.As<float, Vector<float>>(ref origin); |
|
|
ref Vector<float> d1 = ref Unsafe.As<float, Vector<float>>(ref Unsafe.Add(ref origin, stride)); |
|
|
ref Vector<float> d1 = ref Unsafe.As<float, Vector<float>>(ref Unsafe.Add(ref origin, stride)); |
|
|
@ -119,15 +141,247 @@ namespace SixLabors.ImageSharp.Benchmarks.Codecs.Jpeg.BlockOperations |
|
|
d7 = Unsafe.As<Vector4, Vector<float>>(ref s.V7L); |
|
|
d7 = Unsafe.As<Vector4, Vector<float>>(ref s.V7L); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// RESULTS:
|
|
|
[Benchmark] |
|
|
|
|
|
public void UseVector8_V3() |
|
|
|
|
|
{ |
|
|
|
|
|
int stride = Width * sizeof(float); |
|
|
|
|
|
ref float d = ref this.unpinnedBuffer[0]; |
|
|
|
|
|
ref Vector<float> s = ref Unsafe.As<Block8x8F, Vector<float>>(ref this.block); |
|
|
|
|
|
|
|
|
|
|
|
Vector<float> v0 = s; |
|
|
|
|
|
Vector<float> v1 = Unsafe.AddByteOffset(ref s, (IntPtr)1); |
|
|
|
|
|
Vector<float> v2 = Unsafe.AddByteOffset(ref s, (IntPtr)2); |
|
|
|
|
|
Vector<float> v3 = Unsafe.AddByteOffset(ref s, (IntPtr)3); |
|
|
|
|
|
|
|
|
|
|
|
Unsafe.As<float, Vector<float>>(ref d) = v0; |
|
|
|
|
|
Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)stride)) = v1; |
|
|
|
|
|
Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 2))) = v2; |
|
|
|
|
|
Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 3))) = v3; |
|
|
|
|
|
|
|
|
|
|
|
v0 = Unsafe.AddByteOffset(ref s, (IntPtr)4); |
|
|
|
|
|
v1 = Unsafe.AddByteOffset(ref s, (IntPtr)5); |
|
|
|
|
|
v2 = Unsafe.AddByteOffset(ref s, (IntPtr)6); |
|
|
|
|
|
v3 = Unsafe.AddByteOffset(ref s, (IntPtr)7); |
|
|
|
|
|
|
|
|
|
|
|
Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 4))) = v0; |
|
|
|
|
|
Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 5))) = v1; |
|
|
|
|
|
Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 6))) = v2; |
|
|
|
|
|
Unsafe.As<float, Vector<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 7))) = v3; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
|
|
|
|
|
[Benchmark] |
|
|
|
|
|
public void UseVector256_Avx2_Variant1() |
|
|
|
|
|
{ |
|
|
|
|
|
int stride = Width; |
|
|
|
|
|
float* d = this.bufferPtr; |
|
|
|
|
|
float* s = this.blockPtr; |
|
|
|
|
|
Vector256<float> v; |
|
|
|
|
|
|
|
|
|
|
|
v = Avx.LoadVector256(s); |
|
|
|
|
|
Avx.Store(d, v); |
|
|
|
|
|
|
|
|
|
|
|
v = Avx.LoadVector256(s + 8); |
|
|
|
|
|
Avx.Store(d + stride, v); |
|
|
|
|
|
|
|
|
|
|
|
v = Avx.LoadVector256(s + (8 * 2)); |
|
|
|
|
|
Avx.Store(d + (stride * 2), v); |
|
|
|
|
|
|
|
|
|
|
|
v = Avx.LoadVector256(s + (8 * 3)); |
|
|
|
|
|
Avx.Store(d + (stride * 3), v); |
|
|
|
|
|
|
|
|
|
|
|
v = Avx.LoadVector256(s + (8 * 4)); |
|
|
|
|
|
Avx.Store(d + (stride * 4), v); |
|
|
|
|
|
|
|
|
|
|
|
v = Avx.LoadVector256(s + (8 * 5)); |
|
|
|
|
|
Avx.Store(d + (stride * 5), v); |
|
|
|
|
|
|
|
|
|
|
|
v = Avx.LoadVector256(s + (8 * 6)); |
|
|
|
|
|
Avx.Store(d + (stride * 6), v); |
|
|
|
|
|
|
|
|
|
|
|
v = Avx.LoadVector256(s + (8 * 7)); |
|
|
|
|
|
Avx.Store(d + (stride * 7), v); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[Benchmark] |
|
|
|
|
|
public void UseVector256_Avx2_Variant2() |
|
|
|
|
|
{ |
|
|
|
|
|
int stride = Width; |
|
|
|
|
|
float* d = this.bufferPtr; |
|
|
|
|
|
float* s = this.blockPtr; |
|
|
|
|
|
|
|
|
|
|
|
Vector256<float> v0 = Avx.LoadVector256(s); |
|
|
|
|
|
Vector256<float> v1 = Avx.LoadVector256(s + 8); |
|
|
|
|
|
Vector256<float> v2 = Avx.LoadVector256(s + (8 * 2)); |
|
|
|
|
|
Vector256<float> v3 = Avx.LoadVector256(s + (8 * 3)); |
|
|
|
|
|
Vector256<float> v4 = Avx.LoadVector256(s + (8 * 4)); |
|
|
|
|
|
Vector256<float> v5 = Avx.LoadVector256(s + (8 * 5)); |
|
|
|
|
|
Vector256<float> v6 = Avx.LoadVector256(s + (8 * 6)); |
|
|
|
|
|
Vector256<float> v7 = Avx.LoadVector256(s + (8 * 7)); |
|
|
|
|
|
|
|
|
|
|
|
Avx.Store(d, v0); |
|
|
|
|
|
Avx.Store(d + stride, v1); |
|
|
|
|
|
Avx.Store(d + (stride * 2), v2); |
|
|
|
|
|
Avx.Store(d + (stride * 3), v3); |
|
|
|
|
|
Avx.Store(d + (stride * 4), v4); |
|
|
|
|
|
Avx.Store(d + (stride * 5), v5); |
|
|
|
|
|
Avx.Store(d + (stride * 6), v6); |
|
|
|
|
|
Avx.Store(d + (stride * 7), v7); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[Benchmark] |
|
|
|
|
|
public void UseVector256_Avx2_Variant3() |
|
|
|
|
|
{ |
|
|
|
|
|
int stride = Width; |
|
|
|
|
|
float* d = this.bufferPtr; |
|
|
|
|
|
float* s = this.blockPtr; |
|
|
|
|
|
|
|
|
|
|
|
Vector256<float> v0 = Avx.LoadVector256(s); |
|
|
|
|
|
Vector256<float> v1 = Avx.LoadVector256(s + 8); |
|
|
|
|
|
Vector256<float> v2 = Avx.LoadVector256(s + (8 * 2)); |
|
|
|
|
|
Vector256<float> v3 = Avx.LoadVector256(s + (8 * 3)); |
|
|
|
|
|
Avx.Store(d, v0); |
|
|
|
|
|
Avx.Store(d + stride, v1); |
|
|
|
|
|
Avx.Store(d + (stride * 2), v2); |
|
|
|
|
|
Avx.Store(d + (stride * 3), v3); |
|
|
|
|
|
|
|
|
|
|
|
v0 = Avx.LoadVector256(s + (8 * 4)); |
|
|
|
|
|
v1 = Avx.LoadVector256(s + (8 * 5)); |
|
|
|
|
|
v2 = Avx.LoadVector256(s + (8 * 6)); |
|
|
|
|
|
v3 = Avx.LoadVector256(s + (8 * 7)); |
|
|
|
|
|
Avx.Store(d + (stride * 4), v0); |
|
|
|
|
|
Avx.Store(d + (stride * 5), v1); |
|
|
|
|
|
Avx.Store(d + (stride * 6), v2); |
|
|
|
|
|
Avx.Store(d + (stride * 7), v3); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[Benchmark] |
|
|
|
|
|
public void UseVector256_Avx2_Variant3_RefCast() |
|
|
|
|
|
{ |
|
|
|
|
|
int stride = Width; |
|
|
|
|
|
ref float d = ref this.unpinnedBuffer[0]; |
|
|
|
|
|
ref Vector256<float> s = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this.block); |
|
|
|
|
|
|
|
|
|
|
|
Vector256<float> v0 = s; |
|
|
|
|
|
Vector256<float> v1 = Unsafe.Add(ref s, 1); |
|
|
|
|
|
Vector256<float> v2 = Unsafe.Add(ref s, 2); |
|
|
|
|
|
Vector256<float> v3 = Unsafe.Add(ref s, 3); |
|
|
|
|
|
|
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref d) = v0; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride)) = v1; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 2)) = v2; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 3)) = v3; |
|
|
|
|
|
|
|
|
|
|
|
v0 = Unsafe.Add(ref s, 4); |
|
|
|
|
|
v1 = Unsafe.Add(ref s, 5); |
|
|
|
|
|
v2 = Unsafe.Add(ref s, 6); |
|
|
|
|
|
v3 = Unsafe.Add(ref s, 7); |
|
|
|
|
|
|
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 4)) = v0; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 5)) = v1; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 6)) = v2; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.Add(ref d, stride * 7)) = v3; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
[Benchmark] |
|
|
|
|
|
public void UseVector256_Avx2_Variant3_RefCast_Mod() |
|
|
|
|
|
{ |
|
|
|
|
|
int stride = Width * sizeof(float); |
|
|
|
|
|
ref float d = ref this.unpinnedBuffer[0]; |
|
|
|
|
|
ref Vector256<float> s = ref Unsafe.As<Block8x8F, Vector256<float>>(ref this.block); |
|
|
|
|
|
|
|
|
|
|
|
Vector256<float> v0 = s; |
|
|
|
|
|
Vector256<float> v1 = Unsafe.AddByteOffset(ref s, (IntPtr)1); |
|
|
|
|
|
Vector256<float> v2 = Unsafe.AddByteOffset(ref s, (IntPtr)2); |
|
|
|
|
|
Vector256<float> v3 = Unsafe.AddByteOffset(ref s, (IntPtr)3); |
|
|
|
|
|
|
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref d) = v0; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)stride)) = v1; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 2))) = v2; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 3))) = v3; |
|
|
|
|
|
|
|
|
|
|
|
v0 = Unsafe.AddByteOffset(ref s, (IntPtr)4); |
|
|
|
|
|
v1 = Unsafe.AddByteOffset(ref s, (IntPtr)5); |
|
|
|
|
|
v2 = Unsafe.AddByteOffset(ref s, (IntPtr)6); |
|
|
|
|
|
v3 = Unsafe.AddByteOffset(ref s, (IntPtr)7); |
|
|
|
|
|
|
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 4))) = v0; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 5))) = v1; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 6))) = v2; |
|
|
|
|
|
Unsafe.As<float, Vector256<float>>(ref Unsafe.AddByteOffset(ref d, (IntPtr)(stride * 7))) = v3; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// [Benchmark]
|
|
|
|
|
|
public void UseVector256_Avx2_Variant3_WithLocalPinning() |
|
|
|
|
|
{ |
|
|
|
|
|
int stride = Width; |
|
|
|
|
|
fixed (float* d = this.unpinnedBuffer) |
|
|
|
|
|
fixed (Block8x8F* ss = &this.block) |
|
|
|
|
|
{ |
|
|
|
|
|
var s = (float*)ss; |
|
|
|
|
|
Vector256<float> v0 = Avx.LoadVector256(s); |
|
|
|
|
|
Vector256<float> v1 = Avx.LoadVector256(s + 8); |
|
|
|
|
|
Vector256<float> v2 = Avx.LoadVector256(s + (8 * 2)); |
|
|
|
|
|
Vector256<float> v3 = Avx.LoadVector256(s + (8 * 3)); |
|
|
|
|
|
Avx.Store(d, v0); |
|
|
|
|
|
Avx.Store(d + stride, v1); |
|
|
|
|
|
Avx.Store(d + (stride * 2), v2); |
|
|
|
|
|
Avx.Store(d + (stride * 3), v3); |
|
|
|
|
|
|
|
|
|
|
|
v0 = Avx.LoadVector256(s + (8 * 4)); |
|
|
|
|
|
v1 = Avx.LoadVector256(s + (8 * 5)); |
|
|
|
|
|
v2 = Avx.LoadVector256(s + (8 * 6)); |
|
|
|
|
|
v3 = Avx.LoadVector256(s + (8 * 7)); |
|
|
|
|
|
Avx.Store(d + (stride * 4), v0); |
|
|
|
|
|
Avx.Store(d + (stride * 5), v1); |
|
|
|
|
|
Avx.Store(d + (stride * 6), v2); |
|
|
|
|
|
Avx.Store(d + (stride * 7), v3); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// [Benchmark]
|
|
|
|
|
|
public void UseVector256_Avx2_Variant3_sbyte() |
|
|
|
|
|
{ |
|
|
|
|
|
int stride = Width * 4; |
|
|
|
|
|
var d = (sbyte*)this.bufferPtr; |
|
|
|
|
|
var s = (sbyte*)this.blockPtr; |
|
|
|
|
|
|
|
|
|
|
|
Vector256<sbyte> v0 = Avx.LoadVector256(s); |
|
|
|
|
|
Vector256<sbyte> v1 = Avx.LoadVector256(s + 32); |
|
|
|
|
|
Vector256<sbyte> v2 = Avx.LoadVector256(s + (32 * 2)); |
|
|
|
|
|
Vector256<sbyte> v3 = Avx.LoadVector256(s + (32 * 3)); |
|
|
|
|
|
Avx.Store(d, v0); |
|
|
|
|
|
Avx.Store(d + stride, v1); |
|
|
|
|
|
Avx.Store(d + (stride * 2), v2); |
|
|
|
|
|
Avx.Store(d + (stride * 3), v3); |
|
|
|
|
|
|
|
|
|
|
|
v0 = Avx.LoadVector256(s + (32 * 4)); |
|
|
|
|
|
v1 = Avx.LoadVector256(s + (32 * 5)); |
|
|
|
|
|
v2 = Avx.LoadVector256(s + (32 * 6)); |
|
|
|
|
|
v3 = Avx.LoadVector256(s + (32 * 7)); |
|
|
|
|
|
Avx.Store(d + (stride * 4), v0); |
|
|
|
|
|
Avx.Store(d + (stride * 5), v1); |
|
|
|
|
|
Avx.Store(d + (stride * 6), v2); |
|
|
|
|
|
Avx.Store(d + (stride * 7), v3); |
|
|
|
|
|
} |
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
// *** RESULTS 02/2020 ***
|
|
|
|
|
|
// BenchmarkDotNet=v0.12.0, OS=Windows 10.0.18363
|
|
|
|
|
|
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|
|
|
|
|
// .NET Core SDK=3.1.200-preview-014971
|
|
|
|
|
|
// [Host] : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT
|
|
|
|
|
|
// DefaultJob : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT
|
|
|
//
|
|
|
//
|
|
|
// Method | Mean | Error | StdDev | Scaled |
|
|
|
|
|
|
// -------------- |---------:|----------:|----------:|-------:|
|
|
|
|
|
|
// Original | 22.53 ns | 0.1660 ns | 0.1553 ns | 1.00 |
|
|
|
|
|
|
// UseVector8 | 21.59 ns | 0.3079 ns | 0.2571 ns | 0.96 |
|
|
|
|
|
|
// UseVector8_V2 | 22.57 ns | 0.1699 ns | 0.1506 ns | 1.00 |
|
|
|
|
|
|
//
|
|
|
//
|
|
|
// Conclusion:
|
|
|
// | Method | Mean | Error | StdDev | Ratio | RatioSD |
|
|
|
// Doesn't worth to bother with this
|
|
|
// |--------------------------------------- |---------:|----------:|----------:|------:|--------:|
|
|
|
|
|
|
// | Original | 4.012 ns | 0.0567 ns | 0.0531 ns | 1.00 | 0.00 |
|
|
|
|
|
|
// | UseVector8_V3 | 4.013 ns | 0.0947 ns | 0.0840 ns | 1.00 | 0.03 |
|
|
|
|
|
|
// | UseVector256_Avx2_Variant1 | 2.546 ns | 0.0376 ns | 0.0314 ns | 0.63 | 0.01 |
|
|
|
|
|
|
// | UseVector256_Avx2_Variant2 | 2.643 ns | 0.0162 ns | 0.0151 ns | 0.66 | 0.01 |
|
|
|
|
|
|
// | UseVector256_Avx2_Variant3 | 2.520 ns | 0.0760 ns | 0.0813 ns | 0.63 | 0.02 |
|
|
|
|
|
|
// | UseVector256_Avx2_Variant3_RefCast | 2.300 ns | 0.0877 ns | 0.0938 ns | 0.58 | 0.03 |
|
|
|
|
|
|
// | UseVector256_Avx2_Variant3_RefCast_Mod | 2.139 ns | 0.0698 ns | 0.0686 ns | 0.53 | 0.02 |
|
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|