// Copyright (c) Six Labors. // Licensed under the Six Labors Split License. using System.Buffers; using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; using System.Runtime.Intrinsics.X86; using BenchmarkDotNet.Attributes; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; // ReSharper disable InconsistentNaming namespace SixLabors.ImageSharp.Benchmarks.Bulk; [Config(typeof(Config.ShortCore31))] public abstract class FromVector4 where TPixel : unmanaged, IPixel { protected IMemoryOwner source; protected IMemoryOwner destination; protected Configuration Configuration => Configuration.Default; // [Params(64, 2048)] [Params(64, 256, 2048)] public int Count { get; set; } [GlobalSetup] public void Setup() { this.destination = this.Configuration.MemoryAllocator.Allocate(this.Count); this.source = this.Configuration.MemoryAllocator.Allocate(this.Count); } [GlobalCleanup] public void Cleanup() { this.destination.Dispose(); this.source.Dispose(); } // [Benchmark] public void PerElement() { ref Vector4 s = ref MemoryMarshal.GetReference(this.source.GetSpan()); ref TPixel d = ref MemoryMarshal.GetReference(this.destination.GetSpan()); for (nuint i = 0; i < (uint)this.Count; i++) { Unsafe.Add(ref d, i).FromVector4(Unsafe.Add(ref s, i)); } } [Benchmark(Baseline = true)] public void PixelOperations_Base() { new PixelOperations().FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan()); } [Benchmark] public void PixelOperations_Specialized() { PixelOperations.Instance.FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan()); } } public class FromVector4Rgba32 : FromVector4 { [Benchmark] public void FallbackIntrinsics128() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); SimdUtils.FallbackIntrinsics128.NormalizedFloatToByteSaturate(sBytes, dFloats); } [Benchmark] public void ExtendedIntrinsic() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); SimdUtils.ExtendedIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats); } [Benchmark] public void UseHwIntrinsics() { Span sBytes = MemoryMarshal.Cast(this.source.GetSpan()); Span dFloats = MemoryMarshal.Cast(this.destination.GetSpan()); SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats); } private static ReadOnlySpan PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 }; [Benchmark] public void UseAvx2_Grouped() { Span src = MemoryMarshal.Cast(this.source.GetSpan()); Span dest = MemoryMarshal.Cast(this.destination.GetSpan()); nuint n = (uint)dest.Length / (uint)Vector.Count; ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(src)); ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); Vector256 mask = Unsafe.As>(ref maskBase); var maxBytes = Vector256.Create(255f); for (nuint i = 0; i < n; i++) { ref Vector256 s = ref Unsafe.Add(ref sourceBase, i * 4); Vector256 f0 = s; Vector256 f1 = Unsafe.Add(ref s, 1); Vector256 f2 = Unsafe.Add(ref s, 2); Vector256 f3 = Unsafe.Add(ref s, 3); f0 = Avx.Multiply(maxBytes, f0); f1 = Avx.Multiply(maxBytes, f1); f2 = Avx.Multiply(maxBytes, f2); f3 = Avx.Multiply(maxBytes, f3); Vector256 w0 = Avx.ConvertToVector256Int32(f0); Vector256 w1 = Avx.ConvertToVector256Int32(f1); Vector256 w2 = Avx.ConvertToVector256Int32(f2); Vector256 w3 = Avx.ConvertToVector256Int32(f3); Vector256 u0 = Avx2.PackSignedSaturate(w0, w1); Vector256 u1 = Avx2.PackSignedSaturate(w2, w3); Vector256 b = Avx2.PackUnsignedSaturate(u0, u1); b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte(); Unsafe.Add(ref destBase, i) = b; } } [MethodImpl(MethodImplOptions.AggressiveInlining)] private static Vector256 ConvertToInt32(Vector256 vf, Vector256 scale) { vf = Avx.Multiply(scale, vf); return Avx.ConvertToVector256Int32(vf); } // *** RESULTS 2020 March: *** // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores // .NET Core SDK=3.1.200-preview-014971 // Job-IUZXZT : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT // // | Method | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | // |---------------------------- |------ |-----------:|------------:|----------:|------:|--------:|------:|------:|------:|----------:| // | FallbackIntrinsics128 | 1024 | 2,952.6 ns | 1,680.77 ns | 92.13 ns | 3.32 | 0.16 | - | - | - | - | // | BasicIntrinsics256 | 1024 | 1,664.5 ns | 928.11 ns | 50.87 ns | 1.87 | 0.09 | - | - | - | - | // | ExtendedIntrinsic | 1024 | 890.6 ns | 375.48 ns | 20.58 ns | 1.00 | 0.00 | - | - | - | - | // | UseAvx2 | 1024 | 299.0 ns | 30.47 ns | 1.67 ns | 0.34 | 0.01 | - | - | - | - | // | UseAvx2_Grouped | 1024 | 318.1 ns | 48.19 ns | 2.64 ns | 0.36 | 0.01 | - | - | - | - | // | PixelOperations_Base | 1024 | 8,136.9 ns | 1,834.82 ns | 100.57 ns | 9.14 | 0.26 | - | - | - | 24 B | // | PixelOperations_Specialized | 1024 | 951.1 ns | 123.93 ns | 6.79 ns | 1.07 | 0.03 | - | - | - | - | }