mirror of https://github.com/SixLabors/ImageSharp
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
165 lines
6.7 KiB
165 lines
6.7 KiB
// Copyright (c) Six Labors.
|
|
// Licensed under the Six Labors Split License.
|
|
|
|
using System.Buffers;
|
|
using System.Numerics;
|
|
using System.Runtime.CompilerServices;
|
|
using System.Runtime.InteropServices;
|
|
using System.Runtime.Intrinsics;
|
|
using System.Runtime.Intrinsics.X86;
|
|
using BenchmarkDotNet.Attributes;
|
|
using SixLabors.ImageSharp.Memory;
|
|
using SixLabors.ImageSharp.PixelFormats;
|
|
|
|
// ReSharper disable InconsistentNaming
|
|
namespace SixLabors.ImageSharp.Benchmarks.Bulk;
|
|
|
|
[Config(typeof(Config.ShortCore31))]
|
|
public abstract class FromVector4<TPixel>
|
|
where TPixel : unmanaged, IPixel<TPixel>
|
|
{
|
|
protected IMemoryOwner<Vector4> source;
|
|
|
|
protected IMemoryOwner<TPixel> destination;
|
|
|
|
protected Configuration Configuration => Configuration.Default;
|
|
|
|
// [Params(64, 2048)]
|
|
[Params(64, 256, 2048)]
|
|
public int Count { get; set; }
|
|
|
|
[GlobalSetup]
|
|
public void Setup()
|
|
{
|
|
this.destination = this.Configuration.MemoryAllocator.Allocate<TPixel>(this.Count);
|
|
this.source = this.Configuration.MemoryAllocator.Allocate<Vector4>(this.Count);
|
|
}
|
|
|
|
[GlobalCleanup]
|
|
public void Cleanup()
|
|
{
|
|
this.destination.Dispose();
|
|
this.source.Dispose();
|
|
}
|
|
|
|
// [Benchmark]
|
|
public void PerElement()
|
|
{
|
|
ref Vector4 s = ref MemoryMarshal.GetReference(this.source.GetSpan());
|
|
ref TPixel d = ref MemoryMarshal.GetReference(this.destination.GetSpan());
|
|
for (nuint i = 0; i < (uint)this.Count; i++)
|
|
{
|
|
Unsafe.Add(ref d, i).FromVector4(Unsafe.Add(ref s, i));
|
|
}
|
|
}
|
|
|
|
[Benchmark(Baseline = true)]
|
|
public void PixelOperations_Base()
|
|
{
|
|
new PixelOperations<TPixel>().FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan());
|
|
}
|
|
|
|
[Benchmark]
|
|
public void PixelOperations_Specialized()
|
|
{
|
|
PixelOperations<TPixel>.Instance.FromVector4Destructive(this.Configuration, this.source.GetSpan(), this.destination.GetSpan());
|
|
}
|
|
}
|
|
|
|
public class FromVector4Rgba32 : FromVector4<Rgba32>
|
|
{
|
|
[Benchmark]
|
|
public void FallbackIntrinsics128()
|
|
{
|
|
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
|
|
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
|
|
|
|
SimdUtils.FallbackIntrinsics128.NormalizedFloatToByteSaturate(sBytes, dFloats);
|
|
}
|
|
|
|
[Benchmark]
|
|
public void ExtendedIntrinsic()
|
|
{
|
|
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
|
|
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
|
|
|
|
SimdUtils.ExtendedIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats);
|
|
}
|
|
|
|
[Benchmark]
|
|
public void UseHwIntrinsics()
|
|
{
|
|
Span<float> sBytes = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
|
|
Span<byte> dFloats = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
|
|
|
|
SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(sBytes, dFloats);
|
|
}
|
|
|
|
private static ReadOnlySpan<byte> PermuteMaskDeinterleave8x32 => new byte[] { 0, 0, 0, 0, 4, 0, 0, 0, 1, 0, 0, 0, 5, 0, 0, 0, 2, 0, 0, 0, 6, 0, 0, 0, 3, 0, 0, 0, 7, 0, 0, 0 };
|
|
|
|
[Benchmark]
|
|
public void UseAvx2_Grouped()
|
|
{
|
|
Span<float> src = MemoryMarshal.Cast<Vector4, float>(this.source.GetSpan());
|
|
Span<byte> dest = MemoryMarshal.Cast<Rgba32, byte>(this.destination.GetSpan());
|
|
|
|
nuint n = (uint)dest.Length / (uint)Vector<byte>.Count;
|
|
|
|
ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(src));
|
|
ref Vector256<byte> destBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest));
|
|
|
|
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32);
|
|
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase);
|
|
|
|
var maxBytes = Vector256.Create(255f);
|
|
|
|
for (nuint i = 0; i < n; i++)
|
|
{
|
|
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4);
|
|
|
|
Vector256<float> f0 = s;
|
|
Vector256<float> f1 = Unsafe.Add(ref s, 1);
|
|
Vector256<float> f2 = Unsafe.Add(ref s, 2);
|
|
Vector256<float> f3 = Unsafe.Add(ref s, 3);
|
|
|
|
f0 = Avx.Multiply(maxBytes, f0);
|
|
f1 = Avx.Multiply(maxBytes, f1);
|
|
f2 = Avx.Multiply(maxBytes, f2);
|
|
f3 = Avx.Multiply(maxBytes, f3);
|
|
|
|
Vector256<int> w0 = Avx.ConvertToVector256Int32(f0);
|
|
Vector256<int> w1 = Avx.ConvertToVector256Int32(f1);
|
|
Vector256<int> w2 = Avx.ConvertToVector256Int32(f2);
|
|
Vector256<int> w3 = Avx.ConvertToVector256Int32(f3);
|
|
|
|
Vector256<short> u0 = Avx2.PackSignedSaturate(w0, w1);
|
|
Vector256<short> u1 = Avx2.PackSignedSaturate(w2, w3);
|
|
Vector256<byte> b = Avx2.PackUnsignedSaturate(u0, u1);
|
|
b = Avx2.PermuteVar8x32(b.AsInt32(), mask).AsByte();
|
|
|
|
Unsafe.Add(ref destBase, i) = b;
|
|
}
|
|
}
|
|
|
|
[MethodImpl(MethodImplOptions.AggressiveInlining)]
|
|
private static Vector256<int> ConvertToInt32(Vector256<float> vf, Vector256<float> scale)
|
|
{
|
|
vf = Avx.Multiply(scale, vf);
|
|
return Avx.ConvertToVector256Int32(vf);
|
|
}
|
|
|
|
// *** RESULTS 2020 March: ***
|
|
// Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|
// .NET Core SDK=3.1.200-preview-014971
|
|
// Job-IUZXZT : .NET Core 3.1.2 (CoreCLR 4.700.20.6602, CoreFX 4.700.20.6702), X64 RyuJIT
|
|
//
|
|
// | Method | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|
// |---------------------------- |------ |-----------:|------------:|----------:|------:|--------:|------:|------:|------:|----------:|
|
|
// | FallbackIntrinsics128 | 1024 | 2,952.6 ns | 1,680.77 ns | 92.13 ns | 3.32 | 0.16 | - | - | - | - |
|
|
// | BasicIntrinsics256 | 1024 | 1,664.5 ns | 928.11 ns | 50.87 ns | 1.87 | 0.09 | - | - | - | - |
|
|
// | ExtendedIntrinsic | 1024 | 890.6 ns | 375.48 ns | 20.58 ns | 1.00 | 0.00 | - | - | - | - |
|
|
// | UseAvx2 | 1024 | 299.0 ns | 30.47 ns | 1.67 ns | 0.34 | 0.01 | - | - | - | - |
|
|
// | UseAvx2_Grouped | 1024 | 318.1 ns | 48.19 ns | 2.64 ns | 0.36 | 0.01 | - | - | - | - |
|
|
// | PixelOperations_Base | 1024 | 8,136.9 ns | 1,834.82 ns | 100.57 ns | 9.14 | 0.26 | - | - | - | 24 B |
|
|
// | PixelOperations_Specialized | 1024 | 951.1 ns | 123.93 ns | 6.79 ns | 1.07 | 0.03 | - | - | - | - |
|
|
}
|
|
|