mirror of https://github.com/SixLabors/ImageSharp
9 changed files with 377 additions and 19 deletions
@ -0,0 +1,131 @@ |
|||||
|
// Copyright (c) Six Labors.
|
||||
|
// Licensed under the Apache License, Version 2.0.
|
||||
|
|
||||
|
using System; |
||||
|
using System.Diagnostics; |
||||
|
using System.Runtime.CompilerServices; |
||||
|
using System.Runtime.InteropServices; |
||||
|
|
||||
|
namespace SixLabors.ImageSharp |
||||
|
{ |
||||
|
internal static partial class SimdUtils |
||||
|
{ |
||||
|
/// <summary>
|
||||
|
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
|
||||
|
/// using the control and store the results in <paramref name="dest"/>.
|
||||
|
/// </summary>
|
||||
|
/// <param name="source">The source span of floats</param>
|
||||
|
/// <param name="dest">The destination span of float</param>
|
||||
|
/// <param name="control">The byte control.</param>
|
||||
|
[MethodImpl(InliningOptions.ShortMethod)] |
||||
|
public static void Shuffle4Channel( |
||||
|
ReadOnlySpan<float> source, |
||||
|
Span<float> dest, |
||||
|
byte control) |
||||
|
{ |
||||
|
VerifyShuffleSpanInput(source, dest); |
||||
|
|
||||
|
// TODO: There doesn't seem to be any APIs for
|
||||
|
// System.Numerics that allow shuffling.
|
||||
|
#if SUPPORTS_RUNTIME_INTRINSICS
|
||||
|
HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control); |
||||
|
#endif
|
||||
|
|
||||
|
// Deal with the remainder:
|
||||
|
if (source.Length > 0) |
||||
|
{ |
||||
|
ShuffleRemainder4Channel(source, dest, control); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
[MethodImpl(InliningOptions.ColdPath)] |
||||
|
public static void ShuffleRemainder4Channel( |
||||
|
ReadOnlySpan<float> source, |
||||
|
Span<float> dest, |
||||
|
byte control) |
||||
|
{ |
||||
|
ref float sBase = ref MemoryMarshal.GetReference(source); |
||||
|
ref float dBase = ref MemoryMarshal.GetReference(dest); |
||||
|
Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); |
||||
|
|
||||
|
for (int i = 0; i < source.Length; i += 4) |
||||
|
{ |
||||
|
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); |
||||
|
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); |
||||
|
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); |
||||
|
Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
[Conditional("DEBUG")] |
||||
|
private static void VerifyShuffleSpanInput(ReadOnlySpan<float> source, Span<float> dest) |
||||
|
{ |
||||
|
DebugGuard.IsTrue( |
||||
|
source.Length == dest.Length, |
||||
|
nameof(source), |
||||
|
"Input spans must be of same length!"); |
||||
|
|
||||
|
DebugGuard.IsTrue( |
||||
|
source.Length % 4 == 0, |
||||
|
nameof(source), |
||||
|
"Input spans must be divisiable by 4!"); |
||||
|
} |
||||
|
|
||||
|
public static class Shuffle |
||||
|
{ |
||||
|
public const byte WXYZ = (2 << 6) | (1 << 4) | (0 << 2) | 3; |
||||
|
public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0; |
||||
|
public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; |
||||
|
|
||||
|
public static ReadOnlySpan<byte> WXYZ_128 => MmShuffleByte128(2, 1, 0, 3); |
||||
|
|
||||
|
public static ReadOnlySpan<byte> XYZW_128 => MmShuffleByte128(3, 2, 1, 0); |
||||
|
|
||||
|
public static ReadOnlySpan<byte> ZYXW_128 => MmShuffleByte128(3, 0, 1, 2); |
||||
|
|
||||
|
public static ReadOnlySpan<byte> WXYZ_256 => MmShuffleByte256(2, 1, 0, 3); |
||||
|
|
||||
|
public static ReadOnlySpan<byte> XYZW_256 => MmShuffleByte256(3, 2, 1, 0); |
||||
|
|
||||
|
public static ReadOnlySpan<byte> ZYXW_256 => MmShuffleByte256(3, 0, 1, 2); |
||||
|
|
||||
|
private static byte[] MmShuffleByte128(int p3, int p2, int p1, int p0) |
||||
|
{ |
||||
|
byte[] result = new byte[16]; |
||||
|
|
||||
|
for (int i = 0; i < result.Length; i += 4) |
||||
|
{ |
||||
|
result[i] = (byte)(p0 + i); |
||||
|
result[i + 1] = (byte)(p1 + i); |
||||
|
result[i + 2] = (byte)(p2 + i); |
||||
|
result[i + 3] = (byte)(p3 + i); |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
private static byte[] MmShuffleByte256(int p3, int p2, int p1, int p0) |
||||
|
{ |
||||
|
byte[] result = new byte[32]; |
||||
|
|
||||
|
for (int i = 0; i < result.Length; i += 4) |
||||
|
{ |
||||
|
result[i] = (byte)(p0 + i); |
||||
|
result[i + 1] = (byte)(p1 + i); |
||||
|
result[i + 2] = (byte)(p2 + i); |
||||
|
result[i + 3] = (byte)(p3 + i); |
||||
|
} |
||||
|
|
||||
|
return result; |
||||
|
} |
||||
|
|
||||
|
public static void InverseMmShuffle(byte control, out int p3, out int p2, out int p1, out int p0) |
||||
|
{ |
||||
|
p3 = control >> 6 & 0x3; |
||||
|
p2 = control >> 4 & 0x3; |
||||
|
p1 = control >> 2 & 0x3; |
||||
|
p0 = control >> 0 & 0x3; |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
} |
||||
@ -0,0 +1,68 @@ |
|||||
|
// Copyright (c) Six Labors.
|
||||
|
// Licensed under the Apache License, Version 2.0.
|
||||
|
|
||||
|
using System; |
||||
|
using BenchmarkDotNet.Attributes; |
||||
|
using SixLabors.ImageSharp.Tests; |
||||
|
|
||||
|
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
||||
|
{ |
||||
|
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
||||
|
public class ShuffleFloat4Channel |
||||
|
{ |
||||
|
private float[] source; |
||||
|
private float[] destination; |
||||
|
|
||||
|
[GlobalSetup] |
||||
|
public void Setup() |
||||
|
{ |
||||
|
this.source = new Random(this.Count).GenerateRandomFloatArray(this.Count, 0, 256); |
||||
|
this.destination = new float[this.Count]; |
||||
|
} |
||||
|
|
||||
|
[Params(128, 256, 512, 1024, 2048)] |
||||
|
public int Count { get; set; } |
||||
|
|
||||
|
[Benchmark] |
||||
|
public void Shuffle4Channel() |
||||
|
{ |
||||
|
SimdUtils.Shuffle4Channel(this.source, this.destination, SimdUtils.Shuffle.WXYZ); |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
// 2020-10-26
|
||||
|
// ##########
|
||||
|
//
|
||||
|
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
||||
|
// Intel Core i7-8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
||||
|
// .NET Core SDK = 5.0.100-rc.2.20479.15
|
||||
|
//
|
||||
|
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
||||
|
// AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
||||
|
// No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
||||
|
// SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
||||
|
//
|
||||
|
// Runtime=.NET Core 3.1
|
||||
|
//
|
||||
|
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
||||
|
// |---------------- |---------------- |-------------------------------------------------- |------ |------------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:|
|
||||
|
// | Shuffle4Channel | AVX | Empty | 128 | 14.49 ns | 0.244 ns | 0.217 ns | 1.00 | 0.00 | - | - | - | - |
|
||||
|
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 87.74 ns | 0.524 ns | 0.490 ns | 6.06 | 0.09 | - | - | - | - |
|
||||
|
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 23.65 ns | 0.101 ns | 0.094 ns | 1.63 | 0.03 | - | - | - | - |
|
||||
|
// | | | | | | | | | | | | | |
|
||||
|
// | Shuffle4Channel | AVX | Empty | 256 | 25.87 ns | 0.492 ns | 0.673 ns | 1.00 | 0.00 | - | - | - | - |
|
||||
|
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 159.52 ns | 0.901 ns | 0.843 ns | 6.12 | 0.12 | - | - | - | - |
|
||||
|
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 45.47 ns | 0.404 ns | 0.378 ns | 1.75 | 0.03 | - | - | - | - |
|
||||
|
// | | | | | | | | | | | | | |
|
||||
|
// | Shuffle4Channel | AVX | Empty | 512 | 49.51 ns | 0.088 ns | 0.083 ns | 1.00 | 0.00 | - | - | - | - |
|
||||
|
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 297.96 ns | 0.926 ns | 0.821 ns | 6.02 | 0.02 | - | - | - | - |
|
||||
|
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 90.77 ns | 0.191 ns | 0.169 ns | 1.83 | 0.00 | - | - | - | - |
|
||||
|
// | | | | | | | | | | | | | |
|
||||
|
// | Shuffle4Channel | AVX | Empty | 1024 | 113.09 ns | 1.913 ns | 3.090 ns | 1.00 | 0.00 | - | - | - | - |
|
||||
|
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 604.58 ns | 1.464 ns | 1.298 ns | 5.29 | 0.18 | - | - | - | - |
|
||||
|
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 179.44 ns | 0.208 ns | 0.184 ns | 1.57 | 0.05 | - | - | - | - |
|
||||
|
// | | | | | | | | | | | | | |
|
||||
|
// | Shuffle4Channel | AVX | Empty | 2048 | 217.95 ns | 1.314 ns | 1.165 ns | 1.00 | 0.00 | - | - | - | - |
|
||||
|
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 1,152.04 ns | 3.941 ns | 3.494 ns | 5.29 | 0.03 | - | - | - | - |
|
||||
|
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 349.52 ns | 0.587 ns | 0.520 ns | 1.60 | 0.01 | - | - | - | - |
|
||||
|
} |
||||
@ -0,0 +1,75 @@ |
|||||
|
// Copyright (c) Six Labors.
|
||||
|
// Licensed under the Apache License, Version 2.0.
|
||||
|
|
||||
|
using System; |
||||
|
using SixLabors.ImageSharp.Tests.TestUtilities; |
||||
|
using Xunit; |
||||
|
|
||||
|
namespace SixLabors.ImageSharp.Tests.Common |
||||
|
{ |
||||
|
public partial class SimdUtilsTests |
||||
|
{ |
||||
|
public static readonly TheoryData<byte> ShuffleControls = |
||||
|
new TheoryData<byte> |
||||
|
{ |
||||
|
SimdUtils.Shuffle.WXYZ, |
||||
|
SimdUtils.Shuffle.XYZW, |
||||
|
SimdUtils.Shuffle.ZYXW |
||||
|
}; |
||||
|
|
||||
|
[Theory] |
||||
|
[MemberData(nameof(ShuffleControls))] |
||||
|
public void BulkShuffleFloat4Channel(byte control) |
||||
|
{ |
||||
|
static void RunTest(string serialized) |
||||
|
{ |
||||
|
byte ctrl = FeatureTestRunner.Deserialize<byte>(serialized); |
||||
|
foreach (var item in ArraySizesDivisibleBy4) |
||||
|
{ |
||||
|
foreach (var count in item) |
||||
|
{ |
||||
|
TestShuffle( |
||||
|
(int)count, |
||||
|
(s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl), |
||||
|
ctrl); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
|
|
||||
|
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
||||
|
RunTest, |
||||
|
control, |
||||
|
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); |
||||
|
} |
||||
|
|
||||
|
private static void TestShuffle( |
||||
|
int count, |
||||
|
Action<Memory<float>, Memory<float>> convert, |
||||
|
byte control) |
||||
|
{ |
||||
|
float[] source = new Random(count).GenerateRandomFloatArray(count, 0, 256); |
||||
|
var result = new float[count]; |
||||
|
|
||||
|
float[] expected = new float[count]; |
||||
|
|
||||
|
SimdUtils.Shuffle.InverseMmShuffle( |
||||
|
control, |
||||
|
out int p3, |
||||
|
out int p2, |
||||
|
out int p1, |
||||
|
out int p0); |
||||
|
|
||||
|
for (int i = 0; i < expected.Length; i += 4) |
||||
|
{ |
||||
|
expected[i] = source[p0 + i]; |
||||
|
expected[i + 1] = source[p1 + i]; |
||||
|
expected[i + 2] = source[p2 + i]; |
||||
|
expected[i + 3] = source[p3 + i]; |
||||
|
} |
||||
|
|
||||
|
convert(source, result); |
||||
|
|
||||
|
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5F)); |
||||
|
} |
||||
|
} |
||||
|
} |
||||
Loading…
Reference in new issue