mirror of https://github.com/SixLabors/ImageSharp
9 changed files with 377 additions and 19 deletions
@ -0,0 +1,131 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using System.Diagnostics; |
|||
using System.Runtime.CompilerServices; |
|||
using System.Runtime.InteropServices; |
|||
|
|||
namespace SixLabors.ImageSharp |
|||
{ |
|||
internal static partial class SimdUtils |
|||
{ |
|||
/// <summary>
|
|||
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
|
|||
/// using the control and store the results in <paramref name="dest"/>.
|
|||
/// </summary>
|
|||
/// <param name="source">The source span of floats</param>
|
|||
/// <param name="dest">The destination span of float</param>
|
|||
/// <param name="control">The byte control.</param>
|
|||
[MethodImpl(InliningOptions.ShortMethod)] |
|||
public static void Shuffle4Channel( |
|||
ReadOnlySpan<float> source, |
|||
Span<float> dest, |
|||
byte control) |
|||
{ |
|||
VerifyShuffleSpanInput(source, dest); |
|||
|
|||
// TODO: There doesn't seem to be any APIs for
|
|||
// System.Numerics that allow shuffling.
|
|||
#if SUPPORTS_RUNTIME_INTRINSICS
|
|||
HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control); |
|||
#endif
|
|||
|
|||
// Deal with the remainder:
|
|||
if (source.Length > 0) |
|||
{ |
|||
ShuffleRemainder4Channel(source, dest, control); |
|||
} |
|||
} |
|||
|
|||
[MethodImpl(InliningOptions.ColdPath)] |
|||
public static void ShuffleRemainder4Channel( |
|||
ReadOnlySpan<float> source, |
|||
Span<float> dest, |
|||
byte control) |
|||
{ |
|||
ref float sBase = ref MemoryMarshal.GetReference(source); |
|||
ref float dBase = ref MemoryMarshal.GetReference(dest); |
|||
Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); |
|||
|
|||
for (int i = 0; i < source.Length; i += 4) |
|||
{ |
|||
Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); |
|||
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); |
|||
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); |
|||
Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); |
|||
} |
|||
} |
|||
|
|||
[Conditional("DEBUG")] |
|||
private static void VerifyShuffleSpanInput(ReadOnlySpan<float> source, Span<float> dest) |
|||
{ |
|||
DebugGuard.IsTrue( |
|||
source.Length == dest.Length, |
|||
nameof(source), |
|||
"Input spans must be of same length!"); |
|||
|
|||
DebugGuard.IsTrue( |
|||
source.Length % 4 == 0, |
|||
nameof(source), |
|||
"Input spans must be divisiable by 4!"); |
|||
} |
|||
|
|||
public static class Shuffle |
|||
{ |
|||
public const byte WXYZ = (2 << 6) | (1 << 4) | (0 << 2) | 3; |
|||
public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0; |
|||
public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; |
|||
|
|||
public static ReadOnlySpan<byte> WXYZ_128 => MmShuffleByte128(2, 1, 0, 3); |
|||
|
|||
public static ReadOnlySpan<byte> XYZW_128 => MmShuffleByte128(3, 2, 1, 0); |
|||
|
|||
public static ReadOnlySpan<byte> ZYXW_128 => MmShuffleByte128(3, 0, 1, 2); |
|||
|
|||
public static ReadOnlySpan<byte> WXYZ_256 => MmShuffleByte256(2, 1, 0, 3); |
|||
|
|||
public static ReadOnlySpan<byte> XYZW_256 => MmShuffleByte256(3, 2, 1, 0); |
|||
|
|||
public static ReadOnlySpan<byte> ZYXW_256 => MmShuffleByte256(3, 0, 1, 2); |
|||
|
|||
private static byte[] MmShuffleByte128(int p3, int p2, int p1, int p0) |
|||
{ |
|||
byte[] result = new byte[16]; |
|||
|
|||
for (int i = 0; i < result.Length; i += 4) |
|||
{ |
|||
result[i] = (byte)(p0 + i); |
|||
result[i + 1] = (byte)(p1 + i); |
|||
result[i + 2] = (byte)(p2 + i); |
|||
result[i + 3] = (byte)(p3 + i); |
|||
} |
|||
|
|||
return result; |
|||
} |
|||
|
|||
private static byte[] MmShuffleByte256(int p3, int p2, int p1, int p0) |
|||
{ |
|||
byte[] result = new byte[32]; |
|||
|
|||
for (int i = 0; i < result.Length; i += 4) |
|||
{ |
|||
result[i] = (byte)(p0 + i); |
|||
result[i + 1] = (byte)(p1 + i); |
|||
result[i + 2] = (byte)(p2 + i); |
|||
result[i + 3] = (byte)(p3 + i); |
|||
} |
|||
|
|||
return result; |
|||
} |
|||
|
|||
public static void InverseMmShuffle(byte control, out int p3, out int p2, out int p1, out int p0) |
|||
{ |
|||
p3 = control >> 6 & 0x3; |
|||
p2 = control >> 4 & 0x3; |
|||
p1 = control >> 2 & 0x3; |
|||
p0 = control >> 0 & 0x3; |
|||
} |
|||
} |
|||
} |
|||
} |
|||
@ -0,0 +1,68 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using BenchmarkDotNet.Attributes; |
|||
using SixLabors.ImageSharp.Tests; |
|||
|
|||
namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk |
|||
{ |
|||
[Config(typeof(Config.HwIntrinsics_SSE_AVX))] |
|||
public class ShuffleFloat4Channel |
|||
{ |
|||
private float[] source; |
|||
private float[] destination; |
|||
|
|||
[GlobalSetup] |
|||
public void Setup() |
|||
{ |
|||
this.source = new Random(this.Count).GenerateRandomFloatArray(this.Count, 0, 256); |
|||
this.destination = new float[this.Count]; |
|||
} |
|||
|
|||
[Params(128, 256, 512, 1024, 2048)] |
|||
public int Count { get; set; } |
|||
|
|||
[Benchmark] |
|||
public void Shuffle4Channel() |
|||
{ |
|||
SimdUtils.Shuffle4Channel(this.source, this.destination, SimdUtils.Shuffle.WXYZ); |
|||
} |
|||
} |
|||
|
|||
// 2020-10-26
|
|||
// ##########
|
|||
//
|
|||
// BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
|
|||
// Intel Core i7-8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
|
|||
// .NET Core SDK = 5.0.100-rc.2.20479.15
|
|||
//
|
|||
// [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
// SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
|
|||
//
|
|||
// Runtime=.NET Core 3.1
|
|||
//
|
|||
// | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
|
|||
// |---------------- |---------------- |-------------------------------------------------- |------ |------------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:|
|
|||
// | Shuffle4Channel | AVX | Empty | 128 | 14.49 ns | 0.244 ns | 0.217 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 87.74 ns | 0.524 ns | 0.490 ns | 6.06 | 0.09 | - | - | - | - |
|
|||
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 23.65 ns | 0.101 ns | 0.094 ns | 1.63 | 0.03 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | AVX | Empty | 256 | 25.87 ns | 0.492 ns | 0.673 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 159.52 ns | 0.901 ns | 0.843 ns | 6.12 | 0.12 | - | - | - | - |
|
|||
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 45.47 ns | 0.404 ns | 0.378 ns | 1.75 | 0.03 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | AVX | Empty | 512 | 49.51 ns | 0.088 ns | 0.083 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 297.96 ns | 0.926 ns | 0.821 ns | 6.02 | 0.02 | - | - | - | - |
|
|||
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 90.77 ns | 0.191 ns | 0.169 ns | 1.83 | 0.00 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | AVX | Empty | 1024 | 113.09 ns | 1.913 ns | 3.090 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 604.58 ns | 1.464 ns | 1.298 ns | 5.29 | 0.18 | - | - | - | - |
|
|||
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 179.44 ns | 0.208 ns | 0.184 ns | 1.57 | 0.05 | - | - | - | - |
|
|||
// | | | | | | | | | | | | | |
|
|||
// | Shuffle4Channel | AVX | Empty | 2048 | 217.95 ns | 1.314 ns | 1.165 ns | 1.00 | 0.00 | - | - | - | - |
|
|||
// | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 1,152.04 ns | 3.941 ns | 3.494 ns | 5.29 | 0.03 | - | - | - | - |
|
|||
// | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 349.52 ns | 0.587 ns | 0.520 ns | 1.60 | 0.01 | - | - | - | - |
|
|||
} |
|||
@ -0,0 +1,75 @@ |
|||
// Copyright (c) Six Labors.
|
|||
// Licensed under the Apache License, Version 2.0.
|
|||
|
|||
using System; |
|||
using SixLabors.ImageSharp.Tests.TestUtilities; |
|||
using Xunit; |
|||
|
|||
namespace SixLabors.ImageSharp.Tests.Common |
|||
{ |
|||
public partial class SimdUtilsTests |
|||
{ |
|||
public static readonly TheoryData<byte> ShuffleControls = |
|||
new TheoryData<byte> |
|||
{ |
|||
SimdUtils.Shuffle.WXYZ, |
|||
SimdUtils.Shuffle.XYZW, |
|||
SimdUtils.Shuffle.ZYXW |
|||
}; |
|||
|
|||
[Theory] |
|||
[MemberData(nameof(ShuffleControls))] |
|||
public void BulkShuffleFloat4Channel(byte control) |
|||
{ |
|||
static void RunTest(string serialized) |
|||
{ |
|||
byte ctrl = FeatureTestRunner.Deserialize<byte>(serialized); |
|||
foreach (var item in ArraySizesDivisibleBy4) |
|||
{ |
|||
foreach (var count in item) |
|||
{ |
|||
TestShuffle( |
|||
(int)count, |
|||
(s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl), |
|||
ctrl); |
|||
} |
|||
} |
|||
} |
|||
|
|||
FeatureTestRunner.RunWithHwIntrinsicsFeature( |
|||
RunTest, |
|||
control, |
|||
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); |
|||
} |
|||
|
|||
private static void TestShuffle( |
|||
int count, |
|||
Action<Memory<float>, Memory<float>> convert, |
|||
byte control) |
|||
{ |
|||
float[] source = new Random(count).GenerateRandomFloatArray(count, 0, 256); |
|||
var result = new float[count]; |
|||
|
|||
float[] expected = new float[count]; |
|||
|
|||
SimdUtils.Shuffle.InverseMmShuffle( |
|||
control, |
|||
out int p3, |
|||
out int p2, |
|||
out int p1, |
|||
out int p0); |
|||
|
|||
for (int i = 0; i < expected.Length; i += 4) |
|||
{ |
|||
expected[i] = source[p0 + i]; |
|||
expected[i + 1] = source[p1 + i]; |
|||
expected[i + 2] = source[p2 + i]; |
|||
expected[i + 3] = source[p3 + i]; |
|||
} |
|||
|
|||
convert(source, result); |
|||
|
|||
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5F)); |
|||
} |
|||
} |
|||
} |
|||
Loading…
Reference in new issue