diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index aea04737d8..899ab7130b 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -22,8 +22,8 @@ namespace SixLabors.ImageSharp
/// Shuffle single-precision (32-bit) floating-point elements in
/// using the control and store the results in .
///
- /// The source span of floats
- /// The destination span of float
+ /// The source span of floats.
+ /// The destination span of floats.
/// The byte control.
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4ChannelReduce(
@@ -58,6 +58,46 @@ namespace SixLabors.ImageSharp
}
}
+ ///
+ /// Shuffle 8-bit integers in a within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4ChannelReduce(
+ ref ReadOnlySpan source,
+ ref Span dest,
+ byte control)
+ {
+ if (Avx2.IsSupported || Ssse3.IsSupported)
+ {
+ int remainder;
+ if (Avx.IsSupported)
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count);
+ }
+ else
+ {
+ remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count);
+ }
+
+ int adjustedCount = source.Length - remainder;
+
+ if (adjustedCount > 0)
+ {
+ Shuffle4Channel(
+ source.Slice(0, adjustedCount),
+ dest.Slice(0, adjustedCount),
+ control);
+
+ source = source.Slice(adjustedCount);
+ dest = dest.Slice(adjustedCount);
+ }
+ }
+ }
+
[MethodImpl(InliningOptions.ShortMethod)]
private static void Shuffle4Channel(
ReadOnlySpan source,
@@ -98,6 +138,84 @@ namespace SixLabors.ImageSharp
}
}
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static void Shuffle4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ if (Avx2.IsSupported)
+ {
+ int n = dest.Length / Vector256.Count;
+
+ Vector256 vcm;
+ switch (control)
+ {
+ case Shuffle.WXYZ:
+ vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.WXYZ_256));
+ break;
+ case Shuffle.XYZW:
+ vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.XYZW_256));
+ break;
+ case Shuffle.ZYXW:
+ vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.ZYXW_256));
+ break;
+ default:
+ Span bytes = stackalloc byte[Vector256.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+ break;
+ }
+
+ ref Vector256 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector256 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ for (int i = 0; i < n; i++)
+ {
+ Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm);
+ }
+ }
+ else
+ {
+ // Ssse3
+ int n = dest.Length / Vector128.Count;
+
+ Vector128 vcm;
+ switch (control)
+ {
+ case Shuffle.WXYZ:
+ vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.WXYZ_128));
+ break;
+ case Shuffle.XYZW:
+ vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.XYZW_128));
+ break;
+ case Shuffle.ZYXW:
+ vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.ZYXW_128));
+ break;
+ default:
+ Span bytes = stackalloc byte[Vector128.Count];
+ Shuffle.MmShuffleSpan(ref bytes, control);
+ vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+ break;
+ }
+
+ ref Vector128 sourceBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+
+ ref Vector128 destBase =
+ ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+
+ for (int i = 0; i < n; i++)
+ {
+ Vector128 vs = Unsafe.Add(ref sourceBase, i);
+ Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(vs, vcm);
+ }
+ }
+ }
+
///
/// Performs a multiplication and an addition of the .
///
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
index fe7cbb72a5..76746e4d25 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
@@ -14,8 +14,8 @@ namespace SixLabors.ImageSharp
/// Shuffle single-precision (32-bit) floating-point elements in
/// using the control and store the results in .
///
- /// The source span of floats
- /// The destination span of float
+ /// The source span of floats.
+ /// The destination span of floats.
/// The byte control.
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4Channel(
@@ -38,14 +38,43 @@ namespace SixLabors.ImageSharp
}
}
+ ///
+ /// Shuffle 8-bit integers in a within 128-bit lanes in
+ /// using the control and store the results in .
+ ///
+ /// The source span of bytes.
+ /// The destination span of bytes.
+ /// The byte control.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void Shuffle4Channel(
+ ReadOnlySpan source,
+ Span dest,
+ byte control)
+ {
+ VerifyShuffleSpanInput(source, dest);
+
+ // TODO: There doesn't seem to be any APIs for
+ // System.Numerics that allow shuffling.
+#if SUPPORTS_RUNTIME_INTRINSICS
+ HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control);
+#endif
+
+ // Deal with the remainder:
+ if (source.Length > 0)
+ {
+ ShuffleRemainder4Channel(source, dest, control);
+ }
+ }
+
[MethodImpl(InliningOptions.ColdPath)]
- public static void ShuffleRemainder4Channel(
- ReadOnlySpan source,
- Span dest,
+ public static void ShuffleRemainder4Channel(
+ ReadOnlySpan source,
+ Span dest,
byte control)
+ where T : struct
{
- ref float sBase = ref MemoryMarshal.GetReference(source);
- ref float dBase = ref MemoryMarshal.GetReference(dest);
+ ref T sBase = ref MemoryMarshal.GetReference(source);
+ ref T dBase = ref MemoryMarshal.GetReference(dest);
Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0);
for (int i = 0; i < source.Length; i += 4)
@@ -58,7 +87,8 @@ namespace SixLabors.ImageSharp
}
[Conditional("DEBUG")]
- private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest)
+ private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest)
+ where T : struct
{
DebugGuard.IsTrue(
source.Length == dest.Length,
@@ -77,49 +107,64 @@ namespace SixLabors.ImageSharp
public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0;
public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2;
- public static ReadOnlySpan WXYZ_128 => MmShuffleByte128(2, 1, 0, 3);
+ public static ReadOnlySpan WXYZ_128 => MmShuffleSpan128(WXYZ);
- public static ReadOnlySpan XYZW_128 => MmShuffleByte128(3, 2, 1, 0);
+ public static ReadOnlySpan XYZW_128 => MmShuffleSpan128(XYZW);
- public static ReadOnlySpan ZYXW_128 => MmShuffleByte128(3, 0, 1, 2);
+ public static ReadOnlySpan ZYXW_128 => MmShuffleSpan128(ZYXW);
- public static ReadOnlySpan WXYZ_256 => MmShuffleByte256(2, 1, 0, 3);
+ public static ReadOnlySpan WXYZ_256 => MmShuffleSpan256(WXYZ);
- public static ReadOnlySpan XYZW_256 => MmShuffleByte256(3, 2, 1, 0);
+ public static ReadOnlySpan XYZW_256 => MmShuffleSpan256(XYZW);
- public static ReadOnlySpan ZYXW_256 => MmShuffleByte256(3, 0, 1, 2);
+ public static ReadOnlySpan ZYXW_256 => MmShuffleSpan256(ZYXW);
- private static byte[] MmShuffleByte128(int p3, int p2, int p1, int p0)
+ private static ReadOnlySpan MmShuffleSpan128(byte control)
{
- byte[] result = new byte[16];
-
- for (int i = 0; i < result.Length; i += 4)
- {
- result[i] = (byte)(p0 + i);
- result[i + 1] = (byte)(p1 + i);
- result[i + 2] = (byte)(p2 + i);
- result[i + 3] = (byte)(p3 + i);
- }
+ Span buffer = new byte[16];
+ MmShuffleSpan(ref buffer, control);
+ return buffer;
+ }
- return result;
+ private static ReadOnlySpan MmShuffleSpan256(byte control)
+ {
+ Span buffer = new byte[32];
+ MmShuffleSpan(ref buffer, control);
+ return buffer;
}
- private static byte[] MmShuffleByte256(int p3, int p2, int p1, int p0)
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static byte MmShuffle(int p3, int p2, int p1, int p0)
+ => (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0);
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void MmShuffleSpan(ref Span span, byte control)
{
- byte[] result = new byte[32];
+ InverseMmShuffle(
+ control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0);
- for (int i = 0; i < result.Length; i += 4)
+ ref byte spanBase = ref MemoryMarshal.GetReference(span);
+
+ for (int i = 0; i < span.Length; i += 4)
{
- result[i] = (byte)(p0 + i);
- result[i + 1] = (byte)(p1 + i);
- result[i + 2] = (byte)(p2 + i);
- result[i + 3] = (byte)(p3 + i);
+ Unsafe.Add(ref spanBase, i) = (byte)(p0 + i);
+ Unsafe.Add(ref spanBase, i + 1) = (byte)(p1 + i);
+ Unsafe.Add(ref spanBase, i + 2) = (byte)(p2 + i);
+ Unsafe.Add(ref spanBase, i + 3) = (byte)(p3 + i);
}
-
- return result;
}
- public static void InverseMmShuffle(byte control, out int p3, out int p2, out int p1, out int p0)
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public static void InverseMmShuffle(
+ byte control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0)
{
p3 = control >> 6 & 0x3;
p2 = control >> 4 & 0x3;
diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs
new file mode 100644
index 0000000000..baef86099b
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs
@@ -0,0 +1,68 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using System;
+using BenchmarkDotNet.Attributes;
+
+namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk
+{
+ [Config(typeof(Config.HwIntrinsics_SSE_AVX))]
+ public class ShuffleByte4Channel
+ {
+ private byte[] source;
+ private byte[] destination;
+
+ [GlobalSetup]
+ public void Setup()
+ {
+ this.source = new byte[this.Count];
+ new Random(this.Count).NextBytes(this.source);
+ this.destination = new byte[this.Count];
+ }
+
+ [Params(128, 256, 512, 1024, 2048)]
+ public int Count { get; set; }
+
+ [Benchmark]
+ public void Shuffle4Channel()
+ {
+ SimdUtils.Shuffle4Channel(this.source, this.destination, SimdUtils.Shuffle.WXYZ);
+ }
+ }
+
+ // 2020-10-26
+ // ##########
+ //
+ // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1)
+ // Intel Core i7-8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores
+ // .NET Core SDK = 5.0.100-rc.2.20479.15
+ //
+ // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ // AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ // No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ // SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT
+ //
+ // Runtime=.NET Core 3.1
+ //
+ // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated |
+ // |---------------- |---------------- |-------------------------------------------------- |------ |----------:|----------:|----------:|------:|--------:|-------:|------:|------:|----------:|
+ // | Shuffle4Channel | AVX | Empty | 128 | 33.57 ns | 0.694 ns | 1.268 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B |
+ // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.97 ns | 0.940 ns | 1.045 ns | 1.94 | 0.10 | - | - | - | - |
+ // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 27.23 ns | 0.338 ns | 0.300 ns | 0.84 | 0.04 | 0.0095 | - | - | 40 B |
+ // | | | | | | | | | | | | | |
+ // | Shuffle4Channel | AVX | Empty | 256 | 34.57 ns | 0.295 ns | 0.276 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B |
+ // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 124.62 ns | 0.257 ns | 0.228 ns | 3.60 | 0.03 | - | - | - | - |
+ // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 32.22 ns | 0.106 ns | 0.099 ns | 0.93 | 0.01 | 0.0095 | - | - | 40 B |
+ // | | | | | | | | | | | | | |
+ // | Shuffle4Channel | AVX | Empty | 512 | 40.41 ns | 0.826 ns | 0.848 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B |
+ // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 251.65 ns | 0.440 ns | 0.412 ns | 6.23 | 0.13 | - | - | - | - |
+ // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 41.54 ns | 0.128 ns | 0.114 ns | 1.03 | 0.02 | 0.0095 | - | - | 40 B |
+ // | | | | | | | | | | | | | |
+ // | Shuffle4Channel | AVX | Empty | 1024 | 51.54 ns | 0.156 ns | 0.121 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B |
+ // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 493.66 ns | 1.316 ns | 1.231 ns | 9.58 | 0.04 | - | - | - | - |
+ // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 61.45 ns | 0.216 ns | 0.181 ns | 1.19 | 0.00 | 0.0095 | - | - | 40 B |
+ // | | | | | | | | | | | | | |
+ // | Shuffle4Channel | AVX | Empty | 2048 | 76.85 ns | 0.176 ns | 0.138 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B |
+ // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 985.64 ns | 11.396 ns | 10.103 ns | 12.84 | 0.15 | - | - | - | - |
+ // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 106.13 ns | 0.335 ns | 0.297 ns | 1.38 | 0.01 | 0.0095 | - | - | 40 B |
+}
diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs
index 04aab18e4e..e07bcf257f 100644
--- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs
+++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs
@@ -14,7 +14,10 @@ namespace SixLabors.ImageSharp.Tests.Common
{
SimdUtils.Shuffle.WXYZ,
SimdUtils.Shuffle.XYZW,
- SimdUtils.Shuffle.ZYXW
+ SimdUtils.Shuffle.ZYXW,
+ SimdUtils.Shuffle.MmShuffle(2, 1, 3, 0),
+ SimdUtils.Shuffle.MmShuffle(1, 1, 1, 1),
+ SimdUtils.Shuffle.MmShuffle(3, 3, 3, 3)
};
[Theory]
@@ -28,7 +31,7 @@ namespace SixLabors.ImageSharp.Tests.Common
{
foreach (var count in item)
{
- TestShuffle(
+ TestShuffleFloat4Channel(
(int)count,
(s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl),
ctrl);
@@ -42,7 +45,32 @@ namespace SixLabors.ImageSharp.Tests.Common
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE);
}
- private static void TestShuffle(
+ [Theory]
+ [MemberData(nameof(ShuffleControls))]
+ public void BulkShuffleByte4Channel(byte control)
+ {
+ static void RunTest(string serialized)
+ {
+ byte ctrl = FeatureTestRunner.Deserialize(serialized);
+ foreach (var item in ArraySizesDivisibleBy4)
+ {
+ foreach (var count in item)
+ {
+ TestShuffleByte4Channel(
+ (int)count,
+ (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl),
+ ctrl);
+ }
+ }
+ }
+
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(
+ RunTest,
+ control,
+ HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE);
+ }
+
+ private static void TestShuffleFloat4Channel(
int count,
Action, Memory> convert,
byte control)
@@ -71,5 +99,36 @@ namespace SixLabors.ImageSharp.Tests.Common
Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5F));
}
+
+ private static void TestShuffleByte4Channel(
+ int count,
+ Action, Memory> convert,
+ byte control)
+ {
+ byte[] source = new byte[count];
+ new Random(count).NextBytes(source);
+ var result = new byte[count];
+
+ byte[] expected = new byte[count];
+
+ SimdUtils.Shuffle.InverseMmShuffle(
+ control,
+ out int p3,
+ out int p2,
+ out int p1,
+ out int p0);
+
+ for (int i = 0; i < expected.Length; i += 4)
+ {
+ expected[i] = source[p0 + i];
+ expected[i + 1] = source[p1 + i];
+ expected[i + 2] = source[p2 + i];
+ expected[i + 3] = source[p3 + i];
+ }
+
+ convert(source, result);
+
+ Assert.Equal(expected, result);
+ }
}
}