diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index aea04737d8..899ab7130b 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -22,8 +22,8 @@ namespace SixLabors.ImageSharp /// Shuffle single-precision (32-bit) floating-point elements in /// using the control and store the results in . /// - /// The source span of floats - /// The destination span of float + /// The source span of floats. + /// The destination span of floats. /// The byte control. [MethodImpl(InliningOptions.ShortMethod)] public static void Shuffle4ChannelReduce( @@ -58,6 +58,46 @@ namespace SixLabors.ImageSharp } } + /// + /// Shuffle 8-bit integers in a within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle4ChannelReduce( + ref ReadOnlySpan source, + ref Span dest, + byte control) + { + if (Avx2.IsSupported || Ssse3.IsSupported) + { + int remainder; + if (Avx.IsSupported) + { + remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); + } + else + { + remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + } + + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + Shuffle4Channel( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount), + control); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + } + [MethodImpl(InliningOptions.ShortMethod)] private static void Shuffle4Channel( ReadOnlySpan source, @@ -98,6 +138,84 @@ namespace SixLabors.ImageSharp } } + [MethodImpl(InliningOptions.ShortMethod)] + private static void Shuffle4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + if (Avx2.IsSupported) + { + int n = dest.Length / Vector256.Count; + + Vector256 vcm; + switch (control) + { + case Shuffle.WXYZ: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.WXYZ_256)); + break; + case Shuffle.XYZW: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.XYZW_256)); + break; + case Shuffle.ZYXW: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.ZYXW_256)); + break; + default: + Span bytes = stackalloc byte[Vector256.Count]; + Shuffle.MmShuffleSpan(ref bytes, control); + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + break; + } + + ref Vector256 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector256 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + for (int i = 0; i < n; i++) + { + Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + } + } + else + { + // Ssse3 + int n = dest.Length / Vector128.Count; + + Vector128 vcm; + switch (control) + { + case Shuffle.WXYZ: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.WXYZ_128)); + break; + case Shuffle.XYZW: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.XYZW_128)); + break; + case Shuffle.ZYXW: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.ZYXW_128)); + break; + default: + Span bytes = stackalloc byte[Vector128.Count]; + Shuffle.MmShuffleSpan(ref bytes, control); + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + break; + } + + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + for (int i = 0; i < n; i++) + { + Vector128 vs = Unsafe.Add(ref sourceBase, i); + Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(vs, vcm); + } + } + } + /// /// Performs a multiplication and an addition of the . /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index fe7cbb72a5..76746e4d25 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -14,8 +14,8 @@ namespace SixLabors.ImageSharp /// Shuffle single-precision (32-bit) floating-point elements in /// using the control and store the results in . /// - /// The source span of floats - /// The destination span of float + /// The source span of floats. + /// The destination span of floats. /// The byte control. [MethodImpl(InliningOptions.ShortMethod)] public static void Shuffle4Channel( @@ -38,14 +38,43 @@ namespace SixLabors.ImageSharp } } + /// + /// Shuffle 8-bit integers in a within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + VerifyShuffleSpanInput(source, dest); + + // TODO: There doesn't seem to be any APIs for + // System.Numerics that allow shuffling. +#if SUPPORTS_RUNTIME_INTRINSICS + HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control); +#endif + + // Deal with the remainder: + if (source.Length > 0) + { + ShuffleRemainder4Channel(source, dest, control); + } + } + [MethodImpl(InliningOptions.ColdPath)] - public static void ShuffleRemainder4Channel( - ReadOnlySpan source, - Span dest, + public static void ShuffleRemainder4Channel( + ReadOnlySpan source, + Span dest, byte control) + where T : struct { - ref float sBase = ref MemoryMarshal.GetReference(source); - ref float dBase = ref MemoryMarshal.GetReference(dest); + ref T sBase = ref MemoryMarshal.GetReference(source); + ref T dBase = ref MemoryMarshal.GetReference(dest); Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); for (int i = 0; i < source.Length; i += 4) @@ -58,7 +87,8 @@ namespace SixLabors.ImageSharp } [Conditional("DEBUG")] - private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) + private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) + where T : struct { DebugGuard.IsTrue( source.Length == dest.Length, @@ -77,49 +107,64 @@ namespace SixLabors.ImageSharp public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0; public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; - public static ReadOnlySpan WXYZ_128 => MmShuffleByte128(2, 1, 0, 3); + public static ReadOnlySpan WXYZ_128 => MmShuffleSpan128(WXYZ); - public static ReadOnlySpan XYZW_128 => MmShuffleByte128(3, 2, 1, 0); + public static ReadOnlySpan XYZW_128 => MmShuffleSpan128(XYZW); - public static ReadOnlySpan ZYXW_128 => MmShuffleByte128(3, 0, 1, 2); + public static ReadOnlySpan ZYXW_128 => MmShuffleSpan128(ZYXW); - public static ReadOnlySpan WXYZ_256 => MmShuffleByte256(2, 1, 0, 3); + public static ReadOnlySpan WXYZ_256 => MmShuffleSpan256(WXYZ); - public static ReadOnlySpan XYZW_256 => MmShuffleByte256(3, 2, 1, 0); + public static ReadOnlySpan XYZW_256 => MmShuffleSpan256(XYZW); - public static ReadOnlySpan ZYXW_256 => MmShuffleByte256(3, 0, 1, 2); + public static ReadOnlySpan ZYXW_256 => MmShuffleSpan256(ZYXW); - private static byte[] MmShuffleByte128(int p3, int p2, int p1, int p0) + private static ReadOnlySpan MmShuffleSpan128(byte control) { - byte[] result = new byte[16]; - - for (int i = 0; i < result.Length; i += 4) - { - result[i] = (byte)(p0 + i); - result[i + 1] = (byte)(p1 + i); - result[i + 2] = (byte)(p2 + i); - result[i + 3] = (byte)(p3 + i); - } + Span buffer = new byte[16]; + MmShuffleSpan(ref buffer, control); + return buffer; + } - return result; + private static ReadOnlySpan MmShuffleSpan256(byte control) + { + Span buffer = new byte[32]; + MmShuffleSpan(ref buffer, control); + return buffer; } - private static byte[] MmShuffleByte256(int p3, int p2, int p1, int p0) + [MethodImpl(InliningOptions.ShortMethod)] + public static byte MmShuffle(int p3, int p2, int p1, int p0) + => (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0); + + [MethodImpl(InliningOptions.ShortMethod)] + public static void MmShuffleSpan(ref Span span, byte control) { - byte[] result = new byte[32]; + InverseMmShuffle( + control, + out int p3, + out int p2, + out int p1, + out int p0); - for (int i = 0; i < result.Length; i += 4) + ref byte spanBase = ref MemoryMarshal.GetReference(span); + + for (int i = 0; i < span.Length; i += 4) { - result[i] = (byte)(p0 + i); - result[i + 1] = (byte)(p1 + i); - result[i + 2] = (byte)(p2 + i); - result[i + 3] = (byte)(p3 + i); + Unsafe.Add(ref spanBase, i) = (byte)(p0 + i); + Unsafe.Add(ref spanBase, i + 1) = (byte)(p1 + i); + Unsafe.Add(ref spanBase, i + 2) = (byte)(p2 + i); + Unsafe.Add(ref spanBase, i + 3) = (byte)(p3 + i); } - - return result; } - public static void InverseMmShuffle(byte control, out int p3, out int p2, out int p1, out int p0) + [MethodImpl(InliningOptions.ShortMethod)] + public static void InverseMmShuffle( + byte control, + out int p3, + out int p2, + out int p1, + out int p0) { p3 = control >> 6 & 0x3; p2 = control >> 4 & 0x3; diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs new file mode 100644 index 0000000000..baef86099b --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs @@ -0,0 +1,68 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class ShuffleByte4Channel + { + private byte[] source; + private byte[] destination; + + [GlobalSetup] + public void Setup() + { + this.source = new byte[this.Count]; + new Random(this.Count).NextBytes(this.source); + this.destination = new byte[this.Count]; + } + + [Params(128, 256, 512, 1024, 2048)] + public int Count { get; set; } + + [Benchmark] + public void Shuffle4Channel() + { + SimdUtils.Shuffle4Channel(this.source, this.destination, SimdUtils.Shuffle.WXYZ); + } + } + + // 2020-10-26 + // ########## + // + // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) + // Intel Core i7-8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK = 5.0.100-rc.2.20479.15 + // + // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // + // Runtime=.NET Core 3.1 + // + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------- |---------------- |-------------------------------------------------- |------ |----------:|----------:|----------:|------:|--------:|-------:|------:|------:|----------:| + // | Shuffle4Channel | AVX | Empty | 128 | 33.57 ns | 0.694 ns | 1.268 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.97 ns | 0.940 ns | 1.045 ns | 1.94 | 0.10 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 27.23 ns | 0.338 ns | 0.300 ns | 0.84 | 0.04 | 0.0095 | - | - | 40 B | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 256 | 34.57 ns | 0.295 ns | 0.276 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 124.62 ns | 0.257 ns | 0.228 ns | 3.60 | 0.03 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 32.22 ns | 0.106 ns | 0.099 ns | 0.93 | 0.01 | 0.0095 | - | - | 40 B | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 512 | 40.41 ns | 0.826 ns | 0.848 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 251.65 ns | 0.440 ns | 0.412 ns | 6.23 | 0.13 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 41.54 ns | 0.128 ns | 0.114 ns | 1.03 | 0.02 | 0.0095 | - | - | 40 B | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 1024 | 51.54 ns | 0.156 ns | 0.121 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 493.66 ns | 1.316 ns | 1.231 ns | 9.58 | 0.04 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 61.45 ns | 0.216 ns | 0.181 ns | 1.19 | 0.00 | 0.0095 | - | - | 40 B | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 2048 | 76.85 ns | 0.176 ns | 0.138 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 985.64 ns | 11.396 ns | 10.103 ns | 12.84 | 0.15 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 106.13 ns | 0.335 ns | 0.297 ns | 1.38 | 0.01 | 0.0095 | - | - | 40 B | +} diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index 04aab18e4e..e07bcf257f 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -14,7 +14,10 @@ namespace SixLabors.ImageSharp.Tests.Common { SimdUtils.Shuffle.WXYZ, SimdUtils.Shuffle.XYZW, - SimdUtils.Shuffle.ZYXW + SimdUtils.Shuffle.ZYXW, + SimdUtils.Shuffle.MmShuffle(2, 1, 3, 0), + SimdUtils.Shuffle.MmShuffle(1, 1, 1, 1), + SimdUtils.Shuffle.MmShuffle(3, 3, 3, 3) }; [Theory] @@ -28,7 +31,7 @@ namespace SixLabors.ImageSharp.Tests.Common { foreach (var count in item) { - TestShuffle( + TestShuffleFloat4Channel( (int)count, (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl), ctrl); @@ -42,7 +45,32 @@ namespace SixLabors.ImageSharp.Tests.Common HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); } - private static void TestShuffle( + [Theory] + [MemberData(nameof(ShuffleControls))] + public void BulkShuffleByte4Channel(byte control) + { + static void RunTest(string serialized) + { + byte ctrl = FeatureTestRunner.Deserialize(serialized); + foreach (var item in ArraySizesDivisibleBy4) + { + foreach (var count in item) + { + TestShuffleByte4Channel( + (int)count, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl), + ctrl); + } + } + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + control, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); + } + + private static void TestShuffleFloat4Channel( int count, Action, Memory> convert, byte control) @@ -71,5 +99,36 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5F)); } + + private static void TestShuffleByte4Channel( + int count, + Action, Memory> convert, + byte control) + { + byte[] source = new byte[count]; + new Random(count).NextBytes(source); + var result = new byte[count]; + + byte[] expected = new byte[count]; + + SimdUtils.Shuffle.InverseMmShuffle( + control, + out int p3, + out int p2, + out int p1, + out int p0); + + for (int i = 0; i < expected.Length; i += 4) + { + expected[i] = source[p0 + i]; + expected[i + 1] = source[p1 + i]; + expected[i + 2] = source[p2 + i]; + expected[i + 3] = source[p3 + i]; + } + + convert(source, result); + + Assert.Equal(expected, result); + } } }