diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 2d788992ee..aea04737d8 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -18,6 +18,86 @@ namespace SixLabors.ImageSharp public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; + /// + /// Shuffle single-precision (32-bit) floating-point elements in + /// using the control and store the results in . + /// + /// The source span of floats + /// The destination span of float + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle4ChannelReduce( + ref ReadOnlySpan source, + ref Span dest, + byte control) + { + if (Avx.IsSupported || Sse.IsSupported) + { + int remainder; + if (Avx.IsSupported) + { + remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); + } + else + { + remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + } + + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + Shuffle4Channel( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount), + control); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void Shuffle4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + if (Avx.IsSupported) + { + int n = dest.Length / Vector256.Count; + + ref Vector256 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector256 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + for (int i = 0; i < n; i++) + { + Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control); + } + } + else + { + // Sse + int n = dest.Length / Vector128.Count; + + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + for (int i = 0; i < n; i++) + { + Vector128 vs = Unsafe.Add(ref sourceBase, i); + Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control); + } + } + } + /// /// Performs a multiplication and an addition of the . /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs new file mode 100644 index 0000000000..fe7cbb72a5 --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -0,0 +1,131 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + /// + /// Shuffle single-precision (32-bit) floating-point elements in + /// using the control and store the results in . + /// + /// The source span of floats + /// The destination span of float + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + VerifyShuffleSpanInput(source, dest); + + // TODO: There doesn't seem to be any APIs for + // System.Numerics that allow shuffling. +#if SUPPORTS_RUNTIME_INTRINSICS + HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control); +#endif + + // Deal with the remainder: + if (source.Length > 0) + { + ShuffleRemainder4Channel(source, dest, control); + } + } + + [MethodImpl(InliningOptions.ColdPath)] + public static void ShuffleRemainder4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + ref float sBase = ref MemoryMarshal.GetReference(source); + ref float dBase = ref MemoryMarshal.GetReference(dest); + Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); + + for (int i = 0; i < source.Length; i += 4) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); + Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); + Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); + Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); + } + } + + [Conditional("DEBUG")] + private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) + { + DebugGuard.IsTrue( + source.Length == dest.Length, + nameof(source), + "Input spans must be of same length!"); + + DebugGuard.IsTrue( + source.Length % 4 == 0, + nameof(source), + "Input spans must be divisiable by 4!"); + } + + public static class Shuffle + { + public const byte WXYZ = (2 << 6) | (1 << 4) | (0 << 2) | 3; + public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0; + public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; + + public static ReadOnlySpan WXYZ_128 => MmShuffleByte128(2, 1, 0, 3); + + public static ReadOnlySpan XYZW_128 => MmShuffleByte128(3, 2, 1, 0); + + public static ReadOnlySpan ZYXW_128 => MmShuffleByte128(3, 0, 1, 2); + + public static ReadOnlySpan WXYZ_256 => MmShuffleByte256(2, 1, 0, 3); + + public static ReadOnlySpan XYZW_256 => MmShuffleByte256(3, 2, 1, 0); + + public static ReadOnlySpan ZYXW_256 => MmShuffleByte256(3, 0, 1, 2); + + private static byte[] MmShuffleByte128(int p3, int p2, int p1, int p0) + { + byte[] result = new byte[16]; + + for (int i = 0; i < result.Length; i += 4) + { + result[i] = (byte)(p0 + i); + result[i + 1] = (byte)(p1 + i); + result[i + 2] = (byte)(p2 + i); + result[i + 3] = (byte)(p3 + i); + } + + return result; + } + + private static byte[] MmShuffleByte256(int p3, int p2, int p1, int p0) + { + byte[] result = new byte[32]; + + for (int i = 0; i < result.Length; i += 4) + { + result[i] = (byte)(p0 + i); + result[i + 1] = (byte)(p1 + i); + result[i + 2] = (byte)(p2 + i); + result[i + 3] = (byte)(p3 + i); + } + + return result; + } + + public static void InverseMmShuffle(byte control, out int p3, out int p2, out int p1, out int p0) + { + p3 = control >> 6 & 0x3; + p2 = control >> 4 & 0x3; + p1 = control >> 2 & 0x3; + p0 = control >> 0 & 0x3; + } + } + } +} diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs new file mode 100644 index 0000000000..36b9591d9d --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs @@ -0,0 +1,68 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Tests; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class ShuffleFloat4Channel + { + private float[] source; + private float[] destination; + + [GlobalSetup] + public void Setup() + { + this.source = new Random(this.Count).GenerateRandomFloatArray(this.Count, 0, 256); + this.destination = new float[this.Count]; + } + + [Params(128, 256, 512, 1024, 2048)] + public int Count { get; set; } + + [Benchmark] + public void Shuffle4Channel() + { + SimdUtils.Shuffle4Channel(this.source, this.destination, SimdUtils.Shuffle.WXYZ); + } + } + + // 2020-10-26 + // ########## + // + // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) + // Intel Core i7-8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK = 5.0.100-rc.2.20479.15 + // + // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // + // Runtime=.NET Core 3.1 + // + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------- |---------------- |-------------------------------------------------- |------ |------------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| + // | Shuffle4Channel | AVX | Empty | 128 | 14.49 ns | 0.244 ns | 0.217 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 87.74 ns | 0.524 ns | 0.490 ns | 6.06 | 0.09 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 23.65 ns | 0.101 ns | 0.094 ns | 1.63 | 0.03 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 256 | 25.87 ns | 0.492 ns | 0.673 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 159.52 ns | 0.901 ns | 0.843 ns | 6.12 | 0.12 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 45.47 ns | 0.404 ns | 0.378 ns | 1.75 | 0.03 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 512 | 49.51 ns | 0.088 ns | 0.083 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 297.96 ns | 0.926 ns | 0.821 ns | 6.02 | 0.02 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 90.77 ns | 0.191 ns | 0.169 ns | 1.83 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 1024 | 113.09 ns | 1.913 ns | 3.090 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 604.58 ns | 1.464 ns | 1.298 ns | 5.29 | 0.18 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 179.44 ns | 0.208 ns | 0.184 ns | 1.57 | 0.05 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 2048 | 217.95 ns | 1.314 ns | 1.165 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 1,152.04 ns | 3.941 ns | 3.494 ns | 5.29 | 0.03 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 349.52 ns | 0.587 ns | 0.520 ns | 1.60 | 0.01 | - | - | - | - | +} diff --git a/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj b/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj index eaab162ff2..4784a219b2 100644 --- a/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj +++ b/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj @@ -17,6 +17,7 @@ + diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs new file mode 100644 index 0000000000..04aab18e4e --- /dev/null +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -0,0 +1,75 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using SixLabors.ImageSharp.Tests.TestUtilities; +using Xunit; + +namespace SixLabors.ImageSharp.Tests.Common +{ + public partial class SimdUtilsTests + { + public static readonly TheoryData ShuffleControls = + new TheoryData + { + SimdUtils.Shuffle.WXYZ, + SimdUtils.Shuffle.XYZW, + SimdUtils.Shuffle.ZYXW + }; + + [Theory] + [MemberData(nameof(ShuffleControls))] + public void BulkShuffleFloat4Channel(byte control) + { + static void RunTest(string serialized) + { + byte ctrl = FeatureTestRunner.Deserialize(serialized); + foreach (var item in ArraySizesDivisibleBy4) + { + foreach (var count in item) + { + TestShuffle( + (int)count, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl), + ctrl); + } + } + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + control, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); + } + + private static void TestShuffle( + int count, + Action, Memory> convert, + byte control) + { + float[] source = new Random(count).GenerateRandomFloatArray(count, 0, 256); + var result = new float[count]; + + float[] expected = new float[count]; + + SimdUtils.Shuffle.InverseMmShuffle( + control, + out int p3, + out int p2, + out int p1, + out int p0); + + for (int i = 0; i < expected.Length; i += 4) + { + expected[i] = source[p0 + i]; + expected[i + 1] = source[p1 + i]; + expected[i + 2] = source[p2 + i]; + expected[i + 3] = source[p3 + i]; + } + + convert(source, result); + + Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5F)); + } + } +} diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 838db742a1..bddadff4da 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -13,7 +13,7 @@ using Xunit.Abstractions; namespace SixLabors.ImageSharp.Tests.Common { - public class SimdUtilsTests + public partial class SimdUtilsTests { private ITestOutputHelper Output { get; } @@ -212,14 +212,14 @@ namespace SixLabors.ImageSharp.Tests.Common static void RunTest(string serialized) { TestImpl_BulkConvertByteToNormalizedFloat( - FeatureTestRunner.Deserialize(serialized), + FeatureTestRunner.Deserialize(serialized), (s, d) => SimdUtils.HwIntrinsics.ByteToNormalizedFloat(s.Span, d.Span)); } FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41, - count); + count, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41); } #endif @@ -305,14 +305,14 @@ namespace SixLabors.ImageSharp.Tests.Common static void RunTest(string serialized) { TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( - FeatureTestRunner.Deserialize(serialized), + FeatureTestRunner.Deserialize(serialized), (s, d) => SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span)); } FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2, - count); + count, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); } #endif diff --git a/tests/ImageSharp.Tests/Formats/Png/PngEncoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngEncoderTests.cs index 465bed8a16..b4670cb5d4 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngEncoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngEncoderTests.cs @@ -535,7 +535,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Png static void RunTest(string serialized) { TestImageProvider provider = - FeatureTestRunner.Deserialize>(serialized); + FeatureTestRunner.DeserializeForXunit>(serialized); foreach (PngInterlaceMode interlaceMode in InterlaceMode) { diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs index fdba9ce982..4720ea78ac 100644 --- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs +++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs @@ -29,17 +29,19 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities /// The type to deserialize to. /// The string value to deserialize. /// The value. - public static T Deserialize(string value) + public static T DeserializeForXunit(string value) where T : IXunitSerializable => BasicSerializer.Deserialize(value); /// - /// Allows the deserialization of integers passed to the feature test. + /// Allows the deserialization of types implementing + /// passed to the feature test. /// /// The string value to deserialize. - /// The value. - public static int Deserialize(string value) - => Convert.ToInt32(value); + /// The value. + public static T Deserialize(string value) + where T : IConvertible + => (T)Convert.ChangeType(value, typeof(T)); /// /// Runs the given test within an environment @@ -214,12 +216,13 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities /// where the given features. /// /// The test action to run. - /// The intrinsics features. /// The value to pass as a parameter to the test action. - public static void RunWithHwIntrinsicsFeature( + /// The intrinsics features. + public static void RunWithHwIntrinsicsFeature( Action action, - HwIntrinsics intrinsics, - int serializable) + T serializable, + HwIntrinsics intrinsics) + where T : IConvertible { if (!RemoteExecutor.IsSupported) { diff --git a/tests/ImageSharp.Tests/TestUtilities/Tests/FeatureTestRunnerTests.cs b/tests/ImageSharp.Tests/TestUtilities/Tests/FeatureTestRunnerTests.cs index 646000120f..4cbbefe686 100644 --- a/tests/ImageSharp.Tests/TestUtilities/Tests/FeatureTestRunnerTests.cs +++ b/tests/ImageSharp.Tests/TestUtilities/Tests/FeatureTestRunnerTests.cs @@ -183,7 +183,7 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities.Tests static void AssertHwIntrinsicsFeatureDisabled(string serializable) { Assert.NotNull(serializable); - Assert.NotNull(FeatureTestRunner.Deserialize(serializable)); + Assert.NotNull(FeatureTestRunner.DeserializeForXunit(serializable)); #if SUPPORTS_RUNTIME_INTRINSICS Assert.False(Sse.IsSupported); @@ -202,7 +202,7 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities.Tests static void AssertHwIntrinsicsFeatureDisabled(string serializable, string intrinsic) { Assert.NotNull(serializable); - Assert.NotNull(FeatureTestRunner.Deserialize(serializable)); + Assert.NotNull(FeatureTestRunner.DeserializeForXunit(serializable)); switch ((HwIntrinsics)Enum.Parse(typeof(HwIntrinsics), intrinsic)) {