From 893247bd882674a6cce48d505b1b34e68a3e27da Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 26 Oct 2020 17:05:50 +0000 Subject: [PATCH 1/9] Add 4 channel float shuffling. --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 80 +++++++++++ .../Common/Helpers/SimdUtils.Shuffle.cs | 131 ++++++++++++++++++ .../Color/Bulk/ShuffleFloat4Channel.cs | 68 +++++++++ .../ImageSharp.Benchmarks.csproj | 1 + .../Common/SimdUtilsTests.Shuffle.cs | 75 ++++++++++ .../ImageSharp.Tests/Common/SimdUtilsTests.cs | 14 +- .../Formats/Png/PngEncoderTests.cs | 2 +- .../FeatureTesting/FeatureTestRunner.cs | 21 +-- .../Tests/FeatureTestRunnerTests.cs | 4 +- 9 files changed, 377 insertions(+), 19 deletions(-) create mode 100644 src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs create mode 100644 tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 2d788992e..aea04737d 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -18,6 +18,86 @@ namespace SixLabors.ImageSharp public static ReadOnlySpan PermuteMaskEvenOdd8x32 => new byte[] { 0, 0, 0, 0, 2, 0, 0, 0, 4, 0, 0, 0, 6, 0, 0, 0, 1, 0, 0, 0, 3, 0, 0, 0, 5, 0, 0, 0, 7, 0, 0, 0 }; + /// + /// Shuffle single-precision (32-bit) floating-point elements in + /// using the control and store the results in . + /// + /// The source span of floats + /// The destination span of float + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle4ChannelReduce( + ref ReadOnlySpan source, + ref Span dest, + byte control) + { + if (Avx.IsSupported || Sse.IsSupported) + { + int remainder; + if (Avx.IsSupported) + { + remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); + } + else + { + remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + } + + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + Shuffle4Channel( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount), + control); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void Shuffle4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + if (Avx.IsSupported) + { + int n = dest.Length / Vector256.Count; + + ref Vector256 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector256 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + for (int i = 0; i < n; i++) + { + Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control); + } + } + else + { + // Sse + int n = dest.Length / Vector128.Count; + + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + for (int i = 0; i < n; i++) + { + Vector128 vs = Unsafe.Add(ref sourceBase, i); + Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control); + } + } + } + /// /// Performs a multiplication and an addition of the . /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs new file mode 100644 index 000000000..fe7cbb72a --- /dev/null +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -0,0 +1,131 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Diagnostics; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp +{ + internal static partial class SimdUtils + { + /// + /// Shuffle single-precision (32-bit) floating-point elements in + /// using the control and store the results in . + /// + /// The source span of floats + /// The destination span of float + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + VerifyShuffleSpanInput(source, dest); + + // TODO: There doesn't seem to be any APIs for + // System.Numerics that allow shuffling. +#if SUPPORTS_RUNTIME_INTRINSICS + HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control); +#endif + + // Deal with the remainder: + if (source.Length > 0) + { + ShuffleRemainder4Channel(source, dest, control); + } + } + + [MethodImpl(InliningOptions.ColdPath)] + public static void ShuffleRemainder4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + ref float sBase = ref MemoryMarshal.GetReference(source); + ref float dBase = ref MemoryMarshal.GetReference(dest); + Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); + + for (int i = 0; i < source.Length; i += 4) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); + Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); + Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); + Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); + } + } + + [Conditional("DEBUG")] + private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) + { + DebugGuard.IsTrue( + source.Length == dest.Length, + nameof(source), + "Input spans must be of same length!"); + + DebugGuard.IsTrue( + source.Length % 4 == 0, + nameof(source), + "Input spans must be divisiable by 4!"); + } + + public static class Shuffle + { + public const byte WXYZ = (2 << 6) | (1 << 4) | (0 << 2) | 3; + public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0; + public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; + + public static ReadOnlySpan WXYZ_128 => MmShuffleByte128(2, 1, 0, 3); + + public static ReadOnlySpan XYZW_128 => MmShuffleByte128(3, 2, 1, 0); + + public static ReadOnlySpan ZYXW_128 => MmShuffleByte128(3, 0, 1, 2); + + public static ReadOnlySpan WXYZ_256 => MmShuffleByte256(2, 1, 0, 3); + + public static ReadOnlySpan XYZW_256 => MmShuffleByte256(3, 2, 1, 0); + + public static ReadOnlySpan ZYXW_256 => MmShuffleByte256(3, 0, 1, 2); + + private static byte[] MmShuffleByte128(int p3, int p2, int p1, int p0) + { + byte[] result = new byte[16]; + + for (int i = 0; i < result.Length; i += 4) + { + result[i] = (byte)(p0 + i); + result[i + 1] = (byte)(p1 + i); + result[i + 2] = (byte)(p2 + i); + result[i + 3] = (byte)(p3 + i); + } + + return result; + } + + private static byte[] MmShuffleByte256(int p3, int p2, int p1, int p0) + { + byte[] result = new byte[32]; + + for (int i = 0; i < result.Length; i += 4) + { + result[i] = (byte)(p0 + i); + result[i + 1] = (byte)(p1 + i); + result[i + 2] = (byte)(p2 + i); + result[i + 3] = (byte)(p3 + i); + } + + return result; + } + + public static void InverseMmShuffle(byte control, out int p3, out int p2, out int p1, out int p0) + { + p3 = control >> 6 & 0x3; + p2 = control >> 4 & 0x3; + p1 = control >> 2 & 0x3; + p0 = control >> 0 & 0x3; + } + } + } +} diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs new file mode 100644 index 000000000..36b9591d9 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs @@ -0,0 +1,68 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; +using SixLabors.ImageSharp.Tests; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class ShuffleFloat4Channel + { + private float[] source; + private float[] destination; + + [GlobalSetup] + public void Setup() + { + this.source = new Random(this.Count).GenerateRandomFloatArray(this.Count, 0, 256); + this.destination = new float[this.Count]; + } + + [Params(128, 256, 512, 1024, 2048)] + public int Count { get; set; } + + [Benchmark] + public void Shuffle4Channel() + { + SimdUtils.Shuffle4Channel(this.source, this.destination, SimdUtils.Shuffle.WXYZ); + } + } + + // 2020-10-26 + // ########## + // + // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) + // Intel Core i7-8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK = 5.0.100-rc.2.20479.15 + // + // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // + // Runtime=.NET Core 3.1 + // + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------- |---------------- |-------------------------------------------------- |------ |------------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| + // | Shuffle4Channel | AVX | Empty | 128 | 14.49 ns | 0.244 ns | 0.217 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 87.74 ns | 0.524 ns | 0.490 ns | 6.06 | 0.09 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 23.65 ns | 0.101 ns | 0.094 ns | 1.63 | 0.03 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 256 | 25.87 ns | 0.492 ns | 0.673 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 159.52 ns | 0.901 ns | 0.843 ns | 6.12 | 0.12 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 45.47 ns | 0.404 ns | 0.378 ns | 1.75 | 0.03 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 512 | 49.51 ns | 0.088 ns | 0.083 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 297.96 ns | 0.926 ns | 0.821 ns | 6.02 | 0.02 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 90.77 ns | 0.191 ns | 0.169 ns | 1.83 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 1024 | 113.09 ns | 1.913 ns | 3.090 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 604.58 ns | 1.464 ns | 1.298 ns | 5.29 | 0.18 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 179.44 ns | 0.208 ns | 0.184 ns | 1.57 | 0.05 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 2048 | 217.95 ns | 1.314 ns | 1.165 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 1,152.04 ns | 3.941 ns | 3.494 ns | 5.29 | 0.03 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 349.52 ns | 0.587 ns | 0.520 ns | 1.60 | 0.01 | - | - | - | - | +} diff --git a/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj b/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj index eaab162ff..4784a219b 100644 --- a/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj +++ b/tests/ImageSharp.Benchmarks/ImageSharp.Benchmarks.csproj @@ -17,6 +17,7 @@ + diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs new file mode 100644 index 000000000..04aab18e4 --- /dev/null +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -0,0 +1,75 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using SixLabors.ImageSharp.Tests.TestUtilities; +using Xunit; + +namespace SixLabors.ImageSharp.Tests.Common +{ + public partial class SimdUtilsTests + { + public static readonly TheoryData ShuffleControls = + new TheoryData + { + SimdUtils.Shuffle.WXYZ, + SimdUtils.Shuffle.XYZW, + SimdUtils.Shuffle.ZYXW + }; + + [Theory] + [MemberData(nameof(ShuffleControls))] + public void BulkShuffleFloat4Channel(byte control) + { + static void RunTest(string serialized) + { + byte ctrl = FeatureTestRunner.Deserialize(serialized); + foreach (var item in ArraySizesDivisibleBy4) + { + foreach (var count in item) + { + TestShuffle( + (int)count, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl), + ctrl); + } + } + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + control, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); + } + + private static void TestShuffle( + int count, + Action, Memory> convert, + byte control) + { + float[] source = new Random(count).GenerateRandomFloatArray(count, 0, 256); + var result = new float[count]; + + float[] expected = new float[count]; + + SimdUtils.Shuffle.InverseMmShuffle( + control, + out int p3, + out int p2, + out int p1, + out int p0); + + for (int i = 0; i < expected.Length; i += 4) + { + expected[i] = source[p0 + i]; + expected[i + 1] = source[p1 + i]; + expected[i + 2] = source[p2 + i]; + expected[i + 3] = source[p3 + i]; + } + + convert(source, result); + + Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5F)); + } + } +} diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs index 838db742a..bddadff4d 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.cs @@ -13,7 +13,7 @@ using Xunit.Abstractions; namespace SixLabors.ImageSharp.Tests.Common { - public class SimdUtilsTests + public partial class SimdUtilsTests { private ITestOutputHelper Output { get; } @@ -212,14 +212,14 @@ namespace SixLabors.ImageSharp.Tests.Common static void RunTest(string serialized) { TestImpl_BulkConvertByteToNormalizedFloat( - FeatureTestRunner.Deserialize(serialized), + FeatureTestRunner.Deserialize(serialized), (s, d) => SimdUtils.HwIntrinsics.ByteToNormalizedFloat(s.Span, d.Span)); } FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41, - count); + count, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE41); } #endif @@ -305,14 +305,14 @@ namespace SixLabors.ImageSharp.Tests.Common static void RunTest(string serialized) { TestImpl_BulkConvertNormalizedFloatToByteClampOverflows( - FeatureTestRunner.Deserialize(serialized), + FeatureTestRunner.Deserialize(serialized), (s, d) => SimdUtils.HwIntrinsics.NormalizedFloatToByteSaturate(s.Span, d.Span)); } FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2, - count); + count, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2); } #endif diff --git a/tests/ImageSharp.Tests/Formats/Png/PngEncoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngEncoderTests.cs index 465bed8a1..b4670cb5d 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngEncoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngEncoderTests.cs @@ -535,7 +535,7 @@ namespace SixLabors.ImageSharp.Tests.Formats.Png static void RunTest(string serialized) { TestImageProvider provider = - FeatureTestRunner.Deserialize>(serialized); + FeatureTestRunner.DeserializeForXunit>(serialized); foreach (PngInterlaceMode interlaceMode in InterlaceMode) { diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs index fdba9ce98..4720ea78a 100644 --- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs +++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs @@ -29,17 +29,19 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities /// The type to deserialize to. /// The string value to deserialize. /// The value. - public static T Deserialize(string value) + public static T DeserializeForXunit(string value) where T : IXunitSerializable => BasicSerializer.Deserialize(value); /// - /// Allows the deserialization of integers passed to the feature test. + /// Allows the deserialization of types implementing + /// passed to the feature test. /// /// The string value to deserialize. - /// The value. - public static int Deserialize(string value) - => Convert.ToInt32(value); + /// The value. + public static T Deserialize(string value) + where T : IConvertible + => (T)Convert.ChangeType(value, typeof(T)); /// /// Runs the given test within an environment @@ -214,12 +216,13 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities /// where the given features. /// /// The test action to run. - /// The intrinsics features. /// The value to pass as a parameter to the test action. - public static void RunWithHwIntrinsicsFeature( + /// The intrinsics features. + public static void RunWithHwIntrinsicsFeature( Action action, - HwIntrinsics intrinsics, - int serializable) + T serializable, + HwIntrinsics intrinsics) + where T : IConvertible { if (!RemoteExecutor.IsSupported) { diff --git a/tests/ImageSharp.Tests/TestUtilities/Tests/FeatureTestRunnerTests.cs b/tests/ImageSharp.Tests/TestUtilities/Tests/FeatureTestRunnerTests.cs index 646000120..4cbbefe68 100644 --- a/tests/ImageSharp.Tests/TestUtilities/Tests/FeatureTestRunnerTests.cs +++ b/tests/ImageSharp.Tests/TestUtilities/Tests/FeatureTestRunnerTests.cs @@ -183,7 +183,7 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities.Tests static void AssertHwIntrinsicsFeatureDisabled(string serializable) { Assert.NotNull(serializable); - Assert.NotNull(FeatureTestRunner.Deserialize(serializable)); + Assert.NotNull(FeatureTestRunner.DeserializeForXunit(serializable)); #if SUPPORTS_RUNTIME_INTRINSICS Assert.False(Sse.IsSupported); @@ -202,7 +202,7 @@ namespace SixLabors.ImageSharp.Tests.TestUtilities.Tests static void AssertHwIntrinsicsFeatureDisabled(string serializable, string intrinsic) { Assert.NotNull(serializable); - Assert.NotNull(FeatureTestRunner.Deserialize(serializable)); + Assert.NotNull(FeatureTestRunner.DeserializeForXunit(serializable)); switch ((HwIntrinsics)Enum.Parse(typeof(HwIntrinsics), intrinsic)) { From 99d0a3111d42eb975a7a253c3c0be8c35d1ba125 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 26 Oct 2020 20:04:32 +0000 Subject: [PATCH 2/9] Add 4 channel byte shuffling --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 122 +++++++++++++++++- .../Common/Helpers/SimdUtils.Shuffle.cs | 115 ++++++++++++----- .../Color/Bulk/ShuffleByte4Channel.cs | 68 ++++++++++ .../Common/SimdUtilsTests.Shuffle.cs | 65 +++++++++- 4 files changed, 330 insertions(+), 40 deletions(-) create mode 100644 tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index aea04737d..899ab7130 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -22,8 +22,8 @@ namespace SixLabors.ImageSharp /// Shuffle single-precision (32-bit) floating-point elements in /// using the control and store the results in . /// - /// The source span of floats - /// The destination span of float + /// The source span of floats. + /// The destination span of floats. /// The byte control. [MethodImpl(InliningOptions.ShortMethod)] public static void Shuffle4ChannelReduce( @@ -58,6 +58,46 @@ namespace SixLabors.ImageSharp } } + /// + /// Shuffle 8-bit integers in a within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle4ChannelReduce( + ref ReadOnlySpan source, + ref Span dest, + byte control) + { + if (Avx2.IsSupported || Ssse3.IsSupported) + { + int remainder; + if (Avx.IsSupported) + { + remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); + } + else + { + remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + } + + int adjustedCount = source.Length - remainder; + + if (adjustedCount > 0) + { + Shuffle4Channel( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount), + control); + + source = source.Slice(adjustedCount); + dest = dest.Slice(adjustedCount); + } + } + } + [MethodImpl(InliningOptions.ShortMethod)] private static void Shuffle4Channel( ReadOnlySpan source, @@ -98,6 +138,84 @@ namespace SixLabors.ImageSharp } } + [MethodImpl(InliningOptions.ShortMethod)] + private static void Shuffle4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + if (Avx2.IsSupported) + { + int n = dest.Length / Vector256.Count; + + Vector256 vcm; + switch (control) + { + case Shuffle.WXYZ: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.WXYZ_256)); + break; + case Shuffle.XYZW: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.XYZW_256)); + break; + case Shuffle.ZYXW: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.ZYXW_256)); + break; + default: + Span bytes = stackalloc byte[Vector256.Count]; + Shuffle.MmShuffleSpan(ref bytes, control); + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + break; + } + + ref Vector256 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector256 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + for (int i = 0; i < n; i++) + { + Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + } + } + else + { + // Ssse3 + int n = dest.Length / Vector128.Count; + + Vector128 vcm; + switch (control) + { + case Shuffle.WXYZ: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.WXYZ_128)); + break; + case Shuffle.XYZW: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.XYZW_128)); + break; + case Shuffle.ZYXW: + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.ZYXW_128)); + break; + default: + Span bytes = stackalloc byte[Vector128.Count]; + Shuffle.MmShuffleSpan(ref bytes, control); + vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + break; + } + + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + for (int i = 0; i < n; i++) + { + Vector128 vs = Unsafe.Add(ref sourceBase, i); + Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(vs, vcm); + } + } + } + /// /// Performs a multiplication and an addition of the . /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index fe7cbb72a..76746e4d2 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -14,8 +14,8 @@ namespace SixLabors.ImageSharp /// Shuffle single-precision (32-bit) floating-point elements in /// using the control and store the results in . /// - /// The source span of floats - /// The destination span of float + /// The source span of floats. + /// The destination span of floats. /// The byte control. [MethodImpl(InliningOptions.ShortMethod)] public static void Shuffle4Channel( @@ -38,14 +38,43 @@ namespace SixLabors.ImageSharp } } + /// + /// Shuffle 8-bit integers in a within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + VerifyShuffleSpanInput(source, dest); + + // TODO: There doesn't seem to be any APIs for + // System.Numerics that allow shuffling. +#if SUPPORTS_RUNTIME_INTRINSICS + HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control); +#endif + + // Deal with the remainder: + if (source.Length > 0) + { + ShuffleRemainder4Channel(source, dest, control); + } + } + [MethodImpl(InliningOptions.ColdPath)] - public static void ShuffleRemainder4Channel( - ReadOnlySpan source, - Span dest, + public static void ShuffleRemainder4Channel( + ReadOnlySpan source, + Span dest, byte control) + where T : struct { - ref float sBase = ref MemoryMarshal.GetReference(source); - ref float dBase = ref MemoryMarshal.GetReference(dest); + ref T sBase = ref MemoryMarshal.GetReference(source); + ref T dBase = ref MemoryMarshal.GetReference(dest); Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); for (int i = 0; i < source.Length; i += 4) @@ -58,7 +87,8 @@ namespace SixLabors.ImageSharp } [Conditional("DEBUG")] - private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) + private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) + where T : struct { DebugGuard.IsTrue( source.Length == dest.Length, @@ -77,49 +107,64 @@ namespace SixLabors.ImageSharp public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0; public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; - public static ReadOnlySpan WXYZ_128 => MmShuffleByte128(2, 1, 0, 3); + public static ReadOnlySpan WXYZ_128 => MmShuffleSpan128(WXYZ); - public static ReadOnlySpan XYZW_128 => MmShuffleByte128(3, 2, 1, 0); + public static ReadOnlySpan XYZW_128 => MmShuffleSpan128(XYZW); - public static ReadOnlySpan ZYXW_128 => MmShuffleByte128(3, 0, 1, 2); + public static ReadOnlySpan ZYXW_128 => MmShuffleSpan128(ZYXW); - public static ReadOnlySpan WXYZ_256 => MmShuffleByte256(2, 1, 0, 3); + public static ReadOnlySpan WXYZ_256 => MmShuffleSpan256(WXYZ); - public static ReadOnlySpan XYZW_256 => MmShuffleByte256(3, 2, 1, 0); + public static ReadOnlySpan XYZW_256 => MmShuffleSpan256(XYZW); - public static ReadOnlySpan ZYXW_256 => MmShuffleByte256(3, 0, 1, 2); + public static ReadOnlySpan ZYXW_256 => MmShuffleSpan256(ZYXW); - private static byte[] MmShuffleByte128(int p3, int p2, int p1, int p0) + private static ReadOnlySpan MmShuffleSpan128(byte control) { - byte[] result = new byte[16]; - - for (int i = 0; i < result.Length; i += 4) - { - result[i] = (byte)(p0 + i); - result[i + 1] = (byte)(p1 + i); - result[i + 2] = (byte)(p2 + i); - result[i + 3] = (byte)(p3 + i); - } + Span buffer = new byte[16]; + MmShuffleSpan(ref buffer, control); + return buffer; + } - return result; + private static ReadOnlySpan MmShuffleSpan256(byte control) + { + Span buffer = new byte[32]; + MmShuffleSpan(ref buffer, control); + return buffer; } - private static byte[] MmShuffleByte256(int p3, int p2, int p1, int p0) + [MethodImpl(InliningOptions.ShortMethod)] + public static byte MmShuffle(int p3, int p2, int p1, int p0) + => (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0); + + [MethodImpl(InliningOptions.ShortMethod)] + public static void MmShuffleSpan(ref Span span, byte control) { - byte[] result = new byte[32]; + InverseMmShuffle( + control, + out int p3, + out int p2, + out int p1, + out int p0); - for (int i = 0; i < result.Length; i += 4) + ref byte spanBase = ref MemoryMarshal.GetReference(span); + + for (int i = 0; i < span.Length; i += 4) { - result[i] = (byte)(p0 + i); - result[i + 1] = (byte)(p1 + i); - result[i + 2] = (byte)(p2 + i); - result[i + 3] = (byte)(p3 + i); + Unsafe.Add(ref spanBase, i) = (byte)(p0 + i); + Unsafe.Add(ref spanBase, i + 1) = (byte)(p1 + i); + Unsafe.Add(ref spanBase, i + 2) = (byte)(p2 + i); + Unsafe.Add(ref spanBase, i + 3) = (byte)(p3 + i); } - - return result; } - public static void InverseMmShuffle(byte control, out int p3, out int p2, out int p1, out int p0) + [MethodImpl(InliningOptions.ShortMethod)] + public static void InverseMmShuffle( + byte control, + out int p3, + out int p2, + out int p1, + out int p0) { p3 = control >> 6 & 0x3; p2 = control >> 4 & 0x3; diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs new file mode 100644 index 000000000..baef86099 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs @@ -0,0 +1,68 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class ShuffleByte4Channel + { + private byte[] source; + private byte[] destination; + + [GlobalSetup] + public void Setup() + { + this.source = new byte[this.Count]; + new Random(this.Count).NextBytes(this.source); + this.destination = new byte[this.Count]; + } + + [Params(128, 256, 512, 1024, 2048)] + public int Count { get; set; } + + [Benchmark] + public void Shuffle4Channel() + { + SimdUtils.Shuffle4Channel(this.source, this.destination, SimdUtils.Shuffle.WXYZ); + } + } + + // 2020-10-26 + // ########## + // + // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) + // Intel Core i7-8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK = 5.0.100-rc.2.20479.15 + // + // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // + // Runtime=.NET Core 3.1 + // + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------- |---------------- |-------------------------------------------------- |------ |----------:|----------:|----------:|------:|--------:|-------:|------:|------:|----------:| + // | Shuffle4Channel | AVX | Empty | 128 | 33.57 ns | 0.694 ns | 1.268 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.97 ns | 0.940 ns | 1.045 ns | 1.94 | 0.10 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 27.23 ns | 0.338 ns | 0.300 ns | 0.84 | 0.04 | 0.0095 | - | - | 40 B | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 256 | 34.57 ns | 0.295 ns | 0.276 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 124.62 ns | 0.257 ns | 0.228 ns | 3.60 | 0.03 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 32.22 ns | 0.106 ns | 0.099 ns | 0.93 | 0.01 | 0.0095 | - | - | 40 B | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 512 | 40.41 ns | 0.826 ns | 0.848 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 251.65 ns | 0.440 ns | 0.412 ns | 6.23 | 0.13 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 41.54 ns | 0.128 ns | 0.114 ns | 1.03 | 0.02 | 0.0095 | - | - | 40 B | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 1024 | 51.54 ns | 0.156 ns | 0.121 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 493.66 ns | 1.316 ns | 1.231 ns | 9.58 | 0.04 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 61.45 ns | 0.216 ns | 0.181 ns | 1.19 | 0.00 | 0.0095 | - | - | 40 B | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 2048 | 76.85 ns | 0.176 ns | 0.138 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 985.64 ns | 11.396 ns | 10.103 ns | 12.84 | 0.15 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 106.13 ns | 0.335 ns | 0.297 ns | 1.38 | 0.01 | 0.0095 | - | - | 40 B | +} diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index 04aab18e4..e07bcf257 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -14,7 +14,10 @@ namespace SixLabors.ImageSharp.Tests.Common { SimdUtils.Shuffle.WXYZ, SimdUtils.Shuffle.XYZW, - SimdUtils.Shuffle.ZYXW + SimdUtils.Shuffle.ZYXW, + SimdUtils.Shuffle.MmShuffle(2, 1, 3, 0), + SimdUtils.Shuffle.MmShuffle(1, 1, 1, 1), + SimdUtils.Shuffle.MmShuffle(3, 3, 3, 3) }; [Theory] @@ -28,7 +31,7 @@ namespace SixLabors.ImageSharp.Tests.Common { foreach (var count in item) { - TestShuffle( + TestShuffleFloat4Channel( (int)count, (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl), ctrl); @@ -42,7 +45,32 @@ namespace SixLabors.ImageSharp.Tests.Common HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); } - private static void TestShuffle( + [Theory] + [MemberData(nameof(ShuffleControls))] + public void BulkShuffleByte4Channel(byte control) + { + static void RunTest(string serialized) + { + byte ctrl = FeatureTestRunner.Deserialize(serialized); + foreach (var item in ArraySizesDivisibleBy4) + { + foreach (var count in item) + { + TestShuffleByte4Channel( + (int)count, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl), + ctrl); + } + } + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + control, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); + } + + private static void TestShuffleFloat4Channel( int count, Action, Memory> convert, byte control) @@ -71,5 +99,36 @@ namespace SixLabors.ImageSharp.Tests.Common Assert.Equal(expected, result, new ApproximateFloatComparer(1e-5F)); } + + private static void TestShuffleByte4Channel( + int count, + Action, Memory> convert, + byte control) + { + byte[] source = new byte[count]; + new Random(count).NextBytes(source); + var result = new byte[count]; + + byte[] expected = new byte[count]; + + SimdUtils.Shuffle.InverseMmShuffle( + control, + out int p3, + out int p2, + out int p1, + out int p0); + + for (int i = 0; i < expected.Length; i += 4) + { + expected[i] = source[p0 + i]; + expected[i + 1] = source[p1 + i]; + expected[i + 2] = source[p2 + i]; + expected[i + 3] = source[p3 + i]; + } + + convert(source, result); + + Assert.Equal(expected, result); + } } } From 34963a7f7a40a1950376672e09deba5e794eeb7a Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Mon, 26 Oct 2020 22:13:21 +0000 Subject: [PATCH 3/9] Don't use static spans for now. --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 45 ++++--------------- .../Common/Helpers/SimdUtils.Shuffle.cs | 26 ----------- .../Color/Bulk/ShuffleByte4Channel.cs | 42 ++++++++--------- 3 files changed, 30 insertions(+), 83 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 899ab7130..d68e16e23 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -148,24 +148,12 @@ namespace SixLabors.ImageSharp { int n = dest.Length / Vector256.Count; - Vector256 vcm; - switch (control) - { - case Shuffle.WXYZ: - vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.WXYZ_256)); - break; - case Shuffle.XYZW: - vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.XYZW_256)); - break; - case Shuffle.ZYXW: - vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.ZYXW_256)); - break; - default: - Span bytes = stackalloc byte[Vector256.Count]; - Shuffle.MmShuffleSpan(ref bytes, control); - vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); - break; - } + // I've chosen to do this for convenience while we determine what + // shuffle controls to add to the library. + // We can add static ROS instances if need be in the future. + Span bytes = stackalloc byte[Vector256.Count]; + Shuffle.MmShuffleSpan(ref bytes, control); + Vector256 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); @@ -183,24 +171,9 @@ namespace SixLabors.ImageSharp // Ssse3 int n = dest.Length / Vector128.Count; - Vector128 vcm; - switch (control) - { - case Shuffle.WXYZ: - vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.WXYZ_128)); - break; - case Shuffle.XYZW: - vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.XYZW_128)); - break; - case Shuffle.ZYXW: - vcm = Unsafe.As>(ref MemoryMarshal.GetReference(Shuffle.ZYXW_128)); - break; - default: - Span bytes = stackalloc byte[Vector128.Count]; - Shuffle.MmShuffleSpan(ref bytes, control); - vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); - break; - } + Span bytes = stackalloc byte[Vector128.Count]; + Shuffle.MmShuffleSpan(ref bytes, control); + Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 76746e4d2..6b766b88d 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -107,32 +107,6 @@ namespace SixLabors.ImageSharp public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0; public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; - public static ReadOnlySpan WXYZ_128 => MmShuffleSpan128(WXYZ); - - public static ReadOnlySpan XYZW_128 => MmShuffleSpan128(XYZW); - - public static ReadOnlySpan ZYXW_128 => MmShuffleSpan128(ZYXW); - - public static ReadOnlySpan WXYZ_256 => MmShuffleSpan256(WXYZ); - - public static ReadOnlySpan XYZW_256 => MmShuffleSpan256(XYZW); - - public static ReadOnlySpan ZYXW_256 => MmShuffleSpan256(ZYXW); - - private static ReadOnlySpan MmShuffleSpan128(byte control) - { - Span buffer = new byte[16]; - MmShuffleSpan(ref buffer, control); - return buffer; - } - - private static ReadOnlySpan MmShuffleSpan256(byte control) - { - Span buffer = new byte[32]; - MmShuffleSpan(ref buffer, control); - return buffer; - } - [MethodImpl(InliningOptions.ShortMethod)] public static byte MmShuffle(int p3, int p2, int p1, int p0) => (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0); diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs index baef86099..c45b103e3 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs @@ -44,25 +44,25 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk // // Runtime=.NET Core 3.1 // - // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |---------------- |---------------- |-------------------------------------------------- |------ |----------:|----------:|----------:|------:|--------:|-------:|------:|------:|----------:| - // | Shuffle4Channel | AVX | Empty | 128 | 33.57 ns | 0.694 ns | 1.268 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.97 ns | 0.940 ns | 1.045 ns | 1.94 | 0.10 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 27.23 ns | 0.338 ns | 0.300 ns | 0.84 | 0.04 | 0.0095 | - | - | 40 B | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 256 | 34.57 ns | 0.295 ns | 0.276 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 124.62 ns | 0.257 ns | 0.228 ns | 3.60 | 0.03 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 32.22 ns | 0.106 ns | 0.099 ns | 0.93 | 0.01 | 0.0095 | - | - | 40 B | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 512 | 40.41 ns | 0.826 ns | 0.848 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 251.65 ns | 0.440 ns | 0.412 ns | 6.23 | 0.13 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 41.54 ns | 0.128 ns | 0.114 ns | 1.03 | 0.02 | 0.0095 | - | - | 40 B | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 1024 | 51.54 ns | 0.156 ns | 0.121 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 493.66 ns | 1.316 ns | 1.231 ns | 9.58 | 0.04 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 61.45 ns | 0.216 ns | 0.181 ns | 1.19 | 0.00 | 0.0095 | - | - | 40 B | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 2048 | 76.85 ns | 0.176 ns | 0.138 ns | 1.00 | 0.00 | 0.0134 | - | - | 56 B | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 985.64 ns | 11.396 ns | 10.103 ns | 12.84 | 0.15 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 106.13 ns | 0.335 ns | 0.297 ns | 1.38 | 0.01 | 0.0095 | - | - | 40 B | + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------- |---------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| + // | Shuffle4Channel | AVX | Empty | 128 | 20.51 ns | 0.270 ns | 0.211 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.00 ns | 0.991 ns | 0.927 ns | 3.08 | 0.06 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 17.25 ns | 0.066 ns | 0.058 ns | 0.84 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 256 | 24.57 ns | 0.248 ns | 0.219 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 124.55 ns | 2.501 ns | 2.456 ns | 5.06 | 0.10 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 21.80 ns | 0.094 ns | 0.088 ns | 0.89 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 512 | 28.51 ns | 0.130 ns | 0.115 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 256.52 ns | 1.424 ns | 1.332 ns | 9.00 | 0.07 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 29.72 ns | 0.217 ns | 0.203 ns | 1.04 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 1024 | 36.40 ns | 0.357 ns | 0.334 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 492.71 ns | 1.498 ns | 1.251 ns | 13.52 | 0.12 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 44.71 ns | 0.264 ns | 0.234 ns | 1.23 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | AVX | Empty | 2048 | 59.38 ns | 0.180 ns | 0.159 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 975.05 ns | 2.043 ns | 1.811 ns | 16.42 | 0.05 | - | - | - | - | + // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 81.83 ns | 0.212 ns | 0.198 ns | 1.38 | 0.01 | - | - | - | - | } From 84a1d1a28bbc5fff9da861b129ddbcccadb33b8b Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 27 Oct 2020 19:32:32 +0000 Subject: [PATCH 4/9] Add optimized fallback for existing shuffles. --- .../Common/Helpers/SimdUtils.Shuffle.cs | 133 +++++++++++++++++- .../Common/SimdUtilsTests.Shuffle.cs | 2 + 2 files changed, 129 insertions(+), 6 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 6b766b88d..4d2678320 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -2,6 +2,7 @@ // Licensed under the Apache License, Version 2.0. using System; +using System.Buffers.Binary; using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -67,14 +68,53 @@ namespace SixLabors.ImageSharp } [MethodImpl(InliningOptions.ColdPath)] - public static void ShuffleRemainder4Channel( - ReadOnlySpan source, - Span dest, + public static void ShuffleRemainder4Channel( + ReadOnlySpan source, + Span dest, + byte control) + { + ref float sBase = ref MemoryMarshal.GetReference(source); + ref float dBase = ref MemoryMarshal.GetReference(dest); + Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); + + for (int i = 0; i < source.Length; i += 4) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); + Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); + Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); + Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); + } + } + + [MethodImpl(InliningOptions.ColdPath)] + public static void ShuffleRemainder4Channel( + ReadOnlySpan source, + Span dest, byte control) - where T : struct { - ref T sBase = ref MemoryMarshal.GetReference(source); - ref T dBase = ref MemoryMarshal.GetReference(dest); +#if NETCOREAPP + // The JIT can detect and optimize rotation idioms ROTL (Rotate Left) + // and ROTR (Rotate Right) emitting efficient CPU instructions: + // https://github.com/dotnet/coreclr/pull/1830 + switch (control) + { + case Shuffle.WXYZ: + WXYZ(source, dest); + return; + case Shuffle.WZYX: + WZYX(source, dest); + return; + case Shuffle.YZWX: + YZWX(source, dest); + return; + case Shuffle.ZYXW: + ZYXW(source, dest); + return; + } +#endif + + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); for (int i = 0; i < source.Length; i += 4) @@ -86,6 +126,85 @@ namespace SixLabors.ImageSharp } } + [MethodImpl(InliningOptions.ShortMethod)] + private static void WXYZ(ReadOnlySpan source, Span dest) + { + ReadOnlySpan s = MemoryMarshal.Cast(source); + Span d = MemoryMarshal.Cast(dest); + ref uint sBase = ref MemoryMarshal.GetReference(s); + ref uint dBase = ref MemoryMarshal.GetReference(d); + + for (int i = 0; i < s.Length; i++) + { + uint packed = Unsafe.Add(ref sBase, i); + + // packed = [W Z Y X] + // ROTL(8, packed) = [Z Y X W] + Unsafe.Add(ref dBase, i) = (packed << 8) | (packed >> 24); + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void ZYXW(ReadOnlySpan source, Span dest) + { + ReadOnlySpan s = MemoryMarshal.Cast(source); + Span d = MemoryMarshal.Cast(dest); + ref uint sBase = ref MemoryMarshal.GetReference(s); + ref uint dBase = ref MemoryMarshal.GetReference(d); + + for (int i = 0; i < s.Length; i++) + { + uint packed = Unsafe.Add(ref sBase, i); + + // packed = [W Z Y X] + // tmp1 = [W 0 Y 0] + // tmp2 = [0 Z 0 X] + // tmp3=ROTL(16, tmp2) = [0 X 0 Z] + // tmp1 + tmp3 = [W X Y Z] + uint tmp1 = packed & 0xFF00FF00; + uint tmp2 = packed & 0x00FF00FF; + uint tmp3 = (tmp2 << 16) | (tmp2 >> 16); + + Unsafe.Add(ref dBase, i) = tmp1 + tmp3; + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void WZYX(ReadOnlySpan source, Span dest) + { + ReadOnlySpan s = MemoryMarshal.Cast(source); + Span d = MemoryMarshal.Cast(dest); + ref uint sBase = ref MemoryMarshal.GetReference(s); + ref uint dBase = ref MemoryMarshal.GetReference(d); + + for (int i = 0; i < s.Length; i++) + { + uint packed = Unsafe.Add(ref sBase, i); + + // packed = [W Z Y X] + // REVERSE(packedArgb) = [X Y Z W] + Unsafe.Add(ref dBase, i) = BinaryPrimitives.ReverseEndianness(packed); + } + } + + [MethodImpl(InliningOptions.ShortMethod)] + private static void YZWX(ReadOnlySpan source, Span dest) + { + ReadOnlySpan s = MemoryMarshal.Cast(source); + Span d = MemoryMarshal.Cast(dest); + ref uint sBase = ref MemoryMarshal.GetReference(s); + ref uint dBase = ref MemoryMarshal.GetReference(d); + + for (int i = 0; i < s.Length; i++) + { + uint packed = Unsafe.Add(ref sBase, i); + + // packed = [W Z Y X] + // ROTR(8, packedArgb) = [Y Z W X] + Unsafe.Add(ref dBase, i) = (packed >> 8) | (packed << 24); + } + } + [Conditional("DEBUG")] private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) where T : struct @@ -104,7 +223,9 @@ namespace SixLabors.ImageSharp public static class Shuffle { public const byte WXYZ = (2 << 6) | (1 << 4) | (0 << 2) | 3; + public const byte WZYX = (0 << 6) | (1 << 4) | (2 << 2) | 3; public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0; + public const byte YZWX = (0 << 6) | (3 << 4) | (2 << 2) | 1; public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; [MethodImpl(InliningOptions.ShortMethod)] diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index e07bcf257..cdb6a86df 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -13,7 +13,9 @@ namespace SixLabors.ImageSharp.Tests.Common new TheoryData { SimdUtils.Shuffle.WXYZ, + SimdUtils.Shuffle.WZYX, SimdUtils.Shuffle.XYZW, + SimdUtils.Shuffle.YZWX, SimdUtils.Shuffle.ZYXW, SimdUtils.Shuffle.MmShuffle(2, 1, 3, 0), SimdUtils.Shuffle.MmShuffle(1, 1, 1, 1), From 956d1a3c77c2a898f67dac2cccc04f526aa17f40 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 28 Oct 2020 00:35:10 +0000 Subject: [PATCH 5/9] Unroll loops --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 105 +++++++++++++++--- .../Common/Helpers/SimdUtils.Shuffle.cs | 2 +- 2 files changed, 88 insertions(+), 19 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index d68e16e23..0ea17c770 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -106,34 +106,72 @@ namespace SixLabors.ImageSharp { if (Avx.IsSupported) { - int n = dest.Length / Vector256.Count; - ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - for (int i = 0; i < n; i++) + int n = dest.Length / Vector256.Count; + int m = ImageMaths.Modulo4(n); + int u = n - m; + + for (int i = 0; i < u; i += 4) + { + ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i); + ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); + + vd0 = Avx.Permute(vs0, control); + Unsafe.Add(ref vd0, 1) = Avx.Permute(Unsafe.Add(ref vs0, 1), control); + Unsafe.Add(ref vd0, 2) = Avx.Permute(Unsafe.Add(ref vs0, 2), control); + Unsafe.Add(ref vd0, 3) = Avx.Permute(Unsafe.Add(ref vs0, 3), control); + } + + if (m > 0) { - Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control); + for (int i = u; i < n; i++) + { + Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control); + } } } else { // Sse - int n = dest.Length / Vector128.Count; - ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector128 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - for (int i = 0; i < n; i++) + int n = dest.Length / Vector128.Count; + int m = ImageMaths.Modulo4(n); + int u = n - m; + + for (int i = 0; i < u; i += 4) { - Vector128 vs = Unsafe.Add(ref sourceBase, i); - Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control); + ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i); + ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); + + vd0 = Sse.Shuffle(vs0, vs0, control); + + Vector128 vs1 = Unsafe.Add(ref vs0, 1); + Unsafe.Add(ref vd0, 1) = Sse.Shuffle(vs1, vs1, control); + + Vector128 vs2 = Unsafe.Add(ref vs0, 2); + Unsafe.Add(ref vd0, 2) = Sse.Shuffle(vs2, vs2, control); + + Vector128 vs3 = Unsafe.Add(ref vs0, 3); + Unsafe.Add(ref vd0, 3) = Sse.Shuffle(vs3, vs3, control); + } + + if (m > 0) + { + for (int i = u; i < n; i++) + { + Vector128 vs = Unsafe.Add(ref sourceBase, i); + Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control); + } } } } @@ -146,8 +184,6 @@ namespace SixLabors.ImageSharp { if (Avx2.IsSupported) { - int n = dest.Length / Vector256.Count; - // I've chosen to do this for convenience while we determine what // shuffle controls to add to the library. // We can add static ROS instances if need be in the future. @@ -161,16 +197,32 @@ namespace SixLabors.ImageSharp ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - for (int i = 0; i < n; i++) + int n = dest.Length / Vector256.Count; + int m = ImageMaths.Modulo4(n); + int u = n - m; + + for (int i = 0; i < u; i += 4) { - Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); + ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i); + + vd0 = Avx2.Shuffle(vs0, vcm); + Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vcm); + Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vcm); + Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vcm); + } + + if (m > 0) + { + for (int i = u; i < n; i++) + { + Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + } } } else { // Ssse3 - int n = dest.Length / Vector128.Count; - Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MmShuffleSpan(ref bytes, control); Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); @@ -181,10 +233,27 @@ namespace SixLabors.ImageSharp ref Vector128 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - for (int i = 0; i < n; i++) + int n = dest.Length / Vector128.Count; + int m = ImageMaths.Modulo4(n); + int u = n - m; + + for (int i = 0; i < u; i += 4) { - Vector128 vs = Unsafe.Add(ref sourceBase, i); - Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(vs, vcm); + ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); + ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i); + + vd0 = Ssse3.Shuffle(vs0, vcm); + Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vcm); + Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vcm); + Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vcm); + } + + if (m > 0) + { + for (int i = u; i < n; i++) + { + Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + } } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 4d2678320..59b625419 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -229,7 +229,7 @@ namespace SixLabors.ImageSharp public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; [MethodImpl(InliningOptions.ShortMethod)] - public static byte MmShuffle(int p3, int p2, int p1, int p0) + public static byte MmShuffle(byte p3, byte p2, byte p1, byte p0) => (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0); [MethodImpl(InliningOptions.ShortMethod)] From 28dc056d831adcec0d47a332bf638abf5f3ff1f3 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 28 Oct 2020 01:08:53 +0000 Subject: [PATCH 6/9] Fix coverage --- src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs | 2 +- tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 0ea17c770..367df03ec 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -74,7 +74,7 @@ namespace SixLabors.ImageSharp if (Avx2.IsSupported || Ssse3.IsSupported) { int remainder; - if (Avx.IsSupported) + if (Avx2.IsSupported) { remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index cdb6a86df..94298f94c 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -44,7 +44,7 @@ namespace SixLabors.ImageSharp.Tests.Common FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, control, - HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); } [Theory] From 0f950a1e508b1db74cd8f757e164722ac1e0796a Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 28 Oct 2020 19:33:49 +0000 Subject: [PATCH 7/9] Implement new optimized 4 channel shuffle methods. --- .../Argb32.PixelOperations.Generated.cs | 64 ++++++------- .../Bgra32.PixelOperations.Generated.cs | 64 ++++++------- .../Rgba32.PixelOperations.Generated.cs | 64 ++++++------- .../Generated/_Common.ttinclude | 32 +++---- .../PixelFormats/Utils/PixelConverter.cs | 89 +++++++------------ .../PixelConversion_ConvertFromRgba32.cs | 56 ++++-------- ...ConverterTests.ReferenceImplementations.cs | 60 ++++++++----- .../PixelFormats/PixelConverterTests.cs | 67 +++++++------- 8 files changed, 216 insertions(+), 280 deletions(-) diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs index 0b1292b64..3f48d2acc 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Argb32.PixelOperations.Generated.cs @@ -53,66 +53,58 @@ namespace SixLabors.ImageSharp.PixelFormats Vector4Converters.RgbaCompatible.ToVector4(configuration, this, sourcePixels, destVectors, modifiers.Remove(PixelConversionModifiers.Scale)); } /// - public override void ToRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToRgba32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromArgb32.ToRgba32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromArgb32.ToRgba32(source, dest); } /// - public override void FromRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void FromRgba32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromRgba32.ToArgb32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgba32.ToArgb32(source, dest); } /// - public override void ToBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToBgra32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromArgb32.ToBgra32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromArgb32.ToBgra32(source, dest); } /// - public override void FromBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void FromBgra32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromBgra32.ToArgb32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgra32.ToArgb32(source, dest); } /// diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs index 5bdd10404..8cf2d5850 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Bgra32.PixelOperations.Generated.cs @@ -53,66 +53,58 @@ namespace SixLabors.ImageSharp.PixelFormats Vector4Converters.RgbaCompatible.ToVector4(configuration, this, sourcePixels, destVectors, modifiers.Remove(PixelConversionModifiers.Scale)); } /// - public override void ToRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToRgba32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromBgra32.ToRgba32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgra32.ToRgba32(source, dest); } /// - public override void FromRgba32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void FromRgba32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromRgba32.ToBgra32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgba32.ToBgra32(source, dest); } /// - public override void ToArgb32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToArgb32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromBgra32.ToArgb32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgra32.ToArgb32(source, dest); } /// - public override void FromArgb32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void FromArgb32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromArgb32.ToBgra32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromArgb32.ToBgra32(source, dest); } /// diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs index b05c62f1f..9a36ec29a 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs +++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/Rgba32.PixelOperations.Generated.cs @@ -42,66 +42,58 @@ namespace SixLabors.ImageSharp.PixelFormats } /// - public override void ToArgb32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToArgb32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromRgba32.ToArgb32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgba32.ToArgb32(source, dest); } /// - public override void FromArgb32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void FromArgb32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromArgb32.ToRgba32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromArgb32.ToRgba32(source, dest); } /// - public override void ToBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void ToBgra32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromRgba32.ToBgra32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromRgba32.ToBgra32(source, dest); } /// - public override void FromBgra32(Configuration configuration, ReadOnlySpan sourcePixels, Span destinationPixels) + public override void FromBgra32( + Configuration configuration, + ReadOnlySpan sourcePixels, + Span destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.FromBgra32.ToRgba32(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast(sourcePixels); + Span dest = MemoryMarshal.Cast(destinationPixels); + PixelConverter.FromBgra32.ToRgba32(source, dest); } /// diff --git a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude index 5d56731ba..d8b5286cd 100644 --- a/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude +++ b/src/ImageSharp/PixelFormats/PixelImplementations/Generated/_Common.ttinclude @@ -88,35 +88,31 @@ using System.Runtime.InteropServices; { #> /// - public override void To<#=otherPixelType#>(Configuration configuration, ReadOnlySpan<<#=thisPixelType#>> sourcePixels, Span<<#=otherPixelType#>> destinationPixels) + public override void To<#=otherPixelType#>( + Configuration configuration, + ReadOnlySpan<<#=thisPixelType#>> sourcePixels, + Span<<#=otherPixelType#>> destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As<<#=thisPixelType#>,uint>(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As<<#=otherPixelType#>, uint>(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.From<#=thisPixelType#>.To<#=otherPixelType#>(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast<<#=thisPixelType#>, byte>(sourcePixels); + Span dest = MemoryMarshal.Cast<<#=otherPixelType#>, byte>(destinationPixels); + PixelConverter.From<#=thisPixelType#>.To<#=otherPixelType#>(source, dest); } /// - public override void From<#=otherPixelType#>(Configuration configuration, ReadOnlySpan<<#=otherPixelType#>> sourcePixels, Span<<#=thisPixelType#>> destinationPixels) + public override void From<#=otherPixelType#>( + Configuration configuration, + ReadOnlySpan<<#=otherPixelType#>> sourcePixels, + Span<<#=thisPixelType#>> destinationPixels) { Guard.NotNull(configuration, nameof(configuration)); Guard.DestinationShouldNotBeTooShort(sourcePixels, destinationPixels, nameof(destinationPixels)); - ref uint sourceRef = ref Unsafe.As<<#=otherPixelType#>,uint>(ref MemoryMarshal.GetReference(sourcePixels)); - ref uint destRef = ref Unsafe.As<<#=thisPixelType#>, uint>(ref MemoryMarshal.GetReference(destinationPixels)); - - for (int i = 0; i < sourcePixels.Length; i++) - { - uint sp = Unsafe.Add(ref sourceRef, i); - Unsafe.Add(ref destRef, i) = PixelConverter.From<#=otherPixelType#>.To<#=thisPixelType#>(sp); - } + ReadOnlySpan source = MemoryMarshal.Cast<<#=otherPixelType#>, byte>(sourcePixels); + Span dest = MemoryMarshal.Cast<<#=thisPixelType#>, byte>(destinationPixels); + PixelConverter.From<#=otherPixelType#>.To<#=thisPixelType#>(source, dest); } <#+ } diff --git a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs index 814264084..bc24258c9 100644 --- a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs +++ b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs @@ -1,6 +1,7 @@ -// Copyright (c) Six Labors. +// Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using System.Buffers.Binary; using System.Runtime.CompilerServices; @@ -21,88 +22,64 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils public static class FromRgba32 { /// - /// Converts a packed to . + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. /// [MethodImpl(InliningOptions.ShortMethod)] - public static uint ToArgb32(uint packedRgba) - { - // packedRgba = [aa bb gg rr] - // ROTL(8, packedRgba) = [bb gg rr aa] - return (packedRgba << 8) | (packedRgba >> 24); - } + public static void ToArgb32(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.WXYZ); /// - /// Converts a packed to . + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. /// [MethodImpl(InliningOptions.ShortMethod)] - public static uint ToBgra32(uint packedRgba) - { - // packedRgba = [aa bb gg rr] - // tmp1 = [aa 00 gg 00] - // tmp2 = [00 bb 00 rr] - // tmp3=ROTL(16, tmp2) = [00 rr 00 bb] - // tmp1 + tmp3 = [aa rr gg bb] - uint tmp1 = packedRgba & 0xFF00FF00; - uint tmp2 = packedRgba & 0x00FF00FF; - uint tmp3 = (tmp2 << 16) | (tmp2 >> 16); - return tmp1 + tmp3; - } + public static void ToBgra32(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.ZYXW); } public static class FromArgb32 { /// - /// Converts a packed to . + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. /// [MethodImpl(InliningOptions.ShortMethod)] - public static uint ToRgba32(uint packedArgb) - { - // packedArgb = [bb gg rr aa] - // ROTR(8, packedArgb) = [aa bb gg rr] - return (packedArgb >> 8) | (packedArgb << 24); - } + public static void ToRgba32(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.YZWX); /// - /// Converts a packed to . + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. /// [MethodImpl(InliningOptions.ShortMethod)] - public static uint ToBgra32(uint packedArgb) - { - // packedArgb = [bb gg rr aa] - // REVERSE(packedArgb) = [aa rr gg bb] - return BinaryPrimitives.ReverseEndianness(packedArgb); - } + public static void ToBgra32(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.WZYX); } public static class FromBgra32 { /// - /// Converts a packed to . + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. /// [MethodImpl(InliningOptions.ShortMethod)] - public static uint ToArgb32(uint packedBgra) - { - // packedBgra = [aa rr gg bb] - // REVERSE(packedBgra) = [bb gg rr aa] - return BinaryPrimitives.ReverseEndianness(packedBgra); - } + public static void ToArgb32(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.WZYX); /// - /// Converts a packed to . + /// Converts a representing a collection of + /// pixels to a representing + /// a collection of pixels. /// [MethodImpl(InliningOptions.ShortMethod)] - public static uint ToRgba32(uint packedBgra) - { - // packedRgba = [aa rr gg bb] - // tmp1 = [aa 00 gg 00] - // tmp2 = [00 rr 00 bb] - // tmp3=ROTL(16, tmp2) = [00 bb 00 rr] - // tmp1 + tmp3 = [aa bb gg rr] - uint tmp1 = packedBgra & 0xFF00FF00; - uint tmp2 = packedBgra & 0x00FF00FF; - uint tmp3 = (tmp2 << 16) | (tmp2 >> 16); - return tmp1 + tmp3; - } + public static void ToRgba32(ReadOnlySpan source, Span dest) + => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.ZYXW); } } -} \ No newline at end of file +} diff --git a/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_ConvertFromRgba32.cs b/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_ConvertFromRgba32.cs index 7d6c2efed..a933f890f 100644 --- a/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_ConvertFromRgba32.cs +++ b/tests/ImageSharp.Benchmarks/General/PixelConversion/PixelConversion_ConvertFromRgba32.cs @@ -168,49 +168,27 @@ namespace SixLabors.ImageSharp.Benchmarks.General.PixelConversion [Benchmark] public void PixelConverter_Rgba32_ToArgb32() { - ref uint sBase = ref Unsafe.As(ref this.PermutedRunnerRgbaToArgb.Source[0]); - ref uint dBase = ref Unsafe.As(ref this.PermutedRunnerRgbaToArgb.Dest[0]); + Span source = MemoryMarshal.Cast(this.PermutedRunnerRgbaToArgb.Source); + Span dest = MemoryMarshal.Cast(this.PermutedRunnerRgbaToArgb.Dest); - for (int i = 0; i < this.Count; i++) - { - uint s = Unsafe.Add(ref sBase, i); - Unsafe.Add(ref dBase, i) = PixelConverter.FromRgba32.ToArgb32(s); - } - } - - [Benchmark] - public void PixelConverter_Rgba32_ToArgb32_CopyThenWorkOnSingleBuffer() - { - Span source = MemoryMarshal.Cast(this.PermutedRunnerRgbaToArgb.Source); - Span dest = MemoryMarshal.Cast(this.PermutedRunnerRgbaToArgb.Dest); - source.CopyTo(dest); - - ref uint dBase = ref MemoryMarshal.GetReference(dest); - - for (int i = 0; i < this.Count; i++) - { - uint s = Unsafe.Add(ref dBase, i); - Unsafe.Add(ref dBase, i) = PixelConverter.FromRgba32.ToArgb32(s); - } + PixelConverter.FromRgba32.ToArgb32(source, dest); } /* RESULTS: - Method | Count | Mean | Error | StdDev | Scaled | ScaledSD | - ---------------------------------------------------------- |------ |-----------:|-----------:|-----------:|-------:|---------:| - ByRef | 256 | 328.7 ns | 6.6141 ns | 6.1868 ns | 1.00 | 0.00 | - ByVal | 256 | 322.0 ns | 4.3541 ns | 4.0728 ns | 0.98 | 0.02 | - FromBytes | 256 | 321.5 ns | 3.3499 ns | 3.1335 ns | 0.98 | 0.02 | - InlineShuffle | 256 | 330.7 ns | 4.2525 ns | 3.9778 ns | 1.01 | 0.02 | - PixelConverter_Rgba32_ToArgb32 | 256 | 167.4 ns | 0.6357 ns | 0.5309 ns | 0.51 | 0.01 | - PixelConverter_Rgba32_ToArgb32_CopyThenWorkOnSingleBuffer | 256 | 196.6 ns | 0.8929 ns | 0.7915 ns | 0.60 | 0.01 | - | | | | | | | - ByRef | 2048 | 2,534.4 ns | 8.2947 ns | 6.9265 ns | 1.00 | 0.00 | - ByVal | 2048 | 2,638.5 ns | 52.6843 ns | 70.3320 ns | 1.04 | 0.03 | - FromBytes | 2048 | 2,517.2 ns | 40.8055 ns | 38.1695 ns | 0.99 | 0.01 | - InlineShuffle | 2048 | 2,546.5 ns | 21.2506 ns | 19.8778 ns | 1.00 | 0.01 | - PixelConverter_Rgba32_ToArgb32 | 2048 | 1,265.7 ns | 5.1397 ns | 4.5562 ns | 0.50 | 0.00 | - PixelConverter_Rgba32_ToArgb32_CopyThenWorkOnSingleBuffer | 2048 | 1,410.3 ns | 11.1939 ns | 9.9231 ns | 0.56 | 0.00 | - */ + | Method | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | + |------------------------------- |------ |------------:|----------:|----------:|------------:|------:|--------:| + | ByRef | 256 | 288.84 ns | 19.601 ns | 52.319 ns | 268.10 ns | 1.00 | 0.00 | + | ByVal | 256 | 267.97 ns | 1.831 ns | 1.713 ns | 267.85 ns | 0.77 | 0.18 | + | FromBytes | 256 | 266.81 ns | 2.427 ns | 2.270 ns | 266.47 ns | 0.76 | 0.18 | + | InlineShuffle | 256 | 291.41 ns | 5.820 ns | 5.444 ns | 290.17 ns | 0.83 | 0.19 | + | PixelConverter_Rgba32_ToArgb32 | 256 | 38.62 ns | 0.431 ns | 0.403 ns | 38.68 ns | 0.11 | 0.03 | + | | | | | | | | | + | ByRef | 2048 | 2,197.69 ns | 15.826 ns | 14.804 ns | 2,197.25 ns | 1.00 | 0.00 | + | ByVal | 2048 | 2,226.81 ns | 44.266 ns | 62.054 ns | 2,197.17 ns | 1.03 | 0.04 | + | FromBytes | 2048 | 2,181.35 ns | 18.033 ns | 16.868 ns | 2,185.97 ns | 0.99 | 0.01 | + | InlineShuffle | 2048 | 2,233.10 ns | 27.673 ns | 24.531 ns | 2,229.78 ns | 1.02 | 0.01 | + | PixelConverter_Rgba32_ToArgb32 | 2048 | 139.90 ns | 2.152 ns | 3.825 ns | 138.70 ns | 0.06 | 0.00 | + */ } } diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.ReferenceImplementations.cs b/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.ReferenceImplementations.cs index 6fda9dbba..9d0d09a98 100644 --- a/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.ReferenceImplementations.cs +++ b/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.ReferenceImplementations.cs @@ -13,34 +13,49 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats { public static class ReferenceImplementations { - public static Rgba32 MakeRgba32(byte r, byte g, byte b, byte a) + public static byte[] MakeRgba32ByteArray(byte r, byte g, byte b, byte a) { - Rgba32 d = default; - d.R = r; - d.G = g; - d.B = b; - d.A = a; - return d; + var buffer = new byte[256]; + + for (int i = 0; i < buffer.Length; i += 4) + { + buffer[i] = r; + buffer[i + 1] = g; + buffer[i + 2] = b; + buffer[i + 3] = a; + } + + return buffer; } - public static Argb32 MakeArgb32(byte r, byte g, byte b, byte a) + public static byte[] MakeArgb32ByteArray(byte r, byte g, byte b, byte a) { - Argb32 d = default; - d.R = r; - d.G = g; - d.B = b; - d.A = a; - return d; + var buffer = new byte[256]; + + for (int i = 0; i < buffer.Length; i += 4) + { + buffer[i] = a; + buffer[i + 1] = r; + buffer[i + 2] = g; + buffer[i + 3] = b; + } + + return buffer; } - public static Bgra32 MakeBgra32(byte r, byte g, byte b, byte a) + public static byte[] MakeBgra32ByteArray(byte r, byte g, byte b, byte a) { - Bgra32 d = default; - d.R = r; - d.G = g; - d.B = b; - d.A = a; - return d; + var buffer = new byte[256]; + + for (int i = 0; i < buffer.Length; i += 4) + { + buffer[i] = b; + buffer[i + 1] = g; + buffer[i + 2] = r; + buffer[i + 3] = a; + } + + return buffer; } internal static void To( @@ -83,8 +98,7 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats if (typeof(TDestinationPixel) == typeof(L8)) { - ref L8 l8Ref = ref MemoryMarshal.GetReference( - MemoryMarshal.Cast(destinationPixels)); + ref L8 l8Ref = ref MemoryMarshal.GetReference(MemoryMarshal.Cast(destinationPixels)); for (int i = 0; i < count; i++) { ref TSourcePixel sp = ref Unsafe.Add(ref sourceRef, i); diff --git a/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.cs b/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.cs index 3de6804dc..6eed875f3 100644 --- a/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.cs +++ b/tests/ImageSharp.Tests/PixelFormats/PixelConverterTests.cs @@ -1,6 +1,7 @@ // Copyright (c) Six Labors. // Licensed under the Apache License, Version 2.0. +using System; using SixLabors.ImageSharp.PixelFormats; using SixLabors.ImageSharp.PixelFormats.Utils; @@ -33,30 +34,28 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats [MemberData(nameof(RgbaData))] public void ToArgb32(byte r, byte g, byte b, byte a) { - Rgba32 s = ReferenceImplementations.MakeRgba32(r, g, b, a); + byte[] source = ReferenceImplementations.MakeRgba32ByteArray(r, g, b, a); + var actual = new byte[source.Length]; - // Act: - uint actualPacked = PixelConverter.FromRgba32.ToArgb32(s.PackedValue); + PixelConverter.FromRgba32.ToArgb32(source, actual); - // Assert: - uint expectedPacked = ReferenceImplementations.MakeArgb32(r, g, b, a).PackedValue; + byte[] expected = ReferenceImplementations.MakeArgb32ByteArray(r, g, b, a); - Assert.Equal(expectedPacked, actualPacked); + Assert.Equal(expected, actual); } [Theory] [MemberData(nameof(RgbaData))] public void ToBgra32(byte r, byte g, byte b, byte a) { - Rgba32 s = ReferenceImplementations.MakeRgba32(r, g, b, a); + byte[] source = ReferenceImplementations.MakeRgba32ByteArray(r, g, b, a); + var actual = new byte[source.Length]; - // Act: - uint actualPacked = PixelConverter.FromRgba32.ToBgra32(s.PackedValue); + PixelConverter.FromRgba32.ToBgra32(source, actual); - // Assert: - uint expectedPacked = ReferenceImplementations.MakeBgra32(r, g, b, a).PackedValue; + byte[] expected = ReferenceImplementations.MakeBgra32ByteArray(r, g, b, a); - Assert.Equal(expectedPacked, actualPacked); + Assert.Equal(expected, actual); } } @@ -66,30 +65,28 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats [MemberData(nameof(RgbaData))] public void ToRgba32(byte r, byte g, byte b, byte a) { - Argb32 s = ReferenceImplementations.MakeArgb32(r, g, b, a); + byte[] source = ReferenceImplementations.MakeArgb32ByteArray(r, g, b, a); + var actual = new byte[source.Length]; - // Act: - uint actualPacked = PixelConverter.FromArgb32.ToRgba32(s.PackedValue); + PixelConverter.FromArgb32.ToRgba32(source, actual); - // Assert: - uint expectedPacked = ReferenceImplementations.MakeRgba32(r, g, b, a).PackedValue; + byte[] expected = ReferenceImplementations.MakeRgba32ByteArray(r, g, b, a); - Assert.Equal(expectedPacked, actualPacked); + Assert.Equal(expected, actual); } [Theory] [MemberData(nameof(RgbaData))] public void ToBgra32(byte r, byte g, byte b, byte a) { - Argb32 s = ReferenceImplementations.MakeArgb32(r, g, b, a); + byte[] source = ReferenceImplementations.MakeArgb32ByteArray(r, g, b, a); + var actual = new byte[source.Length]; - // Act: - uint actualPacked = PixelConverter.FromArgb32.ToBgra32(s.PackedValue); + PixelConverter.FromArgb32.ToBgra32(source, actual); - // Assert: - uint expectedPacked = ReferenceImplementations.MakeBgra32(r, g, b, a).PackedValue; + byte[] expected = ReferenceImplementations.MakeBgra32ByteArray(r, g, b, a); - Assert.Equal(expectedPacked, actualPacked); + Assert.Equal(expected, actual); } } @@ -99,30 +96,28 @@ namespace SixLabors.ImageSharp.Tests.PixelFormats [MemberData(nameof(RgbaData))] public void ToArgb32(byte r, byte g, byte b, byte a) { - Bgra32 s = ReferenceImplementations.MakeBgra32(r, g, b, a); + byte[] source = ReferenceImplementations.MakeBgra32ByteArray(r, g, b, a); + var actual = new byte[source.Length]; - // Act: - uint actualPacked = PixelConverter.FromBgra32.ToArgb32(s.PackedValue); + PixelConverter.FromBgra32.ToArgb32(source, actual); - // Assert: - uint expectedPacked = ReferenceImplementations.MakeArgb32(r, g, b, a).PackedValue; + byte[] expected = ReferenceImplementations.MakeArgb32ByteArray(r, g, b, a); - Assert.Equal(expectedPacked, actualPacked); + Assert.Equal(expected, actual); } [Theory] [MemberData(nameof(RgbaData))] public void ToRgba32(byte r, byte g, byte b, byte a) { - Bgra32 s = ReferenceImplementations.MakeBgra32(r, g, b, a); + byte[] source = ReferenceImplementations.MakeBgra32ByteArray(r, g, b, a); + var actual = new byte[source.Length]; - // Act: - uint actualPacked = PixelConverter.FromBgra32.ToRgba32(s.PackedValue); + PixelConverter.FromBgra32.ToRgba32(source, actual); - // Assert: - uint expectedPacked = ReferenceImplementations.MakeRgba32(r, g, b, a).PackedValue; + byte[] expected = ReferenceImplementations.MakeRgba32ByteArray(r, g, b, a); - Assert.Equal(expectedPacked, actualPacked); + Assert.Equal(expected, actual); } } } From aa20c09c4896738ba1df05ace362c2d08f64854d Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Wed, 28 Oct 2020 22:07:54 +0000 Subject: [PATCH 8/9] Update based on feedback --- .../Common/Helpers/IComponentShuffle.cs | 165 ++++++++++++++++++ .../Common/Helpers/SimdUtils.Shuffle.cs | 142 +-------------- .../PixelFormats/Utils/PixelConverter.cs | 13 +- .../Color/Bulk/ShuffleByte4Channel.cs | 2 +- .../Color/Bulk/ShuffleFloat4Channel.cs | 2 +- .../Config.HwIntrinsics.cs | 11 +- .../Common/SimdUtilsTests.Shuffle.cs | 93 ++++++---- 7 files changed, 245 insertions(+), 183 deletions(-) create mode 100644 src/ImageSharp/Common/Helpers/IComponentShuffle.cs diff --git a/src/ImageSharp/Common/Helpers/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/IComponentShuffle.cs new file mode 100644 index 000000000..e354a57b0 --- /dev/null +++ b/src/ImageSharp/Common/Helpers/IComponentShuffle.cs @@ -0,0 +1,165 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using System.Buffers.Binary; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace SixLabors.ImageSharp +{ + /// + /// Defines the contract for methods that allow the shuffling of pixel components. + /// Used for shuffling on platforms that do not support Hardware Intrinsics. + /// + internal interface IComponentShuffle + { + /// + /// Gets the shuffle control. + /// + byte Control { get; } + + /// + /// Shuffle 8-bit integers within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + void RunFallbackShuffle(ReadOnlySpan source, Span dest); + } + + internal readonly struct DefaultShuffle4 : IComponentShuffle + { + public DefaultShuffle4(byte p3, byte p2, byte p1, byte p0) + : this(SimdUtils.Shuffle.MmShuffle(p3, p2, p1, p0)) + { + } + + public DefaultShuffle4(byte control) => this.Control = control; + + public byte Control { get; } + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); + SimdUtils.Shuffle.InverseMmShuffle( + this.Control, + out int p3, + out int p2, + out int p1, + out int p0); + + for (int i = 0; i < source.Length; i += 4) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); + Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); + Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); + Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); + } + } + } + + internal readonly struct WXYZShuffle4 : IComponentShuffle + { + public byte Control => SimdUtils.Shuffle.MmShuffle(2, 1, 0, 3); + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ReadOnlySpan s = MemoryMarshal.Cast(source); + Span d = MemoryMarshal.Cast(dest); + ref uint sBase = ref MemoryMarshal.GetReference(s); + ref uint dBase = ref MemoryMarshal.GetReference(d); + + // The JIT can detect and optimize rotation idioms ROTL (Rotate Left) + // and ROTR (Rotate Right) emitting efficient CPU instructions: + // https://github.com/dotnet/coreclr/pull/1830 + for (int i = 0; i < s.Length; i++) + { + uint packed = Unsafe.Add(ref sBase, i); + + // packed = [W Z Y X] + // ROTL(8, packed) = [Z Y X W] + Unsafe.Add(ref dBase, i) = (packed << 8) | (packed >> 24); + } + } + } + + internal readonly struct WZYXShuffle4 : IComponentShuffle + { + public byte Control => SimdUtils.Shuffle.MmShuffle(0, 1, 2, 3); + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ReadOnlySpan s = MemoryMarshal.Cast(source); + Span d = MemoryMarshal.Cast(dest); + ref uint sBase = ref MemoryMarshal.GetReference(s); + ref uint dBase = ref MemoryMarshal.GetReference(d); + + for (int i = 0; i < s.Length; i++) + { + uint packed = Unsafe.Add(ref sBase, i); + + // packed = [W Z Y X] + // REVERSE(packedArgb) = [X Y Z W] + Unsafe.Add(ref dBase, i) = BinaryPrimitives.ReverseEndianness(packed); + } + } + } + + internal readonly struct YZWXShuffle4 : IComponentShuffle + { + public byte Control => SimdUtils.Shuffle.MmShuffle(0, 3, 2, 1); + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ReadOnlySpan s = MemoryMarshal.Cast(source); + Span d = MemoryMarshal.Cast(dest); + ref uint sBase = ref MemoryMarshal.GetReference(s); + ref uint dBase = ref MemoryMarshal.GetReference(d); + + for (int i = 0; i < s.Length; i++) + { + uint packed = Unsafe.Add(ref sBase, i); + + // packed = [W Z Y X] + // ROTR(8, packedArgb) = [Y Z W X] + Unsafe.Add(ref dBase, i) = (packed >> 8) | (packed << 24); + } + } + } + + internal readonly struct ZYXWShuffle4 : IComponentShuffle + { + public byte Control => SimdUtils.Shuffle.MmShuffle(3, 0, 1, 2); + + [MethodImpl(InliningOptions.ShortMethod)] + public void RunFallbackShuffle(ReadOnlySpan source, Span dest) + { + ReadOnlySpan s = MemoryMarshal.Cast(source); + Span d = MemoryMarshal.Cast(dest); + ref uint sBase = ref MemoryMarshal.GetReference(s); + ref uint dBase = ref MemoryMarshal.GetReference(d); + + for (int i = 0; i < s.Length; i++) + { + uint packed = Unsafe.Add(ref sBase, i); + + // packed = [W Z Y X] + // tmp1 = [W 0 Y 0] + // tmp2 = [0 Z 0 X] + // tmp3=ROTL(16, tmp2) = [0 X 0 Z] + // tmp1 + tmp3 = [W X Y Z] + uint tmp1 = packed & 0xFF00FF00; + uint tmp2 = packed & 0x00FF00FF; + uint tmp3 = (tmp2 << 16) | (tmp2 >> 16); + + Unsafe.Add(ref dBase, i) = tmp1 + tmp3; + } + } + } +} diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 59b625419..febb31c2f 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Buffers.Binary; using System.Diagnostics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -40,34 +39,32 @@ namespace SixLabors.ImageSharp } /// - /// Shuffle 8-bit integers in a within 128-bit lanes in + /// Shuffle 8-bit integers within 128-bit lanes in /// using the control and store the results in . /// /// The source span of bytes. /// The destination span of bytes. - /// The byte control. + /// The type of shuffle to perform. [MethodImpl(InliningOptions.ShortMethod)] - public static void Shuffle4Channel( + public static void Shuffle4Channel( ReadOnlySpan source, Span dest, - byte control) + TShuffle shuffle) + where TShuffle : struct, IComponentShuffle { VerifyShuffleSpanInput(source, dest); - // TODO: There doesn't seem to be any APIs for - // System.Numerics that allow shuffling. #if SUPPORTS_RUNTIME_INTRINSICS - HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control); + HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, shuffle.Control); #endif // Deal with the remainder: if (source.Length > 0) { - ShuffleRemainder4Channel(source, dest, control); + shuffle.RunFallbackShuffle(source, dest); } } - [MethodImpl(InliningOptions.ColdPath)] public static void ShuffleRemainder4Channel( ReadOnlySpan source, Span dest, @@ -86,125 +83,6 @@ namespace SixLabors.ImageSharp } } - [MethodImpl(InliningOptions.ColdPath)] - public static void ShuffleRemainder4Channel( - ReadOnlySpan source, - Span dest, - byte control) - { -#if NETCOREAPP - // The JIT can detect and optimize rotation idioms ROTL (Rotate Left) - // and ROTR (Rotate Right) emitting efficient CPU instructions: - // https://github.com/dotnet/coreclr/pull/1830 - switch (control) - { - case Shuffle.WXYZ: - WXYZ(source, dest); - return; - case Shuffle.WZYX: - WZYX(source, dest); - return; - case Shuffle.YZWX: - YZWX(source, dest); - return; - case Shuffle.ZYXW: - ZYXW(source, dest); - return; - } -#endif - - ref byte sBase = ref MemoryMarshal.GetReference(source); - ref byte dBase = ref MemoryMarshal.GetReference(dest); - Shuffle.InverseMmShuffle(control, out int p3, out int p2, out int p1, out int p0); - - for (int i = 0; i < source.Length; i += 4) - { - Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + i); - Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); - Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); - Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); - } - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static void WXYZ(ReadOnlySpan source, Span dest) - { - ReadOnlySpan s = MemoryMarshal.Cast(source); - Span d = MemoryMarshal.Cast(dest); - ref uint sBase = ref MemoryMarshal.GetReference(s); - ref uint dBase = ref MemoryMarshal.GetReference(d); - - for (int i = 0; i < s.Length; i++) - { - uint packed = Unsafe.Add(ref sBase, i); - - // packed = [W Z Y X] - // ROTL(8, packed) = [Z Y X W] - Unsafe.Add(ref dBase, i) = (packed << 8) | (packed >> 24); - } - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static void ZYXW(ReadOnlySpan source, Span dest) - { - ReadOnlySpan s = MemoryMarshal.Cast(source); - Span d = MemoryMarshal.Cast(dest); - ref uint sBase = ref MemoryMarshal.GetReference(s); - ref uint dBase = ref MemoryMarshal.GetReference(d); - - for (int i = 0; i < s.Length; i++) - { - uint packed = Unsafe.Add(ref sBase, i); - - // packed = [W Z Y X] - // tmp1 = [W 0 Y 0] - // tmp2 = [0 Z 0 X] - // tmp3=ROTL(16, tmp2) = [0 X 0 Z] - // tmp1 + tmp3 = [W X Y Z] - uint tmp1 = packed & 0xFF00FF00; - uint tmp2 = packed & 0x00FF00FF; - uint tmp3 = (tmp2 << 16) | (tmp2 >> 16); - - Unsafe.Add(ref dBase, i) = tmp1 + tmp3; - } - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static void WZYX(ReadOnlySpan source, Span dest) - { - ReadOnlySpan s = MemoryMarshal.Cast(source); - Span d = MemoryMarshal.Cast(dest); - ref uint sBase = ref MemoryMarshal.GetReference(s); - ref uint dBase = ref MemoryMarshal.GetReference(d); - - for (int i = 0; i < s.Length; i++) - { - uint packed = Unsafe.Add(ref sBase, i); - - // packed = [W Z Y X] - // REVERSE(packedArgb) = [X Y Z W] - Unsafe.Add(ref dBase, i) = BinaryPrimitives.ReverseEndianness(packed); - } - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static void YZWX(ReadOnlySpan source, Span dest) - { - ReadOnlySpan s = MemoryMarshal.Cast(source); - Span d = MemoryMarshal.Cast(dest); - ref uint sBase = ref MemoryMarshal.GetReference(s); - ref uint dBase = ref MemoryMarshal.GetReference(d); - - for (int i = 0; i < s.Length; i++) - { - uint packed = Unsafe.Add(ref sBase, i); - - // packed = [W Z Y X] - // ROTR(8, packedArgb) = [Y Z W X] - Unsafe.Add(ref dBase, i) = (packed >> 8) | (packed << 24); - } - } - [Conditional("DEBUG")] private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) where T : struct @@ -222,12 +100,6 @@ namespace SixLabors.ImageSharp public static class Shuffle { - public const byte WXYZ = (2 << 6) | (1 << 4) | (0 << 2) | 3; - public const byte WZYX = (0 << 6) | (1 << 4) | (2 << 2) | 3; - public const byte XYZW = (3 << 6) | (2 << 4) | (1 << 2) | 0; - public const byte YZWX = (0 << 6) | (3 << 4) | (2 << 2) | 1; - public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; - [MethodImpl(InliningOptions.ShortMethod)] public static byte MmShuffle(byte p3, byte p2, byte p1, byte p0) => (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0); diff --git a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs index bc24258c9..ab9011a5c 100644 --- a/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs +++ b/src/ImageSharp/PixelFormats/Utils/PixelConverter.cs @@ -2,7 +2,6 @@ // Licensed under the Apache License, Version 2.0. using System; -using System.Buffers.Binary; using System.Runtime.CompilerServices; namespace SixLabors.ImageSharp.PixelFormats.Utils @@ -28,7 +27,7 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToArgb32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.WXYZ); + => SimdUtils.Shuffle4Channel(source, dest, default); /// /// Converts a representing a collection of @@ -37,7 +36,7 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToBgra32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.ZYXW); + => SimdUtils.Shuffle4Channel(source, dest, default); } public static class FromArgb32 @@ -49,7 +48,7 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToRgba32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.YZWX); + => SimdUtils.Shuffle4Channel(source, dest, default); /// /// Converts a representing a collection of @@ -58,7 +57,7 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToBgra32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.WZYX); + => SimdUtils.Shuffle4Channel(source, dest, default); } public static class FromBgra32 @@ -70,7 +69,7 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToArgb32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.WZYX); + => SimdUtils.Shuffle4Channel(source, dest, default); /// /// Converts a representing a collection of @@ -79,7 +78,7 @@ namespace SixLabors.ImageSharp.PixelFormats.Utils /// [MethodImpl(InliningOptions.ShortMethod)] public static void ToRgba32(ReadOnlySpan source, Span dest) - => SimdUtils.Shuffle4Channel(source, dest, SimdUtils.Shuffle.ZYXW); + => SimdUtils.Shuffle4Channel(source, dest, default); } } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs index c45b103e3..bd4a8d534 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs @@ -26,7 +26,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Benchmark] public void Shuffle4Channel() { - SimdUtils.Shuffle4Channel(this.source, this.destination, SimdUtils.Shuffle.WXYZ); + SimdUtils.Shuffle4Channel(this.source, this.destination, default); } } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs index 36b9591d9..04c6dbf21 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs @@ -26,7 +26,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Benchmark] public void Shuffle4Channel() { - SimdUtils.Shuffle4Channel(this.source, this.destination, SimdUtils.Shuffle.WXYZ); + SimdUtils.Shuffle4Channel(this.source, this.destination, default(WXYZShuffle4).Control); } } diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs index e8a06bf24..eacd36799 100644 --- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs +++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs @@ -58,6 +58,12 @@ namespace SixLabors.ImageSharp.Benchmarks { public HwIntrinsics_SSE_AVX() { + this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) + .WithEnvironmentVariables( + new EnvironmentVariable(EnableHWIntrinsic, Off), + new EnvironmentVariable(FeatureSIMD, Off)) + .WithId("No HwIntrinsics")); + #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { @@ -72,11 +78,6 @@ namespace SixLabors.ImageSharp.Benchmarks .WithId("SSE")); } #endif - this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithEnvironmentVariables( - new EnvironmentVariable(EnableHWIntrinsic, Off), - new EnvironmentVariable(FeatureSIMD, Off)) - .WithId("No HwIntrinsics")); } } } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index 94298f94c..06f61e617 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -9,66 +9,91 @@ namespace SixLabors.ImageSharp.Tests.Common { public partial class SimdUtilsTests { - public static readonly TheoryData ShuffleControls = - new TheoryData - { - SimdUtils.Shuffle.WXYZ, - SimdUtils.Shuffle.WZYX, - SimdUtils.Shuffle.XYZW, - SimdUtils.Shuffle.YZWX, - SimdUtils.Shuffle.ZYXW, - SimdUtils.Shuffle.MmShuffle(2, 1, 3, 0), - SimdUtils.Shuffle.MmShuffle(1, 1, 1, 1), - SimdUtils.Shuffle.MmShuffle(3, 3, 3, 3) - }; - [Theory] - [MemberData(nameof(ShuffleControls))] - public void BulkShuffleFloat4Channel(byte control) + [MemberData(nameof(ArraySizesDivisibleBy4))] + public void BulkShuffleFloat4Channel(int count) { static void RunTest(string serialized) { - byte ctrl = FeatureTestRunner.Deserialize(serialized); - foreach (var item in ArraySizesDivisibleBy4) - { - foreach (var count in item) - { - TestShuffleFloat4Channel( - (int)count, - (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl), - ctrl); - } - } + // No need to test multiple shuffle controls as the + // pipeline is always the same. + int size = FeatureTestRunner.Deserialize(serialized); + byte control = default(WZYXShuffle4).Control; + + TestShuffleFloat4Channel( + size, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, control), + control); } FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - control, + count, HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); } [Theory] - [MemberData(nameof(ShuffleControls))] - public void BulkShuffleByte4Channel(byte control) + [MemberData(nameof(ArraySizesDivisibleBy4))] + public void BulkShuffleByte4Channel(int count) { static void RunTest(string serialized) { - byte ctrl = FeatureTestRunner.Deserialize(serialized); + int size = FeatureTestRunner.Deserialize(serialized); foreach (var item in ArraySizesDivisibleBy4) { + // These cannot be expressed as a theory as you cannot + // use RemoteExecutor within generic methods nor pass + // IComponentShuffle to the generic utils method. foreach (var count in item) { + WXYZShuffle4 wxyz = default; + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, wxyz), + wxyz.Control); + + WZYXShuffle4 wzyx = default; TestShuffleByte4Channel( - (int)count, - (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, ctrl), - ctrl); + size, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, wzyx), + wzyx.Control); + + YZWXShuffle4 yzwx = default; + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, yzwx), + yzwx.Control); + + ZYXWShuffle4 zyxw = default; + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, zyxw), + zyxw.Control); + + var xwyz = new DefaultShuffle4(2, 1, 3, 0); + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, xwyz), + xwyz.Control); + + var yyyy = new DefaultShuffle4(1, 1, 1, 1); + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, yyyy), + yyyy.Control); + + var wwww = new DefaultShuffle4(3, 3, 3, 3); + TestShuffleByte4Channel( + size, + (s, d) => SimdUtils.Shuffle4Channel(s.Span, d.Span, wwww), + wwww.Control); } } } FeatureTestRunner.RunWithHwIntrinsicsFeature( RunTest, - control, + count, HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableSSE); } From cdc1c0fce57544bae85a4f9766fcb3403976ed1a Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Thu, 29 Oct 2020 01:48:46 +0000 Subject: [PATCH 9/9] Fix benchmarks, cleanup. --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 24 ++------ .../Common/Helpers/SimdUtils.Shuffle.cs | 2 - .../Color/Bulk/ShuffleByte4Channel.cs | 57 +++++++++--------- .../Color/Bulk/ShuffleFloat4Channel.cs | 60 +++++++++---------- .../Config.HwIntrinsics.cs | 6 +- 5 files changed, 67 insertions(+), 82 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 367df03ec..782328edd 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -33,15 +33,9 @@ namespace SixLabors.ImageSharp { if (Avx.IsSupported || Sse.IsSupported) { - int remainder; - if (Avx.IsSupported) - { - remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); - } - else - { - remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); - } + int remainder = Avx.IsSupported + ? ImageMaths.ModuloP2(source.Length, Vector256.Count) + : ImageMaths.ModuloP2(source.Length, Vector128.Count); int adjustedCount = source.Length - remainder; @@ -73,15 +67,9 @@ namespace SixLabors.ImageSharp { if (Avx2.IsSupported || Ssse3.IsSupported) { - int remainder; - if (Avx2.IsSupported) - { - remainder = ImageMaths.ModuloP2(source.Length, Vector256.Count); - } - else - { - remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); - } + int remainder = Avx2.IsSupported + ? ImageMaths.ModuloP2(source.Length, Vector256.Count) + : ImageMaths.ModuloP2(source.Length, Vector128.Count); int adjustedCount = source.Length - remainder; diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index febb31c2f..a4a40fb4f 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -25,8 +25,6 @@ namespace SixLabors.ImageSharp { VerifyShuffleSpanInput(source, dest); - // TODO: There doesn't seem to be any APIs for - // System.Numerics that allow shuffling. #if SUPPORTS_RUNTIME_INTRINSICS HwIntrinsics.Shuffle4ChannelReduce(ref source, ref dest, control); #endif diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs index bd4a8d534..749859eac 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleByte4Channel.cs @@ -30,39 +30,38 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk } } - // 2020-10-26 + // 2020-10-29 // ########## // // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) - // Intel Core i7-8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores - // .NET Core SDK = 5.0.100-rc.2.20479.15 - // - // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT - // AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT - // No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT - // SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK=3.1.403 + // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT // // Runtime=.NET Core 3.1 // - // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |---------------- |---------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| - // | Shuffle4Channel | AVX | Empty | 128 | 20.51 ns | 0.270 ns | 0.211 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.00 ns | 0.991 ns | 0.927 ns | 3.08 | 0.06 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 17.25 ns | 0.066 ns | 0.058 ns | 0.84 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 256 | 24.57 ns | 0.248 ns | 0.219 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 124.55 ns | 2.501 ns | 2.456 ns | 5.06 | 0.10 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 21.80 ns | 0.094 ns | 0.088 ns | 0.89 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 512 | 28.51 ns | 0.130 ns | 0.115 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 256.52 ns | 1.424 ns | 1.332 ns | 9.00 | 0.07 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 29.72 ns | 0.217 ns | 0.203 ns | 1.04 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 1024 | 36.40 ns | 0.357 ns | 0.334 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 492.71 ns | 1.498 ns | 1.251 ns | 13.52 | 0.12 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 44.71 ns | 0.264 ns | 0.234 ns | 1.23 | 0.02 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 2048 | 59.38 ns | 0.180 ns | 0.159 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 975.05 ns | 2.043 ns | 1.811 ns | 16.42 | 0.05 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 81.83 ns | 0.212 ns | 0.198 ns | 1.38 | 0.01 | - | - | - | - | + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 17.39 ns | 0.187 ns | 0.175 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 128 | 21.72 ns | 0.299 ns | 0.279 ns | 1.25 | 0.02 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 18.10 ns | 0.346 ns | 0.289 ns | 1.04 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 35.51 ns | 0.711 ns | 0.790 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 256 | 23.90 ns | 0.508 ns | 0.820 ns | 0.69 | 0.02 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 20.40 ns | 0.133 ns | 0.111 ns | 0.57 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 73.39 ns | 0.310 ns | 0.259 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 512 | 26.10 ns | 0.418 ns | 0.391 ns | 0.36 | 0.01 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 27.59 ns | 0.556 ns | 0.571 ns | 0.38 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 150.64 ns | 2.903 ns | 2.716 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 1024 | 38.67 ns | 0.801 ns | 1.889 ns | 0.24 | 0.02 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 47.13 ns | 0.948 ns | 1.054 ns | 0.31 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 315.29 ns | 5.206 ns | 6.583 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 2048 | 57.37 ns | 1.152 ns | 1.078 ns | 0.18 | 0.01 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 65.75 ns | 1.198 ns | 1.600 ns | 0.21 | 0.01 | - | - | - | - | } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs index 04c6dbf21..6f5b5001b 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs @@ -10,6 +10,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class ShuffleFloat4Channel { + private static readonly byte control = default(WXYZShuffle4).Control; private float[] source; private float[] destination; @@ -26,43 +27,42 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Benchmark] public void Shuffle4Channel() { - SimdUtils.Shuffle4Channel(this.source, this.destination, default(WXYZShuffle4).Control); + SimdUtils.Shuffle4Channel(this.source, this.destination, control); } } - // 2020-10-26 + // 2020-10-29 // ########## // // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) - // Intel Core i7-8650U CPU 1.90GHz(Kaby Lake R), 1 CPU, 8 logical and 4 physical cores - // .NET Core SDK = 5.0.100-rc.2.20479.15 - // - // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT - // AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT - // No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT - // SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK=3.1.403 + // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT // // Runtime=.NET Core 3.1 // - // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |---------------- |---------------- |-------------------------------------------------- |------ |------------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| - // | Shuffle4Channel | AVX | Empty | 128 | 14.49 ns | 0.244 ns | 0.217 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 87.74 ns | 0.524 ns | 0.490 ns | 6.06 | 0.09 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 128 | 23.65 ns | 0.101 ns | 0.094 ns | 1.63 | 0.03 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 256 | 25.87 ns | 0.492 ns | 0.673 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 159.52 ns | 0.901 ns | 0.843 ns | 6.12 | 0.12 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 256 | 45.47 ns | 0.404 ns | 0.378 ns | 1.75 | 0.03 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 512 | 49.51 ns | 0.088 ns | 0.083 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 297.96 ns | 0.926 ns | 0.821 ns | 6.02 | 0.02 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 512 | 90.77 ns | 0.191 ns | 0.169 ns | 1.83 | 0.00 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 1024 | 113.09 ns | 1.913 ns | 3.090 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 604.58 ns | 1.464 ns | 1.298 ns | 5.29 | 0.18 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 1024 | 179.44 ns | 0.208 ns | 0.184 ns | 1.57 | 0.05 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | AVX | Empty | 2048 | 217.95 ns | 1.314 ns | 1.165 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 1,152.04 ns | 3.941 ns | 3.494 ns | 5.29 | 0.03 | - | - | - | - | - // | Shuffle4Channel | SSE | COMPlus_EnableAVX=0 | 2048 | 349.52 ns | 0.587 ns | 0.520 ns | 1.60 | 0.01 | - | - | - | - | + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------- |------------------- |-------------------------------------------------- |------ |-----------:|----------:|----------:|------:|------:|------:|------:|----------:| + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 63.647 ns | 0.5475 ns | 0.4853 ns | 1.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 128 | 9.818 ns | 0.1457 ns | 0.1292 ns | 0.15 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 15.267 ns | 0.1005 ns | 0.0940 ns | 0.24 | - | - | - | - | + // | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 125.586 ns | 1.9312 ns | 1.8064 ns | 1.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 256 | 15.878 ns | 0.1983 ns | 0.1758 ns | 0.13 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 29.170 ns | 0.2925 ns | 0.2442 ns | 0.23 | - | - | - | - | + // | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 263.859 ns | 2.6660 ns | 2.3634 ns | 1.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 512 | 29.452 ns | 0.3334 ns | 0.3118 ns | 0.11 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 52.912 ns | 0.1932 ns | 0.1713 ns | 0.20 | - | - | - | - | + // | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 495.717 ns | 1.9850 ns | 1.8567 ns | 1.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 1024 | 53.757 ns | 0.3212 ns | 0.2847 ns | 0.11 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 107.815 ns | 1.6201 ns | 1.3528 ns | 0.22 | - | - | - | - | + // | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 980.134 ns | 3.7407 ns | 3.1237 ns | 1.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 2048 | 105.120 ns | 0.6140 ns | 0.5443 ns | 0.11 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 216.473 ns | 2.3268 ns | 2.0627 ns | 0.22 | - | - | - | - | } diff --git a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs index eacd36799..5ceb4c8a0 100644 --- a/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs +++ b/tests/ImageSharp.Benchmarks/Config.HwIntrinsics.cs @@ -62,20 +62,20 @@ namespace SixLabors.ImageSharp.Benchmarks .WithEnvironmentVariables( new EnvironmentVariable(EnableHWIntrinsic, Off), new EnvironmentVariable(FeatureSIMD, Off)) - .WithId("No HwIntrinsics")); + .WithId("1. No HwIntrinsics").AsBaseline()); #if SUPPORTS_RUNTIME_INTRINSICS if (Avx.IsSupported) { this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) - .WithId("AVX").AsBaseline()); + .WithId("2. AVX")); } if (Sse.IsSupported) { this.AddJob(Job.Default.WithRuntime(CoreRuntime.Core31) .WithEnvironmentVariables(new EnvironmentVariable(EnableAVX, Off)) - .WithId("SSE")); + .WithId("3. SSE")); } #endif }