diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 8a0b5460c..29b569a80 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -119,6 +119,39 @@ namespace SixLabors.ImageSharp } } + /// + /// Shuffles then slices 8-bit integers within 128-bit lanes in + /// using the control and store the results in . + /// + /// The source span of bytes. + /// The destination span of bytes. + /// The byte control. + [MethodImpl(InliningOptions.ShortMethod)] + public static unsafe void Shuffle4Slice3Reduce( + ref ReadOnlySpan source, + ref Span dest, + byte control) + { + if (Ssse3.IsSupported) + { + int remainder = ImageMaths.ModuloP2(dest.Length, Vector128.Count); + + int adjustedCount = dest.Length - remainder; + int destSlice = (int)(adjustedCount * (3 / 4F)); + + if (adjustedCount > 0) + { + Shuffle4Slice3( + source.Slice(0, adjustedCount), + dest.Slice(0, adjustedCount), + control); + + source = source.Slice(adjustedCount); + dest = dest.Slice(destSlice); + } + } + } + [MethodImpl(InliningOptions.ShortMethod)] private static void Shuffle4( ReadOnlySpan source, @@ -316,6 +349,50 @@ namespace SixLabors.ImageSharp } } + [MethodImpl(InliningOptions.ShortMethod)] + private static unsafe void Shuffle4Slice3( + ReadOnlySpan source, + Span dest, + byte control) + { + if (Ssse3.IsSupported) + { + Vector128 sliceMask = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1).AsByte(); + + Span bytes = stackalloc byte[Vector128.Count]; + Shuffle.MmShuffleSpan(ref bytes, control); + Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + + // var control = MmShuffle(3, 0, 1, 2); + // Span bytes = stackalloc byte[Vector128.Count]; + // MmShuffleSpan(ref bytes, control); + // Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + // + // Vector128 s0 = Vector128.Create((byte)1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 1).Dump("s0"); + // Vector128 padded = Ssse3.Shuffle(s0, padMask).Dump("padded"); + // + // padded = Sse3.Or(Vector128.Create(0xff000000u).AsByte(), padded).Dump("0r"); + // + // var shuffled = Ssse3.Shuffle(padded, vcm).Dump("shuffled"); + // var d0 = Ssse3.Shuffle(shuffled, sliceMask).Dump("d0"); + fixed (byte* sBase = &source.GetPinnableReference()) + fixed (byte* dBase = &dest.GetPinnableReference()) + { + byte* s = sBase; + byte* d = dBase; + + for (int i = 0; i < source.Length; i += 16) + { + Vector128 vs0 = Ssse3.Shuffle(Sse2.LoadVector128(s), vcm); + Sse2.Store(d, Ssse3.Shuffle(vs0, sliceMask)); + + s += 16; + d += 12; + } + } + } + } + /// /// Performs a multiplication and an addition of the . /// diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 81d77d655..f3946361b 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -69,7 +69,7 @@ namespace SixLabors.ImageSharp Span dest, byte control) { - VerifyPadShuffleSpanInput(source, dest); + VerifyPad3Shuffle4SpanInput(source, dest); #if SUPPORTS_RUNTIME_INTRINSICS HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, control); @@ -82,6 +82,25 @@ namespace SixLabors.ImageSharp } } + [MethodImpl(InliningOptions.ShortMethod)] + public static void Shuffle4Slice3( + ReadOnlySpan source, + Span dest, + byte control) + { + VerifyShuffle4Slice3SpanInput(source, dest); + +#if SUPPORTS_RUNTIME_INTRINSICS + HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, control); +#endif + + // Deal with the remainder: + if (source.Length > 0) + { + Shuffle4Slice3Remainder(source, dest, control); + } + } + public static void Shuffle4Remainder( ReadOnlySpan source, Span dest, @@ -118,6 +137,23 @@ namespace SixLabors.ImageSharp } } + public static void Shuffle4Slice3Remainder( + ReadOnlySpan source, + Span dest, + byte control) + { + ref byte sBase = ref MemoryMarshal.GetReference(source); + ref byte dBase = ref MemoryMarshal.GetReference(dest); + Shuffle.InverseMmShuffle(control, out int _, out int p2, out int p1, out int p0); + + for (int i = 0, j = 0; i < dest.Length; i += 3, j += 4) + { + Unsafe.Add(ref dBase, i) = Unsafe.Add(ref sBase, p0 + j); + Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j); + Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j); + } + } + [Conditional("DEBUG")] private static void VerifyShuffleSpanInput(ReadOnlySpan source, Span dest) where T : struct @@ -130,21 +166,45 @@ namespace SixLabors.ImageSharp DebugGuard.IsTrue( source.Length % 4 == 0, nameof(source), - "Input spans must be divisiable by 4!"); + "Input spans must be divisable by 4!"); } [Conditional("DEBUG")] - private static void VerifyPadShuffleSpanInput(ReadOnlySpan source, Span dest) + private static void VerifyPad3Shuffle4SpanInput(ReadOnlySpan source, Span dest) { + DebugGuard.IsTrue( + source.Length % 3 == 0, + nameof(source), + "Input span must be divisable by 3!"); + + DebugGuard.IsTrue( + dest.Length % 4 == 0, + nameof(dest), + "Output span must be divisable by 4!"); + DebugGuard.IsTrue( source.Length == (int)(dest.Length * 3 / 4F), nameof(source), - "Input spans must be 3/4 the length of the output span!"); + "Input span must be 3/4 the length of the output span!"); + } + [Conditional("DEBUG")] + private static void VerifyShuffle4Slice3SpanInput(ReadOnlySpan source, Span dest) + { DebugGuard.IsTrue( - source.Length % 3 == 0, + source.Length % 4 == 0, + nameof(source), + "Input span must be divisable by 4!"); + + DebugGuard.IsTrue( + dest.Length % 3 == 0, + nameof(dest), + "Output span must be divisable by 3!"); + + DebugGuard.IsTrue( + source.Length == (int)(dest.Length * 4 / 3F), nameof(source), - "Input spans must be divisiable by 3!"); + "Output span must be 3/4 the length of the input span!"); } public static class Shuffle diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs index c529b2af1..8286fea0e 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs @@ -9,6 +9,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class Pad3Shuffle4Channel { + private static readonly byte Control = default(WXYZShuffle4).Control; private byte[] source; private byte[] destination; @@ -17,20 +18,20 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk { this.source = new byte[this.Count]; new Random(this.Count).NextBytes(this.source); - this.destination = new byte[this.Count]; + this.destination = new byte[(int)(this.Count * (4 / 3F))]; } [Params(96, 384, 768, 1536)] public int Count { get; set; } [Benchmark] - public void Shuffle4Channel() + public void Pad3Shuffle4() { - SimdUtils.Shuffle4(this.source, this.destination, default); + SimdUtils.Pad3Shuffle4(this.source, this.destination, Control); } } - // 2020-10-29 + // 2020-10-30 // ########## // // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) @@ -43,25 +44,21 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk // // Runtime=.NET Core 3.1 // - // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |---------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 17.39 ns | 0.187 ns | 0.175 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 128 | 21.72 ns | 0.299 ns | 0.279 ns | 1.25 | 0.02 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 18.10 ns | 0.346 ns | 0.289 ns | 1.04 | 0.02 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 35.51 ns | 0.711 ns | 0.790 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 256 | 23.90 ns | 0.508 ns | 0.820 ns | 0.69 | 0.02 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 20.40 ns | 0.133 ns | 0.111 ns | 0.57 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 73.39 ns | 0.310 ns | 0.259 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 512 | 26.10 ns | 0.418 ns | 0.391 ns | 0.36 | 0.01 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 27.59 ns | 0.556 ns | 0.571 ns | 0.38 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 150.64 ns | 2.903 ns | 2.716 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 1024 | 38.67 ns | 0.801 ns | 1.889 ns | 0.24 | 0.02 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 47.13 ns | 0.948 ns | 1.054 ns | 0.31 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 315.29 ns | 5.206 ns | 6.583 ns | 1.00 | 0.00 | - | - | - | - | - // | Shuffle4Channel | 2. AVX | Empty | 2048 | 57.37 ns | 1.152 ns | 1.078 ns | 0.18 | 0.01 | - | - | - | - | - // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 65.75 ns | 1.198 ns | 1.600 ns | 0.21 | 0.01 | - | - | - | - | + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 62.91 ns | 1.240 ns | 1.569 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 96 | 44.34 ns | 0.371 ns | 0.329 ns | 0.70 | 0.02 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 44.46 ns | 0.617 ns | 0.515 ns | 0.70 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 247.93 ns | 2.640 ns | 2.470 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 384 | 92.91 ns | 1.204 ns | 1.127 ns | 0.37 | 0.01 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 91.42 ns | 1.234 ns | 1.094 ns | 0.37 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 444.79 ns | 5.094 ns | 4.254 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 768 | 162.92 ns | 1.046 ns | 0.873 ns | 0.37 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 166.22 ns | 1.728 ns | 1.443 ns | 0.37 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 882.51 ns | 6.936 ns | 5.792 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 309.72 ns | 3.777 ns | 3.533 ns | 0.35 | 0.01 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 323.18 ns | 4.079 ns | 3.816 ns | 0.37 | 0.00 | - | - | - | - | } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs new file mode 100644 index 000000000..b64379959 --- /dev/null +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Shuffle4Slice3Channel.cs @@ -0,0 +1,68 @@ +// Copyright (c) Six Labors. +// Licensed under the Apache License, Version 2.0. + +using System; +using BenchmarkDotNet.Attributes; + +namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk +{ + [Config(typeof(Config.HwIntrinsics_SSE_AVX))] + public class Shuffle4Slice3Channel + { + private static readonly byte Control = default(WXYZShuffle4).Control; + private byte[] source; + private byte[] destination; + + [GlobalSetup] + public void Setup() + { + this.source = new byte[this.Count]; + new Random(this.Count).NextBytes(this.source); + this.destination = new byte[(int)(this.Count * (3 / 4F))]; + } + + [Params(128, 256, 512, 1024, 2048)] + public int Count { get; set; } + + [Benchmark] + public void Shuffle4Slice3() + { + SimdUtils.Shuffle4Slice3(this.source, this.destination, Control); + } + } + + // 2020-10-29 + // ########## + // + // BenchmarkDotNet=v0.12.1, OS=Windows 10.0.19041.572 (2004/?/20H1) + // Intel Core i7-8650U CPU 1.90GHz (Kaby Lake R), 1 CPU, 8 logical and 4 physical cores + // .NET Core SDK=3.1.403 + // [Host] : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 1. No HwIntrinsics : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 2. AVX : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // 3. SSE : .NET Core 3.1.9 (CoreCLR 4.700.20.47201, CoreFX 4.700.20.47203), X64 RyuJIT + // + // Runtime=.NET Core 3.1 + // + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Median | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |---------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|----------:|------:|--------:|------:|------:|------:|----------:| + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 128 | 50.09 ns | 1.018 ns | 1.460 ns | 49.16 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 128 | 35.28 ns | 0.106 ns | 0.089 ns | 35.30 ns | 0.69 | 0.02 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 128 | 35.13 ns | 0.247 ns | 0.231 ns | 35.22 ns | 0.69 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 256 | 101.48 ns | 0.875 ns | 0.819 ns | 101.60 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 256 | 53.25 ns | 0.518 ns | 0.433 ns | 53.21 ns | 0.52 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 256 | 57.21 ns | 0.508 ns | 0.451 ns | 57.38 ns | 0.56 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 512 | 202.53 ns | 0.884 ns | 0.827 ns | 202.40 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 512 | 82.55 ns | 0.418 ns | 0.391 ns | 82.59 ns | 0.41 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 512 | 82.89 ns | 1.057 ns | 0.989 ns | 82.48 ns | 0.41 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1024 | 398.79 ns | 7.807 ns | 6.921 ns | 395.67 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 1024 | 144.51 ns | 1.033 ns | 0.966 ns | 144.42 ns | 0.36 | 0.01 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 1024 | 143.77 ns | 0.820 ns | 0.684 ns | 143.62 ns | 0.36 | 0.01 | - | - | - | - | + // | | | | | | | | | | | | | | | + // | Shuffle4Channel | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 2048 | 798.44 ns | 4.447 ns | 3.472 ns | 799.39 ns | 1.00 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 2. AVX | Empty | 2048 | 277.12 ns | 1.723 ns | 1.612 ns | 276.93 ns | 0.35 | 0.00 | - | - | - | - | + // | Shuffle4Channel | 3. SSE | COMPlus_EnableAVX=0 | 2048 | 275.70 ns | 1.796 ns | 1.500 ns | 275.51 ns | 0.35 | 0.00 | - | - | - | - || +} diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs index 4a2512fea..86b1f766e 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/ShuffleFloat4Channel.cs @@ -10,7 +10,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Config(typeof(Config.HwIntrinsics_SSE_AVX))] public class ShuffleFloat4Channel { - private static readonly byte control = default(WXYZShuffle4).Control; + private static readonly byte Control = default(WXYZShuffle4).Control; private float[] source; private float[] destination; @@ -27,7 +27,7 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk [Benchmark] public void Shuffle4Channel() { - SimdUtils.Shuffle4(this.source, this.destination, control); + SimdUtils.Shuffle4(this.source, this.destination, Control); } } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index 1c456e5a2..f801cd28b 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -120,6 +120,29 @@ namespace SixLabors.ImageSharp.Tests.Common HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); } + [Theory] + [MemberData(nameof(ArraySizesDivisibleBy4))] + public void BulkShuffle4Slice3Channel(int count) + { + static void RunTest(string serialized) + { + // No need to test multiple shuffle controls as the + // pipeline is always the same. + int size = FeatureTestRunner.Deserialize(serialized); + byte control = default(WZYXShuffle4).Control; + + TestShuffle4Slice3Channel( + size, + (s, d) => SimdUtils.Shuffle4Slice3(s.Span, d.Span, control), + control); + } + + FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunTest, + count, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX | HwIntrinsics.DisableSSE); + } + private static void TestShuffleFloat4Channel( int count, Action, Memory> convert, @@ -210,6 +233,47 @@ namespace SixLabors.ImageSharp.Tests.Common convert(source, result); + for (int i = 0; i < expected.Length; i++) + { + Assert.Equal(expected[i], result[i]); + } + + Assert.Equal(expected, result); + } + + private static void TestShuffle4Slice3Channel( + int count, + Action, Memory> convert, + byte control) + { + byte[] source = new byte[count]; + new Random(count).NextBytes(source); + + var result = new byte[(int)(count * (3 / 4F))]; + + byte[] expected = new byte[result.Length]; + + SimdUtils.Shuffle.InverseMmShuffle( + control, + out int _, + out int p2, + out int p1, + out int p0); + + for (int i = 0, j = 0; i < expected.Length; i += 3, j += 4) + { + expected[i] = source[p0 + j]; + expected[i + 1] = source[p1 + j]; + expected[i + 2] = source[p2 + j]; + } + + convert(source, result); + + for (int i = 0; i < expected.Length; i++) + { + Assert.Equal(expected[i], result[i]); + } + Assert.Equal(expected, result); } }