diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index d6e45026b7..5083a3c03d 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -101,20 +101,20 @@ namespace SixLabors.ImageSharp { if (Ssse3.IsSupported) { - int remainder = ImageMaths.ModuloP2(source.Length, Vector128.Count); + int remainder = source.Length % (Vector128.Count * 3); - int adjustedCount = source.Length - remainder; - int sourceSlice = (int)(adjustedCount * (3 / 4F)); + int sourceCount = source.Length - remainder; + int destCount = (int)(sourceCount * (4 / 3D)); - if (adjustedCount > 0) + if (sourceCount > 0) { Pad3Shuffle4( - source.Slice(0, adjustedCount), - dest.Slice(0, adjustedCount), + source.Slice(0, sourceCount), + dest.Slice(0, destCount), control); - source = source.Slice(sourceSlice); - dest = dest.Slice(adjustedCount); + source = source.Slice(sourceCount); + dest = dest.Slice(destCount); } } } @@ -320,31 +320,37 @@ namespace SixLabors.ImageSharp { if (Ssse3.IsSupported) { - Vector128 wMask = Vector128.Create(0xff000000u).AsByte(); - Vector128 padMask = Vector128.Create(0, 1, 2, -1, 3, 4, 5, -1, 6, 7, 8, -1, 9, 10, 11, -1).AsByte(); + Vector128 vmask = Vector128.Create(0, 1, 2, 0x80, 3, 4, 5, 0x80, 6, 7, 8, 0x80, 9, 10, 11, 0x80).AsByte(); + Vector128 vfill = Vector128.Create(0xff000000ff000000ul).AsByte(); Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MmShuffleSpan(ref bytes, control); - Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); - fixed (byte* sBase = source) - fixed (byte* dBase = dest) + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); + + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + int n = source.Length / Vector128.Count; + + for (int i = 0, j = 0; i < n; i += 3, j += 4) { - byte* s = sBase; - byte* d = dBase; + ref Vector128 v0 = ref Unsafe.Add(ref sourceBase, i); + Vector128 v1 = Unsafe.Add(ref v0, 1); + Vector128 v2 = Unsafe.Add(ref v0, 2); + Vector128 v3 = Sse2.ShiftRightLogical128BitLane(v2, 4); - // TODO: Consider unrolling and shuffling 4 at a time using Ssse3.AlignRight - // See https://stackoverflow.com/questions/2973708/fast-24-bit-array-32-bit-array-conversion - for (int i = 0; i < source.Length; i += 16) - { - Vector128 vs0 = Sse2.LoadVector128(s); - Vector128 val = Sse2.Or(wMask, Ssse3.Shuffle(vs0, padMask)); - val = Ssse3.Shuffle(val, vcm); - Sse2.Store(d, val); + v2 = Ssse3.AlignRight(v2, v1, 8); + v1 = Ssse3.AlignRight(v1, v0, 12); - s += 12; - d += 16; - } + ref Vector128 vd = ref Unsafe.Add(ref destBase, j); + + vd = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle); + Unsafe.Add(ref vd, 1) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle); + Unsafe.Add(ref vd, 2) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle); + Unsafe.Add(ref vd, 3) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle); } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index f3946361b1..54ca2a73ec 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -183,7 +183,7 @@ namespace SixLabors.ImageSharp "Output span must be divisable by 4!"); DebugGuard.IsTrue( - source.Length == (int)(dest.Length * 3 / 4F), + source.Length == (int)(dest.Length * 3 / 4D), nameof(source), "Input span must be 3/4 the length of the output span!"); } diff --git a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs index 8286fea0e5..9eb1e109be 100644 --- a/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs +++ b/tests/ImageSharp.Benchmarks/Color/Bulk/Pad3Shuffle4Channel.cs @@ -44,21 +44,21 @@ namespace SixLabors.ImageSharp.Benchmarks.ColorSpaces.Bulk // // Runtime=.NET Core 3.1 // - // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | - // |------------- |------------------- |-------------------------------------------------- |------ |----------:|---------:|---------:|------:|--------:|------:|------:|------:|----------:| - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 62.91 ns | 1.240 ns | 1.569 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 96 | 44.34 ns | 0.371 ns | 0.329 ns | 0.70 | 0.02 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 44.46 ns | 0.617 ns | 0.515 ns | 0.70 | 0.02 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 247.93 ns | 2.640 ns | 2.470 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 384 | 92.91 ns | 1.204 ns | 1.127 ns | 0.37 | 0.01 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 91.42 ns | 1.234 ns | 1.094 ns | 0.37 | 0.01 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 444.79 ns | 5.094 ns | 4.254 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 768 | 162.92 ns | 1.046 ns | 0.873 ns | 0.37 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 166.22 ns | 1.728 ns | 1.443 ns | 0.37 | 0.00 | - | - | - | - | - // | | | | | | | | | | | | | | - // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 882.51 ns | 6.936 ns | 5.792 ns | 1.00 | 0.00 | - | - | - | - | - // | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 309.72 ns | 3.777 ns | 3.533 ns | 0.35 | 0.01 | - | - | - | - | - // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 323.18 ns | 4.079 ns | 3.816 ns | 0.37 | 0.00 | - | - | - | - | + // | Method | Job | EnvironmentVariables | Count | Mean | Error | StdDev | Ratio | RatioSD | Gen 0 | Gen 1 | Gen 2 | Allocated | + // |------------- |------------------- |-------------------------------------------------- |------ |----------:|----------:|---------:|------:|--------:|------:|------:|------:|----------:| + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 96 | 71.84 ns | 1.491 ns | 3.146 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 96 | 23.14 ns | 0.186 ns | 0.165 ns | 0.33 | 0.02 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 96 | 22.94 ns | 0.127 ns | 0.112 ns | 0.33 | 0.02 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 384 | 230.90 ns | 0.877 ns | 0.777 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 384 | 39.54 ns | 0.601 ns | 0.533 ns | 0.17 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 384 | 39.88 ns | 0.657 ns | 0.548 ns | 0.17 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 768 | 451.67 ns | 2.932 ns | 2.448 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 768 | 60.79 ns | 1.200 ns | 1.561 ns | 0.13 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 768 | 62.30 ns | 0.469 ns | 0.416 ns | 0.14 | 0.00 | - | - | - | - | + // | | | | | | | | | | | | | | + // | Pad3Shuffle4 | 1. No HwIntrinsics | COMPlus_EnableHWIntrinsic=0,COMPlus_FeatureSIMD=0 | 1536 | 896.11 ns | 10.358 ns | 9.689 ns | 1.00 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 2. AVX | Empty | 1536 | 109.72 ns | 2.149 ns | 2.300 ns | 0.12 | 0.00 | - | - | - | - | + // | Pad3Shuffle4 | 3. SSE | COMPlus_EnableAVX=0 | 1536 | 108.70 ns | 0.994 ns | 0.881 ns | 0.12 | 0.00 | - | - | - | - | } diff --git a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs index f801cd28b5..26f85dd76c 100644 --- a/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs +++ b/tests/ImageSharp.Tests/Common/SimdUtilsTests.Shuffle.cs @@ -212,7 +212,7 @@ namespace SixLabors.ImageSharp.Tests.Common byte[] source = new byte[count]; new Random(count).NextBytes(source); - var result = new byte[(int)(count * (4 / 3F))]; + var result = new byte[(int)(count * (4 / 3D))]; byte[] expected = new byte[result.Length]; @@ -249,7 +249,7 @@ namespace SixLabors.ImageSharp.Tests.Common byte[] source = new byte[count]; new Random(count).NextBytes(source); - var result = new byte[(int)(count * (3 / 4F))]; + var result = new byte[(int)(count * (3 / 4D))]; byte[] expected = new byte[result.Length];