From f462bfe7f02168002984dac415af46f780be6ff7 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Sat, 31 Oct 2020 21:35:23 +0000 Subject: [PATCH] Faster Shuffle4Slice3 --- .../Common/Helpers/SimdUtils.HwIntrinsics.cs | 88 ++++++++++++------- .../Common/Helpers/SimdUtils.Shuffle.cs | 4 +- 2 files changed, 56 insertions(+), 36 deletions(-) diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index 5083a3c03..abda6c4df 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -104,7 +104,7 @@ namespace SixLabors.ImageSharp int remainder = source.Length % (Vector128.Count * 3); int sourceCount = source.Length - remainder; - int destCount = (int)(sourceCount * (4 / 3D)); + int destCount = sourceCount * 4 / 3; if (sourceCount > 0) { @@ -134,20 +134,20 @@ namespace SixLabors.ImageSharp { if (Ssse3.IsSupported) { - int remainder = ImageMaths.ModuloP2(dest.Length, Vector128.Count); + int remainder = source.Length % (Vector128.Count * 4); - int adjustedCount = dest.Length - remainder; - int destSlice = (int)(adjustedCount * (3 / 4F)); + int sourceCount = source.Length - remainder; + int destCount = sourceCount * 3 / 4; - if (adjustedCount > 0) + if (sourceCount > 0) { Shuffle4Slice3( - source.Slice(0, adjustedCount), - dest.Slice(0, adjustedCount), + source.Slice(0, sourceCount), + dest.Slice(0, destCount), control); - source = source.Slice(adjustedCount); - dest = dest.Slice(destSlice); + source = source.Slice(sourceCount); + dest = dest.Slice(destCount); } } } @@ -243,7 +243,7 @@ namespace SixLabors.ImageSharp // We can add static ROS instances if need be in the future. Span bytes = stackalloc byte[Vector256.Count]; Shuffle.MmShuffleSpan(ref bytes, control); - Vector256 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + Vector256 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); @@ -260,17 +260,17 @@ namespace SixLabors.ImageSharp ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i); - vd0 = Avx2.Shuffle(vs0, vcm); - Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vcm); - Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vcm); - Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vcm); + vd0 = Avx2.Shuffle(vs0, vshuffle); + Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle); + Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle); + Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle); } if (m > 0) { for (int i = u; i < n; i++) { - Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); } } } @@ -279,7 +279,7 @@ namespace SixLabors.ImageSharp // Ssse3 Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MmShuffleSpan(ref bytes, control); - Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); @@ -296,17 +296,17 @@ namespace SixLabors.ImageSharp ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i); - vd0 = Ssse3.Shuffle(vs0, vcm); - Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vcm); - Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vcm); - Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vcm); + vd0 = Ssse3.Shuffle(vs0, vshuffle); + Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle); + Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle); + Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle); } if (m > 0) { for (int i = u; i < n; i++) { - Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); } } } @@ -363,26 +363,46 @@ namespace SixLabors.ImageSharp { if (Ssse3.IsSupported) { - Vector128 sliceMask = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, -1, -1, -1, -1).AsByte(); + Vector128 vmasko = Vector128.Create(0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14, 3, 7, 11, 15).AsByte(); + Vector128 vmaske = Ssse3.AlignRight(vmasko, vmasko, 12).AsByte(); Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MmShuffleSpan(ref bytes, control); - Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); + + ref Vector128 sourceBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); - fixed (byte* sBase = source) - fixed (byte* dBase = dest) + ref Vector128 destBase = + ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); + + int n = source.Length / Vector128.Count; + + for (int i = 0, j = 0; i < n; i += 4, j += 3) { - byte* s = sBase; - byte* d = dBase; + ref Vector128 v0 = ref Unsafe.Add(ref sourceBase, i); + Vector128 v1 = Unsafe.Add(ref v0, 1); + Vector128 v2 = Unsafe.Add(ref v0, 2); + Vector128 v3 = Unsafe.Add(ref v0, 3); - for (int i = 0; i < source.Length; i += 16) - { - Vector128 vs0 = Ssse3.Shuffle(Sse2.LoadVector128(s), vcm); - Sse2.Store(d, Ssse3.Shuffle(vs0, sliceMask)); + v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vshuffle), vmaske); + v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vshuffle), vmasko); + v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vshuffle), vmaske); + v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vshuffle), vmasko); - s += 16; - d += 12; - } + v0 = Ssse3.AlignRight(v1, v0, 4); + v3 = Ssse3.AlignRight(v3, v2, 12); + + v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); + v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); + + v1 = Ssse3.AlignRight(v2, v1, 8); + + ref Vector128 vd = ref Unsafe.Add(ref destBase, j); + + vd = v0; + Unsafe.Add(ref vd, 1) = v1; + Unsafe.Add(ref vd, 2) = v3; } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 54ca2a73e..61c1ce48e 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -183,7 +183,7 @@ namespace SixLabors.ImageSharp "Output span must be divisable by 4!"); DebugGuard.IsTrue( - source.Length == (int)(dest.Length * 3 / 4D), + source.Length == dest.Length * 3 / 4, nameof(source), "Input span must be 3/4 the length of the output span!"); } @@ -202,7 +202,7 @@ namespace SixLabors.ImageSharp "Output span must be divisable by 3!"); DebugGuard.IsTrue( - source.Length == (int)(dest.Length * 4 / 3F), + source.Length == dest.Length * 4 / 3, nameof(source), "Output span must be 3/4 the length of the input span!"); }