diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs index d68e16e23b..0ea17c7706 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs @@ -106,34 +106,72 @@ namespace SixLabors.ImageSharp { if (Avx.IsSupported) { - int n = dest.Length / Vector256.Count; - ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - for (int i = 0; i < n; i++) + int n = dest.Length / Vector256.Count; + int m = ImageMaths.Modulo4(n); + int u = n - m; + + for (int i = 0; i < u; i += 4) + { + ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i); + ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); + + vd0 = Avx.Permute(vs0, control); + Unsafe.Add(ref vd0, 1) = Avx.Permute(Unsafe.Add(ref vs0, 1), control); + Unsafe.Add(ref vd0, 2) = Avx.Permute(Unsafe.Add(ref vs0, 2), control); + Unsafe.Add(ref vd0, 3) = Avx.Permute(Unsafe.Add(ref vs0, 3), control); + } + + if (m > 0) { - Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control); + for (int i = u; i < n; i++) + { + Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control); + } } } else { // Sse - int n = dest.Length / Vector128.Count; - ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source)); ref Vector128 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - for (int i = 0; i < n; i++) + int n = dest.Length / Vector128.Count; + int m = ImageMaths.Modulo4(n); + int u = n - m; + + for (int i = 0; i < u; i += 4) { - Vector128 vs = Unsafe.Add(ref sourceBase, i); - Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control); + ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i); + ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); + + vd0 = Sse.Shuffle(vs0, vs0, control); + + Vector128 vs1 = Unsafe.Add(ref vs0, 1); + Unsafe.Add(ref vd0, 1) = Sse.Shuffle(vs1, vs1, control); + + Vector128 vs2 = Unsafe.Add(ref vs0, 2); + Unsafe.Add(ref vd0, 2) = Sse.Shuffle(vs2, vs2, control); + + Vector128 vs3 = Unsafe.Add(ref vs0, 3); + Unsafe.Add(ref vd0, 3) = Sse.Shuffle(vs3, vs3, control); + } + + if (m > 0) + { + for (int i = u; i < n; i++) + { + Vector128 vs = Unsafe.Add(ref sourceBase, i); + Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control); + } } } } @@ -146,8 +184,6 @@ namespace SixLabors.ImageSharp { if (Avx2.IsSupported) { - int n = dest.Length / Vector256.Count; - // I've chosen to do this for convenience while we determine what // shuffle controls to add to the library. // We can add static ROS instances if need be in the future. @@ -161,16 +197,32 @@ namespace SixLabors.ImageSharp ref Vector256 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - for (int i = 0; i < n; i++) + int n = dest.Length / Vector256.Count; + int m = ImageMaths.Modulo4(n); + int u = n - m; + + for (int i = 0; i < u; i += 4) { - Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i); + ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i); + + vd0 = Avx2.Shuffle(vs0, vcm); + Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vcm); + Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vcm); + Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vcm); + } + + if (m > 0) + { + for (int i = u; i < n; i++) + { + Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + } } } else { // Ssse3 - int n = dest.Length / Vector128.Count; - Span bytes = stackalloc byte[Vector128.Count]; Shuffle.MmShuffleSpan(ref bytes, control); Vector128 vcm = Unsafe.As>(ref MemoryMarshal.GetReference(bytes)); @@ -181,10 +233,27 @@ namespace SixLabors.ImageSharp ref Vector128 destBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(dest)); - for (int i = 0; i < n; i++) + int n = dest.Length / Vector128.Count; + int m = ImageMaths.Modulo4(n); + int u = n - m; + + for (int i = 0; i < u; i += 4) { - Vector128 vs = Unsafe.Add(ref sourceBase, i); - Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(vs, vcm); + ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i); + ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i); + + vd0 = Ssse3.Shuffle(vs0, vcm); + Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vcm); + Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vcm); + Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vcm); + } + + if (m > 0) + { + for (int i = u; i < n; i++) + { + Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vcm); + } } } } diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index 4d2678320b..59b6254192 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -229,7 +229,7 @@ namespace SixLabors.ImageSharp public const byte ZYXW = (3 << 6) | (0 << 4) | (1 << 2) | 2; [MethodImpl(InliningOptions.ShortMethod)] - public static byte MmShuffle(int p3, int p2, int p1, int p0) + public static byte MmShuffle(byte p3, byte p2, byte p1, byte p0) => (byte)((p3 << 6) | (p2 << 4) | (p1 << 2) | p0); [MethodImpl(InliningOptions.ShortMethod)]