diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index 293997c4d..ca28a7aab 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -5,7 +5,6 @@ using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
namespace SixLabors.ImageSharp;
@@ -61,6 +60,12 @@ internal static class Numerics
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static nint Modulo4(nint x) => x & 3;
+ ///
+ /// Calculates % 4
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static nuint Modulo4(nuint x) => x & 3;
+
///
/// Calculates % 8
///
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs
index 683ac518b..d4ab8c618 100644
--- a/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IComponentShuffle.cs
@@ -19,24 +19,26 @@ namespace SixLabors.ImageSharp;
internal interface IComponentShuffle
{
///
- /// Shuffles then slices 8-bit integers within 128-bit lanes in
- /// using the control and store the results in .
+ /// Shuffles then slices 8-bit integers in
+ /// using the control and store the results in .
+ /// If successful, this method will reduce the length of length
+ /// by the shuffle amount.
///
/// The source span of bytes.
- /// The destination span of bytes.
- void ShuffleReduce(ref ReadOnlySpan source, ref Span dest);
+ /// The destination span of bytes.
+ void ShuffleReduce(ref ReadOnlySpan source, ref Span destination);
///
- /// Shuffle 8-bit integers within 128-bit lanes in
- /// using the control and store the results in .
+ /// Shuffle 8-bit integers in
+ /// using the control and store the results in .
///
/// The source span of bytes.
- /// The destination span of bytes.
+ /// The destination span of bytes.
///
- /// Implementation can assume that source.Length is less or equal than dest.Length.
+ /// Implementation can assume that source.Length is less or equal than destination.Length.
/// Loops should iterate using source.Length.
///
- void RunFallbackShuffle(ReadOnlySpan source, Span dest);
+ void Shuffle(ReadOnlySpan source, Span destination);
}
///
@@ -44,24 +46,21 @@ internal interface IShuffle4 : IComponentShuffle
{
}
-internal readonly struct DefaultShuffle4 : IShuffle4
+internal readonly struct DefaultShuffle4(byte control) : IShuffle4
{
- public DefaultShuffle4(byte control)
- => this.Control = control;
-
- public byte Control { get; }
+ public byte Control { get; } = control;
[MethodImpl(InliningOptions.ShortMethod)]
- public void ShuffleReduce(ref ReadOnlySpan source, ref Span dest)
- => HwIntrinsics.Shuffle4Reduce(ref source, ref dest, this.Control);
+ public void ShuffleReduce(ref ReadOnlySpan source, ref Span destination)
+ => HwIntrinsics.Shuffle4Reduce(ref source, ref destination, this.Control);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span destination)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
- ref byte dBase = ref MemoryMarshal.GetReference(dest);
+ ref byte dBase = ref MemoryMarshal.GetReference(destination);
- Shuffle.InverseMMShuffle(this.Control, out uint p3, out uint p2, out uint p1, out uint p0);
+ SimdUtils.Shuffle.InverseMMShuffle(this.Control, out uint p3, out uint p2, out uint p1, out uint p0);
for (nuint i = 0; i < (uint)source.Length; i += 4)
{
@@ -76,14 +75,14 @@ internal readonly struct DefaultShuffle4 : IShuffle4
internal readonly struct WXYZShuffle4 : IShuffle4
{
[MethodImpl(InliningOptions.ShortMethod)]
- public void ShuffleReduce(ref ReadOnlySpan source, ref Span dest)
- => HwIntrinsics.Shuffle4Reduce(ref source, ref dest, Shuffle.MMShuffle2103);
+ public void ShuffleReduce(ref ReadOnlySpan source, ref Span destination)
+ => HwIntrinsics.Shuffle4Reduce(ref source, ref destination, SimdUtils.Shuffle.MMShuffle2103);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span destination)
{
ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
- ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(destination));
uint n = (uint)source.Length / 4;
for (nuint i = 0; i < n; i++)
@@ -100,14 +99,14 @@ internal readonly struct WXYZShuffle4 : IShuffle4
internal readonly struct WZYXShuffle4 : IShuffle4
{
[MethodImpl(InliningOptions.ShortMethod)]
- public void ShuffleReduce(ref ReadOnlySpan source, ref Span dest)
- => HwIntrinsics.Shuffle4Reduce(ref source, ref dest, Shuffle.MMShuffle0123);
+ public void ShuffleReduce(ref ReadOnlySpan source, ref Span destination)
+ => HwIntrinsics.Shuffle4Reduce(ref source, ref destination, SimdUtils.Shuffle.MMShuffle0123);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span destination)
{
ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
- ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(destination));
uint n = (uint)source.Length / 4;
for (nuint i = 0; i < n; i++)
@@ -124,14 +123,14 @@ internal readonly struct WZYXShuffle4 : IShuffle4
internal readonly struct YZWXShuffle4 : IShuffle4
{
[MethodImpl(InliningOptions.ShortMethod)]
- public void ShuffleReduce(ref ReadOnlySpan source, ref Span dest)
- => HwIntrinsics.Shuffle4Reduce(ref source, ref dest, Shuffle.MMShuffle0321);
+ public void ShuffleReduce(ref ReadOnlySpan source, ref Span destination)
+ => HwIntrinsics.Shuffle4Reduce(ref source, ref destination, SimdUtils.Shuffle.MMShuffle0321);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span destination)
{
ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
- ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(destination));
uint n = (uint)source.Length / 4;
for (nuint i = 0; i < n; i++)
@@ -148,14 +147,14 @@ internal readonly struct YZWXShuffle4 : IShuffle4
internal readonly struct ZYXWShuffle4 : IShuffle4
{
[MethodImpl(InliningOptions.ShortMethod)]
- public void ShuffleReduce(ref ReadOnlySpan source, ref Span dest)
- => HwIntrinsics.Shuffle4Reduce(ref source, ref dest, Shuffle.MMShuffle3012);
+ public void ShuffleReduce(ref ReadOnlySpan source, ref Span destination)
+ => HwIntrinsics.Shuffle4Reduce(ref source, ref destination, SimdUtils.Shuffle.MMShuffle3012);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span destination)
{
ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
- ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(destination));
uint n = (uint)source.Length / 4;
for (nuint i = 0; i < n; i++)
@@ -179,14 +178,14 @@ internal readonly struct ZYXWShuffle4 : IShuffle4
internal readonly struct XWZYShuffle4 : IShuffle4
{
[MethodImpl(InliningOptions.ShortMethod)]
- public void ShuffleReduce(ref ReadOnlySpan source, ref Span dest)
- => HwIntrinsics.Shuffle4Reduce(ref source, ref dest, Shuffle.MMShuffle1230);
+ public void ShuffleReduce(ref ReadOnlySpan source, ref Span destination)
+ => HwIntrinsics.Shuffle4Reduce(ref source, ref destination, SimdUtils.Shuffle.MMShuffle1230);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span destination)
{
ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
- ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
+ ref uint dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(destination));
uint n = (uint)source.Length / 4;
for (nuint i = 0; i < n; i++)
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
index 6cf6eef08..255448d61 100644
--- a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
@@ -24,12 +24,12 @@ internal readonly struct DefaultPad3Shuffle4 : IPad3Shuffle4
=> HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, this.Control);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span dest)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(dest);
- Shuffle.InverseMMShuffle(this.Control, out uint p3, out uint p2, out uint p1, out uint p0);
+ SimdUtils.Shuffle.InverseMMShuffle(this.Control, out uint p3, out uint p2, out uint p1, out uint p0);
Span temp = stackalloc byte[4];
ref byte t = ref MemoryMarshal.GetReference(temp);
@@ -52,10 +52,10 @@ internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4
{
[MethodImpl(InliningOptions.ShortMethod)]
public void ShuffleReduce(ref ReadOnlySpan source, ref Span dest)
- => HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, Shuffle.MMShuffle3210);
+ => HwIntrinsics.Pad3Shuffle4Reduce(ref source, ref dest, SimdUtils.Shuffle.MMShuffle3210);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span dest)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(dest);
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
index 2cd586212..89faca243 100644
--- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
@@ -24,12 +24,12 @@ internal readonly struct DefaultShuffle3 : IShuffle3
=> HwIntrinsics.Shuffle3Reduce(ref source, ref dest, this.Control);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span dest)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(dest);
- Shuffle.InverseMMShuffle(this.Control, out _, out uint p2, out uint p1, out uint p0);
+ SimdUtils.Shuffle.InverseMMShuffle(this.Control, out _, out uint p2, out uint p1, out uint p0);
for (nuint i = 0; i < (uint)source.Length; i += 3)
{
diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
index 5e82973e3..30fda7a8e 100644
--- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
@@ -24,12 +24,12 @@ internal readonly struct DefaultShuffle4Slice3 : IShuffle4Slice3
=> HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, this.Control);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span dest)
{
ref byte sBase = ref MemoryMarshal.GetReference(source);
ref byte dBase = ref MemoryMarshal.GetReference(dest);
- Shuffle.InverseMMShuffle(this.Control, out _, out uint p2, out uint p1, out uint p0);
+ SimdUtils.Shuffle.InverseMMShuffle(this.Control, out _, out uint p2, out uint p1, out uint p0);
for (nuint i = 0, j = 0; i < (uint)dest.Length; i += 3, j += 4)
{
@@ -44,10 +44,10 @@ internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3
{
[MethodImpl(InliningOptions.ShortMethod)]
public void ShuffleReduce(ref ReadOnlySpan source, ref Span dest)
- => HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, Shuffle.MMShuffle3210);
+ => HwIntrinsics.Shuffle4Slice3Reduce(ref source, ref dest, SimdUtils.Shuffle.MMShuffle3210);
[MethodImpl(InliningOptions.ShortMethod)]
- public void RunFallbackShuffle(ReadOnlySpan source, Span dest)
+ public void Shuffle(ReadOnlySpan source, Span dest)
{
ref uint sBase = ref Unsafe.As(ref MemoryMarshal.GetReference(source));
ref Byte3 dBase = ref Unsafe.As(ref MemoryMarshal.GetReference(dest));
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index ad079b52e..4732effd4 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -51,22 +51,32 @@ internal static partial class SimdUtils
///
/// Shuffle single-precision (32-bit) floating-point elements in
- /// using the control and store the results in .
+ /// using the control and store the results in .
///
/// The source span of floats.
- /// The destination span of floats.
+ /// The destination span of floats.
/// The byte control.
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4Reduce(
ref ReadOnlySpan source,
- ref Span dest,
+ ref Span destination,
[ConstantExpected] byte control)
{
- if (Avx.IsSupported || Sse.IsSupported)
+ if (Vector512.IsHardwareAccelerated || Vector256.IsHardwareAccelerated || Vector128.IsHardwareAccelerated)
{
- int remainder = Avx.IsSupported
- ? Numerics.ModuloP2(source.Length, Vector256.Count)
- : Numerics.ModuloP2(source.Length, Vector128.Count);
+ int remainder = 0;
+ if (Vector512.IsHardwareAccelerated)
+ {
+ remainder = Numerics.ModuloP2(source.Length, Vector512.Count);
+ }
+ else if (Vector256.IsHardwareAccelerated)
+ {
+ remainder = Numerics.ModuloP2(source.Length, Vector256.Count);
+ }
+ else if (Vector128.IsHardwareAccelerated)
+ {
+ remainder = Numerics.ModuloP2(source.Length, Vector128.Count);
+ }
int adjustedCount = source.Length - remainder;
@@ -74,17 +84,17 @@ internal static partial class SimdUtils
{
Shuffle4(
source[..adjustedCount],
- dest[..adjustedCount],
+ destination[..adjustedCount],
control);
source = source[adjustedCount..];
- dest = dest[adjustedCount..];
+ destination = destination[adjustedCount..];
}
}
}
///
- /// Shuffle 8-bit integers within 128-bit lanes in
+ /// Shuffle 8-bit integers
/// using the control and store the results in .
///
/// The source span of bytes.
@@ -96,11 +106,21 @@ internal static partial class SimdUtils
ref Span dest,
byte control)
{
- if (Avx2.IsSupported || Ssse3.IsSupported)
+ if (Vector512.IsHardwareAccelerated || Vector256.IsHardwareAccelerated || Vector128.IsHardwareAccelerated)
{
- int remainder = Avx2.IsSupported
- ? Numerics.ModuloP2(source.Length, Vector256.Count)
- : Numerics.ModuloP2(source.Length, Vector128.Count);
+ int remainder = 0;
+ if (Vector512.IsHardwareAccelerated)
+ {
+ remainder = Numerics.ModuloP2(source.Length, Vector512.Count);
+ }
+ else if (Vector256.IsHardwareAccelerated)
+ {
+ remainder = Numerics.ModuloP2(source.Length, Vector256.Count);
+ }
+ else if (Vector128.IsHardwareAccelerated)
+ {
+ remainder = Numerics.ModuloP2(source.Length, Vector128.Count);
+ }
int adjustedCount = source.Length - remainder;
@@ -218,76 +238,102 @@ internal static partial class SimdUtils
[MethodImpl(InliningOptions.ShortMethod)]
private static void Shuffle4(
ReadOnlySpan source,
- Span dest,
+ Span destination,
[ConstantExpected] byte control)
{
- if (Avx.IsSupported)
+ if (Vector512.IsHardwareAccelerated)
{
- ref Vector256 sourceBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ Span temp = stackalloc int[Vector512.Count];
+ Shuffle.MMShuffleSpan(ref temp, control);
+ Vector512 mask = Unsafe.As>(ref MemoryMarshal.GetReference(temp));
- ref Vector256 destBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+ ref Vector512 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Vector512 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
- nint n = (nint)dest.Vector256Count();
- nint m = Numerics.Modulo4(n);
- nint u = n - m;
+ nuint n = (uint)destination.Length / (uint)Vector512.Count;
+ nuint m = Numerics.Modulo4(n);
+ nuint u = n - m;
- for (nint i = 0; i < u; i += 4)
+ for (nuint i = 0; i < u; i += 4)
{
- ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i);
- ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector512 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector512 vd0 = ref Unsafe.Add(ref destinationBase, i);
- vd0 = Avx.Permute(vs0, control);
- Unsafe.Add(ref vd0, 1) = Avx.Permute(Unsafe.Add(ref vs0, 1), control);
- Unsafe.Add(ref vd0, 2) = Avx.Permute(Unsafe.Add(ref vs0, 2), control);
- Unsafe.Add(ref vd0, 3) = Avx.Permute(Unsafe.Add(ref vs0, 3), control);
+ vd0 = Vector512.Shuffle(vs0, mask);
+ Unsafe.Add(ref vd0, (nuint)1) = Vector512.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
+ Unsafe.Add(ref vd0, (nuint)2) = Vector512.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
+ Unsafe.Add(ref vd0, (nuint)3) = Vector512.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
- for (nint i = u; i < n; i++)
+ for (nuint i = u; i < n; i++)
{
- Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control);
+ Unsafe.Add(ref destinationBase, i) = Vector512.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
- else
+ else if (Vector256.IsHardwareAccelerated)
{
- // Sse
- ref Vector128 sourceBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ Span temp = stackalloc int[Vector256.Count];
+ Shuffle.MMShuffleSpan(ref temp, control);
+ Vector256 mask = Unsafe.As>(ref MemoryMarshal.GetReference(temp));
- ref Vector128 destBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
- nint n = (nint)((uint)dest.Length / (uint)Vector128.Count);
- nint m = Numerics.Modulo4(n);
- nint u = n - m;
+ nuint n = (uint)destination.Length / (uint)Vector256.Count;
+ nuint m = Numerics.Modulo4(n);
+ nuint u = n - m;
- for (nint i = 0; i < u; i += 4)
+ for (nuint i = 0; i < u; i += 4)
{
- ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i);
- ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i);
- vd0 = Sse.Shuffle(vs0, vs0, control);
+ vd0 = Vector256.Shuffle(vs0, mask);
+ Unsafe.Add(ref vd0, (nuint)1) = Vector256.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
+ Unsafe.Add(ref vd0, (nuint)2) = Vector256.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
+ Unsafe.Add(ref vd0, (nuint)3) = Vector256.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
+ }
- Vector128 vs1 = Unsafe.Add(ref vs0, 1);
- Unsafe.Add(ref vd0, 1) = Sse.Shuffle(vs1, vs1, control);
+ if (m > 0)
+ {
+ for (nuint i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destinationBase, i) = Vector256.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
+ }
+ }
+ }
+ else if (Vector128.IsHardwareAccelerated)
+ {
+ Span temp = stackalloc int[Vector128.Count];
+ Shuffle.MMShuffleSpan(ref temp, control);
+ Vector128 mask = Unsafe.As>(ref MemoryMarshal.GetReference(temp));
+
+ ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Vector128 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
- Vector128 vs2 = Unsafe.Add(ref vs0, 2);
- Unsafe.Add(ref vd0, 2) = Sse.Shuffle(vs2, vs2, control);
+ nuint n = (uint)destination.Length / (uint)Vector128.Count;
+ nuint m = Numerics.Modulo4(n);
+ nuint u = n - m;
+
+ for (nuint i = 0; i < u; i += 4)
+ {
+ ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector128 vd0 = ref Unsafe.Add(ref destinationBase, i);
- Vector128 vs3 = Unsafe.Add(ref vs0, 3);
- Unsafe.Add(ref vd0, 3) = Sse.Shuffle(vs3, vs3, control);
+ vd0 = Vector128.Shuffle(vs0, mask);
+ Unsafe.Add(ref vd0, (nuint)1) = Vector128.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
+ Unsafe.Add(ref vd0, (nuint)2) = Vector128.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
+ Unsafe.Add(ref vd0, (nuint)3) = Vector128.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
- for (nint i = u; i < n; i++)
+ for (nuint i = u; i < n; i++)
{
- Vector128 vs = Unsafe.Add(ref sourceBase, i);
- Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control);
+ Unsafe.Add(ref destinationBase, i) = Vector128.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
@@ -296,80 +342,102 @@ internal static partial class SimdUtils
[MethodImpl(InliningOptions.ShortMethod)]
private static void Shuffle4(
ReadOnlySpan source,
- Span dest,
+ Span destination,
byte control)
{
- if (Avx2.IsSupported)
+ if (Vector512.IsHardwareAccelerated)
{
- // I've chosen to do this for convenience while we determine what
- // shuffle controls to add to the library.
- // We can add static ROS instances if need be in the future.
- Span bytes = stackalloc byte[Vector256.Count];
- Shuffle.MMShuffleSpan(ref bytes, control);
- Vector256 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
+ Span temp = stackalloc byte[Vector512.Count];
+ Shuffle.MMShuffleSpan(ref temp, control);
+ Vector512 mask = Unsafe.As>(ref MemoryMarshal.GetReference(temp));
- ref Vector256 sourceBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Vector512 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Vector512 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
- ref Vector256 destBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+ nuint n = (uint)destination.Length / (uint)Vector512.Count;
+ nuint m = Numerics.Modulo4(n);
+ nuint u = n - m;
+
+ for (nuint i = 0; i < u; i += 4)
+ {
+ ref Vector512 vs0 = ref Unsafe.Add(ref sourceBase, i);
+ ref Vector512 vd0 = ref Unsafe.Add(ref destinationBase, i);
+
+ vd0 = Vector512.Shuffle(vs0, mask);
+ Unsafe.Add(ref vd0, (nuint)1) = Vector512.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
+ Unsafe.Add(ref vd0, (nuint)2) = Vector512.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
+ Unsafe.Add(ref vd0, (nuint)3) = Vector512.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
+ }
+
+ if (m > 0)
+ {
+ for (nuint i = u; i < n; i++)
+ {
+ Unsafe.Add(ref destinationBase, i) = Vector512.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
+ }
+ }
+ }
+ else if (Vector256.IsHardwareAccelerated)
+ {
+ Span temp = stackalloc byte[Vector256.Count];
+ Shuffle.MMShuffleSpan(ref temp, control);
+ Vector256 mask = Unsafe.As>(ref MemoryMarshal.GetReference(temp));
- nint n = (nint)((uint)dest.Length / (uint)Vector256.Count);
- nint m = Numerics.Modulo4(n);
- nint u = n - m;
+ ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
- for (nint i = 0; i < u; i += 4)
+ nuint n = (uint)destination.Length / (uint)Vector256.Count;
+ nuint m = Numerics.Modulo4(n);
+ nuint u = n - m;
+
+ for (nuint i = 0; i < u; i += 4)
{
ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
- ref Vector256 vd0 = ref Unsafe.Add(ref destBase, i);
+ ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i);
- vd0 = Avx2.Shuffle(vs0, vshuffle);
- Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle);
- Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle);
- Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle);
+ vd0 = Vector256.Shuffle(vs0, mask);
+ Unsafe.Add(ref vd0, (nuint)1) = Vector256.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
+ Unsafe.Add(ref vd0, (nuint)2) = Vector256.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
+ Unsafe.Add(ref vd0, (nuint)3) = Vector256.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
- for (nint i = u; i < n; i++)
+ for (nuint i = u; i < n; i++)
{
- Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle);
+ Unsafe.Add(ref destinationBase, i) = Vector256.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
- else
+ else if (Vector128.IsHardwareAccelerated)
{
- // Ssse3
- Span bytes = stackalloc byte[Vector128.Count];
- Shuffle.MMShuffleSpan(ref bytes, control);
- Vector128 vshuffle = Unsafe.As>(ref MemoryMarshal.GetReference(bytes));
-
- ref Vector128 sourceBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ Span temp = stackalloc byte[Vector128.Count];
+ Shuffle.MMShuffleSpan(ref temp, control);
+ Vector128 mask = Unsafe.As>(ref MemoryMarshal.GetReference(temp));
- ref Vector128 destBase =
- ref Unsafe.As>(ref MemoryMarshal.GetReference(dest));
+ ref Vector128 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
+ ref Vector128 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
- nint n = (nint)((uint)dest.Length / (uint)Vector128.Count);
- nint m = Numerics.Modulo4(n);
- nint u = n - m;
+ nuint n = (uint)destination.Length / (uint)Vector128.Count;
+ nuint m = Numerics.Modulo4(n);
+ nuint u = n - m;
- for (nint i = 0; i < u; i += 4)
+ for (nuint i = 0; i < u; i += 4)
{
ref Vector128 vs0 = ref Unsafe.Add(ref sourceBase, i);
- ref Vector128 vd0 = ref Unsafe.Add(ref destBase, i);
+ ref Vector128 vd0 = ref Unsafe.Add(ref destinationBase, i);
- vd0 = Ssse3.Shuffle(vs0, vshuffle);
- Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle);
- Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle);
- Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle);
+ vd0 = Vector128.Shuffle(vs0, mask);
+ Unsafe.Add(ref vd0, (nuint)1) = Vector128.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask);
+ Unsafe.Add(ref vd0, (nuint)2) = Vector128.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask);
+ Unsafe.Add(ref vd0, (nuint)3) = Vector128.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
- for (nint i = u; i < n; i++)
+ for (nuint i = u; i < n; i++)
{
- Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle);
+ Unsafe.Add(ref destinationBase, i) = Vector128.Shuffle(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
index 83cd3d246..dbeb54a80 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
@@ -12,140 +12,140 @@ internal static partial class SimdUtils
{
///
/// Shuffle single-precision (32-bit) floating-point elements in
- /// using the control and store the results in .
+ /// using the control and store the results in .
///
/// The source span of floats.
- /// The destination span of floats.
+ /// The destination span of floats.
/// The byte control.
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4(
ReadOnlySpan source,
- Span dest,
+ Span destination,
[ConstantExpected] byte control)
{
- VerifyShuffle4SpanInput(source, dest);
+ VerifyShuffle4SpanInput(source, destination);
- HwIntrinsics.Shuffle4Reduce(ref source, ref dest, control);
+ HwIntrinsics.Shuffle4Reduce(ref source, ref destination, control);
// Deal with the remainder:
if (source.Length > 0)
{
- Shuffle4Remainder(source, dest, control);
+ Shuffle4Remainder(source, destination, control);
}
}
///
/// Shuffle 8-bit integers within 128-bit lanes in
- /// using the control and store the results in .
+ /// using the control and store the results in .
///
/// The type of shuffle struct.
/// The source span of bytes.
- /// The destination span of bytes.
+ /// The destination span of bytes.
/// The type of shuffle to perform.
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4(
ReadOnlySpan source,
- Span dest,
+ Span destination,
TShuffle shuffle)
where TShuffle : struct, IShuffle4
{
- VerifyShuffle4SpanInput(source, dest);
+ VerifyShuffle4SpanInput(source, destination);
- shuffle.ShuffleReduce(ref source, ref dest);
+ shuffle.ShuffleReduce(ref source, ref destination);
// Deal with the remainder:
if (source.Length > 0)
{
- shuffle.RunFallbackShuffle(source, dest);
+ shuffle.Shuffle(source, destination);
}
}
///
/// Shuffle 8-bit integer triplets within 128-bit lanes in
- /// using the control and store the results in .
+ /// using the control and store the results in .
///
/// The type of shuffle struct.
/// The source span of bytes.
- /// The destination span of bytes.
+ /// The destination span of bytes.
/// The type of shuffle to perform.
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle3(
ReadOnlySpan source,
- Span dest,
+ Span destination,
TShuffle shuffle)
where TShuffle : struct, IShuffle3
{
- // Source length should be smaller than dest length, and divisible by 3.
- VerifyShuffle3SpanInput(source, dest);
+ // Source length should be smaller than destination length, and divisible by 3.
+ VerifyShuffle3SpanInput(source, destination);
- shuffle.ShuffleReduce(ref source, ref dest);
+ shuffle.ShuffleReduce(ref source, ref destination);
// Deal with the remainder:
if (source.Length > 0)
{
- shuffle.RunFallbackShuffle(source, dest);
+ shuffle.Shuffle(source, destination);
}
}
///
/// Pads then shuffles 8-bit integers within 128-bit lanes in
- /// using the control and store the results in .
+ /// using the control and store the results in .
///
/// The type of shuffle struct.
/// The source span of bytes.
- /// The destination span of bytes.
+ /// The destination span of bytes.
/// The type of shuffle to perform.
[MethodImpl(InliningOptions.ShortMethod)]
public static void Pad3Shuffle4(
ReadOnlySpan source,
- Span dest,
+ Span destination,
TShuffle shuffle)
where TShuffle : struct, IPad3Shuffle4
{
- VerifyPad3Shuffle4SpanInput(source, dest);
+ VerifyPad3Shuffle4SpanInput(source, destination);
- shuffle.ShuffleReduce(ref source, ref dest);
+ shuffle.ShuffleReduce(ref source, ref destination);
// Deal with the remainder:
if (source.Length > 0)
{
- shuffle.RunFallbackShuffle(source, dest);
+ shuffle.Shuffle(source, destination);
}
}
///
/// Shuffles then slices 8-bit integers within 128-bit lanes in
- /// using the control and store the results in .
+ /// using the control and store the results in .
///
/// The type of shuffle struct.
/// The source span of bytes.
- /// The destination span of bytes.
+ /// The destination span of bytes.
/// The type of shuffle to perform.
[MethodImpl(InliningOptions.ShortMethod)]
public static void Shuffle4Slice3(
ReadOnlySpan source,
- Span dest,
+ Span destination,
TShuffle shuffle)
where TShuffle : struct, IShuffle4Slice3
{
- VerifyShuffle4Slice3SpanInput(source, dest);
+ VerifyShuffle4Slice3SpanInput(source, destination);
- shuffle.ShuffleReduce(ref source, ref dest);
+ shuffle.ShuffleReduce(ref source, ref destination);
// Deal with the remainder:
if (source.Length > 0)
{
- shuffle.RunFallbackShuffle(source, dest);
+ shuffle.Shuffle(source, destination);
}
}
private static void Shuffle4Remainder(
ReadOnlySpan source,
- Span dest,
+ Span destination,
byte control)
{
ref float sBase = ref MemoryMarshal.GetReference(source);
- ref float dBase = ref MemoryMarshal.GetReference(dest);
+ ref float dBase = ref MemoryMarshal.GetReference(destination);
Shuffle.InverseMMShuffle(control, out uint p3, out uint p2, out uint p1, out uint p0);
for (nuint i = 0; i < (uint)source.Length; i += 4)
@@ -158,69 +158,69 @@ internal static partial class SimdUtils
}
[Conditional("DEBUG")]
- internal static void VerifyShuffle4SpanInput(ReadOnlySpan source, Span dest)
+ internal static void VerifyShuffle4SpanInput(ReadOnlySpan source, Span destination)
where T : struct
{
DebugGuard.IsTrue(
- source.Length == dest.Length,
+ source.Length == destination.Length,
nameof(source),
"Input spans must be of same length!");
DebugGuard.IsTrue(
source.Length % 4 == 0,
nameof(source),
- "Input spans must be divisable by 4!");
+ "Input spans must be divisible by 4!");
}
[Conditional("DEBUG")]
- private static void VerifyShuffle3SpanInput(ReadOnlySpan source, Span dest)
+ private static void VerifyShuffle3SpanInput(ReadOnlySpan