|
|
|
@ -8,6 +8,7 @@ using System.Runtime.InteropServices; |
|
|
|
using System.Runtime.Intrinsics; |
|
|
|
using System.Runtime.Intrinsics.Arm; |
|
|
|
using System.Runtime.Intrinsics.X86; |
|
|
|
using SixLabors.ImageSharp.Common.Helpers; |
|
|
|
using SixLabors.ImageSharp.PixelFormats; |
|
|
|
|
|
|
|
namespace SixLabors.ImageSharp; |
|
|
|
@ -51,22 +52,34 @@ internal static partial class SimdUtils |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Shuffle single-precision (32-bit) floating-point elements in <paramref name="source"/>
|
|
|
|
/// using the control and store the results in <paramref name="dest"/>.
|
|
|
|
/// using the control and store the results in <paramref name="destination"/>.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="source">The source span of floats.</param>
|
|
|
|
/// <param name="dest">The destination span of floats.</param>
|
|
|
|
/// <param name="destination">The destination span of floats.</param>
|
|
|
|
/// <param name="control">The byte control.</param>
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
public static void Shuffle4Reduce( |
|
|
|
ref ReadOnlySpan<float> source, |
|
|
|
ref Span<float> dest, |
|
|
|
ref Span<float> destination, |
|
|
|
[ConstantExpected] byte control) |
|
|
|
{ |
|
|
|
if (Avx.IsSupported || Sse.IsSupported) |
|
|
|
if ((Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleFloat) || |
|
|
|
(Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleFloat) || |
|
|
|
(Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleFloat)) |
|
|
|
{ |
|
|
|
int remainder = Avx.IsSupported |
|
|
|
? Numerics.ModuloP2(source.Length, Vector256<float>.Count) |
|
|
|
: Numerics.ModuloP2(source.Length, Vector128<float>.Count); |
|
|
|
int remainder = 0; |
|
|
|
if (Vector512.IsHardwareAccelerated) |
|
|
|
{ |
|
|
|
remainder = Numerics.ModuloP2(source.Length, Vector512<float>.Count); |
|
|
|
} |
|
|
|
else if (Vector256.IsHardwareAccelerated) |
|
|
|
{ |
|
|
|
remainder = Numerics.ModuloP2(source.Length, Vector256<float>.Count); |
|
|
|
} |
|
|
|
else if (Vector128.IsHardwareAccelerated) |
|
|
|
{ |
|
|
|
remainder = Numerics.ModuloP2(source.Length, Vector128<float>.Count); |
|
|
|
} |
|
|
|
|
|
|
|
int adjustedCount = source.Length - remainder; |
|
|
|
|
|
|
|
@ -74,33 +87,45 @@ internal static partial class SimdUtils |
|
|
|
{ |
|
|
|
Shuffle4( |
|
|
|
source[..adjustedCount], |
|
|
|
dest[..adjustedCount], |
|
|
|
destination[..adjustedCount], |
|
|
|
control); |
|
|
|
|
|
|
|
source = source[adjustedCount..]; |
|
|
|
dest = dest[adjustedCount..]; |
|
|
|
destination = destination[adjustedCount..]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Shuffle 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|
|
|
/// using the control and store the results in <paramref name="dest"/>.
|
|
|
|
/// Shuffle 8-bit integers <paramref name="source"/>
|
|
|
|
/// using the control and store the results in <paramref name="destination"/>.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="source">The source span of bytes.</param>
|
|
|
|
/// <param name="dest">The destination span of bytes.</param>
|
|
|
|
/// <param name="destination">The destination span of bytes.</param>
|
|
|
|
/// <param name="control">The byte control.</param>
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
public static void Shuffle4Reduce( |
|
|
|
ref ReadOnlySpan<byte> source, |
|
|
|
ref Span<byte> dest, |
|
|
|
byte control) |
|
|
|
ref Span<byte> destination, |
|
|
|
[ConstantExpected] byte control) |
|
|
|
{ |
|
|
|
if (Avx2.IsSupported || Ssse3.IsSupported) |
|
|
|
if ((Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleByte) || |
|
|
|
(Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleByte) || |
|
|
|
(Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte)) |
|
|
|
{ |
|
|
|
int remainder = Avx2.IsSupported |
|
|
|
? Numerics.ModuloP2(source.Length, Vector256<byte>.Count) |
|
|
|
: Numerics.ModuloP2(source.Length, Vector128<byte>.Count); |
|
|
|
int remainder = 0; |
|
|
|
if (Vector512.IsHardwareAccelerated) |
|
|
|
{ |
|
|
|
remainder = Numerics.ModuloP2(source.Length, Vector512<byte>.Count); |
|
|
|
} |
|
|
|
else if (Vector256.IsHardwareAccelerated) |
|
|
|
{ |
|
|
|
remainder = Numerics.ModuloP2(source.Length, Vector256<byte>.Count); |
|
|
|
} |
|
|
|
else if (Vector128.IsHardwareAccelerated) |
|
|
|
{ |
|
|
|
remainder = Numerics.ModuloP2(source.Length, Vector128<byte>.Count); |
|
|
|
} |
|
|
|
|
|
|
|
int adjustedCount = source.Length - remainder; |
|
|
|
|
|
|
|
@ -108,29 +133,29 @@ internal static partial class SimdUtils |
|
|
|
{ |
|
|
|
Shuffle4( |
|
|
|
source[..adjustedCount], |
|
|
|
dest[..adjustedCount], |
|
|
|
destination[..adjustedCount], |
|
|
|
control); |
|
|
|
|
|
|
|
source = source[adjustedCount..]; |
|
|
|
dest = dest[adjustedCount..]; |
|
|
|
destination = destination[adjustedCount..]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Shuffles 8-bit integer triplets within 128-bit lanes in <paramref name="source"/>
|
|
|
|
/// using the control and store the results in <paramref name="dest"/>.
|
|
|
|
/// Shuffles 8-bit integer triplets in <paramref name="source"/>
|
|
|
|
/// using the control and store the results in <paramref name="destination"/>.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="source">The source span of bytes.</param>
|
|
|
|
/// <param name="dest">The destination span of bytes.</param>
|
|
|
|
/// <param name="destination">The destination span of bytes.</param>
|
|
|
|
/// <param name="control">The byte control.</param>
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
public static void Shuffle3Reduce( |
|
|
|
ref ReadOnlySpan<byte> source, |
|
|
|
ref Span<byte> dest, |
|
|
|
byte control) |
|
|
|
ref Span<byte> destination, |
|
|
|
[ConstantExpected] byte control) |
|
|
|
{ |
|
|
|
if (Ssse3.IsSupported) |
|
|
|
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsRightAlign) |
|
|
|
{ |
|
|
|
int remainder = source.Length % (Vector128<byte>.Count * 3); |
|
|
|
|
|
|
|
@ -140,77 +165,77 @@ internal static partial class SimdUtils |
|
|
|
{ |
|
|
|
Shuffle3( |
|
|
|
source[..adjustedCount], |
|
|
|
dest[..adjustedCount], |
|
|
|
destination[..adjustedCount], |
|
|
|
control); |
|
|
|
|
|
|
|
source = source[adjustedCount..]; |
|
|
|
dest = dest[adjustedCount..]; |
|
|
|
destination = destination[adjustedCount..]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Pads then shuffles 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|
|
|
/// using the control and store the results in <paramref name="dest"/>.
|
|
|
|
/// Pads then shuffles 8-bit integers in <paramref name="source"/>
|
|
|
|
/// using the control and store the results in <paramref name="destination"/>.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="source">The source span of bytes.</param>
|
|
|
|
/// <param name="dest">The destination span of bytes.</param>
|
|
|
|
/// <param name="destination">The destination span of bytes.</param>
|
|
|
|
/// <param name="control">The byte control.</param>
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
public static void Pad3Shuffle4Reduce( |
|
|
|
ref ReadOnlySpan<byte> source, |
|
|
|
ref Span<byte> dest, |
|
|
|
byte control) |
|
|
|
ref Span<byte> destination, |
|
|
|
[ConstantExpected] byte control) |
|
|
|
{ |
|
|
|
if (Ssse3.IsSupported) |
|
|
|
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte) |
|
|
|
{ |
|
|
|
int remainder = source.Length % (Vector128<byte>.Count * 3); |
|
|
|
|
|
|
|
int sourceCount = source.Length - remainder; |
|
|
|
int destCount = (int)((uint)sourceCount * 4 / 3); |
|
|
|
int destinationCount = (int)((uint)sourceCount * 4 / 3); |
|
|
|
|
|
|
|
if (sourceCount > 0) |
|
|
|
{ |
|
|
|
Pad3Shuffle4( |
|
|
|
source[..sourceCount], |
|
|
|
dest[..destCount], |
|
|
|
destination[..destinationCount], |
|
|
|
control); |
|
|
|
|
|
|
|
source = source[sourceCount..]; |
|
|
|
dest = dest[destCount..]; |
|
|
|
destination = destination[destinationCount..]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
/// <summary>
|
|
|
|
/// Shuffles then slices 8-bit integers within 128-bit lanes in <paramref name="source"/>
|
|
|
|
/// using the control and store the results in <paramref name="dest"/>.
|
|
|
|
/// Shuffles then slices 8-bit integers in <paramref name="source"/>
|
|
|
|
/// using the control and store the results in <paramref name="destination"/>.
|
|
|
|
/// </summary>
|
|
|
|
/// <param name="source">The source span of bytes.</param>
|
|
|
|
/// <param name="dest">The destination span of bytes.</param>
|
|
|
|
/// <param name="destination">The destination span of bytes.</param>
|
|
|
|
/// <param name="control">The byte control.</param>
|
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
public static void Shuffle4Slice3Reduce( |
|
|
|
ref ReadOnlySpan<byte> source, |
|
|
|
ref Span<byte> dest, |
|
|
|
byte control) |
|
|
|
ref Span<byte> destination, |
|
|
|
[ConstantExpected] byte control) |
|
|
|
{ |
|
|
|
if (Ssse3.IsSupported) |
|
|
|
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte) |
|
|
|
{ |
|
|
|
int remainder = source.Length & ((Vector128<byte>.Count * 4) - 1); // bit-hack for modulo
|
|
|
|
|
|
|
|
int sourceCount = source.Length - remainder; |
|
|
|
int destCount = (int)((uint)sourceCount * 3 / 4); |
|
|
|
int destinationCount = (int)((uint)sourceCount * 3 / 4); |
|
|
|
|
|
|
|
if (sourceCount > 0) |
|
|
|
{ |
|
|
|
Shuffle4Slice3( |
|
|
|
source[..sourceCount], |
|
|
|
dest[..destCount], |
|
|
|
destination[..destinationCount], |
|
|
|
control); |
|
|
|
|
|
|
|
source = source[sourceCount..]; |
|
|
|
dest = dest[destCount..]; |
|
|
|
destination = destination[destinationCount..]; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
@ -218,76 +243,90 @@ internal static partial class SimdUtils |
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
private static void Shuffle4( |
|
|
|
ReadOnlySpan<float> source, |
|
|
|
Span<float> dest, |
|
|
|
Span<float> destination, |
|
|
|
[ConstantExpected] byte control) |
|
|
|
{ |
|
|
|
if (Avx.IsSupported) |
|
|
|
if (Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleFloat) |
|
|
|
{ |
|
|
|
ref Vector256<float> sourceBase = |
|
|
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector512<float> sourceBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector512<float> destinationBase = ref Unsafe.As<float, Vector512<float>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
ref Vector256<float> destBase = |
|
|
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
nuint n = (uint)destination.Length / (uint)Vector512<float>.Count; |
|
|
|
nuint m = Numerics.Modulo4(n); |
|
|
|
nuint u = n - m; |
|
|
|
|
|
|
|
nint n = (nint)dest.Vector256Count<float>(); |
|
|
|
nint m = Numerics.Modulo4(n); |
|
|
|
nint u = n - m; |
|
|
|
|
|
|
|
for (nint i = 0; i < u; i += 4) |
|
|
|
for (nuint i = 0; i < u; i += 4) |
|
|
|
{ |
|
|
|
ref Vector256<float> vd0 = ref Unsafe.Add(ref destBase, i); |
|
|
|
ref Vector256<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
ref Vector512<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
ref Vector512<float> vd0 = ref Unsafe.Add(ref destinationBase, i); |
|
|
|
|
|
|
|
vd0 = Avx.Permute(vs0, control); |
|
|
|
Unsafe.Add(ref vd0, 1) = Avx.Permute(Unsafe.Add(ref vs0, 1), control); |
|
|
|
Unsafe.Add(ref vd0, 2) = Avx.Permute(Unsafe.Add(ref vs0, 2), control); |
|
|
|
Unsafe.Add(ref vd0, 3) = Avx.Permute(Unsafe.Add(ref vs0, 3), control); |
|
|
|
vd0 = Vector512Utilities.Shuffle(vs0, control); |
|
|
|
Unsafe.Add(ref vd0, (nuint)1) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); |
|
|
|
Unsafe.Add(ref vd0, (nuint)2) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); |
|
|
|
Unsafe.Add(ref vd0, (nuint)3) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); |
|
|
|
} |
|
|
|
|
|
|
|
if (m > 0) |
|
|
|
{ |
|
|
|
for (nint i = u; i < n; i++) |
|
|
|
for (nuint i = u; i < n; i++) |
|
|
|
{ |
|
|
|
Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control); |
|
|
|
Unsafe.Add(ref destinationBase, i) = Vector512Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else if (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleFloat) |
|
|
|
{ |
|
|
|
// Sse
|
|
|
|
ref Vector128<float> sourceBase = |
|
|
|
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector256<float> sourceBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector256<float> destinationBase = ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
ref Vector128<float> destBase = |
|
|
|
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
nuint n = (uint)destination.Length / (uint)Vector256<float>.Count; |
|
|
|
nuint m = Numerics.Modulo4(n); |
|
|
|
nuint u = n - m; |
|
|
|
|
|
|
|
nint n = (nint)((uint)dest.Length / (uint)Vector128<float>.Count); |
|
|
|
nint m = Numerics.Modulo4(n); |
|
|
|
nint u = n - m; |
|
|
|
|
|
|
|
for (nint i = 0; i < u; i += 4) |
|
|
|
for (nuint i = 0; i < u; i += 4) |
|
|
|
{ |
|
|
|
ref Vector128<float> vd0 = ref Unsafe.Add(ref destBase, i); |
|
|
|
ref Vector128<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
ref Vector256<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
ref Vector256<float> vd0 = ref Unsafe.Add(ref destinationBase, i); |
|
|
|
|
|
|
|
vd0 = Vector256Utilities.Shuffle(vs0, control); |
|
|
|
Unsafe.Add(ref vd0, (nuint)1) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); |
|
|
|
Unsafe.Add(ref vd0, (nuint)2) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); |
|
|
|
Unsafe.Add(ref vd0, (nuint)3) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); |
|
|
|
} |
|
|
|
|
|
|
|
vd0 = Sse.Shuffle(vs0, vs0, control); |
|
|
|
if (m > 0) |
|
|
|
{ |
|
|
|
for (nuint i = u; i < n; i++) |
|
|
|
{ |
|
|
|
Unsafe.Add(ref destinationBase, i) = Vector256Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
else if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleFloat) |
|
|
|
{ |
|
|
|
ref Vector128<float> sourceBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector128<float> destinationBase = ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
Vector128<float> vs1 = Unsafe.Add(ref vs0, 1); |
|
|
|
Unsafe.Add(ref vd0, 1) = Sse.Shuffle(vs1, vs1, control); |
|
|
|
nuint n = (uint)destination.Length / (uint)Vector128<float>.Count; |
|
|
|
nuint m = Numerics.Modulo4(n); |
|
|
|
nuint u = n - m; |
|
|
|
|
|
|
|
Vector128<float> vs2 = Unsafe.Add(ref vs0, 2); |
|
|
|
Unsafe.Add(ref vd0, 2) = Sse.Shuffle(vs2, vs2, control); |
|
|
|
for (nuint i = 0; i < u; i += 4) |
|
|
|
{ |
|
|
|
ref Vector128<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
ref Vector128<float> vd0 = ref Unsafe.Add(ref destinationBase, i); |
|
|
|
|
|
|
|
Vector128<float> vs3 = Unsafe.Add(ref vs0, 3); |
|
|
|
Unsafe.Add(ref vd0, 3) = Sse.Shuffle(vs3, vs3, control); |
|
|
|
vd0 = Vector128Utilities.Shuffle(vs0, control); |
|
|
|
Unsafe.Add(ref vd0, (nuint)1) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), control); |
|
|
|
Unsafe.Add(ref vd0, (nuint)2) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), control); |
|
|
|
Unsafe.Add(ref vd0, (nuint)3) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), control); |
|
|
|
} |
|
|
|
|
|
|
|
if (m > 0) |
|
|
|
{ |
|
|
|
for (nint i = u; i < n; i++) |
|
|
|
for (nuint i = u; i < n; i++) |
|
|
|
{ |
|
|
|
Vector128<float> vs = Unsafe.Add(ref sourceBase, i); |
|
|
|
Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control); |
|
|
|
Unsafe.Add(ref destinationBase, i) = Vector128Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), control); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
@ -296,80 +335,102 @@ internal static partial class SimdUtils |
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
private static void Shuffle4( |
|
|
|
ReadOnlySpan<byte> source, |
|
|
|
Span<byte> dest, |
|
|
|
byte control) |
|
|
|
Span<byte> destination, |
|
|
|
[ConstantExpected] byte control) |
|
|
|
{ |
|
|
|
if (Avx2.IsSupported) |
|
|
|
if (Vector512.IsHardwareAccelerated && Vector512Utilities.SupportsShuffleByte) |
|
|
|
{ |
|
|
|
// I've chosen to do this for convenience while we determine what
|
|
|
|
// shuffle controls to add to the library.
|
|
|
|
// We can add static ROS instances if need be in the future.
|
|
|
|
Span<byte> bytes = stackalloc byte[Vector256<byte>.Count]; |
|
|
|
Shuffle.MMShuffleSpan(ref bytes, control); |
|
|
|
Vector256<byte> vshuffle = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|
|
|
Span<byte> temp = stackalloc byte[Vector512<byte>.Count]; |
|
|
|
Shuffle.MMShuffleSpan(ref temp, control); |
|
|
|
Vector512<byte> mask = Unsafe.As<byte, Vector512<byte>>(ref MemoryMarshal.GetReference(temp)); |
|
|
|
|
|
|
|
ref Vector256<byte> sourceBase = |
|
|
|
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector512<byte> sourceBase = ref Unsafe.As<byte, Vector512<byte>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector512<byte> destinationBase = ref Unsafe.As<byte, Vector512<byte>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
ref Vector256<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
nuint n = (uint)destination.Length / (uint)Vector512<byte>.Count; |
|
|
|
nuint m = Numerics.Modulo4(n); |
|
|
|
nuint u = n - m; |
|
|
|
|
|
|
|
for (nuint i = 0; i < u; i += 4) |
|
|
|
{ |
|
|
|
ref Vector512<byte> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
ref Vector512<byte> vd0 = ref Unsafe.Add(ref destinationBase, i); |
|
|
|
|
|
|
|
vd0 = Vector512Utilities.Shuffle(vs0, mask); |
|
|
|
Unsafe.Add(ref vd0, (nuint)1) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); |
|
|
|
Unsafe.Add(ref vd0, (nuint)2) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); |
|
|
|
Unsafe.Add(ref vd0, (nuint)3) = Vector512Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); |
|
|
|
} |
|
|
|
|
|
|
|
if (m > 0) |
|
|
|
{ |
|
|
|
for (nuint i = u; i < n; i++) |
|
|
|
{ |
|
|
|
Unsafe.Add(ref destinationBase, i) = Vector512Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
else if (Vector256.IsHardwareAccelerated && Vector256Utilities.SupportsShuffleByte) |
|
|
|
{ |
|
|
|
Span<byte> temp = stackalloc byte[Vector256<byte>.Count]; |
|
|
|
Shuffle.MMShuffleSpan(ref temp, control); |
|
|
|
Vector256<byte> mask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp)); |
|
|
|
|
|
|
|
ref Vector256<byte> sourceBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector256<byte> destinationBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
nint n = (nint)((uint)dest.Length / (uint)Vector256<byte>.Count); |
|
|
|
nint m = Numerics.Modulo4(n); |
|
|
|
nint u = n - m; |
|
|
|
nuint n = (uint)destination.Length / (uint)Vector256<byte>.Count; |
|
|
|
nuint m = Numerics.Modulo4(n); |
|
|
|
nuint u = n - m; |
|
|
|
|
|
|
|
for (nint i = 0; i < u; i += 4) |
|
|
|
for (nuint i = 0; i < u; i += 4) |
|
|
|
{ |
|
|
|
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destBase, i); |
|
|
|
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i); |
|
|
|
|
|
|
|
vd0 = Avx2.Shuffle(vs0, vshuffle); |
|
|
|
Unsafe.Add(ref vd0, 1) = Avx2.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle); |
|
|
|
Unsafe.Add(ref vd0, 2) = Avx2.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle); |
|
|
|
Unsafe.Add(ref vd0, 3) = Avx2.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle); |
|
|
|
vd0 = Vector256Utilities.Shuffle(vs0, mask); |
|
|
|
Unsafe.Add(ref vd0, (nuint)1) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); |
|
|
|
Unsafe.Add(ref vd0, (nuint)2) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); |
|
|
|
Unsafe.Add(ref vd0, (nuint)3) = Vector256Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); |
|
|
|
} |
|
|
|
|
|
|
|
if (m > 0) |
|
|
|
{ |
|
|
|
for (nint i = u; i < n; i++) |
|
|
|
for (nuint i = u; i < n; i++) |
|
|
|
{ |
|
|
|
Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); |
|
|
|
Unsafe.Add(ref destinationBase, i) = Vector256Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
else |
|
|
|
else if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte) |
|
|
|
{ |
|
|
|
// Ssse3
|
|
|
|
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|
|
|
Shuffle.MMShuffleSpan(ref bytes, control); |
|
|
|
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|
|
|
|
|
|
|
ref Vector128<byte> sourceBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
Span<byte> temp = stackalloc byte[Vector128<byte>.Count]; |
|
|
|
Shuffle.MMShuffleSpan(ref temp, control); |
|
|
|
Vector128<byte> mask = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(temp)); |
|
|
|
|
|
|
|
ref Vector128<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
ref Vector128<byte> sourceBase = ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector128<byte> destinationBase = ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
nint n = (nint)((uint)dest.Length / (uint)Vector128<byte>.Count); |
|
|
|
nint m = Numerics.Modulo4(n); |
|
|
|
nint u = n - m; |
|
|
|
nuint n = (uint)destination.Length / (uint)Vector128<byte>.Count; |
|
|
|
nuint m = Numerics.Modulo4(n); |
|
|
|
nuint u = n - m; |
|
|
|
|
|
|
|
for (nint i = 0; i < u; i += 4) |
|
|
|
for (nuint i = 0; i < u; i += 4) |
|
|
|
{ |
|
|
|
ref Vector128<byte> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
ref Vector128<byte> vd0 = ref Unsafe.Add(ref destBase, i); |
|
|
|
ref Vector128<byte> vd0 = ref Unsafe.Add(ref destinationBase, i); |
|
|
|
|
|
|
|
vd0 = Ssse3.Shuffle(vs0, vshuffle); |
|
|
|
Unsafe.Add(ref vd0, 1) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 1), vshuffle); |
|
|
|
Unsafe.Add(ref vd0, 2) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 2), vshuffle); |
|
|
|
Unsafe.Add(ref vd0, 3) = Ssse3.Shuffle(Unsafe.Add(ref vs0, 3), vshuffle); |
|
|
|
vd0 = Vector128Utilities.Shuffle(vs0, mask); |
|
|
|
Unsafe.Add(ref vd0, (nuint)1) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)1), mask); |
|
|
|
Unsafe.Add(ref vd0, (nuint)2) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)2), mask); |
|
|
|
Unsafe.Add(ref vd0, (nuint)3) = Vector128Utilities.Shuffle(Unsafe.Add(ref vs0, (nuint)3), mask); |
|
|
|
} |
|
|
|
|
|
|
|
if (m > 0) |
|
|
|
{ |
|
|
|
for (nint i = u; i < n; i++) |
|
|
|
for (nuint i = u; i < n; i++) |
|
|
|
{ |
|
|
|
Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); |
|
|
|
Unsafe.Add(ref destinationBase, i) = Vector128Utilities.Shuffle(Unsafe.Add(ref sourceBase, i), mask); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
@ -378,24 +439,21 @@ internal static partial class SimdUtils |
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
private static void Shuffle3( |
|
|
|
ReadOnlySpan<byte> source, |
|
|
|
Span<byte> dest, |
|
|
|
byte control) |
|
|
|
Span<byte> destination, |
|
|
|
[ConstantExpected] byte control) |
|
|
|
{ |
|
|
|
if (Ssse3.IsSupported) |
|
|
|
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsRightAlign) |
|
|
|
{ |
|
|
|
Vector128<byte> vmask = ShuffleMaskPad4Nx16(); |
|
|
|
Vector128<byte> vmasko = ShuffleMaskSlice4Nx16(); |
|
|
|
Vector128<byte> vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); |
|
|
|
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16(); |
|
|
|
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); |
|
|
|
Vector128<byte> maskE = Vector128Utilities.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); |
|
|
|
|
|
|
|
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|
|
|
Shuffle.MMShuffleSpan(ref bytes, control); |
|
|
|
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|
|
|
Vector128<byte> mask = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|
|
|
|
|
|
|
ref Vector128<byte> sourceBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
|
|
|
|
ref Vector128<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
ref Vector128<byte> sourceBase = ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
ref Vector128<byte> destinationBase = ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
nuint n = source.Vector128Count<byte>(); |
|
|
|
|
|
|
|
@ -404,36 +462,36 @@ internal static partial class SimdUtils |
|
|
|
ref Vector128<byte> vs = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
|
|
|
|
Vector128<byte> v0 = vs; |
|
|
|
Vector128<byte> v1 = Unsafe.Add(ref vs, 1); |
|
|
|
Vector128<byte> v2 = Unsafe.Add(ref vs, 2); |
|
|
|
Vector128<byte> v3 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|
|
|
Vector128<byte> v1 = Unsafe.Add(ref vs, (nuint)1); |
|
|
|
Vector128<byte> v2 = Unsafe.Add(ref vs, (nuint)2); |
|
|
|
Vector128<byte> v3 = Vector128Utilities.ShiftRightBytesInVector(v2, 4); |
|
|
|
|
|
|
|
v2 = Ssse3.AlignRight(v2, v1, 8); |
|
|
|
v1 = Ssse3.AlignRight(v1, v0, 12); |
|
|
|
v2 = Vector128Utilities.AlignRight(v2, v1, 8); |
|
|
|
v1 = Vector128Utilities.AlignRight(v1, v0, 12); |
|
|
|
|
|
|
|
v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vmask), vshuffle); |
|
|
|
v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vmask), vshuffle); |
|
|
|
v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vmask), vshuffle); |
|
|
|
v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vmask), vshuffle); |
|
|
|
v0 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, maskPad4Nx16), mask); |
|
|
|
v1 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, maskPad4Nx16), mask); |
|
|
|
v2 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, maskPad4Nx16), mask); |
|
|
|
v3 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, maskPad4Nx16), mask); |
|
|
|
|
|
|
|
v0 = Ssse3.Shuffle(v0, vmaske); |
|
|
|
v1 = Ssse3.Shuffle(v1, vmasko); |
|
|
|
v2 = Ssse3.Shuffle(v2, vmaske); |
|
|
|
v3 = Ssse3.Shuffle(v3, vmasko); |
|
|
|
v0 = Vector128Utilities.Shuffle(v0, maskE); |
|
|
|
v1 = Vector128Utilities.Shuffle(v1, maskSlice4Nx16); |
|
|
|
v2 = Vector128Utilities.Shuffle(v2, maskE); |
|
|
|
v3 = Vector128Utilities.Shuffle(v3, maskSlice4Nx16); |
|
|
|
|
|
|
|
v0 = Ssse3.AlignRight(v1, v0, 4); |
|
|
|
v3 = Ssse3.AlignRight(v3, v2, 12); |
|
|
|
v0 = Vector128Utilities.AlignRight(v1, v0, 4); |
|
|
|
v3 = Vector128Utilities.AlignRight(v3, v2, 12); |
|
|
|
|
|
|
|
v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); |
|
|
|
v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|
|
|
v1 = Vector128Utilities.ShiftLeftBytesInVector(v1, 4); |
|
|
|
v2 = Vector128Utilities.ShiftRightBytesInVector(v2, 4); |
|
|
|
|
|
|
|
v1 = Ssse3.AlignRight(v2, v1, 8); |
|
|
|
v1 = Vector128Utilities.AlignRight(v2, v1, 8); |
|
|
|
|
|
|
|
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, i); |
|
|
|
ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, i); |
|
|
|
|
|
|
|
vd = v0; |
|
|
|
Unsafe.Add(ref vd, 1) = v1; |
|
|
|
Unsafe.Add(ref vd, 2) = v3; |
|
|
|
Unsafe.Add(ref vd, (nuint)1) = v1; |
|
|
|
Unsafe.Add(ref vd, (nuint)2) = v3; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
@ -441,23 +499,23 @@ internal static partial class SimdUtils |
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
private static void Pad3Shuffle4( |
|
|
|
ReadOnlySpan<byte> source, |
|
|
|
Span<byte> dest, |
|
|
|
byte control) |
|
|
|
Span<byte> destination, |
|
|
|
[ConstantExpected] byte control) |
|
|
|
{ |
|
|
|
if (Ssse3.IsSupported) |
|
|
|
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte) |
|
|
|
{ |
|
|
|
Vector128<byte> vmask = ShuffleMaskPad4Nx16(); |
|
|
|
Vector128<byte> vfill = Vector128.Create(0xff000000ff000000ul).AsByte(); |
|
|
|
Vector128<byte> maskPad4Nx16 = ShuffleMaskPad4Nx16(); |
|
|
|
Vector128<byte> fill = Vector128.Create(0xff000000ff000000ul).AsByte(); |
|
|
|
|
|
|
|
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|
|
|
Shuffle.MMShuffleSpan(ref bytes, control); |
|
|
|
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|
|
|
Span<byte> temp = stackalloc byte[Vector128<byte>.Count]; |
|
|
|
Shuffle.MMShuffleSpan(ref temp, control); |
|
|
|
Vector128<byte> mask = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(temp)); |
|
|
|
|
|
|
|
ref Vector128<byte> sourceBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
|
|
|
|
ref Vector128<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
ref Vector128<byte> destinationBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
nuint n = source.Vector128Count<byte>(); |
|
|
|
|
|
|
|
@ -466,17 +524,17 @@ internal static partial class SimdUtils |
|
|
|
ref Vector128<byte> v0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
Vector128<byte> v1 = Unsafe.Add(ref v0, 1); |
|
|
|
Vector128<byte> v2 = Unsafe.Add(ref v0, 2); |
|
|
|
Vector128<byte> v3 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|
|
|
Vector128<byte> v3 = Vector128Utilities.ShiftRightBytesInVector(v2, 4); |
|
|
|
|
|
|
|
v2 = Ssse3.AlignRight(v2, v1, 8); |
|
|
|
v1 = Ssse3.AlignRight(v1, v0, 12); |
|
|
|
v2 = Vector128Utilities.AlignRight(v2, v1, 8); |
|
|
|
v1 = Vector128Utilities.AlignRight(v1, v0, 12); |
|
|
|
|
|
|
|
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, j); |
|
|
|
ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, j); |
|
|
|
|
|
|
|
vd = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v0, vmask), vfill), vshuffle); |
|
|
|
Unsafe.Add(ref vd, 1) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v1, vmask), vfill), vshuffle); |
|
|
|
Unsafe.Add(ref vd, 2) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v2, vmask), vfill), vshuffle); |
|
|
|
Unsafe.Add(ref vd, 3) = Ssse3.Shuffle(Sse2.Or(Ssse3.Shuffle(v3, vmask), vfill), vshuffle); |
|
|
|
vd = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, maskPad4Nx16) | fill, mask); |
|
|
|
Unsafe.Add(ref vd, 1) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, maskPad4Nx16) | fill, mask); |
|
|
|
Unsafe.Add(ref vd, 2) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, maskPad4Nx16) | fill, mask); |
|
|
|
Unsafe.Add(ref vd, 3) = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, maskPad4Nx16) | fill, mask); |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
@ -484,23 +542,23 @@ internal static partial class SimdUtils |
|
|
|
[MethodImpl(InliningOptions.ShortMethod)] |
|
|
|
private static void Shuffle4Slice3( |
|
|
|
ReadOnlySpan<byte> source, |
|
|
|
Span<byte> dest, |
|
|
|
byte control) |
|
|
|
Span<byte> destination, |
|
|
|
[ConstantExpected] byte control) |
|
|
|
{ |
|
|
|
if (Ssse3.IsSupported) |
|
|
|
if (Vector128.IsHardwareAccelerated && Vector128Utilities.SupportsShuffleByte && Vector128Utilities.SupportsShiftByte) |
|
|
|
{ |
|
|
|
Vector128<byte> vmasko = ShuffleMaskSlice4Nx16(); |
|
|
|
Vector128<byte> vmaske = Ssse3.AlignRight(vmasko, vmasko, 12); |
|
|
|
Vector128<byte> maskSlice4Nx16 = ShuffleMaskSlice4Nx16(); |
|
|
|
Vector128<byte> maskE = Vector128Utilities.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12); |
|
|
|
|
|
|
|
Span<byte> bytes = stackalloc byte[Vector128<byte>.Count]; |
|
|
|
Shuffle.MMShuffleSpan(ref bytes, control); |
|
|
|
Vector128<byte> vshuffle = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(bytes)); |
|
|
|
Span<byte> temp = stackalloc byte[Vector128<byte>.Count]; |
|
|
|
Shuffle.MMShuffleSpan(ref temp, control); |
|
|
|
Vector128<byte> mask = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(temp)); |
|
|
|
|
|
|
|
ref Vector128<byte> sourceBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
|
|
|
|
ref Vector128<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
ref Vector128<byte> destinationBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
nuint n = source.Vector128Count<byte>(); |
|
|
|
|
|
|
|
@ -513,20 +571,20 @@ internal static partial class SimdUtils |
|
|
|
Vector128<byte> v2 = Unsafe.Add(ref vs, 2); |
|
|
|
Vector128<byte> v3 = Unsafe.Add(ref vs, 3); |
|
|
|
|
|
|
|
v0 = Ssse3.Shuffle(Ssse3.Shuffle(v0, vshuffle), vmaske); |
|
|
|
v1 = Ssse3.Shuffle(Ssse3.Shuffle(v1, vshuffle), vmasko); |
|
|
|
v2 = Ssse3.Shuffle(Ssse3.Shuffle(v2, vshuffle), vmaske); |
|
|
|
v3 = Ssse3.Shuffle(Ssse3.Shuffle(v3, vshuffle), vmasko); |
|
|
|
v0 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v0, mask), maskE); |
|
|
|
v1 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v1, mask), maskSlice4Nx16); |
|
|
|
v2 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v2, mask), maskE); |
|
|
|
v3 = Vector128Utilities.Shuffle(Vector128Utilities.Shuffle(v3, mask), maskSlice4Nx16); |
|
|
|
|
|
|
|
v0 = Ssse3.AlignRight(v1, v0, 4); |
|
|
|
v3 = Ssse3.AlignRight(v3, v2, 12); |
|
|
|
v0 = Vector128Utilities.AlignRight(v1, v0, 4); |
|
|
|
v3 = Vector128Utilities.AlignRight(v3, v2, 12); |
|
|
|
|
|
|
|
v1 = Sse2.ShiftLeftLogical128BitLane(v1, 4); |
|
|
|
v2 = Sse2.ShiftRightLogical128BitLane(v2, 4); |
|
|
|
v1 = Vector128Utilities.ShiftLeftBytesInVector(v1, 4); |
|
|
|
v2 = Vector128Utilities.ShiftRightBytesInVector(v2, 4); |
|
|
|
|
|
|
|
v1 = Ssse3.AlignRight(v2, v1, 8); |
|
|
|
v1 = Vector128Utilities.AlignRight(v2, v1, 8); |
|
|
|
|
|
|
|
ref Vector128<byte> vd = ref Unsafe.Add(ref destBase, j); |
|
|
|
ref Vector128<byte> vd = ref Unsafe.Add(ref destinationBase, j); |
|
|
|
|
|
|
|
vd = v0; |
|
|
|
Unsafe.Add(ref vd, 1) = v1; |
|
|
|
|