|
|
|
@ -221,11 +221,11 @@ internal static partial class SimdUtils |
|
|
|
ref Vector256<float> destBase = |
|
|
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
|
|
|
|
int n = dest.Length / Vector256<float>.Count; |
|
|
|
int m = Numerics.Modulo4(n); |
|
|
|
int u = n - m; |
|
|
|
nint n = (nint)(uint)dest.Length / Vector256<float>.Count; |
|
|
|
nint m = Numerics.Modulo4(n); |
|
|
|
nint u = n - m; |
|
|
|
|
|
|
|
for (int i = 0; i < u; i += 4) |
|
|
|
for (nint i = 0; i < u; i += 4) |
|
|
|
{ |
|
|
|
ref Vector256<float> vd0 = ref Unsafe.Add(ref destBase, i); |
|
|
|
ref Vector256<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
@ -238,7 +238,7 @@ internal static partial class SimdUtils |
|
|
|
|
|
|
|
if (m > 0) |
|
|
|
{ |
|
|
|
for (int i = u; i < n; i++) |
|
|
|
for (nint i = u; i < n; i++) |
|
|
|
{ |
|
|
|
Unsafe.Add(ref destBase, i) = Avx.Permute(Unsafe.Add(ref sourceBase, i), control); |
|
|
|
} |
|
|
|
@ -253,11 +253,11 @@ internal static partial class SimdUtils |
|
|
|
ref Vector128<float> destBase = |
|
|
|
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
|
|
|
|
int n = dest.Length / Vector128<float>.Count; |
|
|
|
int m = Numerics.Modulo4(n); |
|
|
|
int u = n - m; |
|
|
|
nint n = (nint)(uint)dest.Length / Vector128<float>.Count; |
|
|
|
nint m = Numerics.Modulo4(n); |
|
|
|
nint u = n - m; |
|
|
|
|
|
|
|
for (int i = 0; i < u; i += 4) |
|
|
|
for (nint i = 0; i < u; i += 4) |
|
|
|
{ |
|
|
|
ref Vector128<float> vd0 = ref Unsafe.Add(ref destBase, i); |
|
|
|
ref Vector128<float> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
@ -276,7 +276,7 @@ internal static partial class SimdUtils |
|
|
|
|
|
|
|
if (m > 0) |
|
|
|
{ |
|
|
|
for (int i = u; i < n; i++) |
|
|
|
for (nint i = u; i < n; i++) |
|
|
|
{ |
|
|
|
Vector128<float> vs = Unsafe.Add(ref sourceBase, i); |
|
|
|
Unsafe.Add(ref destBase, i) = Sse.Shuffle(vs, vs, control); |
|
|
|
@ -306,11 +306,11 @@ internal static partial class SimdUtils |
|
|
|
ref Vector256<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
|
|
|
|
int n = dest.Length / Vector256<byte>.Count; |
|
|
|
int m = Numerics.Modulo4(n); |
|
|
|
int u = n - m; |
|
|
|
nint n = (nint)(uint)dest.Length / Vector256<byte>.Count; |
|
|
|
nint m = Numerics.Modulo4(n); |
|
|
|
nint u = n - m; |
|
|
|
|
|
|
|
for (int i = 0; i < u; i += 4) |
|
|
|
for (nint i = 0; i < u; i += 4) |
|
|
|
{ |
|
|
|
ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
ref Vector256<byte> vd0 = ref Unsafe.Add(ref destBase, i); |
|
|
|
@ -323,7 +323,7 @@ internal static partial class SimdUtils |
|
|
|
|
|
|
|
if (m > 0) |
|
|
|
{ |
|
|
|
for (int i = u; i < n; i++) |
|
|
|
for (nint i = u; i < n; i++) |
|
|
|
{ |
|
|
|
Unsafe.Add(ref destBase, i) = Avx2.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); |
|
|
|
} |
|
|
|
@ -342,11 +342,11 @@ internal static partial class SimdUtils |
|
|
|
ref Vector128<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
|
|
|
|
int n = dest.Length / Vector128<byte>.Count; |
|
|
|
int m = Numerics.Modulo4(n); |
|
|
|
int u = n - m; |
|
|
|
nint n = (nint)(uint)dest.Length / Vector128<byte>.Count; |
|
|
|
nint m = Numerics.Modulo4(n); |
|
|
|
nint u = n - m; |
|
|
|
|
|
|
|
for (int i = 0; i < u; i += 4) |
|
|
|
for (nint i = 0; i < u; i += 4) |
|
|
|
{ |
|
|
|
ref Vector128<byte> vs0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
ref Vector128<byte> vd0 = ref Unsafe.Add(ref destBase, i); |
|
|
|
@ -359,7 +359,7 @@ internal static partial class SimdUtils |
|
|
|
|
|
|
|
if (m > 0) |
|
|
|
{ |
|
|
|
for (int i = u; i < n; i++) |
|
|
|
for (nint i = u; i < n; i++) |
|
|
|
{ |
|
|
|
Unsafe.Add(ref destBase, i) = Ssse3.Shuffle(Unsafe.Add(ref sourceBase, i), vshuffle); |
|
|
|
} |
|
|
|
@ -391,9 +391,9 @@ internal static partial class SimdUtils |
|
|
|
ref Vector128<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
|
|
|
|
int n = source.Length / Vector128<byte>.Count; |
|
|
|
nint n = (nint)(uint)source.Length / Vector128<byte>.Count; |
|
|
|
|
|
|
|
for (int i = 0; i < n; i += 3) |
|
|
|
for (nint i = 0; i < n; i += 3) |
|
|
|
{ |
|
|
|
ref Vector128<byte> vs = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
|
|
|
|
@ -454,9 +454,9 @@ internal static partial class SimdUtils |
|
|
|
ref Vector128<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
|
|
|
|
int n = source.Length / Vector128<byte>.Count; |
|
|
|
nint n = (nint)(uint)source.Length / Vector128<byte>.Count; |
|
|
|
|
|
|
|
for (int i = 0, j = 0; i < n; i += 3, j += 4) |
|
|
|
for (nint i = 0, j = 0; i < n; i += 3, j += 4) |
|
|
|
{ |
|
|
|
ref Vector128<byte> v0 = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
Vector128<byte> v1 = Unsafe.Add(ref v0, 1); |
|
|
|
@ -498,9 +498,9 @@ internal static partial class SimdUtils |
|
|
|
ref Vector128<byte> destBase = |
|
|
|
ref Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
|
|
|
|
int n = source.Length / Vector128<byte>.Count; |
|
|
|
nint n = (nint)(uint)source.Length / Vector128<byte>.Count; |
|
|
|
|
|
|
|
for (int i = 0, j = 0; i < n; i += 4, j += 3) |
|
|
|
for (nint i = 0, j = 0; i < n; i += 4, j += 3) |
|
|
|
{ |
|
|
|
ref Vector128<byte> vs = ref Unsafe.Add(ref sourceBase, i); |
|
|
|
|
|
|
|
@ -650,16 +650,16 @@ internal static partial class SimdUtils |
|
|
|
{ |
|
|
|
VerifySpanInput(source, dest, Vector256<byte>.Count); |
|
|
|
|
|
|
|
int n = dest.Length / Vector256<byte>.Count; |
|
|
|
nint n = (nint)(uint)dest.Length / Vector256<byte>.Count; |
|
|
|
|
|
|
|
ref Vector256<float> destBase = |
|
|
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
|
|
|
|
var scale = Vector256.Create(1 / (float)byte.MaxValue); |
|
|
|
|
|
|
|
for (int i = 0; i < n; i++) |
|
|
|
for (nint i = 0; i < n; i++) |
|
|
|
{ |
|
|
|
int si = Vector256<byte>.Count * i; |
|
|
|
nint si = Vector256<byte>.Count * i; |
|
|
|
Vector256<int> i0 = Avx2.ConvertToVector256Int32(sourceBase + si); |
|
|
|
Vector256<int> i1 = Avx2.ConvertToVector256Int32(sourceBase + si + Vector256<int>.Count); |
|
|
|
Vector256<int> i2 = Avx2.ConvertToVector256Int32(sourceBase + si + (Vector256<int>.Count * 2)); |
|
|
|
@ -683,7 +683,7 @@ internal static partial class SimdUtils |
|
|
|
// Sse
|
|
|
|
VerifySpanInput(source, dest, Vector128<byte>.Count); |
|
|
|
|
|
|
|
int n = dest.Length / Vector128<byte>.Count; |
|
|
|
nint n = (nint)(uint)dest.Length / Vector128<byte>.Count; |
|
|
|
|
|
|
|
ref Vector128<float> destBase = |
|
|
|
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(dest)); |
|
|
|
@ -691,9 +691,9 @@ internal static partial class SimdUtils |
|
|
|
var scale = Vector128.Create(1 / (float)byte.MaxValue); |
|
|
|
Vector128<byte> zero = Vector128<byte>.Zero; |
|
|
|
|
|
|
|
for (int i = 0; i < n; i++) |
|
|
|
for (nint i = 0; i < n; i++) |
|
|
|
{ |
|
|
|
int si = Vector128<byte>.Count * i; |
|
|
|
nint si = Vector128<byte>.Count * i; |
|
|
|
|
|
|
|
Vector128<int> i0, i1, i2, i3; |
|
|
|
if (Sse41.IsSupported) |
|
|
|
@ -782,7 +782,7 @@ internal static partial class SimdUtils |
|
|
|
{ |
|
|
|
VerifySpanInput(source, dest, Vector256<byte>.Count); |
|
|
|
|
|
|
|
int n = dest.Length / Vector256<byte>.Count; |
|
|
|
nint n = (nint)(uint)dest.Length / Vector256<byte>.Count; |
|
|
|
|
|
|
|
ref Vector256<float> sourceBase = |
|
|
|
ref Unsafe.As<float, Vector256<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
@ -794,7 +794,7 @@ internal static partial class SimdUtils |
|
|
|
ref byte maskBase = ref MemoryMarshal.GetReference(PermuteMaskDeinterleave8x32); |
|
|
|
Vector256<int> mask = Unsafe.As<byte, Vector256<int>>(ref maskBase); |
|
|
|
|
|
|
|
for (int i = 0; i < n; i++) |
|
|
|
for (nint i = 0; i < n; i++) |
|
|
|
{ |
|
|
|
ref Vector256<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|
|
|
|
|
|
|
@ -821,7 +821,7 @@ internal static partial class SimdUtils |
|
|
|
// Sse
|
|
|
|
VerifySpanInput(source, dest, Vector128<byte>.Count); |
|
|
|
|
|
|
|
int n = dest.Length / Vector128<byte>.Count; |
|
|
|
nint n = (nint)(uint)dest.Length / Vector128<byte>.Count; |
|
|
|
|
|
|
|
ref Vector128<float> sourceBase = |
|
|
|
ref Unsafe.As<float, Vector128<float>>(ref MemoryMarshal.GetReference(source)); |
|
|
|
@ -831,7 +831,7 @@ internal static partial class SimdUtils |
|
|
|
|
|
|
|
var scale = Vector128.Create((float)byte.MaxValue); |
|
|
|
|
|
|
|
for (int i = 0; i < n; i++) |
|
|
|
for (nint i = 0; i < n; i++) |
|
|
|
{ |
|
|
|
ref Vector128<float> s = ref Unsafe.Add(ref sourceBase, i * 4); |
|
|
|
|
|
|
|
@ -864,7 +864,7 @@ internal static partial class SimdUtils |
|
|
|
ref Vector256<byte> bBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(blueChannel)); |
|
|
|
ref byte dBase = ref Unsafe.As<Rgb24, byte>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
int count = redChannel.Length / Vector256<byte>.Count; |
|
|
|
nint count = (nint)(uint)redChannel.Length / Vector256<byte>.Count; |
|
|
|
|
|
|
|
ref byte control1Bytes = ref MemoryMarshal.GetReference(PermuteMaskEvenOdd8x32); |
|
|
|
Vector256<uint> control1 = Unsafe.As<byte, Vector256<uint>>(ref control1Bytes); |
|
|
|
@ -875,7 +875,7 @@ internal static partial class SimdUtils |
|
|
|
|
|
|
|
Vector256<byte> shuffleAlpha = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(ShuffleMaskShiftAlpha)); |
|
|
|
|
|
|
|
for (int i = 0; i < count; i++) |
|
|
|
for (nint i = 0; i < count; i++) |
|
|
|
{ |
|
|
|
Vector256<byte> r0 = Unsafe.Add(ref rBase, i); |
|
|
|
Vector256<byte> g0 = Unsafe.Add(ref gBase, i); |
|
|
|
@ -918,7 +918,7 @@ internal static partial class SimdUtils |
|
|
|
Unsafe.As<byte, Vector256<byte>>(ref d4) = rgb4; |
|
|
|
} |
|
|
|
|
|
|
|
int slice = count * Vector256<byte>.Count; |
|
|
|
int slice = (int)count * Vector256<byte>.Count; |
|
|
|
redChannel = redChannel[slice..]; |
|
|
|
greenChannel = greenChannel[slice..]; |
|
|
|
blueChannel = blueChannel[slice..]; |
|
|
|
@ -936,12 +936,12 @@ internal static partial class SimdUtils |
|
|
|
ref Vector256<byte> bBase = ref Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(blueChannel)); |
|
|
|
ref Vector256<byte> dBase = ref Unsafe.As<Rgba32, Vector256<byte>>(ref MemoryMarshal.GetReference(destination)); |
|
|
|
|
|
|
|
int count = redChannel.Length / Vector256<byte>.Count; |
|
|
|
nint count = (nint)(uint)redChannel.Length / Vector256<byte>.Count; |
|
|
|
ref byte control1Bytes = ref MemoryMarshal.GetReference(PermuteMaskEvenOdd8x32); |
|
|
|
Vector256<uint> control1 = Unsafe.As<byte, Vector256<uint>>(ref control1Bytes); |
|
|
|
var a = Vector256.Create((byte)255); |
|
|
|
|
|
|
|
for (int i = 0; i < count; i++) |
|
|
|
for (nint i = 0; i < count; i++) |
|
|
|
{ |
|
|
|
Vector256<byte> r0 = Unsafe.Add(ref rBase, i); |
|
|
|
Vector256<byte> g0 = Unsafe.Add(ref gBase, i); |
|
|
|
@ -970,7 +970,7 @@ internal static partial class SimdUtils |
|
|
|
Unsafe.Add(ref d0, 3) = rgb4; |
|
|
|
} |
|
|
|
|
|
|
|
int slice = count * Vector256<byte>.Count; |
|
|
|
int slice = (int)count * Vector256<byte>.Count; |
|
|
|
redChannel = redChannel[slice..]; |
|
|
|
greenChannel = greenChannel[slice..]; |
|
|
|
blueChannel = blueChannel[slice..]; |
|
|
|
|