diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index 50eeb8e0a7..760296c9d3 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -608,6 +608,44 @@ internal static class Vector128_
return Vector128.Narrow(prodLo, prodHi);
}
+ ///
+ /// Multiply the packed 16-bit unsigned integers in and , producing
+ /// intermediate unsigned 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
+ ///
+ ///
+ /// The first vector containing packed 16-bit unsigned integers to multiply.
+ ///
+ ///
+ /// The second vector containing packed 16-bit unsigned integers to multiply.
+ ///
+ ///
+ /// A vector containing the high 16 bits of the products of the packed 16-bit unsigned integers
+ /// from and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 MultiplyHigh(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.MultiplyHigh(left, right);
+ }
+
+ // Widen each half of the short vectors into two uint vectors
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Elementwise multiply: each int lane now holds the full 32-bit product
+ Vector128 prodLo = leftLo * rightLo;
+ Vector128 prodHi = leftHi * rightHi;
+
+ // Arithmetic shift right by 16 bits to extract the high word
+ prodLo >>= 16;
+ prodHi >>= 16;
+
+ // Narrow the two int vectors back into one short vector
+ return Vector128.Narrow(prodLo, prodHi);
+ }
+
///
/// Unpack and interleave 64-bit integers from the high half of and
/// and store the results in the result.
@@ -927,7 +965,7 @@ internal static class Vector128_
/// The second vector containing packed signed 16-bit integers to subtract.
///
///
- /// A vector containing the results of subtracting packed unsigned 16-bit integers
+ /// A vector containing the results of subtracting packed signed 16-bit integers
///
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128 SubtractSaturate(Vector128 left, Vector128 right)
@@ -967,7 +1005,57 @@ internal static class Vector128_
}
///
- /// Add packed unsigned 8-bit integers in from packed unsigned 8-bit integers
+ /// Subtract packed unsigned 16-bit integers in from packed unsigned 16-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed unsigned 16-bit integers to subtract from.
+ ///
+ ///
+ /// The second vector containing packed unsigned 16-bit integers to subtract.
+ ///
+ ///
+ /// A vector containing the results of subtracting packed unsigned 16-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 SubtractSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.SubtractSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.SubtractSaturate(left, right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.SubtractSaturate(left, right);
+ }
+
+ // Widen inputs to 32-bit signed
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Subtract
+ Vector128 diffLo = leftLo - rightLo;
+ Vector128 diffHi = leftHi - rightHi;
+
+ // Clamp to signed 16-bit range
+ Vector128 min = Vector128.Create((uint)ushort.MinValue);
+ Vector128 max = Vector128.Create((uint)ushort.MaxValue);
+
+ diffLo = Clamp(diffLo, min, max);
+ diffHi = Clamp(diffHi, min, max);
+
+ // Narrow back to 16 bit signed.
+ return Vector128.Narrow(diffLo, diffHi);
+ }
+
+ ///
+ /// Add packed unsigned 8-bit integers in to packed unsigned 8-bit integers
/// in using saturation, and store the results.
///
///
@@ -1015,6 +1103,55 @@ internal static class Vector128_
return Vector128.Narrow(sumLo, sumHi);
}
+ ///
+ /// Add packed unsigned 16-bit integers in to packed unsigned 16-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed unsigned 16-bit integers to add to.
+ ///
+ ///
+ /// The second vector containing packed unsigned 16-bit integers to add.
+ ///
+ ///
+ /// A vector containing the results of adding packed unsigned 16-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 AddSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.AddSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.AddSaturate(left, right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.AddSaturate(left, right);
+ }
+
+ // Widen inputs to 32-bit
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Add
+ Vector128 sumLo = leftLo + rightLo;
+ Vector128 sumHi = leftHi + rightHi;
+
+ // Clamp to signed 16-bit range
+ Vector128 max = Vector128.Create((uint)ushort.MaxValue);
+
+ sumLo = Clamp(sumLo, Vector128.Zero, max);
+ sumHi = Clamp(sumHi, Vector128.Zero, max);
+
+ // Narrow back to 16 bit unsigned.
+ return Vector128.Narrow(sumLo, sumHi);
+ }
+
///
/// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers
/// in using saturation, and store the results.
diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
index 40146c6af8..d5f91b7c88 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs
@@ -5,7 +5,7 @@ using System.Buffers;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
@@ -29,9 +29,9 @@ internal static class YuvConversion
// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer)
{
- if (Sse41.IsSupported)
+ if (Vector128.IsHardwareAccelerated)
{
- UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
+ UpSampleVector128(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer);
}
else
{
@@ -107,7 +107,7 @@ internal static class YuvConversion
//
// Then m can be written as
// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
- private static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer)
+ private static void UpSampleVector128(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer)
{
const int xStep = 3;
Array.Clear(uvBuffer);
@@ -138,18 +138,18 @@ internal static class YuvConversion
{
for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
{
- UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
- UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
- ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
+ UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
+ UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
+ ConvertYuvToBgrWithBottomYVector128(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep);
}
}
else
{
for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16)
{
- UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
- UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
- ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep);
+ UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru);
+ UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv);
+ ConvertYuvToBgrVector128(topY, topDst, ru, rv, pos, xStep);
}
}
@@ -161,18 +161,18 @@ internal static class YuvConversion
Span tmpBottomDst = tmpTopDst[(4 * 32)..];
Span tmpTop = tmpBottomDst[(4 * 32)..];
Span tmpBottom = bottomY.IsEmpty ? null : tmpTop[32..];
- UpSampleLastBlock(topU[uvPos..], curU[uvPos..], leftOver, ru);
- UpSampleLastBlock(topV[uvPos..], curV[uvPos..], leftOver, rv);
+ UpSampleLastBlockVector128(topU[uvPos..], curU[uvPos..], leftOver, ru);
+ UpSampleLastBlockVector128(topV[uvPos..], curV[uvPos..], leftOver, rv);
topY[pos..len].CopyTo(tmpTop);
if (!bottomY.IsEmpty)
{
bottomY[pos..len].CopyTo(tmpBottom);
- ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
+ ConvertYuvToBgrWithBottomYVector128(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep);
}
else
{
- ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep);
+ ConvertYuvToBgrVector128(tmpTop, tmpTopDst, ru, rv, 0, xStep);
}
tmpTopDst[..((len - pos) * xStep)].CopyTo(topDst[(pos * xStep)..]);
@@ -184,7 +184,7 @@ internal static class YuvConversion
}
// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
- private static void UpSample32Pixels(ref byte r1, ref byte r2, Span output)
+ private static void UpSample32PixelsVector128(ref byte r1, ref byte r2, Span output)
{
// Load inputs.
Vector128 a = Unsafe.As>(ref r1);
@@ -192,28 +192,28 @@ internal static class YuvConversion
Vector128 c = Unsafe.As>(ref r2);
Vector128 d = Unsafe.As>(ref Unsafe.Add(ref r2, 1));
- Vector128 s = Sse2.Average(a, d); // s = (a + d + 1) / 2
- Vector128 t = Sse2.Average(b, c); // t = (b + c + 1) / 2
- Vector128 st = Sse2.Xor(s, t); // st = s^t
+ Vector128 s = Vector128_.Average(a, d); // s = (a + d + 1) / 2
+ Vector128 t = Vector128_.Average(b, c); // t = (b + c + 1) / 2
+ Vector128 st = s ^ t; // st = s^t
- Vector128 ad = Sse2.Xor(a, d); // ad = a^d
- Vector128 bc = Sse2.Xor(b, c); // bc = b^c
+ Vector128 ad = a ^ d; // ad = a^d
+ Vector128 bc = b ^ c; // bc = b^c
- Vector128 t1 = Sse2.Or(ad, bc); // (a^d) | (b^c)
- Vector128 t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t)
- Vector128 t3 = Sse2.And(t2, Vector128.Create((byte)1)); // (a^d) | (b^c) | (s^t) & 1
- Vector128 t4 = Sse2.Average(s, t);
- Vector128 k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4
+ Vector128 t1 = ad | bc; // (a^d) | (b^c)
+ Vector128 t2 = t1 | st; // (a^d) | (b^c) | (s^t)
+ Vector128 t3 = t2 & Vector128.Create((byte)1); // (a^d) | (b^c) | (s^t) & 1
+ Vector128 t4 = Vector128_.Average(s, t);
+ Vector128 k = t4 - t3; // k = (a + b + c + d) / 4
- Vector128 diag1 = GetM(k, st, bc, t);
- Vector128 diag2 = GetM(k, st, ad, s);
+ Vector128 diag1 = GetMVector128(k, st, bc, t);
+ Vector128 diag2 = GetMVector128(k, st, ad, s);
// Pack the alternate pixels.
- PackAndStore(a, b, diag1, diag2, output); // store top.
- PackAndStore(c, d, diag2, diag1, output[(2 * 32)..]);
+ PackAndStoreVector128(a, b, diag1, diag2, output); // store top.
+ PackAndStoreVector128(c, d, diag2, diag1, output[(2 * 32)..]);
}
- private static void UpSampleLastBlock(Span tb, Span bb, int numPixels, Span output)
+ private static void UpSampleLastBlockVector128(Span tb, Span bb, int numPixels, Span output)
{
Span r1 = stackalloc byte[17];
Span r2 = stackalloc byte[17];
@@ -230,27 +230,27 @@ internal static class YuvConversion
ref byte r1Ref = ref MemoryMarshal.GetReference(r1);
ref byte r2Ref = ref MemoryMarshal.GetReference(r2);
- UpSample32Pixels(ref r1Ref, ref r2Ref, output);
+ UpSample32PixelsVector128(ref r1Ref, ref r2Ref, output);
}
// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
- private static Vector128 GetM(Vector128 k, Vector128 st, Vector128 ij, Vector128 input)
+ private static Vector128 GetMVector128(Vector128 k, Vector128 st, Vector128 ij, Vector128 input)
{
- Vector128 tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2
- Vector128 tmp1 = Sse2.And(ij, st); // (ij) & (s^t)
- Vector128 tmp2 = Sse2.Xor(k, input); // (k^in)
- Vector128 tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in)
- Vector128 tmp4 = Sse2.And(tmp3, Vector128.Create((byte)1)); // & 1 -> lsb_correction
+ Vector128 tmp0 = Vector128_.Average(k, input); // (k + in + 1) / 2
+ Vector128 tmp1 = ij & st; // (ij) & (s^t)
+ Vector128 tmp2 = k ^ input; // (k^in)
+ Vector128 tmp3 = tmp1 | tmp2; // ((ij) & (s^t)) | (k^in)
+ Vector128 tmp4 = tmp3 & Vector128.Create((byte)1); // & 1 -> lsb_correction
- return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction
+ return tmp0 - tmp4; // (k + in + 1) / 2 - lsb_correction
}
- private static void PackAndStore(Vector128 a, Vector128 b, Vector128 da, Vector128 db, Span output)
+ private static void PackAndStoreVector128(Vector128 a, Vector128 b, Vector128 da, Vector128 db, Span output)
{
- Vector128 ta = Sse2.Average(a, da); // (9a + 3b + 3c + d + 8) / 16
- Vector128 tb = Sse2.Average(b, db); // (3a + 9b + c + 3d + 8) / 16
- Vector128 t1 = Sse2.UnpackLow(ta, tb);
- Vector128 t2 = Sse2.UnpackHigh(ta, tb);
+ Vector128 ta = Vector128_.Average(a, da); // (9a + 3b + 3c + d + 8) / 16
+ Vector128 tb = Vector128_.Average(b, db); // (3a + 9b + c + 3d + 8) / 16
+ Vector128 t1 = Vector128_.UnpackLow(ta, tb);
+ Vector128 t2 = Vector128_.UnpackHigh(ta, tb);
ref byte output0Ref = ref MemoryMarshal.GetReference(output);
ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16);
@@ -562,41 +562,42 @@ internal static class YuvConversion
}
[MethodImpl(InliningOptions.ShortMethod)]
- private static void ConvertYuvToBgrSse41(Span topY, Span topDst, Span ru, Span rv, int curX, int step) => YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
+ private static void ConvertYuvToBgrVector128(Span topY, Span topDst, Span ru, Span rv, int curX, int step)
+ => YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);
[MethodImpl(InliningOptions.ShortMethod)]
- private static void ConvertYuvToBgrWithBottomYSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step)
+ private static void ConvertYuvToBgrWithBottomYVector128(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step)
{
- YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]);
- YuvToBgrSse41(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
+ YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]);
+ YuvToBgrVector128(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]);
}
- private static void YuvToBgrSse41(Span y, Span u, Span v, Span dst)
+ private static void YuvToBgrVector128(Span y, Span u, Span v, Span dst)
{
ref byte yRef = ref MemoryMarshal.GetReference(y);
ref byte uRef = ref MemoryMarshal.GetReference(u);
ref byte vRef = ref MemoryMarshal.GetReference(v);
- ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128 r0, out Vector128 g0, out Vector128 b0);
- ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128 r1, out Vector128 g1, out Vector128 b1);
- ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128 r2, out Vector128 g2, out Vector128 b2);
- ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128 r3, out Vector128 g3, out Vector128 b3);
+ ConvertYuv444ToBgrVector128(ref yRef, ref uRef, ref vRef, out Vector128 r0, out Vector128 g0, out Vector128 b0);
+ ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128 r1, out Vector128 g1, out Vector128 b1);
+ ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128 r2, out Vector128 g2, out Vector128 b2);
+ ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128 r3, out Vector128 g3, out Vector128 b3);
// Cast to 8b and store as BBBBGGGGRRRR.
- Vector128 bgr0 = Sse2.PackUnsignedSaturate(b0, b1);
- Vector128 bgr1 = Sse2.PackUnsignedSaturate(b2, b3);
- Vector128 bgr2 = Sse2.PackUnsignedSaturate(g0, g1);
- Vector128 bgr3 = Sse2.PackUnsignedSaturate(g2, g3);
- Vector128 bgr4 = Sse2.PackUnsignedSaturate(r0, r1);
- Vector128 bgr5 = Sse2.PackUnsignedSaturate(r2, r3);
+ Vector128 bgr0 = Vector128_.PackUnsignedSaturate(b0, b1);
+ Vector128 bgr1 = Vector128_.PackUnsignedSaturate(b2, b3);
+ Vector128 bgr2 = Vector128_.PackUnsignedSaturate(g0, g1);
+ Vector128 bgr3 = Vector128_.PackUnsignedSaturate(g2, g3);
+ Vector128 bgr4 = Vector128_.PackUnsignedSaturate(r0, r1);
+ Vector128 bgr5 = Vector128_.PackUnsignedSaturate(r2, r3);
// Pack as BGRBGRBGRBGR.
- PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
+ PlanarTo24bVector128(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst);
}
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
- private static void PlanarTo24bSse41(Vector128 input0, Vector128 input1, Vector128 input2, Vector128 input3, Vector128 input4, Vector128 input5, Span rgb)
+ private static void PlanarTo24bVector128(Vector128 input0, Vector128 input1, Vector128 input2, Vector128 input3, Vector128 input4, Vector128 input5, Span rgb)
{
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
@@ -612,7 +613,7 @@ internal static class YuvConversion
// r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
// Process R.
- ChannelMixing(
+ ChannelMixingVector128(
input0,
input1,
Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5), // PlanarTo24Shuffle0
@@ -627,7 +628,7 @@ internal static class YuvConversion
// Process G.
// Same as before, just shifted to the left by one and including the right padding.
- ChannelMixing(
+ ChannelMixingVector128(
input2,
input3,
Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255), // PlanarTo24Shuffle3
@@ -641,7 +642,7 @@ internal static class YuvConversion
out Vector128 g5);
// Process B.
- ChannelMixing(
+ ChannelMixingVector128(
input4,
input5,
Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255), // PlanarTo24Shuffle6
@@ -655,24 +656,24 @@ internal static class YuvConversion
out Vector128 b5);
// OR the different channels.
- Vector128 rg0 = Sse2.Or(r0, g0);
- Vector128 rg1 = Sse2.Or(r1, g1);
- Vector128 rg2 = Sse2.Or(r2, g2);
- Vector128 rg3 = Sse2.Or(r3, g3);
- Vector128 rg4 = Sse2.Or(r4, g4);
- Vector128 rg5 = Sse2.Or(r5, g5);
+ Vector128 rg0 = r0 | g0;
+ Vector128 rg1 = r1 | g1;
+ Vector128 rg2 = r2 | g2;
+ Vector128 rg3 = r3 | g3;
+ Vector128 rg4 = r4 | g4;
+ Vector128 rg5 = r5 | g5;
ref byte outputRef = ref MemoryMarshal.GetReference(rgb);
- Unsafe.As>(ref outputRef) = Sse2.Or(rg0, b0);
- Unsafe.As>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1);
- Unsafe.As>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2);
- Unsafe.As>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3);
- Unsafe.As>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4);
- Unsafe.As>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5);
+ Unsafe.As>(ref outputRef) = rg0 | b0;
+ Unsafe.As>(ref Unsafe.Add(ref outputRef, 16)) = rg1 | b1;
+ Unsafe.As>(ref Unsafe.Add(ref outputRef, 32)) = rg2 | b2;
+ Unsafe.As>(ref Unsafe.Add(ref outputRef, 48)) = rg3 | b3;
+ Unsafe.As>(ref Unsafe.Add(ref outputRef, 64)) = rg4 | b4;
+ Unsafe.As>(ref Unsafe.Add(ref outputRef, 80)) = rg5 | b5;
}
// Shuffles the input buffer as A0 0 0 A1 0 0 A2
- private static void ChannelMixing(
+ private static void ChannelMixingVector128(
Vector128 input0,
Vector128 input1,
Vector128 shuffle0,
@@ -685,53 +686,53 @@ internal static class YuvConversion
out Vector128 output4,
out Vector128 output5)
{
- output0 = Ssse3.Shuffle(input0, shuffle0);
- output1 = Ssse3.Shuffle(input0, shuffle1);
- output2 = Ssse3.Shuffle(input0, shuffle2);
- output3 = Ssse3.Shuffle(input1, shuffle0);
- output4 = Ssse3.Shuffle(input1, shuffle1);
- output5 = Ssse3.Shuffle(input1, shuffle2);
+ output0 = Vector128_.ShuffleNative(input0, shuffle0);
+ output1 = Vector128_.ShuffleNative(input0, shuffle1);
+ output2 = Vector128_.ShuffleNative(input0, shuffle2);
+ output3 = Vector128_.ShuffleNative(input1, shuffle0);
+ output4 = Vector128_.ShuffleNative(input1, shuffle1);
+ output5 = Vector128_.ShuffleNative(input1, shuffle2);
}
// Convert 32 samples of YUV444 to B/G/R
- private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b)
+ private static void ConvertYuv444ToBgrVector128(ref byte y, ref byte u, ref byte v, out Vector128