diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs index 50eeb8e0a7..760296c9d3 100644 --- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs +++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs @@ -608,6 +608,44 @@ internal static class Vector128_ return Vector128.Narrow(prodLo, prodHi); } + /// + /// Multiply the packed 16-bit unsigned integers in and , producing + /// intermediate unsigned 32-bit integers, and store the high 16 bits of the intermediate integers in the result. + /// + /// + /// The first vector containing packed 16-bit unsigned integers to multiply. + /// + /// + /// The second vector containing packed 16-bit unsigned integers to multiply. + /// + /// + /// A vector containing the high 16 bits of the products of the packed 16-bit unsigned integers + /// from and . + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 MultiplyHigh(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.MultiplyHigh(left, right); + } + + // Widen each half of the short vectors into two uint vectors + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Elementwise multiply: each int lane now holds the full 32-bit product + Vector128 prodLo = leftLo * rightLo; + Vector128 prodHi = leftHi * rightHi; + + // Arithmetic shift right by 16 bits to extract the high word + prodLo >>= 16; + prodHi >>= 16; + + // Narrow the two int vectors back into one short vector + return Vector128.Narrow(prodLo, prodHi); + } + /// /// Unpack and interleave 64-bit integers from the high half of and /// and store the results in the result. @@ -927,7 +965,7 @@ internal static class Vector128_ /// The second vector containing packed signed 16-bit integers to subtract. /// /// - /// A vector containing the results of subtracting packed unsigned 16-bit integers + /// A vector containing the results of subtracting packed signed 16-bit integers /// [MethodImpl(MethodImplOptions.AggressiveInlining)] public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) @@ -967,7 +1005,57 @@ internal static class Vector128_ } /// - /// Add packed unsigned 8-bit integers in from packed unsigned 8-bit integers + /// Subtract packed unsigned 16-bit integers in from packed unsigned 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 16-bit integers to subtract from. + /// + /// + /// The second vector containing packed unsigned 16-bit integers to subtract. + /// + /// + /// A vector containing the results of subtracting packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 SubtractSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.SubtractSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.SubtractSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.SubtractSaturate(left, right); + } + + // Widen inputs to 32-bit signed + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Subtract + Vector128 diffLo = leftLo - rightLo; + Vector128 diffHi = leftHi - rightHi; + + // Clamp to signed 16-bit range + Vector128 min = Vector128.Create((uint)ushort.MinValue); + Vector128 max = Vector128.Create((uint)ushort.MaxValue); + + diffLo = Clamp(diffLo, min, max); + diffHi = Clamp(diffHi, min, max); + + // Narrow back to 16 bit signed. + return Vector128.Narrow(diffLo, diffHi); + } + + /// + /// Add packed unsigned 8-bit integers in to packed unsigned 8-bit integers /// in using saturation, and store the results. /// /// @@ -1015,6 +1103,55 @@ internal static class Vector128_ return Vector128.Narrow(sumLo, sumHi); } + /// + /// Add packed unsigned 16-bit integers in to packed unsigned 16-bit integers + /// in using saturation, and store the results. + /// + /// + /// The first vector containing packed unsigned 16-bit integers to add to. + /// + /// + /// The second vector containing packed unsigned 16-bit integers to add. + /// + /// + /// A vector containing the results of adding packed unsigned 16-bit integers + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public static Vector128 AddSaturate(Vector128 left, Vector128 right) + { + if (Sse2.IsSupported) + { + return Sse2.AddSaturate(left, right); + } + + if (AdvSimd.IsSupported) + { + return AdvSimd.AddSaturate(left, right); + } + + if (PackedSimd.IsSupported) + { + return PackedSimd.AddSaturate(left, right); + } + + // Widen inputs to 32-bit + (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left); + (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right); + + // Add + Vector128 sumLo = leftLo + rightLo; + Vector128 sumHi = leftHi + rightHi; + + // Clamp to signed 16-bit range + Vector128 max = Vector128.Create((uint)ushort.MaxValue); + + sumLo = Clamp(sumLo, Vector128.Zero, max); + sumHi = Clamp(sumHi, Vector128.Zero, max); + + // Narrow back to 16 bit unsigned. + return Vector128.Narrow(sumLo, sumHi); + } + /// /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers /// in using saturation, and store the results. diff --git a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs index 40146c6af8..d5f91b7c88 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/YuvConversion.cs @@ -5,7 +5,7 @@ using System.Buffers; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.X86; +using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Memory; using SixLabors.ImageSharp.PixelFormats; @@ -29,9 +29,9 @@ internal static class YuvConversion // ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16 public static void UpSample(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { - if (Sse41.IsSupported) + if (Vector128.IsHardwareAccelerated) { - UpSampleSse41(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer); + UpSampleVector128(topY, bottomY, topU, topV, curU, curV, topDst, bottomDst, len, uvBuffer); } else { @@ -107,7 +107,7 @@ internal static class YuvConversion // // Then m can be written as // m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 - private static void UpSampleSse41(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) + private static void UpSampleVector128(Span topY, Span bottomY, Span topU, Span topV, Span curU, Span curV, Span topDst, Span bottomDst, int len, byte[] uvBuffer) { const int xStep = 3; Array.Clear(uvBuffer); @@ -138,18 +138,18 @@ internal static class YuvConversion { for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) { - UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru); - UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv); - ConvertYuvToBgrWithBottomYSse41(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); + UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru); + UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv); + ConvertYuvToBgrWithBottomYVector128(topY, bottomY, topDst, bottomDst, ru, rv, pos, xStep); } } else { for (pos = 1, uvPos = 0; pos + 32 + 1 <= len; pos += 32, uvPos += 16) { - UpSample32Pixels(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru); - UpSample32Pixels(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv); - ConvertYuvToBgrSse41(topY, topDst, ru, rv, pos, xStep); + UpSample32PixelsVector128(ref Unsafe.Add(ref topURef, (uint)uvPos), ref Unsafe.Add(ref curURef, (uint)uvPos), ru); + UpSample32PixelsVector128(ref Unsafe.Add(ref topVRef, (uint)uvPos), ref Unsafe.Add(ref curVRef, (uint)uvPos), rv); + ConvertYuvToBgrVector128(topY, topDst, ru, rv, pos, xStep); } } @@ -161,18 +161,18 @@ internal static class YuvConversion Span tmpBottomDst = tmpTopDst[(4 * 32)..]; Span tmpTop = tmpBottomDst[(4 * 32)..]; Span tmpBottom = bottomY.IsEmpty ? null : tmpTop[32..]; - UpSampleLastBlock(topU[uvPos..], curU[uvPos..], leftOver, ru); - UpSampleLastBlock(topV[uvPos..], curV[uvPos..], leftOver, rv); + UpSampleLastBlockVector128(topU[uvPos..], curU[uvPos..], leftOver, ru); + UpSampleLastBlockVector128(topV[uvPos..], curV[uvPos..], leftOver, rv); topY[pos..len].CopyTo(tmpTop); if (!bottomY.IsEmpty) { bottomY[pos..len].CopyTo(tmpBottom); - ConvertYuvToBgrWithBottomYSse41(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); + ConvertYuvToBgrWithBottomYVector128(tmpTop, tmpBottom, tmpTopDst, tmpBottomDst, ru, rv, 0, xStep); } else { - ConvertYuvToBgrSse41(tmpTop, tmpTopDst, ru, rv, 0, xStep); + ConvertYuvToBgrVector128(tmpTop, tmpTopDst, ru, rv, 0, xStep); } tmpTopDst[..((len - pos) * xStep)].CopyTo(topDst[(pos * xStep)..]); @@ -184,7 +184,7 @@ internal static class YuvConversion } // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. - private static void UpSample32Pixels(ref byte r1, ref byte r2, Span output) + private static void UpSample32PixelsVector128(ref byte r1, ref byte r2, Span output) { // Load inputs. Vector128 a = Unsafe.As>(ref r1); @@ -192,28 +192,28 @@ internal static class YuvConversion Vector128 c = Unsafe.As>(ref r2); Vector128 d = Unsafe.As>(ref Unsafe.Add(ref r2, 1)); - Vector128 s = Sse2.Average(a, d); // s = (a + d + 1) / 2 - Vector128 t = Sse2.Average(b, c); // t = (b + c + 1) / 2 - Vector128 st = Sse2.Xor(s, t); // st = s^t + Vector128 s = Vector128_.Average(a, d); // s = (a + d + 1) / 2 + Vector128 t = Vector128_.Average(b, c); // t = (b + c + 1) / 2 + Vector128 st = s ^ t; // st = s^t - Vector128 ad = Sse2.Xor(a, d); // ad = a^d - Vector128 bc = Sse2.Xor(b, c); // bc = b^c + Vector128 ad = a ^ d; // ad = a^d + Vector128 bc = b ^ c; // bc = b^c - Vector128 t1 = Sse2.Or(ad, bc); // (a^d) | (b^c) - Vector128 t2 = Sse2.Or(t1, st); // (a^d) | (b^c) | (s^t) - Vector128 t3 = Sse2.And(t2, Vector128.Create((byte)1)); // (a^d) | (b^c) | (s^t) & 1 - Vector128 t4 = Sse2.Average(s, t); - Vector128 k = Sse2.Subtract(t4, t3); // k = (a + b + c + d) / 4 + Vector128 t1 = ad | bc; // (a^d) | (b^c) + Vector128 t2 = t1 | st; // (a^d) | (b^c) | (s^t) + Vector128 t3 = t2 & Vector128.Create((byte)1); // (a^d) | (b^c) | (s^t) & 1 + Vector128 t4 = Vector128_.Average(s, t); + Vector128 k = t4 - t3; // k = (a + b + c + d) / 4 - Vector128 diag1 = GetM(k, st, bc, t); - Vector128 diag2 = GetM(k, st, ad, s); + Vector128 diag1 = GetMVector128(k, st, bc, t); + Vector128 diag2 = GetMVector128(k, st, ad, s); // Pack the alternate pixels. - PackAndStore(a, b, diag1, diag2, output); // store top. - PackAndStore(c, d, diag2, diag1, output[(2 * 32)..]); + PackAndStoreVector128(a, b, diag1, diag2, output); // store top. + PackAndStoreVector128(c, d, diag2, diag1, output[(2 * 32)..]); } - private static void UpSampleLastBlock(Span tb, Span bb, int numPixels, Span output) + private static void UpSampleLastBlockVector128(Span tb, Span bb, int numPixels, Span output) { Span r1 = stackalloc byte[17]; Span r2 = stackalloc byte[17]; @@ -230,27 +230,27 @@ internal static class YuvConversion ref byte r1Ref = ref MemoryMarshal.GetReference(r1); ref byte r2Ref = ref MemoryMarshal.GetReference(r2); - UpSample32Pixels(ref r1Ref, ref r2Ref, output); + UpSample32PixelsVector128(ref r1Ref, ref r2Ref, output); } // Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 - private static Vector128 GetM(Vector128 k, Vector128 st, Vector128 ij, Vector128 input) + private static Vector128 GetMVector128(Vector128 k, Vector128 st, Vector128 ij, Vector128 input) { - Vector128 tmp0 = Sse2.Average(k, input); // (k + in + 1) / 2 - Vector128 tmp1 = Sse2.And(ij, st); // (ij) & (s^t) - Vector128 tmp2 = Sse2.Xor(k, input); // (k^in) - Vector128 tmp3 = Sse2.Or(tmp1, tmp2); // ((ij) & (s^t)) | (k^in) - Vector128 tmp4 = Sse2.And(tmp3, Vector128.Create((byte)1)); // & 1 -> lsb_correction + Vector128 tmp0 = Vector128_.Average(k, input); // (k + in + 1) / 2 + Vector128 tmp1 = ij & st; // (ij) & (s^t) + Vector128 tmp2 = k ^ input; // (k^in) + Vector128 tmp3 = tmp1 | tmp2; // ((ij) & (s^t)) | (k^in) + Vector128 tmp4 = tmp3 & Vector128.Create((byte)1); // & 1 -> lsb_correction - return Sse2.Subtract(tmp0, tmp4); // (k + in + 1) / 2 - lsb_correction + return tmp0 - tmp4; // (k + in + 1) / 2 - lsb_correction } - private static void PackAndStore(Vector128 a, Vector128 b, Vector128 da, Vector128 db, Span output) + private static void PackAndStoreVector128(Vector128 a, Vector128 b, Vector128 da, Vector128 db, Span output) { - Vector128 ta = Sse2.Average(a, da); // (9a + 3b + 3c + d + 8) / 16 - Vector128 tb = Sse2.Average(b, db); // (3a + 9b + c + 3d + 8) / 16 - Vector128 t1 = Sse2.UnpackLow(ta, tb); - Vector128 t2 = Sse2.UnpackHigh(ta, tb); + Vector128 ta = Vector128_.Average(a, da); // (9a + 3b + 3c + d + 8) / 16 + Vector128 tb = Vector128_.Average(b, db); // (3a + 9b + c + 3d + 8) / 16 + Vector128 t1 = Vector128_.UnpackLow(ta, tb); + Vector128 t2 = Vector128_.UnpackHigh(ta, tb); ref byte output0Ref = ref MemoryMarshal.GetReference(output); ref byte output1Ref = ref Unsafe.Add(ref output0Ref, 16); @@ -562,41 +562,42 @@ internal static class YuvConversion } [MethodImpl(InliningOptions.ShortMethod)] - private static void ConvertYuvToBgrSse41(Span topY, Span topDst, Span ru, Span rv, int curX, int step) => YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]); + private static void ConvertYuvToBgrVector128(Span topY, Span topDst, Span ru, Span rv, int curX, int step) + => YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]); [MethodImpl(InliningOptions.ShortMethod)] - private static void ConvertYuvToBgrWithBottomYSse41(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) + private static void ConvertYuvToBgrWithBottomYVector128(Span topY, Span bottomY, Span topDst, Span bottomDst, Span ru, Span rv, int curX, int step) { - YuvToBgrSse41(topY[curX..], ru, rv, topDst[(curX * step)..]); - YuvToBgrSse41(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]); + YuvToBgrVector128(topY[curX..], ru, rv, topDst[(curX * step)..]); + YuvToBgrVector128(bottomY[curX..], ru[64..], rv[64..], bottomDst[(curX * step)..]); } - private static void YuvToBgrSse41(Span y, Span u, Span v, Span dst) + private static void YuvToBgrVector128(Span y, Span u, Span v, Span dst) { ref byte yRef = ref MemoryMarshal.GetReference(y); ref byte uRef = ref MemoryMarshal.GetReference(u); ref byte vRef = ref MemoryMarshal.GetReference(v); - ConvertYuv444ToBgrSse41(ref yRef, ref uRef, ref vRef, out Vector128 r0, out Vector128 g0, out Vector128 b0); - ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128 r1, out Vector128 g1, out Vector128 b1); - ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128 r2, out Vector128 g2, out Vector128 b2); - ConvertYuv444ToBgrSse41(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128 r3, out Vector128 g3, out Vector128 b3); + ConvertYuv444ToBgrVector128(ref yRef, ref uRef, ref vRef, out Vector128 r0, out Vector128 g0, out Vector128 b0); + ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 8), ref Unsafe.Add(ref uRef, 8), ref Unsafe.Add(ref vRef, 8), out Vector128 r1, out Vector128 g1, out Vector128 b1); + ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 16), ref Unsafe.Add(ref uRef, 16), ref Unsafe.Add(ref vRef, 16), out Vector128 r2, out Vector128 g2, out Vector128 b2); + ConvertYuv444ToBgrVector128(ref Unsafe.Add(ref yRef, 24), ref Unsafe.Add(ref uRef, 24), ref Unsafe.Add(ref vRef, 24), out Vector128 r3, out Vector128 g3, out Vector128 b3); // Cast to 8b and store as BBBBGGGGRRRR. - Vector128 bgr0 = Sse2.PackUnsignedSaturate(b0, b1); - Vector128 bgr1 = Sse2.PackUnsignedSaturate(b2, b3); - Vector128 bgr2 = Sse2.PackUnsignedSaturate(g0, g1); - Vector128 bgr3 = Sse2.PackUnsignedSaturate(g2, g3); - Vector128 bgr4 = Sse2.PackUnsignedSaturate(r0, r1); - Vector128 bgr5 = Sse2.PackUnsignedSaturate(r2, r3); + Vector128 bgr0 = Vector128_.PackUnsignedSaturate(b0, b1); + Vector128 bgr1 = Vector128_.PackUnsignedSaturate(b2, b3); + Vector128 bgr2 = Vector128_.PackUnsignedSaturate(g0, g1); + Vector128 bgr3 = Vector128_.PackUnsignedSaturate(g2, g3); + Vector128 bgr4 = Vector128_.PackUnsignedSaturate(r0, r1); + Vector128 bgr5 = Vector128_.PackUnsignedSaturate(r2, r3); // Pack as BGRBGRBGRBGR. - PlanarTo24bSse41(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst); + PlanarTo24bVector128(bgr0, bgr1, bgr2, bgr3, bgr4, bgr5, dst); } // Pack the planar buffers // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... - private static void PlanarTo24bSse41(Vector128 input0, Vector128 input1, Vector128 input2, Vector128 input3, Vector128 input4, Vector128 input5, Span rgb) + private static void PlanarTo24bVector128(Vector128 input0, Vector128 input1, Vector128 input2, Vector128 input3, Vector128 input4, Vector128 input5, Span rgb) { // The input is 6 registers of sixteen 8b but for the sake of explanation, // let's take 6 registers of four 8b values. @@ -612,7 +613,7 @@ internal static class YuvConversion // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 // Process R. - ChannelMixing( + ChannelMixingVector128( input0, input1, Vector128.Create(0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255, 5), // PlanarTo24Shuffle0 @@ -627,7 +628,7 @@ internal static class YuvConversion // Process G. // Same as before, just shifted to the left by one and including the right padding. - ChannelMixing( + ChannelMixingVector128( input2, input3, Vector128.Create(255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255, 255), // PlanarTo24Shuffle3 @@ -641,7 +642,7 @@ internal static class YuvConversion out Vector128 g5); // Process B. - ChannelMixing( + ChannelMixingVector128( input4, input5, Vector128.Create(255, 255, 0, 255, 255, 1, 255, 255, 2, 255, 255, 3, 255, 255, 4, 255), // PlanarTo24Shuffle6 @@ -655,24 +656,24 @@ internal static class YuvConversion out Vector128 b5); // OR the different channels. - Vector128 rg0 = Sse2.Or(r0, g0); - Vector128 rg1 = Sse2.Or(r1, g1); - Vector128 rg2 = Sse2.Or(r2, g2); - Vector128 rg3 = Sse2.Or(r3, g3); - Vector128 rg4 = Sse2.Or(r4, g4); - Vector128 rg5 = Sse2.Or(r5, g5); + Vector128 rg0 = r0 | g0; + Vector128 rg1 = r1 | g1; + Vector128 rg2 = r2 | g2; + Vector128 rg3 = r3 | g3; + Vector128 rg4 = r4 | g4; + Vector128 rg5 = r5 | g5; ref byte outputRef = ref MemoryMarshal.GetReference(rgb); - Unsafe.As>(ref outputRef) = Sse2.Or(rg0, b0); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 16)) = Sse2.Or(rg1, b1); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 32)) = Sse2.Or(rg2, b2); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 48)) = Sse2.Or(rg3, b3); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 64)) = Sse2.Or(rg4, b4); - Unsafe.As>(ref Unsafe.Add(ref outputRef, 80)) = Sse2.Or(rg5, b5); + Unsafe.As>(ref outputRef) = rg0 | b0; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 16)) = rg1 | b1; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 32)) = rg2 | b2; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 48)) = rg3 | b3; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 64)) = rg4 | b4; + Unsafe.As>(ref Unsafe.Add(ref outputRef, 80)) = rg5 | b5; } // Shuffles the input buffer as A0 0 0 A1 0 0 A2 - private static void ChannelMixing( + private static void ChannelMixingVector128( Vector128 input0, Vector128 input1, Vector128 shuffle0, @@ -685,53 +686,53 @@ internal static class YuvConversion out Vector128 output4, out Vector128 output5) { - output0 = Ssse3.Shuffle(input0, shuffle0); - output1 = Ssse3.Shuffle(input0, shuffle1); - output2 = Ssse3.Shuffle(input0, shuffle2); - output3 = Ssse3.Shuffle(input1, shuffle0); - output4 = Ssse3.Shuffle(input1, shuffle1); - output5 = Ssse3.Shuffle(input1, shuffle2); + output0 = Vector128_.ShuffleNative(input0, shuffle0); + output1 = Vector128_.ShuffleNative(input0, shuffle1); + output2 = Vector128_.ShuffleNative(input0, shuffle2); + output3 = Vector128_.ShuffleNative(input1, shuffle0); + output4 = Vector128_.ShuffleNative(input1, shuffle1); + output5 = Vector128_.ShuffleNative(input1, shuffle2); } // Convert 32 samples of YUV444 to B/G/R - private static void ConvertYuv444ToBgrSse41(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b) + private static void ConvertYuv444ToBgrVector128(ref byte y, ref byte u, ref byte v, out Vector128 r, out Vector128 g, out Vector128 b) { // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. Vector128 y0 = Unsafe.As>(ref y); Vector128 u0 = Unsafe.As>(ref u); Vector128 v0 = Unsafe.As>(ref v); - y0 = Sse2.UnpackLow(Vector128.Zero, y0); - u0 = Sse2.UnpackLow(Vector128.Zero, u0); - v0 = Sse2.UnpackLow(Vector128.Zero, v0); + y0 = Vector128_.UnpackLow(Vector128.Zero, y0); + u0 = Vector128_.UnpackLow(Vector128.Zero, u0); + v0 = Vector128_.UnpackLow(Vector128.Zero, v0); // These constants are 14b fixed-point version of ITU-R BT.601 constants. // R = (19077 * y + 26149 * v - 14234) >> 6 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 // B = (19077 * y + 33050 * u - 17685) >> 6 - var k19077 = Vector128.Create((ushort)19077); - var k26149 = Vector128.Create((ushort)26149); - var k14234 = Vector128.Create((ushort)14234); + Vector128 k19077 = Vector128.Create((ushort)19077); + Vector128 k26149 = Vector128.Create((ushort)26149); + Vector128 k14234 = Vector128.Create((ushort)14234); - Vector128 y1 = Sse2.MultiplyHigh(y0.AsUInt16(), k19077); - Vector128 r0 = Sse2.MultiplyHigh(v0.AsUInt16(), k26149); - Vector128 g0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419)); - Vector128 g1 = Sse2.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320)); + Vector128 y1 = Vector128_.MultiplyHigh(y0.AsUInt16(), k19077); + Vector128 r0 = Vector128_.MultiplyHigh(v0.AsUInt16(), k26149); + Vector128 g0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create((ushort)6419)); + Vector128 g1 = Vector128_.MultiplyHigh(v0.AsUInt16(), Vector128.Create((ushort)13320)); - Vector128 r1 = Sse2.Subtract(y1.AsUInt16(), k14234); - Vector128 r2 = Sse2.Add(r1, r0); + Vector128 r1 = y1.AsUInt16() - k14234; + Vector128 r2 = r1 + r0; - Vector128 g2 = Sse2.Add(y1.AsUInt16(), Vector128.Create((ushort)8708)); - Vector128 g3 = Sse2.Add(g0, g1); - Vector128 g4 = Sse2.Subtract(g2, g3); + Vector128 g2 = y1.AsUInt16() + Vector128.Create((ushort)8708); + Vector128 g3 = g0 + g1; + Vector128 g4 = g2 - g3; - Vector128 b0 = Sse2.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16()); - Vector128 b1 = Sse2.AddSaturate(b0, y1); - Vector128 b2 = Sse2.SubtractSaturate(b1, Vector128.Create((ushort)17685)); + Vector128 b0 = Vector128_.MultiplyHigh(u0.AsUInt16(), Vector128.Create(26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129, 26, 129).AsUInt16()); + Vector128 b1 = Vector128_.AddSaturate(b0, y1); + Vector128 b2 = Vector128_.SubtractSaturate(b1, Vector128.Create((ushort)17685)); // Use logical shift for B2, which can be larger than 32767. - r = Sse2.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815] - g = Sse2.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710] - b = Sse2.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] + r = Vector128.ShiftRightArithmetic(r2.AsInt16(), 6); // range: [-14234, 30815] + g = Vector128.ShiftRightArithmetic(g4.AsInt16(), 6); // range: [-10953, 27710] + b = Vector128.ShiftRightLogical(b2.AsInt16(), 6); // range: [0, 34238] } [MethodImpl(InliningOptions.ShortMethod)]