From 217450eb00a76bd07ebbd004f3c677888790d085 Mon Sep 17 00:00:00 2001 From: James Jackson-South Date: Tue, 3 Jun 2025 18:40:47 +1000 Subject: [PATCH] Complete LossyUtils port --- .../Formats/Webp/Lossy/LossyUtils.cs | 248 +++++------------- .../Formats/Webp/Lossy/Vp8Encoding.cs | 8 +- 2 files changed, 75 insertions(+), 181 deletions(-) diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs index b8c4c9c31..c65861c4b 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs @@ -5,8 +5,6 @@ using System.Buffers.Binary; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; using System.Runtime.Intrinsics; -using System.Runtime.Intrinsics.Arm; -using System.Runtime.Intrinsics.X86; using SixLabors.ImageSharp.Common.Helpers; // ReSharper disable InconsistentNaming @@ -18,7 +16,7 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8_Sse16x16(Span a, Span b) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { return Vp8_Sse16xN_Vector256(a, b, 4); } @@ -28,11 +26,6 @@ internal static class LossyUtils return Vp8_16xN_Vector128(a, b, 8); } - if (AdvSimd.IsSupported) - { - return Vp8_Sse16x16_Neon(a, b); - } - return Vp8_SseNxN(a, b, 16, 16); } @@ -50,11 +43,6 @@ internal static class LossyUtils return Vp8_16xN_Vector128(a, b, 4); } - if (AdvSimd.IsSupported) - { - return Vp8_Sse16x8_Neon(a, b); - } - return Vp8_SseNxN(a, b, 16, 8); } @@ -62,7 +50,7 @@ internal static class LossyUtils [MethodImpl(InliningOptions.ShortMethod)] public static int Vp8_Sse4x4(Span a, Span b) { - if (Avx2.IsSupported) + if (Vector256.IsHardwareAccelerated) { // Load values. ref byte aRef = ref MemoryMarshal.GetReference(a); @@ -123,19 +111,14 @@ internal static class LossyUtils // subtract, square and accumulate. Vector128 d0 = Vector128_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16()); - Vector128 d1 = Sse2.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16()); - Vector128 e0 = Sse2.MultiplyAddAdjacent(d0, d0); - Vector128 e1 = Sse2.MultiplyAddAdjacent(d1, d1); - Vector128 sum = Sse2.Add(e0, e1); + Vector128 d1 = Vector128_.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16()); + Vector128 e0 = Vector128_.MultiplyAddAdjacent(d0, d0); + Vector128 e1 = Vector128_.MultiplyAddAdjacent(d1, d1); + Vector128 sum = e0 + e1; return ReduceSumVector128(sum); } - if (AdvSimd.IsSupported) - { - return Vp8_Sse4x4_Neon(a, b); - } - return Vp8_SseNxN(a, b, 4, 4); } @@ -216,95 +199,6 @@ internal static class LossyUtils return ReduceSumVector256(sum); } - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe int Vp8_Sse16x16_Neon(Span a, Span b) - { - Vector128 sum = Vector128.Zero; - fixed (byte* aRef = &MemoryMarshal.GetReference(a)) - { - fixed (byte* bRef = &MemoryMarshal.GetReference(b)) - { - for (int y = 0; y < 16; y++) - { - sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum); - } - } - } - - return (int)Vector128.Sum(sum); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe int Vp8_Sse16x8_Neon(Span a, Span b) - { - Vector128 sum = Vector128.Zero; - fixed (byte* aRef = &MemoryMarshal.GetReference(a)) - { - fixed (byte* bRef = &MemoryMarshal.GetReference(b)) - { - for (int y = 0; y < 8; y++) - { - sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum); - } - } - } - - return (int)Vector128.Sum(sum); - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static int Vp8_Sse4x4_Neon(Span a, Span b) - { - Vector128 a0 = Load4x4Neon(a).AsByte(); - Vector128 b0 = Load4x4Neon(b).AsByte(); - Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); - Vector64 absDiffLower = absDiff.GetLower().AsByte(); - Vector64 absDiffUpper = absDiff.GetUpper().AsByte(); - Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); - Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); - - // pair-wise adds and widen. - Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); - Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); - - Vector128 sum = AdvSimd.Add(sum1, sum2); - - return (int)Vector128.Sum(sum); - } - - // Load all 4x4 pixels into a single Vector128 - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe Vector128 Load4x4Neon(Span src) - { - fixed (byte* srcRef = &MemoryMarshal.GetReference(src)) - { - Vector128 output = Vector128.Zero; - output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef); - output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps)); - output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2))); - output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3))); - return output; - } - } - - [MethodImpl(InliningOptions.ShortMethod)] - private static unsafe Vector128 AccumulateSSE16Neon(byte* a, byte* b, Vector128 sum) - { - Vector128 a0 = AdvSimd.LoadVector128(a); - Vector128 b0 = AdvSimd.LoadVector128(b); - - Vector128 absDiff = AdvSimd.AbsoluteDifference(a0, b0); - Vector64 absDiffLower = absDiff.GetLower(); - Vector64 absDiffUpper = absDiff.GetUpper(); - Vector128 prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower); - Vector128 prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper); - - // pair-wise adds and widen. - Vector128 sum1 = AdvSimd.AddPairwiseWidening(prod1); - Vector128 sum2 = AdvSimd.AddPairwiseWidening(prod2); - return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2)); - } - [MethodImpl(InliningOptions.ShortMethod)] private static Vector128 SubtractAndAccumulateVector128(Vector128 a, Vector128 b) { @@ -330,7 +224,7 @@ internal static class LossyUtils // Take abs(a-b) in 8b. Vector256 ab = Vector256_.SubtractSaturate(a, b); Vector256 ba = Vector256_.SubtractSaturate(b, a); - Vector256 absAb = Avx2.Or(ab, ba); + Vector256 absAb = ab | ba; // Zero-extend to 16b. Vector256 c0 = Vector256_.UnpackLow(absAb, Vector256.Zero); @@ -948,7 +842,7 @@ internal static class LossyUtils // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); + Vp8Transpose_2_4x4_16bVector128(b0, b1, b2, b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 @@ -995,7 +889,7 @@ internal static class LossyUtils // Transpose two 4x4 16b matrices horizontally stored in registers. [MethodImpl(InliningOptions.ShortMethod)] - public static void Vp8Transpose_2_4x4_16b(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) + public static void Vp8Transpose_2_4x4_16bVector128(Vector128 b0, Vector128 b1, Vector128 b2, Vector128 b3, out Vector128 output0, out Vector128 output1, out Vector128 output2, out Vector128 output3) { // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 @@ -1110,7 +1004,7 @@ internal static class LossyUtils Vector128 tmp3 = a.AsInt16() - d; // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. @@ -1143,7 +1037,7 @@ internal static class LossyUtils Vector128 shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'dst' and store. // Load the reference(s). @@ -1189,7 +1083,7 @@ internal static class LossyUtils public static void TransformOne(Span src, Span dst, Span scratch) { - if (Sse2.IsSupported) + if (Vector128.IsHardwareAccelerated) { // Load and concatenate the transform coefficients. ref short srcRef = ref MemoryMarshal.GetReference(src); @@ -1205,102 +1099,102 @@ internal static class LossyUtils // Vertical pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16()); - Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16()); + Vector128 a = in0.AsInt16() + in2.AsInt16(); + Vector128 b = in0.AsInt16() - in2.AsInt16(); Vector128 k1 = Vector128.Create((short)20091); Vector128 k2 = Vector128.Create((short)-30068); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 - Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2); - Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1); - Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16()); - Vector128 c4 = Sse2.Subtract(c1, c2); - Vector128 c = Sse2.Add(c3.AsInt16(), c4); + Vector128 c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2); + Vector128 c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1); + Vector128 c3 = in1.AsInt16() - in3.AsInt16(); + Vector128 c4 = c1 - c2; + Vector128 c = c3.AsInt16() + c4; // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 - Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1); - Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2); - Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16()); - Vector128 d4 = Sse2.Add(d1, d2); - Vector128 d = Sse2.Add(d3, d4); + Vector128 d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1); + Vector128 d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2); + Vector128 d3 = in1.AsInt16() + in3.AsInt16(); + Vector128 d4 = d1 + d2; + Vector128 d = d3 + d4; // Second pass. - Vector128 tmp0 = Sse2.Add(a.AsInt16(), d); - Vector128 tmp1 = Sse2.Add(b.AsInt16(), c); - Vector128 tmp2 = Sse2.Subtract(b.AsInt16(), c); - Vector128 tmp3 = Sse2.Subtract(a.AsInt16(), d); + Vector128 tmp0 = a.AsInt16() + d; + Vector128 tmp1 = b.AsInt16() + c; + Vector128 tmp2 = b.AsInt16() - c; + Vector128 tmp3 = a.AsInt16() - d; // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. - Vector128 dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4)); - a = Sse2.Add(dc, t2.AsInt16()); - b = Sse2.Subtract(dc, t2.AsInt16()); + Vector128 dc = t0.AsInt16() + Vector128.Create((short)4); + a = dc + t2.AsInt16(); + b = dc - t2.AsInt16(); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 - c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2); - c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1); - c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16()); - c4 = Sse2.Subtract(c1, c2); - c = Sse2.Add(c3, c4); + c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2); + c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1); + c3 = t1.AsInt16() - t3.AsInt16(); + c4 = c1 - c2; + c = c3 + c4; // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 - d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1); - d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2); - d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16()); - d4 = Sse2.Add(d1, d2); - d = Sse2.Add(d3, d4); + d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1); + d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2); + d3 = t1.AsInt16() + t3.AsInt16(); + d4 = d1 + d2; + d = d3 + d4; // Second pass. - tmp0 = Sse2.Add(a, d); - tmp1 = Sse2.Add(b, c); - tmp2 = Sse2.Subtract(b, c); - tmp3 = Sse2.Subtract(a, d); - Vector128 shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3); - Vector128 shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3); - Vector128 shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3); - Vector128 shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3); + tmp0 = a + d; + tmp1 = b + c; + tmp2 = b - c; + tmp3 = a - d; + Vector128 shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3); + Vector128 shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3); + Vector128 shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3); + Vector128 shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3); // Transpose the two 4x4. - Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'dst' and store. // Load the reference(s). // Load four bytes/pixels per line. ref byte dstRef = ref MemoryMarshal.GetReference(dst); - Vector128 dst0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref dstRef)).AsByte(); - Vector128 dst1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps))).AsByte(); - Vector128 dst2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 2))).AsByte(); - Vector128 dst3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3))).AsByte(); + Vector128 dst0 = Vector128.CreateScalar(Unsafe.As(ref dstRef)).AsByte(); + Vector128 dst1 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps))).AsByte(); + Vector128 dst2 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 2))).AsByte(); + Vector128 dst3 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3))).AsByte(); // Convert to 16b. - dst0 = Sse2.UnpackLow(dst0, Vector128.Zero); - dst1 = Sse2.UnpackLow(dst1, Vector128.Zero); - dst2 = Sse2.UnpackLow(dst2, Vector128.Zero); - dst3 = Sse2.UnpackLow(dst3, Vector128.Zero); + dst0 = Vector128_.UnpackLow(dst0, Vector128.Zero); + dst1 = Vector128_.UnpackLow(dst1, Vector128.Zero); + dst2 = Vector128_.UnpackLow(dst2, Vector128.Zero); + dst3 = Vector128_.UnpackLow(dst3, Vector128.Zero); // Add the inverse transform(s). - dst0 = Sse2.Add(dst0.AsInt16(), t0.AsInt16()).AsByte(); - dst1 = Sse2.Add(dst1.AsInt16(), t1.AsInt16()).AsByte(); - dst2 = Sse2.Add(dst2.AsInt16(), t2.AsInt16()).AsByte(); - dst3 = Sse2.Add(dst3.AsInt16(), t3.AsInt16()).AsByte(); + dst0 = (dst0.AsInt16() + t0.AsInt16()).AsByte(); + dst1 = (dst1.AsInt16() + t1.AsInt16()).AsByte(); + dst2 = (dst2.AsInt16() + t2.AsInt16()).AsByte(); + dst3 = (dst3.AsInt16() + t3.AsInt16()).AsByte(); // Unsigned saturate to 8b. - dst0 = Sse2.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); - dst1 = Sse2.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); - dst2 = Sse2.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); - dst3 = Sse2.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); + dst0 = Vector128_.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16()); + dst1 = Vector128_.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16()); + dst2 = Vector128_.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16()); + dst3 = Vector128_.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16()); // Store the results. // Store four bytes/pixels per line. ref byte outputRef = ref MemoryMarshal.GetReference(dst); - int output0 = Sse2.ConvertToInt32(dst0.AsInt32()); - int output1 = Sse2.ConvertToInt32(dst1.AsInt32()); - int output2 = Sse2.ConvertToInt32(dst2.AsInt32()); - int output3 = Sse2.ConvertToInt32(dst3.AsInt32()); + int output0 = dst0.AsInt32().ToScalar(); + int output1 = dst1.AsInt32().ToScalar(); + int output2 = dst2.AsInt32().ToScalar(); + int output3 = dst3.AsInt32().ToScalar(); Unsafe.As(ref outputRef) = output0; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1; Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2; diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs index c645816d4..fd8d48dd0 100644 --- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs +++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs @@ -131,14 +131,14 @@ internal static unsafe class Vp8Encoding InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'ref' and store. // Load the reference(s). @@ -210,14 +210,14 @@ internal static unsafe class Vp8Encoding InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3); // Horizontal pass and subsequent transpose. // First pass, c and d calculations are longer because of the "trick" multiplications. InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3); // Transpose the two 4x4. - LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); + LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3); // Add inverse transform to 'ref' and store. // Load the reference(s).