From 217450eb00a76bd07ebbd004f3c677888790d085 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Tue, 3 Jun 2025 18:40:47 +1000
Subject: [PATCH] Complete LossyUtils port

---
 .../Formats/Webp/Lossy/LossyUtils.cs          | 248 +++++-------------
 .../Formats/Webp/Lossy/Vp8Encoding.cs         |   8 +-
 2 files changed, 75 insertions(+), 181 deletions(-)
diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index b8c4c9c31..c65861c4b 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -5,8 +5,6 @@ using System.Buffers.Binary;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.Arm;
-using System.Runtime.Intrinsics.X86;
 using SixLabors.ImageSharp.Common.Helpers;
 
 // ReSharper disable InconsistentNaming
@@ -18,7 +16,7 @@ internal static class LossyUtils
     [MethodImpl(InliningOptions.ShortMethod)]
     public static int Vp8_Sse16x16(Span<byte> a, Span<byte> b)
     {
-        if (Avx2.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
         {
             return Vp8_Sse16xN_Vector256(a, b, 4);
         }
@@ -28,11 +26,6 @@ internal static class LossyUtils
             return Vp8_16xN_Vector128(a, b, 8);
         }
 
-        if (AdvSimd.IsSupported)
-        {
-            return Vp8_Sse16x16_Neon(a, b);
-        }
-
         return Vp8_SseNxN(a, b, 16, 16);
     }
 
@@ -50,11 +43,6 @@ internal static class LossyUtils
             return Vp8_16xN_Vector128(a, b, 4);
         }
 
-        if (AdvSimd.IsSupported)
-        {
-            return Vp8_Sse16x8_Neon(a, b);
-        }
-
         return Vp8_SseNxN(a, b, 16, 8);
     }
 
@@ -62,7 +50,7 @@ internal static class LossyUtils
     [MethodImpl(InliningOptions.ShortMethod)]
     public static int Vp8_Sse4x4(Span<byte> a, Span<byte> b)
     {
-        if (Avx2.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
         {
             // Load values.
             ref byte aRef = ref MemoryMarshal.GetReference(a);
@@ -123,19 +111,14 @@ internal static class LossyUtils
 
             // subtract, square and accumulate.
             Vector128<short> d0 = Vector128_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
-            Vector128<short> d1 = Sse2.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16());
-            Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0, d0);
-            Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1, d1);
-            Vector128<int> sum = Sse2.Add(e0, e1);
+            Vector128<short> d1 = Vector128_.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16());
+            Vector128<int> e0 = Vector128_.MultiplyAddAdjacent(d0, d0);
+            Vector128<int> e1 = Vector128_.MultiplyAddAdjacent(d1, d1);
+            Vector128<int> sum = e0 + e1;
 
             return ReduceSumVector128(sum);
         }
 
-        if (AdvSimd.IsSupported)
-        {
-            return Vp8_Sse4x4_Neon(a, b);
-        }
-
         return Vp8_SseNxN(a, b, 4, 4);
     }
 
@@ -216,95 +199,6 @@ internal static class LossyUtils
         return ReduceSumVector256(sum);
     }
 
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private static unsafe int Vp8_Sse16x16_Neon(Span<byte> a, Span<byte> b)
-    {
-        Vector128<uint> sum = Vector128<uint>.Zero;
-        fixed (byte* aRef = &MemoryMarshal.GetReference(a))
-        {
-            fixed (byte* bRef = &MemoryMarshal.GetReference(b))
-            {
-                for (int y = 0; y < 16; y++)
-                {
-                    sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
-                }
-            }
-        }
-
-        return (int)Vector128.Sum(sum);
-    }
-
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private static unsafe int Vp8_Sse16x8_Neon(Span<byte> a, Span<byte> b)
-    {
-        Vector128<uint> sum = Vector128<uint>.Zero;
-        fixed (byte* aRef = &MemoryMarshal.GetReference(a))
-        {
-            fixed (byte* bRef = &MemoryMarshal.GetReference(b))
-            {
-                for (int y = 0; y < 8; y++)
-                {
-                    sum = AccumulateSSE16Neon(aRef + (y * WebpConstants.Bps), bRef + (y * WebpConstants.Bps), sum);
-                }
-            }
-        }
-
-        return (int)Vector128.Sum(sum);
-    }
-
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private static int Vp8_Sse4x4_Neon(Span<byte> a, Span<byte> b)
-    {
-        Vector128<byte> a0 = Load4x4Neon(a).AsByte();
-        Vector128<byte> b0 = Load4x4Neon(b).AsByte();
-        Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0);
-        Vector64<byte> absDiffLower = absDiff.GetLower().AsByte();
-        Vector64<byte> absDiffUpper = absDiff.GetUpper().AsByte();
-        Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
-        Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);
-
-        // pair-wise adds and widen.
-        Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1);
-        Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2);
-
-        Vector128<uint> sum = AdvSimd.Add(sum1, sum2);
-
-        return (int)Vector128.Sum(sum);
-    }
-
-    // Load all 4x4 pixels into a single Vector128<uint>
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private static unsafe Vector128<uint> Load4x4Neon(Span<byte> src)
-    {
-        fixed (byte* srcRef = &MemoryMarshal.GetReference(src))
-        {
-            Vector128<uint> output = Vector128<uint>.Zero;
-            output = AdvSimd.LoadAndInsertScalar(output, 0, (uint*)srcRef);
-            output = AdvSimd.LoadAndInsertScalar(output, 1, (uint*)(srcRef + WebpConstants.Bps));
-            output = AdvSimd.LoadAndInsertScalar(output, 2, (uint*)(srcRef + (WebpConstants.Bps * 2)));
-            output = AdvSimd.LoadAndInsertScalar(output, 3, (uint*)(srcRef + (WebpConstants.Bps * 3)));
-            return output;
-        }
-    }
-
-    [MethodImpl(InliningOptions.ShortMethod)]
-    private static unsafe Vector128<uint> AccumulateSSE16Neon(byte* a, byte* b, Vector128<uint> sum)
-    {
-        Vector128<byte> a0 = AdvSimd.LoadVector128(a);
-        Vector128<byte> b0 = AdvSimd.LoadVector128(b);
-
-        Vector128<byte> absDiff = AdvSimd.AbsoluteDifference(a0, b0);
-        Vector64<byte> absDiffLower = absDiff.GetLower();
-        Vector64<byte> absDiffUpper = absDiff.GetUpper();
-        Vector128<ushort> prod1 = AdvSimd.MultiplyWideningLower(absDiffLower, absDiffLower);
-        Vector128<ushort> prod2 = AdvSimd.MultiplyWideningLower(absDiffUpper, absDiffUpper);
-
-        // pair-wise adds and widen.
-        Vector128<uint> sum1 = AdvSimd.AddPairwiseWidening(prod1);
-        Vector128<uint> sum2 = AdvSimd.AddPairwiseWidening(prod2);
-        return AdvSimd.Add(sum, AdvSimd.Add(sum1, sum2));
-    }
-
     [MethodImpl(InliningOptions.ShortMethod)]
     private static Vector128<int> SubtractAndAccumulateVector128(Vector128<byte> a, Vector128<byte> b)
     {
@@ -330,7 +224,7 @@ internal static class LossyUtils
         // Take abs(a-b) in 8b.
         Vector256<byte> ab = Vector256_.SubtractSaturate(a, b);
         Vector256<byte> ba = Vector256_.SubtractSaturate(b, a);
-        Vector256<byte> absAb = Avx2.Or(ab, ba);
+        Vector256<byte> absAb = ab | ba;
 
         // Zero-extend to 16b.
         Vector256<byte> c0 = Vector256_.UnpackLow(absAb, Vector256<byte>.Zero);
@@ -948,7 +842,7 @@ internal static class LossyUtils
         // a20 a21 a22 a23   b20 b21 b22 b23
         // a30 a31 a32 a33   b30 b31 b32 b33
         // Transpose the two 4x4.
-        Vp8Transpose_2_4x4_16b(b0, b1, b2, b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3);
+        Vp8Transpose_2_4x4_16bVector128(b0, b1, b2, b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3);
 
         // a00 a10 a20 a30   b00 b10 b20 b30
         // a01 a11 a21 a31   b01 b11 b21 b31
@@ -995,7 +889,7 @@ internal static class LossyUtils
 
     // Transpose two 4x4 16b matrices horizontally stored in registers.
     [MethodImpl(InliningOptions.ShortMethod)]
-    public static void Vp8Transpose_2_4x4_16b(Vector128<short> b0, Vector128<short> b1, Vector128<short> b2, Vector128<short> b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3)
+    public static void Vp8Transpose_2_4x4_16bVector128(Vector128<short> b0, Vector128<short> b1, Vector128<short> b2, Vector128<short> b3, out Vector128<long> output0, out Vector128<long> output1, out Vector128<long> output2, out Vector128<long> output3)
     {
         // Transpose the two 4x4.
         // a00 a01 a02 a03   b00 b01 b02 b03
@@ -1110,7 +1004,7 @@ internal static class LossyUtils
             Vector128<short> tmp3 = a.AsInt16() - d;
 
             // Transpose the two 4x4.
-            Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+            Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
 
             // Horizontal pass and subsequent transpose.
             // First pass, c and d calculations are longer because of the "trick" multiplications.
@@ -1143,7 +1037,7 @@ internal static class LossyUtils
             Vector128<short> shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3);
 
             // Transpose the two 4x4.
-            Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+            Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
 
             // Add inverse transform to 'dst' and store.
             // Load the reference(s).
@@ -1189,7 +1083,7 @@ internal static class LossyUtils
 
     public static void TransformOne(Span<short> src, Span<byte> dst, Span<int> scratch)
     {
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
         {
             // Load and concatenate the transform coefficients.
             ref short srcRef = ref MemoryMarshal.GetReference(src);
@@ -1205,102 +1099,102 @@ internal static class LossyUtils
 
             // Vertical pass and subsequent transpose.
             // First pass, c and d calculations are longer because of the "trick" multiplications.
-            Vector128<short> a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
-            Vector128<short> b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
+            Vector128<short> a = in0.AsInt16() + in2.AsInt16();
+            Vector128<short> b = in0.AsInt16() - in2.AsInt16();
 
             Vector128<short> k1 = Vector128.Create((short)20091);
             Vector128<short> k2 = Vector128.Create((short)-30068);
 
             // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
-            Vector128<short> c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2);
-            Vector128<short> c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1);
-            Vector128<short> c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
-            Vector128<short> c4 = Sse2.Subtract(c1, c2);
-            Vector128<short> c = Sse2.Add(c3.AsInt16(), c4);
+            Vector128<short> c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2);
+            Vector128<short> c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1);
+            Vector128<short> c3 = in1.AsInt16() - in3.AsInt16();
+            Vector128<short> c4 = c1 - c2;
+            Vector128<short> c = c3.AsInt16() + c4;
 
             // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
-            Vector128<short> d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1);
-            Vector128<short> d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2);
-            Vector128<short> d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
-            Vector128<short> d4 = Sse2.Add(d1, d2);
-            Vector128<short> d = Sse2.Add(d3, d4);
+            Vector128<short> d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1);
+            Vector128<short> d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2);
+            Vector128<short> d3 = in1.AsInt16() + in3.AsInt16();
+            Vector128<short> d4 = d1 + d2;
+            Vector128<short> d = d3 + d4;
 
             // Second pass.
-            Vector128<short> tmp0 = Sse2.Add(a.AsInt16(), d);
-            Vector128<short> tmp1 = Sse2.Add(b.AsInt16(), c);
-            Vector128<short> tmp2 = Sse2.Subtract(b.AsInt16(), c);
-            Vector128<short> tmp3 = Sse2.Subtract(a.AsInt16(), d);
+            Vector128<short> tmp0 = a.AsInt16() + d;
+            Vector128<short> tmp1 = b.AsInt16() + c;
+            Vector128<short> tmp2 = b.AsInt16() - c;
+            Vector128<short> tmp3 = a.AsInt16() - d;
 
             // Transpose the two 4x4.
-            Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+            Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
 
             // Horizontal pass and subsequent transpose.
             // First pass, c and d calculations are longer because of the "trick" multiplications.
-            Vector128<short> dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4));
-            a = Sse2.Add(dc, t2.AsInt16());
-            b = Sse2.Subtract(dc, t2.AsInt16());
+            Vector128<short> dc = t0.AsInt16() + Vector128.Create((short)4);
+            a = dc + t2.AsInt16();
+            b = dc - t2.AsInt16();
 
             // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
-            c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2);
-            c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1);
-            c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
-            c4 = Sse2.Subtract(c1, c2);
-            c = Sse2.Add(c3, c4);
+            c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2);
+            c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1);
+            c3 = t1.AsInt16() - t3.AsInt16();
+            c4 = c1 - c2;
+            c = c3 + c4;
 
             // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
-            d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1);
-            d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2);
-            d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
-            d4 = Sse2.Add(d1, d2);
-            d = Sse2.Add(d3, d4);
+            d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1);
+            d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2);
+            d3 = t1.AsInt16() + t3.AsInt16();
+            d4 = d1 + d2;
+            d = d3 + d4;
 
             // Second pass.
-            tmp0 = Sse2.Add(a, d);
-            tmp1 = Sse2.Add(b, c);
-            tmp2 = Sse2.Subtract(b, c);
-            tmp3 = Sse2.Subtract(a, d);
-            Vector128<short> shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
-            Vector128<short> shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
-            Vector128<short> shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
-            Vector128<short> shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+            tmp0 = a + d;
+            tmp1 = b + c;
+            tmp2 = b - c;
+            tmp3 = a - d;
+            Vector128<short> shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3);
+            Vector128<short> shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3);
+            Vector128<short> shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3);
+            Vector128<short> shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3);
 
             // Transpose the two 4x4.
-            Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+            Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
 
             // Add inverse transform to 'dst' and store.
             // Load the reference(s).
             // Load four bytes/pixels per line.
             ref byte dstRef = ref MemoryMarshal.GetReference(dst);
-            Vector128<byte> dst0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref dstRef)).AsByte();
-            Vector128<byte> dst1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref dstRef, WebpConstants.Bps))).AsByte();
-            Vector128<byte> dst2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 2))).AsByte();
-            Vector128<byte> dst3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As<byte, int>(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3))).AsByte();
+            Vector128<byte> dst0 = Vector128.CreateScalar(Unsafe.As<byte, int>(ref dstRef)).AsByte();
+            Vector128<byte> dst1 = Vector128.CreateScalar(Unsafe.As<byte, int>(ref Unsafe.Add(ref dstRef, WebpConstants.Bps))).AsByte();
+            Vector128<byte> dst2 = Vector128.CreateScalar(Unsafe.As<byte, int>(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 2))).AsByte();
+            Vector128<byte> dst3 = Vector128.CreateScalar(Unsafe.As<byte, int>(ref Unsafe.Add(ref dstRef, WebpConstants.Bps * 3))).AsByte();
 
             // Convert to 16b.
-            dst0 = Sse2.UnpackLow(dst0, Vector128<byte>.Zero);
-            dst1 = Sse2.UnpackLow(dst1, Vector128<byte>.Zero);
-            dst2 = Sse2.UnpackLow(dst2, Vector128<byte>.Zero);
-            dst3 = Sse2.UnpackLow(dst3, Vector128<byte>.Zero);
+            dst0 = Vector128_.UnpackLow(dst0, Vector128<byte>.Zero);
+            dst1 = Vector128_.UnpackLow(dst1, Vector128<byte>.Zero);
+            dst2 = Vector128_.UnpackLow(dst2, Vector128<byte>.Zero);
+            dst3 = Vector128_.UnpackLow(dst3, Vector128<byte>.Zero);
 
             // Add the inverse transform(s).
-            dst0 = Sse2.Add(dst0.AsInt16(), t0.AsInt16()).AsByte();
-            dst1 = Sse2.Add(dst1.AsInt16(), t1.AsInt16()).AsByte();
-            dst2 = Sse2.Add(dst2.AsInt16(), t2.AsInt16()).AsByte();
-            dst3 = Sse2.Add(dst3.AsInt16(), t3.AsInt16()).AsByte();
+            dst0 = (dst0.AsInt16() + t0.AsInt16()).AsByte();
+            dst1 = (dst1.AsInt16() + t1.AsInt16()).AsByte();
+            dst2 = (dst2.AsInt16() + t2.AsInt16()).AsByte();
+            dst3 = (dst3.AsInt16() + t3.AsInt16()).AsByte();
 
             // Unsigned saturate to 8b.
-            dst0 = Sse2.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16());
-            dst1 = Sse2.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16());
-            dst2 = Sse2.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16());
-            dst3 = Sse2.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16());
+            dst0 = Vector128_.PackUnsignedSaturate(dst0.AsInt16(), dst0.AsInt16());
+            dst1 = Vector128_.PackUnsignedSaturate(dst1.AsInt16(), dst1.AsInt16());
+            dst2 = Vector128_.PackUnsignedSaturate(dst2.AsInt16(), dst2.AsInt16());
+            dst3 = Vector128_.PackUnsignedSaturate(dst3.AsInt16(), dst3.AsInt16());
 
             // Store the results.
             // Store four bytes/pixels per line.
             ref byte outputRef = ref MemoryMarshal.GetReference(dst);
-            int output0 = Sse2.ConvertToInt32(dst0.AsInt32());
-            int output1 = Sse2.ConvertToInt32(dst1.AsInt32());
-            int output2 = Sse2.ConvertToInt32(dst2.AsInt32());
-            int output3 = Sse2.ConvertToInt32(dst3.AsInt32());
+            int output0 = dst0.AsInt32().ToScalar();
+            int output1 = dst1.AsInt32().ToScalar();
+            int output2 = dst2.AsInt32().ToScalar();
+            int output3 = dst3.AsInt32().ToScalar();
             Unsafe.As<byte, int>(ref outputRef) = output0;
             Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
             Unsafe.As<byte, int>(ref Unsafe.Add(ref outputRef, WebpConstants.Bps * 2)) = output2;
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index c645816d4..fd8d48dd0 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -131,14 +131,14 @@ internal static unsafe class Vp8Encoding
             InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
 
             // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
 
             // Horizontal pass and subsequent transpose.
             // First pass, c and d calculations are longer because of the "trick" multiplications.
             InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
 
             // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
 
             // Add inverse transform to 'ref' and store.
             // Load the reference(s).
@@ -210,14 +210,14 @@ internal static unsafe class Vp8Encoding
             InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128<short> tmp0, out Vector128<short> tmp1, out Vector128<short> tmp2, out Vector128<short> tmp3);
 
             // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128<long> t0, out Vector128<long> t1, out Vector128<long> t2, out Vector128<long> t3);
 
             // Horizontal pass and subsequent transpose.
             // First pass, c and d calculations are longer because of the "trick" multiplications.
             InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128<short> shifted0, out Vector128<short> shifted1, out Vector128<short> shifted2, out Vector128<short> shifted3);
 
             // Transpose the two 4x4.
-            LossyUtils.Vp8Transpose_2_4x4_16b(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
+            LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
 
             // Add inverse transform to 'ref' and store.
             // Load the reference(s).