From 7223e90bb441b4d14871a37f7ed2237218bc7b30 Mon Sep 17 00:00:00 2001
From: James Jackson-South <james_south@hotmail.com>
Date: Mon, 2 Jun 2025 14:47:36 +1000
Subject: [PATCH] Port Vp8_Sse16x16

---
 .../Common/Helpers/Vector128Utilities.cs      |  95 +++++++++-
 .../Common/Helpers/Vector256Utilities.cs      | 163 ++++++++++++++++++
 .../Formats/Webp/Lossy/LossyUtils.cs          | 126 ++++++++------
 3 files changed, 324 insertions(+), 60 deletions(-)
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index c160b9560..c5e16faf9 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -711,6 +711,39 @@ internal static class Vector128_
         return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7));
     }
 
+    /// <summary>
+    /// Unpack and interleave 8-bit integers from the high half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 8-bit integers to unpack from the high half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 8-bit integers to unpack from the high half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 8-bit integers from the high
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<byte> UnpackHigh(Vector128<byte> left, Vector128<byte> right)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.UnpackHigh(left, right);
+        }
+
+        if (AdvSimd.IsSupported)
+        {
+            return AdvSimd.Arm64.ZipHigh(left, right);
+        }
+
+        Vector128<byte> unpacked = Vector128.Create(left.GetUpper(), right.GetUpper());
+        return Vector128.Shuffle(
+            unpacked,
+            Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)));
+    }
+
     /// <summary>
     /// Unpack and interleave 8-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
     /// and store the results in the result.
@@ -744,6 +777,56 @@ internal static class Vector128_
             Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)));
     }
 
+    /// <summary>
+    /// Subtract packed signed 16-bit integers in <paramref name="right"/> from packed signed 16-bit integers
+    /// in <paramref name="left"/> using saturation, and store the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed signed 16-bit integers to subtract from.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed signed 16-bit integers to subtract.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of subtracting packed unsigned 16-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector128<short> SubtractSaturate(Vector128<short> left, Vector128<short> right)
+    {
+        if (Sse2.IsSupported)
+        {
+            return Sse2.SubtractSaturate(left, right);
+        }
+
+        if (AdvSimd.IsSupported)
+        {
+            return AdvSimd.SubtractSaturate(left, right);
+        }
+
+        if (PackedSimd.IsSupported)
+        {
+            return PackedSimd.SubtractSaturate(left, right);
+        }
+
+        // Widen inputs to 32-bit signed
+        (Vector128<int> leftLo, Vector128<int> leftHi) = Vector128.Widen(left);
+        (Vector128<int> rightLo, Vector128<int> rightHi) = Vector128.Widen(right);
+
+        // Subtract
+        Vector128<int> diffLo = leftLo - rightLo;
+        Vector128<int> diffHi = leftHi - rightHi;
+
+        // Clamp to signed 16-bit range
+        Vector128<int> shortMin = Vector128.Create((int)short.MinValue);
+        Vector128<int> shortMax = Vector128.Create((int)short.MaxValue);
+
+        diffLo = Clamp(diffLo, shortMin, shortMax);
+        diffHi = Clamp(diffHi, shortMin, shortMax);
+
+        // Narrow back to 16 bit signed.
+        return Vector128.Narrow(diffLo, diffHi);
+    }
+
     /// <summary>
     /// Subtract packed unsigned 8-bit integers in <paramref name="right"/> from packed unsigned 8-bit integers
     /// in <paramref name="left"/> using saturation, and store the results.
@@ -775,7 +858,7 @@ internal static class Vector128_
             return PackedSimd.SubtractSaturate(left, right);
         }
 
-        // Widen inputs to 16-bit to safely compute unsigned differences without underflow
+        // Widen inputs to 16-bit
         (Vector128<ushort> leftLo, Vector128<ushort> leftHi) = Vector128.Widen(left);
         (Vector128<ushort> rightLo, Vector128<ushort> rightHi) = Vector128.Widen(right);
 
@@ -783,13 +866,11 @@ internal static class Vector128_
         Vector128<ushort> diffLo = leftLo - rightLo;
         Vector128<ushort> diffHi = leftHi - rightHi;
 
-        // Mask lanes where left >= right to preserve the result
-        // All other lanes are zeroed (saturate to 0)
-        Vector128<ushort> maskLo = Vector128.GreaterThanOrEqual(leftLo, rightLo).AsUInt16();
-        Vector128<ushort> maskHi = Vector128.GreaterThanOrEqual(leftHi, rightHi).AsUInt16();
+        // Clamp to signed 8-bit range
+        Vector128<ushort> max = Vector128.Create((ushort)byte.MaxValue);
 
-        diffLo &= maskLo;
-        diffHi &= maskHi;
+        diffLo = Clamp(diffLo, Vector128<ushort>.Zero, max);
+        diffHi = Clamp(diffHi, Vector128<ushort>.Zero, max);
 
         // Narrow back to bytes
         return Vector128.Narrow(diffLo, diffHi);
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index dfefd2d34..71dfadc39 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -162,6 +162,33 @@ internal static class Vector256_
         return (vm0 * vm1) - vs;
     }
 
+    /// <summary>
+    /// Multiply packed signed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
+    /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and
+    /// pack the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed signed 16-bit integers to multiply and add.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed signed 16-bit integers to multiply and add.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<int> MultiplyAddAdjacent(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.MultiplyAddAdjacent(left, right);
+        }
+
+        return Vector256.Create(
+            Vector128_.MultiplyAddAdjacent(left.GetLower(), right.GetLower()),
+            Vector128_.MultiplyAddAdjacent(left.GetUpper(), right.GetUpper()));
+    }
+
     /// <summary>
     /// Packs signed 32-bit integers to signed 16-bit integers and saturates.
     /// </summary>
@@ -303,6 +330,142 @@ internal static class Vector256_
         return Vector256.Narrow(prodLo, prodHi);
     }
 
+    /// <summary>
+    /// Unpack and interleave 32-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 32-bit integers to unpack from the low half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 32-bit integers to unpack from the low half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 32-bit integers from the low
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<int> UnpackLow(Vector256<int> left, Vector256<int> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.UnpackLow(left, right);
+        }
+
+        Vector128<int> lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
+        Vector128<int> hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
+
+        return Vector256.Create(lo, hi);
+    }
+
+    /// <summary>
+    /// Unpack and interleave 8-bit integers from the high half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 8-bit integers to unpack from the high half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 8-bit integers to unpack from the high half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 8-bit integers from the high
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<byte> UnpackHigh(Vector256<byte> left, Vector256<byte> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.UnpackHigh(left, right);
+        }
+
+        Vector128<byte> lo = Vector128_.UnpackHigh(left.GetLower(), right.GetLower());
+        Vector128<byte> hi = Vector128_.UnpackHigh(left.GetUpper(), right.GetUpper());
+
+        return Vector256.Create(lo, hi);
+    }
+
+    /// <summary>
+    /// Unpack and interleave 8-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
+    /// and store the results in the result.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed 8-bit integers to unpack from the low half.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed 8-bit integers to unpack from the low half.
+    /// </param>
+    /// <returns>
+    /// A vector containing the unpacked and interleaved 8-bit integers from the low
+    /// halves of <paramref name="left"/> and <paramref name="right"/>.
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<byte> UnpackLow(Vector256<byte> left, Vector256<byte> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.UnpackLow(left, right);
+        }
+
+        Vector128<byte> lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
+        Vector128<byte> hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
+
+        return Vector256.Create(lo, hi);
+    }
+
+    /// <summary>
+    /// Subtract packed signed 16-bit integers in <paramref name="right"/> from packed signed 16-bit integers
+    /// in <paramref name="left"/> using saturation, and store the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed signed 16-bit integers to subtract from.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed signed 16-bit integers to subtract.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of subtracting packed unsigned 16-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<short> SubtractSaturate(Vector256<short> left, Vector256<short> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.SubtractSaturate(left, right);
+        }
+
+        return Vector256.Create(
+            Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
+            Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
+    }
+
+    /// <summary>
+    /// Subtract packed unsigned 8-bit integers in <paramref name="right"/> from packed unsigned 8-bit integers
+    /// in <paramref name="left"/> using saturation, and store the results.
+    /// </summary>
+    /// <param name="left">
+    /// The first vector containing packed unsigned 8-bit integers to subtract from.
+    /// </param>
+    /// <param name="right">
+    /// The second vector containing packed unsigned 8-bit integers to subtract.
+    /// </param>
+    /// <returns>
+    /// A vector containing the results of subtracting packed unsigned 8-bit integers
+    /// </returns>
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    public static Vector256<byte> SubtractSaturate(Vector256<byte> left, Vector256<byte> right)
+    {
+        if (Avx2.IsSupported)
+        {
+            return Avx2.SubtractSaturate(left, right);
+        }
+
+        return Vector256.Create(
+            Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
+            Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
+    }
+
     [DoesNotReturn]
     private static void ThrowUnreachableException() => throw new UnreachableException();
 }
diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 7d186cd65..4e61242c0 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -20,12 +20,12 @@ internal static class LossyUtils
     {
         if (Avx2.IsSupported)
         {
-            return Vp8_Sse16xN_Avx2(a, b, 4);
+            return Vp8_Sse16xN_Vector256(a, b, 4);
         }
 
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
         {
-            return Vp8_Sse16xN_Sse2(a, b, 8);
+            return Vp8_16xN_Vector128(a, b, 8);
         }
 
         if (AdvSimd.IsSupported)
@@ -40,14 +40,14 @@ internal static class LossyUtils
     [MethodImpl(InliningOptions.ShortMethod)]
     public static int Vp8_Sse16x8(Span<byte> a, Span<byte> b)
     {
-        if (Avx2.IsSupported)
+        if (Vector256.IsHardwareAccelerated)
         {
-            return Vp8_Sse16xN_Avx2(a, b, 2);
+            return Vp8_Sse16xN_Vector256(a, b, 2);
         }
 
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
         {
-            return Vp8_Sse16xN_Sse2(a, b, 4);
+            return Vp8_16xN_Vector128(a, b, 4);
         }
 
         if (AdvSimd.IsSupported)
@@ -81,21 +81,21 @@ internal static class LossyUtils
                 Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3)));
 
             // Combine pair of lines.
-            Vector256<int> a01 = Avx2.UnpackLow(a0.AsInt32(), a1.AsInt32());
-            Vector256<int> b01 = Avx2.UnpackLow(b0.AsInt32(), b1.AsInt32());
+            Vector256<int> a01 = Vector256_.UnpackLow(a0.AsInt32(), a1.AsInt32());
+            Vector256<int> b01 = Vector256_.UnpackLow(b0.AsInt32(), b1.AsInt32());
 
             // Convert to 16b.
-            Vector256<byte> a01s = Avx2.UnpackLow(a01.AsByte(), Vector256<byte>.Zero);
-            Vector256<byte> b01s = Avx2.UnpackLow(b01.AsByte(), Vector256<byte>.Zero);
+            Vector256<byte> a01s = Vector256_.UnpackLow(a01.AsByte(), Vector256<byte>.Zero);
+            Vector256<byte> b01s = Vector256_.UnpackLow(b01.AsByte(), Vector256<byte>.Zero);
 
             // subtract, square and accumulate.
-            Vector256<short> d0 = Avx2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
-            Vector256<int> e0 = Avx2.MultiplyAddAdjacent(d0, d0);
+            Vector256<short> d0 = Vector256_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
+            Vector256<int> e0 = Vector256_.MultiplyAddAdjacent(d0, d0);
 
-            return Numerics.ReduceSum(e0);
+            return ReduceSumVector256(e0);
         }
 
-        if (Sse2.IsSupported)
+        if (Vector128.IsHardwareAccelerated)
         {
             // Load values.
             ref byte aRef = ref MemoryMarshal.GetReference(a);
@@ -110,25 +110,25 @@ internal static class LossyUtils
             Vector128<byte> b3 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3));
 
             // Combine pair of lines.
-            Vector128<int> a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
-            Vector128<int> a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
-            Vector128<int> b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
-            Vector128<int> b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());
+            Vector128<int> a01 = Vector128_.UnpackLow(a0.AsInt32(), a1.AsInt32());
+            Vector128<int> a23 = Vector128_.UnpackLow(a2.AsInt32(), a3.AsInt32());
+            Vector128<int> b01 = Vector128_.UnpackLow(b0.AsInt32(), b1.AsInt32());
+            Vector128<int> b23 = Vector128_.UnpackLow(b2.AsInt32(), b3.AsInt32());
 
             // Convert to 16b.
-            Vector128<byte> a01s = Sse2.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> a23s = Sse2.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> b01s = Sse2.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
-            Vector128<byte> b23s = Sse2.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> a01s = Vector128_.UnpackLow(a01.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> a23s = Vector128_.UnpackLow(a23.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> b01s = Vector128_.UnpackLow(b01.AsByte(), Vector128<byte>.Zero);
+            Vector128<byte> b23s = Vector128_.UnpackLow(b23.AsByte(), Vector128<byte>.Zero);
 
             // subtract, square and accumulate.
-            Vector128<short> d0 = Sse2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
+            Vector128<short> d0 = Vector128_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
             Vector128<short> d1 = Sse2.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16());
             Vector128<int> e0 = Sse2.MultiplyAddAdjacent(d0, d0);
             Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1, d1);
             Vector128<int> sum = Sse2.Add(e0, e1);
 
-            return ReduceSum(sum);
+            return ReduceSumVector128(sum);
         }
 
         if (AdvSimd.IsSupported)
@@ -159,7 +159,7 @@ internal static class LossyUtils
     }
 
     [MethodImpl(InliningOptions.ShortMethod)]
-    private static int Vp8_Sse16xN_Sse2(Span<byte> a, Span<byte> b, int numPairs)
+    private static int Vp8_16xN_Vector128(Span<byte> a, Span<byte> b, int numPairs)
     {
         Vector128<int> sum = Vector128<int>.Zero;
         nuint offset = 0;
@@ -173,18 +173,18 @@ internal static class LossyUtils
             Vector128<byte> a1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps));
             Vector128<byte> b1 = Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps));
 
-            Vector128<int> sum1 = SubtractAndAccumulate(a0, b0);
-            Vector128<int> sum2 = SubtractAndAccumulate(a1, b1);
+            Vector128<int> sum1 = SubtractAndAccumulateVector128(a0, b0);
+            Vector128<int> sum2 = SubtractAndAccumulateVector128(a1, b1);
             sum += sum1 + sum2;
 
             offset += 2 * WebpConstants.Bps;
         }
 
-        return ReduceSum(sum);
+        return ReduceSumVector128(sum);
     }
 
     [MethodImpl(InliningOptions.ShortMethod)]
-    private static int Vp8_Sse16xN_Avx2(Span<byte> a, Span<byte> b, int numPairs)
+    private static int Vp8_Sse16xN_Vector256(Span<byte> a, Span<byte> b, int numPairs)
     {
         Vector256<int> sum = Vector256<int>.Zero;
         nuint offset = 0;
@@ -206,14 +206,14 @@ internal static class LossyUtils
                 Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))),
                 Unsafe.As<byte, Vector128<byte>>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps))));
 
-            Vector256<int> sum1 = SubtractAndAccumulate(a0, b0);
-            Vector256<int> sum2 = SubtractAndAccumulate(a1, b1);
-            sum = Avx2.Add(sum, Avx2.Add(sum1, sum2));
+            Vector256<int> sum1 = SubtractAndAccumulateVector256(a0, b0);
+            Vector256<int> sum2 = SubtractAndAccumulateVector256(a1, b1);
+            sum += sum1 + sum2;
 
             offset += 4 * WebpConstants.Bps;
         }
 
-        return Numerics.ReduceSum(sum);
+        return ReduceSumVector256(sum);
     }
 
     [MethodImpl(InliningOptions.ShortMethod)]
@@ -306,41 +306,41 @@ internal static class LossyUtils
     }
 
     [MethodImpl(InliningOptions.ShortMethod)]
-    private static Vector128<int> SubtractAndAccumulate(Vector128<byte> a, Vector128<byte> b)
+    private static Vector128<int> SubtractAndAccumulateVector128(Vector128<byte> a, Vector128<byte> b)
     {
         // Take abs(a-b) in 8b.
-        Vector128<byte> ab = Sse2.SubtractSaturate(a, b);
-        Vector128<byte> ba = Sse2.SubtractSaturate(b, a);
-        Vector128<byte> absAb = Sse2.Or(ab, ba);
+        Vector128<byte> ab = Vector128_.SubtractSaturate(a, b);
+        Vector128<byte> ba = Vector128_.SubtractSaturate(b, a);
+        Vector128<byte> absAb = ab | ba;
 
         // Zero-extend to 16b.
-        Vector128<byte> c0 = Sse2.UnpackLow(absAb, Vector128<byte>.Zero);
-        Vector128<byte> c1 = Sse2.UnpackHigh(absAb, Vector128<byte>.Zero);
+        Vector128<byte> c0 = Vector128_.UnpackLow(absAb, Vector128<byte>.Zero);
+        Vector128<byte> c1 = Vector128_.UnpackHigh(absAb, Vector128<byte>.Zero);
 
         // Multiply with self.
-        Vector128<int> sum1 = Sse2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
-        Vector128<int> sum2 = Sse2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
+        Vector128<int> sum1 = Vector128_.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
+        Vector128<int> sum2 = Vector128_.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
 
-        return Sse2.Add(sum1, sum2);
+        return sum1 + sum2;
     }
 
     [MethodImpl(InliningOptions.ShortMethod)]
-    private static Vector256<int> SubtractAndAccumulate(Vector256<byte> a, Vector256<byte> b)
+    private static Vector256<int> SubtractAndAccumulateVector256(Vector256<byte> a, Vector256<byte> b)
     {
         // Take abs(a-b) in 8b.
-        Vector256<byte> ab = Avx2.SubtractSaturate(a, b);
-        Vector256<byte> ba = Avx2.SubtractSaturate(b, a);
+        Vector256<byte> ab = Vector256_.SubtractSaturate(a, b);
+        Vector256<byte> ba = Vector256_.SubtractSaturate(b, a);
         Vector256<byte> absAb = Avx2.Or(ab, ba);
 
         // Zero-extend to 16b.
-        Vector256<byte> c0 = Avx2.UnpackLow(absAb, Vector256<byte>.Zero);
-        Vector256<byte> c1 = Avx2.UnpackHigh(absAb, Vector256<byte>.Zero);
+        Vector256<byte> c0 = Vector256_.UnpackLow(absAb, Vector256<byte>.Zero);
+        Vector256<byte> c1 = Vector256_.UnpackHigh(absAb, Vector256<byte>.Zero);
 
         // Multiply with self.
-        Vector256<int> sum1 = Avx2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
-        Vector256<int> sum2 = Avx2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
+        Vector256<int> sum1 = Vector256_.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
+        Vector256<int> sum2 = Vector256_.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
 
-        return Avx2.Add(sum1, sum2);
+        return sum1 + sum2;
     }
 
     [MethodImpl(InliningOptions.ShortMethod)]
@@ -990,7 +990,7 @@ internal static class LossyUtils
         // difference of weighted sums.
         Vector128<int> result = ab0ab2Sum - b0w0bb2w8Sum;
 
-        return ReduceSum(result);
+        return ReduceSumVector128(result);
     }
 
     // Transpose two 4x4 16b matrices horizontally stored in registers.
@@ -1916,7 +1916,27 @@ internal static class LossyUtils
     /// <param name="accumulator">The accumulator to reduce.</param>
     /// <returns>The sum of all elements.</returns>
     [MethodImpl(InliningOptions.ShortMethod)]
-    private static int ReduceSum(Vector128<int> accumulator)
+    public static int ReduceSumVector256(Vector256<int> accumulator)
+    {
+        // Add upper lane to lower lane.
+        Vector128<int> vsum = accumulator.GetLower() + accumulator.GetUpper();
+
+        // Add odd to even.
+        vsum += Vector128_.ShuffleNative(vsum, 0b_11_11_01_01);
+
+        // Add high to low.
+        vsum += Vector128_.ShuffleNative(vsum, 0b_11_10_11_10);
+
+        return vsum.ToScalar();
+    }
+
+    /// <summary>
+    /// Reduces elements of the vector into one sum.
+    /// </summary>
+    /// <param name="accumulator">The accumulator to reduce.</param>
+    /// <returns>The sum of all elements.</returns>
+    [MethodImpl(InliningOptions.ShortMethod)]
+    private static int ReduceSumVector128(Vector128<int> accumulator)
     {
         // Add odd to even.
         Vector128<int> vsum = accumulator + Vector128_.ShuffleNative(accumulator, 0b_11_11_01_01);