diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index c160b9560..c5e16faf9 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -711,6 +711,39 @@ internal static class Vector128_
return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7));
}
+ ///
+ /// Unpack and interleave 8-bit integers from the high half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 8-bit integers to unpack from the high half.
+ ///
+ ///
+ /// The second vector containing packed 8-bit integers to unpack from the high half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 8-bit integers from the high
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackHigh(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackHigh(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipHigh(left, right);
+ }
+
+ Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper());
+ return Vector128.Shuffle(
+ unpacked,
+ Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)));
+ }
+
///
/// Unpack and interleave 8-bit integers from the low half of and
/// and store the results in the result.
@@ -744,6 +777,56 @@ internal static class Vector128_
Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15)));
}
+ ///
+ /// Subtract packed signed 16-bit integers in from packed signed 16-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed signed 16-bit integers to subtract from.
+ ///
+ ///
+ /// The second vector containing packed signed 16-bit integers to subtract.
+ ///
+ ///
+ /// A vector containing the results of subtracting packed unsigned 16-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 SubtractSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.SubtractSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.SubtractSaturate(left, right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.SubtractSaturate(left, right);
+ }
+
+ // Widen inputs to 32-bit signed
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Subtract
+ Vector128 diffLo = leftLo - rightLo;
+ Vector128 diffHi = leftHi - rightHi;
+
+ // Clamp to signed 16-bit range
+ Vector128 shortMin = Vector128.Create((int)short.MinValue);
+ Vector128 shortMax = Vector128.Create((int)short.MaxValue);
+
+ diffLo = Clamp(diffLo, shortMin, shortMax);
+ diffHi = Clamp(diffHi, shortMin, shortMax);
+
+ // Narrow back to 16 bit signed.
+ return Vector128.Narrow(diffLo, diffHi);
+ }
+
///
/// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers
/// in using saturation, and store the results.
@@ -775,7 +858,7 @@ internal static class Vector128_
return PackedSimd.SubtractSaturate(left, right);
}
- // Widen inputs to 16-bit to safely compute unsigned differences without underflow
+ // Widen inputs to 16-bit
(Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
(Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
@@ -783,13 +866,11 @@ internal static class Vector128_
Vector128 diffLo = leftLo - rightLo;
Vector128 diffHi = leftHi - rightHi;
- // Mask lanes where left >= right to preserve the result
- // All other lanes are zeroed (saturate to 0)
- Vector128 maskLo = Vector128.GreaterThanOrEqual(leftLo, rightLo).AsUInt16();
- Vector128 maskHi = Vector128.GreaterThanOrEqual(leftHi, rightHi).AsUInt16();
+ // Clamp to signed 8-bit range
+ Vector128 max = Vector128.Create((ushort)byte.MaxValue);
- diffLo &= maskLo;
- diffHi &= maskHi;
+ diffLo = Clamp(diffLo, Vector128.Zero, max);
+ diffHi = Clamp(diffHi, Vector128.Zero, max);
// Narrow back to bytes
return Vector128.Narrow(diffLo, diffHi);
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index dfefd2d34..71dfadc39 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -162,6 +162,33 @@ internal static class Vector256_
return (vm0 * vm1) - vs;
}
+ ///
+ /// Multiply packed signed 16-bit integers in and , producing
+ /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and
+ /// pack the results.
+ ///
+ ///
+ /// The first vector containing packed signed 16-bit integers to multiply and add.
+ ///
+ ///
+ /// The second vector containing packed signed 16-bit integers to multiply and add.
+ ///
+ ///
+ /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector256 MultiplyAddAdjacent(Vector256 left, Vector256 right)
+ {
+ if (Avx2.IsSupported)
+ {
+ return Avx2.MultiplyAddAdjacent(left, right);
+ }
+
+ return Vector256.Create(
+ Vector128_.MultiplyAddAdjacent(left.GetLower(), right.GetLower()),
+ Vector128_.MultiplyAddAdjacent(left.GetUpper(), right.GetUpper()));
+ }
+
///
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
///
@@ -303,6 +330,142 @@ internal static class Vector256_
return Vector256.Narrow(prodLo, prodHi);
}
+ ///
+ /// Unpack and interleave 32-bit integers from the low half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 32-bit integers to unpack from the low half.
+ ///
+ ///
+ /// The second vector containing packed 32-bit integers to unpack from the low half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 32-bit integers from the low
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector256 UnpackLow(Vector256 left, Vector256 right)
+ {
+ if (Avx2.IsSupported)
+ {
+ return Avx2.UnpackLow(left, right);
+ }
+
+ Vector128 lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
+ Vector128 hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
+
+ return Vector256.Create(lo, hi);
+ }
+
+ ///
+ /// Unpack and interleave 8-bit integers from the high half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 8-bit integers to unpack from the high half.
+ ///
+ ///
+ /// The second vector containing packed 8-bit integers to unpack from the high half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 8-bit integers from the high
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector256 UnpackHigh(Vector256 left, Vector256 right)
+ {
+ if (Avx2.IsSupported)
+ {
+ return Avx2.UnpackHigh(left, right);
+ }
+
+ Vector128 lo = Vector128_.UnpackHigh(left.GetLower(), right.GetLower());
+ Vector128 hi = Vector128_.UnpackHigh(left.GetUpper(), right.GetUpper());
+
+ return Vector256.Create(lo, hi);
+ }
+
+ ///
+ /// Unpack and interleave 8-bit integers from the low half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 8-bit integers to unpack from the low half.
+ ///
+ ///
+ /// The second vector containing packed 8-bit integers to unpack from the low half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 8-bit integers from the low
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector256 UnpackLow(Vector256 left, Vector256 right)
+ {
+ if (Avx2.IsSupported)
+ {
+ return Avx2.UnpackLow(left, right);
+ }
+
+ Vector128 lo = Vector128_.UnpackLow(left.GetLower(), right.GetLower());
+ Vector128 hi = Vector128_.UnpackLow(left.GetUpper(), right.GetUpper());
+
+ return Vector256.Create(lo, hi);
+ }
+
+ ///
+ /// Subtract packed signed 16-bit integers in from packed signed 16-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed signed 16-bit integers to subtract from.
+ ///
+ ///
+ /// The second vector containing packed signed 16-bit integers to subtract.
+ ///
+ ///
+ /// A vector containing the results of subtracting packed unsigned 16-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector256 SubtractSaturate(Vector256 left, Vector256 right)
+ {
+ if (Avx2.IsSupported)
+ {
+ return Avx2.SubtractSaturate(left, right);
+ }
+
+ return Vector256.Create(
+ Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
+ Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
+ }
+
+ ///
+ /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed unsigned 8-bit integers to subtract from.
+ ///
+ ///
+ /// The second vector containing packed unsigned 8-bit integers to subtract.
+ ///
+ ///
+ /// A vector containing the results of subtracting packed unsigned 8-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector256 SubtractSaturate(Vector256 left, Vector256 right)
+ {
+ if (Avx2.IsSupported)
+ {
+ return Avx2.SubtractSaturate(left, right);
+ }
+
+ return Vector256.Create(
+ Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
+ Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
+ }
+
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}
diff --git a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
index 7d186cd65..4e61242c0 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs
@@ -20,12 +20,12 @@ internal static class LossyUtils
{
if (Avx2.IsSupported)
{
- return Vp8_Sse16xN_Avx2(a, b, 4);
+ return Vp8_Sse16xN_Vector256(a, b, 4);
}
- if (Sse2.IsSupported)
+ if (Vector128.IsHardwareAccelerated)
{
- return Vp8_Sse16xN_Sse2(a, b, 8);
+ return Vp8_16xN_Vector128(a, b, 8);
}
if (AdvSimd.IsSupported)
@@ -40,14 +40,14 @@ internal static class LossyUtils
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8_Sse16x8(Span a, Span b)
{
- if (Avx2.IsSupported)
+ if (Vector256.IsHardwareAccelerated)
{
- return Vp8_Sse16xN_Avx2(a, b, 2);
+ return Vp8_Sse16xN_Vector256(a, b, 2);
}
- if (Sse2.IsSupported)
+ if (Vector128.IsHardwareAccelerated)
{
- return Vp8_Sse16xN_Sse2(a, b, 4);
+ return Vp8_16xN_Vector128(a, b, 4);
}
if (AdvSimd.IsSupported)
@@ -81,21 +81,21 @@ internal static class LossyUtils
Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3)));
// Combine pair of lines.
- Vector256 a01 = Avx2.UnpackLow(a0.AsInt32(), a1.AsInt32());
- Vector256 b01 = Avx2.UnpackLow(b0.AsInt32(), b1.AsInt32());
+ Vector256 a01 = Vector256_.UnpackLow(a0.AsInt32(), a1.AsInt32());
+ Vector256 b01 = Vector256_.UnpackLow(b0.AsInt32(), b1.AsInt32());
// Convert to 16b.
- Vector256 a01s = Avx2.UnpackLow(a01.AsByte(), Vector256.Zero);
- Vector256 b01s = Avx2.UnpackLow(b01.AsByte(), Vector256.Zero);
+ Vector256 a01s = Vector256_.UnpackLow(a01.AsByte(), Vector256.Zero);
+ Vector256 b01s = Vector256_.UnpackLow(b01.AsByte(), Vector256.Zero);
// subtract, square and accumulate.
- Vector256 d0 = Avx2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
- Vector256 e0 = Avx2.MultiplyAddAdjacent(d0, d0);
+ Vector256 d0 = Vector256_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
+ Vector256 e0 = Vector256_.MultiplyAddAdjacent(d0, d0);
- return Numerics.ReduceSum(e0);
+ return ReduceSumVector256(e0);
}
- if (Sse2.IsSupported)
+ if (Vector128.IsHardwareAccelerated)
{
// Load values.
ref byte aRef = ref MemoryMarshal.GetReference(a);
@@ -110,25 +110,25 @@ internal static class LossyUtils
Vector128 b3 = Unsafe.As>(ref Unsafe.Add(ref bRef, WebpConstants.Bps * 3));
// Combine pair of lines.
- Vector128 a01 = Sse2.UnpackLow(a0.AsInt32(), a1.AsInt32());
- Vector128 a23 = Sse2.UnpackLow(a2.AsInt32(), a3.AsInt32());
- Vector128 b01 = Sse2.UnpackLow(b0.AsInt32(), b1.AsInt32());
- Vector128 b23 = Sse2.UnpackLow(b2.AsInt32(), b3.AsInt32());
+ Vector128 a01 = Vector128_.UnpackLow(a0.AsInt32(), a1.AsInt32());
+ Vector128 a23 = Vector128_.UnpackLow(a2.AsInt32(), a3.AsInt32());
+ Vector128 b01 = Vector128_.UnpackLow(b0.AsInt32(), b1.AsInt32());
+ Vector128 b23 = Vector128_.UnpackLow(b2.AsInt32(), b3.AsInt32());
// Convert to 16b.
- Vector128 a01s = Sse2.UnpackLow(a01.AsByte(), Vector128.Zero);
- Vector128 a23s = Sse2.UnpackLow(a23.AsByte(), Vector128.Zero);
- Vector128 b01s = Sse2.UnpackLow(b01.AsByte(), Vector128.Zero);
- Vector128 b23s = Sse2.UnpackLow(b23.AsByte(), Vector128.Zero);
+ Vector128 a01s = Vector128_.UnpackLow(a01.AsByte(), Vector128.Zero);
+ Vector128 a23s = Vector128_.UnpackLow(a23.AsByte(), Vector128.Zero);
+ Vector128 b01s = Vector128_.UnpackLow(b01.AsByte(), Vector128.Zero);
+ Vector128 b23s = Vector128_.UnpackLow(b23.AsByte(), Vector128.Zero);
// subtract, square and accumulate.
- Vector128 d0 = Sse2.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
+ Vector128 d0 = Vector128_.SubtractSaturate(a01s.AsInt16(), b01s.AsInt16());
Vector128 d1 = Sse2.SubtractSaturate(a23s.AsInt16(), b23s.AsInt16());
Vector128 e0 = Sse2.MultiplyAddAdjacent(d0, d0);
Vector128 e1 = Sse2.MultiplyAddAdjacent(d1, d1);
Vector128 sum = Sse2.Add(e0, e1);
- return ReduceSum(sum);
+ return ReduceSumVector128(sum);
}
if (AdvSimd.IsSupported)
@@ -159,7 +159,7 @@ internal static class LossyUtils
}
[MethodImpl(InliningOptions.ShortMethod)]
- private static int Vp8_Sse16xN_Sse2(Span a, Span b, int numPairs)
+ private static int Vp8_16xN_Vector128(Span a, Span b, int numPairs)
{
Vector128 sum = Vector128.Zero;
nuint offset = 0;
@@ -173,18 +173,18 @@ internal static class LossyUtils
Vector128 a1 = Unsafe.As>(ref Unsafe.Add(ref aRef, offset + WebpConstants.Bps));
Vector128 b1 = Unsafe.As>(ref Unsafe.Add(ref bRef, offset + WebpConstants.Bps));
- Vector128 sum1 = SubtractAndAccumulate(a0, b0);
- Vector128 sum2 = SubtractAndAccumulate(a1, b1);
+ Vector128 sum1 = SubtractAndAccumulateVector128(a0, b0);
+ Vector128 sum2 = SubtractAndAccumulateVector128(a1, b1);
sum += sum1 + sum2;
offset += 2 * WebpConstants.Bps;
}
- return ReduceSum(sum);
+ return ReduceSumVector128(sum);
}
[MethodImpl(InliningOptions.ShortMethod)]
- private static int Vp8_Sse16xN_Avx2(Span a, Span b, int numPairs)
+ private static int Vp8_Sse16xN_Vector256(Span a, Span b, int numPairs)
{
Vector256 sum = Vector256.Zero;
nuint offset = 0;
@@ -206,14 +206,14 @@ internal static class LossyUtils
Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (2 * WebpConstants.Bps))),
Unsafe.As>(ref Unsafe.Add(ref bRef, offset + (3 * WebpConstants.Bps))));
- Vector256 sum1 = SubtractAndAccumulate(a0, b0);
- Vector256 sum2 = SubtractAndAccumulate(a1, b1);
- sum = Avx2.Add(sum, Avx2.Add(sum1, sum2));
+ Vector256 sum1 = SubtractAndAccumulateVector256(a0, b0);
+ Vector256 sum2 = SubtractAndAccumulateVector256(a1, b1);
+ sum += sum1 + sum2;
offset += 4 * WebpConstants.Bps;
}
- return Numerics.ReduceSum(sum);
+ return ReduceSumVector256(sum);
}
[MethodImpl(InliningOptions.ShortMethod)]
@@ -306,41 +306,41 @@ internal static class LossyUtils
}
[MethodImpl(InliningOptions.ShortMethod)]
- private static Vector128 SubtractAndAccumulate(Vector128 a, Vector128 b)
+ private static Vector128 SubtractAndAccumulateVector128(Vector128 a, Vector128 b)
{
// Take abs(a-b) in 8b.
- Vector128 ab = Sse2.SubtractSaturate(a, b);
- Vector128 ba = Sse2.SubtractSaturate(b, a);
- Vector128 absAb = Sse2.Or(ab, ba);
+ Vector128 ab = Vector128_.SubtractSaturate(a, b);
+ Vector128 ba = Vector128_.SubtractSaturate(b, a);
+ Vector128 absAb = ab | ba;
// Zero-extend to 16b.
- Vector128 c0 = Sse2.UnpackLow(absAb, Vector128.Zero);
- Vector128 c1 = Sse2.UnpackHigh(absAb, Vector128.Zero);
+ Vector128 c0 = Vector128_.UnpackLow(absAb, Vector128.Zero);
+ Vector128 c1 = Vector128_.UnpackHigh(absAb, Vector128.Zero);
// Multiply with self.
- Vector128 sum1 = Sse2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
- Vector128 sum2 = Sse2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
+ Vector128 sum1 = Vector128_.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
+ Vector128 sum2 = Vector128_.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
- return Sse2.Add(sum1, sum2);
+ return sum1 + sum2;
}
[MethodImpl(InliningOptions.ShortMethod)]
- private static Vector256 SubtractAndAccumulate(Vector256 a, Vector256 b)
+ private static Vector256 SubtractAndAccumulateVector256(Vector256 a, Vector256 b)
{
// Take abs(a-b) in 8b.
- Vector256 ab = Avx2.SubtractSaturate(a, b);
- Vector256 ba = Avx2.SubtractSaturate(b, a);
+ Vector256 ab = Vector256_.SubtractSaturate(a, b);
+ Vector256 ba = Vector256_.SubtractSaturate(b, a);
Vector256 absAb = Avx2.Or(ab, ba);
// Zero-extend to 16b.
- Vector256 c0 = Avx2.UnpackLow(absAb, Vector256.Zero);
- Vector256 c1 = Avx2.UnpackHigh(absAb, Vector256.Zero);
+ Vector256 c0 = Vector256_.UnpackLow(absAb, Vector256.Zero);
+ Vector256 c1 = Vector256_.UnpackHigh(absAb, Vector256.Zero);
// Multiply with self.
- Vector256 sum1 = Avx2.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
- Vector256 sum2 = Avx2.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
+ Vector256 sum1 = Vector256_.MultiplyAddAdjacent(c0.AsInt16(), c0.AsInt16());
+ Vector256 sum2 = Vector256_.MultiplyAddAdjacent(c1.AsInt16(), c1.AsInt16());
- return Avx2.Add(sum1, sum2);
+ return sum1 + sum2;
}
[MethodImpl(InliningOptions.ShortMethod)]
@@ -990,7 +990,7 @@ internal static class LossyUtils
// difference of weighted sums.
Vector128 result = ab0ab2Sum - b0w0bb2w8Sum;
- return ReduceSum(result);
+ return ReduceSumVector128(result);
}
// Transpose two 4x4 16b matrices horizontally stored in registers.
@@ -1916,7 +1916,27 @@ internal static class LossyUtils
/// The accumulator to reduce.
/// The sum of all elements.
[MethodImpl(InliningOptions.ShortMethod)]
- private static int ReduceSum(Vector128 accumulator)
+ public static int ReduceSumVector256(Vector256 accumulator)
+ {
+ // Add upper lane to lower lane.
+ Vector128 vsum = accumulator.GetLower() + accumulator.GetUpper();
+
+ // Add odd to even.
+ vsum += Vector128_.ShuffleNative(vsum, 0b_11_11_01_01);
+
+ // Add high to low.
+ vsum += Vector128_.ShuffleNative(vsum, 0b_11_10_11_10);
+
+ return vsum.ToScalar();
+ }
+
+ ///
+ /// Reduces elements of the vector into one sum.
+ ///
+ /// The accumulator to reduce.
+ /// The sum of all elements.
+ [MethodImpl(InliningOptions.ShortMethod)]
+ private static int ReduceSumVector128(Vector128 accumulator)
{
// Add odd to even.
Vector128 vsum = accumulator + Vector128_.ShuffleNative(accumulator, 0b_11_11_01_01);