diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index ca14ae4c3..5f91dcd99 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -884,23 +884,6 @@ internal static class Numerics
accumulator += intHigh;
}
- ///
- /// Reduces elements of the vector into one sum.
- ///
- /// The accumulator to reduce.
- /// The sum of all elements.
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static int ReduceSum(Vector128 accumulator)
- {
- // Add odd to even.
- Vector128 vsum = Sse2.Add(accumulator, Sse2.Shuffle(accumulator, 0b_11_11_01_01));
-
- // Add high to low.
- vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));
-
- return Sse2.ConvertToInt32(vsum);
- }
-
///
/// Reduces elements of the vector into one sum.
///
diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
index 96ddb7976..ff5ea5de3 100644
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@@ -66,9 +66,9 @@ internal static partial class SimdUtils
ref Span destination,
[ConstantExpected] byte control)
{
- if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat) ||
- (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat) ||
- Vector128.IsHardwareAccelerated)
+ if (Vector512.IsHardwareAccelerated ||
+ Vector256.IsHardwareAccelerated ||
+ Vector128.IsHardwareAccelerated)
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@@ -112,9 +112,9 @@ internal static partial class SimdUtils
ref Span destination,
[ConstantExpected] byte control)
{
- if ((Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte) ||
- (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte) ||
- (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte))
+ if (Vector512.IsHardwareAccelerated ||
+ Vector256.IsHardwareAccelerated ||
+ Vector128.IsHardwareAccelerated)
{
int remainder = 0;
if (Vector512.IsHardwareAccelerated)
@@ -158,7 +158,7 @@ internal static partial class SimdUtils
ref Span destination,
[ConstantExpected] byte control)
{
- if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsAlignRight)
+ if (Vector128.IsHardwareAccelerated)
{
int remainder = source.Length % (Vector128.Count * 3);
@@ -190,7 +190,7 @@ internal static partial class SimdUtils
ref Span destination,
[ConstantExpected] byte control)
{
- if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
+ if (Vector128.IsHardwareAccelerated)
{
int remainder = source.Length % (Vector128.Count * 3);
@@ -223,7 +223,7 @@ internal static partial class SimdUtils
ref Span destination,
[ConstantExpected] byte control)
{
- if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte && Vector128_.SupportsShiftByte)
+ if (Vector128.IsHardwareAccelerated)
{
int remainder = source.Length & ((Vector128.Count * 4) - 1); // bit-hack for modulo
@@ -249,7 +249,7 @@ internal static partial class SimdUtils
Span destination,
[ConstantExpected] byte control)
{
- if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeFloat)
+ if (Vector512.IsHardwareAccelerated)
{
ref Vector512 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
ref Vector512 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
@@ -277,7 +277,7 @@ internal static partial class SimdUtils
}
}
}
- else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeFloat)
+ else if (Vector256.IsHardwareAccelerated)
{
ref Vector256 sourceBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(source));
ref Vector256 destinationBase = ref Unsafe.As>(ref MemoryMarshal.GetReference(destination));
@@ -341,7 +341,7 @@ internal static partial class SimdUtils
Span destination,
[ConstantExpected] byte control)
{
- if (Vector512.IsHardwareAccelerated && Vector512_.SupportsShuffleNativeByte)
+ if (Vector512.IsHardwareAccelerated)
{
Span temp = stackalloc byte[Vector512.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@@ -373,8 +373,13 @@ internal static partial class SimdUtils
}
}
}
- else if (Vector256.IsHardwareAccelerated && Vector256_.SupportsShuffleNativeByte)
+ else if (Vector256.IsHardwareAccelerated)
{
+ // ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb).
+ // MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte,
+ // so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F)
+ // for indexing within each lane, and ignores the upper bits unless bit 7 is set,
+ // this usage is guaranteed to remain within-lane and non-zeroing.
Span temp = stackalloc byte[Vector256.Count];
Shuffle.MMShuffleSpan(ref temp, control);
Vector256 mask = Unsafe.As>(ref MemoryMarshal.GetReference(temp));
@@ -391,21 +396,21 @@ internal static partial class SimdUtils
ref Vector256 vs0 = ref Unsafe.Add(ref sourceBase, i);
ref Vector256 vd0 = ref Unsafe.Add(ref destinationBase, i);
- vd0 = Vector256_.ShuffleNative(vs0, mask);
- Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
- Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
- Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
+ vd0 = Vector256_.ShufflePerLane(vs0, mask);
+ Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)1), mask);
+ Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)2), mask);
+ Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)3), mask);
}
if (m > 0)
{
for (nuint i = u; i < n; i++)
{
- Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
+ Unsafe.Add(ref destinationBase, i) = Vector256_.ShufflePerLane(Unsafe.Add(ref sourceBase, i), mask);
}
}
}
- else if (Vector128.IsHardwareAccelerated && Vector128_.SupportsShuffleNativeByte)
+ else if (Vector128.IsHardwareAccelerated)
{
Span temp = stackalloc byte[Vector128.Count];
Shuffle.MMShuffleSpan(ref temp, control);
@@ -445,9 +450,7 @@ internal static partial class SimdUtils
Span destination,
[ConstantExpected] byte control)
{
- if (Vector128.IsHardwareAccelerated &&
- Vector128_.SupportsShuffleNativeByte &&
- Vector128_.SupportsAlignRight)
+ if (Vector128.IsHardwareAccelerated)
{
Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
@@ -507,10 +510,7 @@ internal static partial class SimdUtils
Span destination,
[ConstantExpected] byte control)
{
- if (Vector128.IsHardwareAccelerated &&
- Vector128_.SupportsShuffleNativeByte &&
- Vector128_.SupportsShiftByte &&
- Vector128_.SupportsAlignRight)
+ if (Vector128.IsHardwareAccelerated)
{
Vector128 maskPad4Nx16 = ShuffleMaskPad4Nx16();
Vector128 fill = Vector128.Create(0xff000000ff000000ul).AsByte();
@@ -553,10 +553,7 @@ internal static partial class SimdUtils
Span destination,
[ConstantExpected] byte control)
{
- if (Vector128.IsHardwareAccelerated &&
- Vector128_.SupportsShuffleNativeByte &&
- Vector128_.SupportsShiftByte &&
- Vector128_.SupportsAlignRight)
+ if (Vector128.IsHardwareAccelerated)
{
Vector128 maskSlice4Nx16 = ShuffleMaskSlice4Nx16();
Vector128 maskE = Vector128_.AlignRight(maskSlice4Nx16, maskSlice4Nx16, 12);
diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index dbe0a1fce..a5d377eb9 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -1,10 +1,8 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
-using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.Wasm;
@@ -25,43 +23,34 @@ internal static class Vector128_
#pragma warning restore SA1649 // File name should match first type name
{
///
- /// Gets a value indicating whether shuffle operations are supported.
+ /// Average packed unsigned 8-bit integers in and , and store the results.
///
- public static bool SupportsShuffleNativeByte
+ ///
+ /// The first vector containing packed unsigned 8-bit integers to average.
+ ///
+ ///
+ /// The second vector containing packed unsigned 8-bit integers to average.
+ ///
+ ///
+ /// A vector containing the average of the packed unsigned 8-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 Average(Vector128 left, Vector128 right)
{
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- get
+ if (Sse2.IsSupported)
{
- if (Vector128.IsHardwareAccelerated)
- {
- if (RuntimeInformation.ProcessArchitecture is Architecture.X86 or Architecture.X64)
- {
- return Ssse3.IsSupported;
- }
-
- return true;
- }
-
- return false;
+ return Sse2.Average(left, right);
}
- }
- ///
- /// Gets a value indicating whether right align operations are supported.
- ///
- public static bool SupportsAlignRight
- {
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- get => Ssse3.IsSupported || AdvSimd.IsSupported;
- }
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.FusedAddRoundedHalving(left, right);
+ }
- ///
- /// Gets a value indicating whether right or left byte shift operations are supported.
- ///
- public static bool SupportsShiftByte
- {
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- get => Sse2.IsSupported || AdvSimd.IsSupported;
+ // Account for potential 9th bit to ensure correct rounded result.
+ return Vector128.Narrow(
+ (Vector128.WidenLower(left) + Vector128.WidenLower(right) + Vector128.One) >> 1,
+ (Vector128.WidenUpper(left) + Vector128.WidenUpper(right) + Vector128.One) >> 1);
}
///
@@ -88,6 +77,87 @@ internal static class Vector128_
return Vector128.Shuffle(vector, indices);
}
+ ///
+ /// Creates a new vector by selecting values from an input vector using the control.
+ ///
+ /// The input vector from which values are selected.
+ /// The shuffle control byte.
+ /// The .
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 ShuffleNative(Vector128 vector, [ConstantExpected] byte control)
+ {
+ // Don't use InverseMMShuffle here as we want to avoid the cast.
+ Vector128 indices = Vector128.Create(
+ control & 0x3,
+ (control >> 2) & 0x3,
+ (control >> 4) & 0x3,
+ (control >> 6) & 0x3);
+
+ return Vector128.Shuffle(vector, indices);
+ }
+
+ ///
+ /// Shuffle 16-bit integers in the high 64 bits of using the control in .
+ /// Store the results in the high 64 bits of the destination, with the low 64 bits being copied from .
+ ///
+ /// The input vector containing packed 16-bit integers to shuffle.
+ /// The shuffle control byte.
+ ///
+ /// A vector containing the shuffled 16-bit integers in the high 64 bits, with the low 64 bits copied from .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 ShuffleHigh(Vector128 value, [ConstantExpected] byte control)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.ShuffleHigh(value, control);
+ }
+
+ // Don't use InverseMMShuffle here as we want to avoid the cast.
+ Vector128 indices = Vector128.Create(
+ 0,
+ 1,
+ 2,
+ 3,
+ (short)((control & 0x3) + 4),
+ (short)(((control >> 2) & 0x3) + 4),
+ (short)(((control >> 4) & 0x3) + 4),
+ (short)(((control >> 6) & 0x3) + 4));
+
+ return Vector128.Shuffle(value, indices);
+ }
+
+ ///
+ /// Shuffle 16-bit integers in the low 64 bits of using the control in .
+ /// Store the results in the low 64 bits of the destination, with the high 64 bits being copied from .
+ ///
+ /// The input vector containing packed 16-bit integers to shuffle.
+ /// The shuffle control byte.
+ ///
+ /// A vector containing the shuffled 16-bit integers in the low 64 bits, with the high 64 bits copied from .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 ShuffleLow(Vector128 value, [ConstantExpected] byte control)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.ShuffleLow(value, control);
+ }
+
+ // Don't use InverseMMShuffle here as we want to avoid the cast.
+ Vector128 indices = Vector128.Create(
+ (short)(control & 0x3),
+ (short)((control >> 2) & 0x3),
+ (short)((control >> 4) & 0x3),
+ (short)((control >> 6) & 0x3),
+ 4,
+ 5,
+ 6,
+ 7);
+
+ return Vector128.Shuffle(value, indices);
+ }
+
///
/// Creates a new vector by selecting values from an input vector using a set of indices.
///
@@ -133,8 +203,7 @@ internal static class Vector128_
return AdvSimd.ExtractVector128(value, Vector128.Zero, numBytes);
}
- ThrowUnreachableException();
- return default;
+ return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + Vector128.Create(numBytes));
}
///
@@ -158,8 +227,28 @@ internal static class Vector128_
#pragma warning restore CA1857 // A constant is expected for the parameter
}
- ThrowUnreachableException();
- return default;
+ return Vector128.Shuffle(value, Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) - Vector128.Create(numBytes));
+ }
+
+ ///
+ /// Shift packed 16-bit integers in left by while
+ /// shifting in zeros, and store the results
+ ///
+ /// The vector containing packed 16-bit integers to shift.
+ /// The number of bits to shift left.
+ ///
+ /// A vector containing the packed 16-bit integers shifted left by , with zeros shifted in.
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 ShiftLeftLogical(Vector128 value, [ConstantExpected] byte count)
+ {
+ // Zero lanes where count >= 16 to match SSE2
+ if (count >= 16)
+ {
+ return Vector128.Zero;
+ }
+
+ return value << count;
}
///
@@ -182,8 +271,9 @@ internal static class Vector128_
return AdvSimd.ExtractVector128(right, left, mask);
}
- ThrowUnreachableException();
- return default;
+#pragma warning disable CA1857 // A constant is expected for the parameter
+ return ShiftLeftBytesInVector(left, (byte)(Vector128.Count - mask)) | ShiftRightBytesInVector(right, mask);
+#pragma warning restore CA1857 // A constant is expected for the parameter
}
///
@@ -304,6 +394,37 @@ internal static class Vector128_
return Vector128.Narrow(lefClamped, rightClamped);
}
+ ///
+ /// Packs signed 32-bit integers to unsigned 16-bit integers and saturates.
+ ///
+ /// The left hand source vector.
+ /// The right hand source vector.
+ /// The .
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 PackUnsignedSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse41.IsSupported)
+ {
+ return Sse41.PackUnsignedSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.ExtractNarrowingSaturateUnsignedUpper(AdvSimd.ExtractNarrowingSaturateUnsignedLower(left), right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.ConvertNarrowingSaturateUnsigned(left, right);
+ }
+
+ Vector128 min = Vector128.Create((int)ushort.MinValue);
+ Vector128 max = Vector128.Create((int)ushort.MaxValue);
+ Vector128 lefClamped = Clamp(left, min, max).AsUInt32();
+ Vector128 rightClamped = Clamp(right, min, max).AsUInt32();
+ return Vector128.Narrow(lefClamped, rightClamped);
+ }
+
///
/// Packs signed 32-bit integers to signed 16-bit integers and saturates.
///
@@ -335,6 +456,37 @@ internal static class Vector128_
return Vector128.Narrow(lefClamped, rightClamped);
}
+ ///
+ /// Packs signed 16-bit integers to signed 8-bit integers and saturates.
+ ///
+ /// The left hand source vector.
+ /// The right hand source vector.
+ /// The .
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 PackSignedSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.PackSignedSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.ExtractNarrowingSaturateUpper(AdvSimd.ExtractNarrowingSaturateLower(left), right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.ConvertNarrowingSaturateSigned(left, right);
+ }
+
+ Vector128 min = Vector128.Create((short)sbyte.MinValue);
+ Vector128 max = Vector128.Create((short)sbyte.MaxValue);
+ Vector128 lefClamped = Clamp(left, min, max);
+ Vector128 rightClamped = Clamp(right, min, max);
+ return Vector128.Narrow(lefClamped, rightClamped);
+ }
+
///
/// Restricts a vector between a minimum and a maximum value.
///
@@ -347,6 +499,864 @@ internal static class Vector128_
public static Vector128 Clamp(Vector128 value, Vector128 min, Vector128 max)
=> Vector128.Min(Vector128.Max(value, min), max);
- [DoesNotReturn]
- private static void ThrowUnreachableException() => throw new UnreachableException();
+ ///
+ /// Multiply packed signed 16-bit integers in and , producing
+ /// intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and
+ /// pack the results.
+ ///
+ ///
+ /// The first vector containing packed signed 16-bit integers to multiply and add.
+ ///
+ ///
+ /// The second vector containing packed signed 16-bit integers to multiply and add.
+ ///
+ ///
+ /// A vector containing the results of multiplying and adding adjacent pairs of packed signed 16-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 MultiplyAddAdjacent(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.MultiplyAddAdjacent(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower());
+ Vector128 prodHi = AdvSimd.MultiplyWideningUpper(left, right);
+
+ if (AdvSimd.Arm64.IsSupported)
+ {
+ return AdvSimd.Arm64.AddPairwise(prodLo, prodHi);
+ }
+
+ Vector64 v0 = AdvSimd.AddPairwise(prodLo.GetLower(), prodLo.GetUpper());
+ Vector64 v1 = AdvSimd.AddPairwise(prodHi.GetLower(), prodHi.GetUpper());
+ return Vector128.Create(v0, v1);
+ }
+
+ {
+ // Widen each half of the short vectors into two int vectors
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Elementwise multiply: each int lane now holds the full 32-bit product
+ Vector128 prodLo = leftLo * rightLo;
+ Vector128 prodHi = leftHi * rightHi;
+
+ // Extract the low and high parts of the products shuffling them to form a result we can add together.
+ // Use out-of-bounds to zero out the unused lanes.
+ Vector128 v0 = Vector128.Shuffle(prodLo, Vector128.Create(0, 2, 8, 8));
+ Vector128 v1 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 0, 2));
+ Vector128 v2 = Vector128.Shuffle(prodLo, Vector128.Create(1, 3, 8, 8));
+ Vector128 v3 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 1, 3));
+
+ return v0 + v1 + v2 + v3;
+ }
+ }
+
+ ///
+ /// Horizontally add adjacent pairs of 16-bit integers in and , and
+ /// pack the signed 16-bit results.
+ ///
+ ///
+ /// The first vector containing packed signed 16-bit integers to add.
+ ///
+ ///
+ /// The second vector containing packed signed 16-bit integers to add.
+ ///
+ ///
+ /// A vector containing the results of horizontally adding adjacent pairs of packed signed 16-bit integers
+ ///
+ public static Vector128 HorizontalAdd(Vector128 left, Vector128 right)
+ {
+ if (Ssse3.IsSupported)
+ {
+ return Ssse3.HorizontalAdd(left, right);
+ }
+
+ if (AdvSimd.Arm64.IsSupported)
+ {
+ return AdvSimd.Arm64.AddPairwise(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ Vector128 v0 = AdvSimd.AddPairwiseWidening(left);
+ Vector128 v1 = AdvSimd.AddPairwiseWidening(right);
+
+ return Vector128.Narrow(v0, v1);
+ }
+
+ {
+ // Extract the low and high parts of the products shuffling them to form a result we can add together.
+ // Use out-of-bounds to zero out the unused lanes.
+ Vector128 even = Vector128.Create(0, 2, 4, 6, 8, 8, 8, 8);
+ Vector128 odd = Vector128.Create(1, 3, 5, 7, 8, 8, 8, 8);
+ Vector128 v0 = Vector128.Shuffle(right, even);
+ Vector128 v1 = Vector128.Shuffle(right, odd);
+ Vector128 v2 = Vector128.Shuffle(left, even);
+ Vector128 v3 = Vector128.Shuffle(left, odd);
+
+ return v0 + v1 + v2 + v3;
+ }
+ }
+
+ ///
+ /// Multiply the packed 16-bit integers in and , producing
+ /// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
+ ///
+ ///
+ /// The first vector containing packed 16-bit integers to multiply.
+ ///
+ ///
+ /// The second vector containing packed 16-bit integers to multiply.
+ ///
+ ///
+ /// A vector containing the high 16 bits of the products of the packed 16-bit integers
+ /// from and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 MultiplyHigh(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.MultiplyHigh(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower());
+ Vector128 prodHi = AdvSimd.MultiplyWideningUpper(left, right);
+
+ prodLo >>= 16;
+ prodHi >>= 16;
+
+ return Vector128.Narrow(prodLo, prodHi);
+ }
+
+ {
+ // Widen each half of the short vectors into two int vectors
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Elementwise multiply: each int lane now holds the full 32-bit product
+ Vector128 prodLo = leftLo * rightLo;
+ Vector128 prodHi = leftHi * rightHi;
+
+ // Arithmetic shift right by 16 bits to extract the high word
+ prodLo >>= 16;
+ prodHi >>= 16;
+
+ // Narrow the two int vectors back into one short vector
+ return Vector128.Narrow(prodLo, prodHi);
+ }
+ }
+
+ ///
+ /// Multiply the packed 16-bit unsigned integers in and , producing
+ /// intermediate unsigned 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
+ ///
+ ///
+ /// The first vector containing packed 16-bit unsigned integers to multiply.
+ ///
+ ///
+ /// The second vector containing packed 16-bit unsigned integers to multiply.
+ ///
+ ///
+ /// A vector containing the high 16 bits of the products of the packed 16-bit unsigned integers
+ /// from and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 MultiplyHigh(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.MultiplyHigh(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ Vector128 prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower());
+ Vector128 prodHi = AdvSimd.MultiplyWideningUpper(left, right);
+
+ prodLo >>= 16;
+ prodHi >>= 16;
+
+ return Vector128.Narrow(prodLo, prodHi);
+ }
+
+ {
+ // Widen each half of the short vectors into two uint vectors
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Elementwise multiply: each int lane now holds the full 32-bit product
+ Vector128 prodLo = leftLo * rightLo;
+ Vector128 prodHi = leftHi * rightHi;
+
+ // Arithmetic shift right by 16 bits to extract the high word
+ prodLo >>= 16;
+ prodHi >>= 16;
+
+ // Narrow the two int vectors back into one short vector
+ return Vector128.Narrow(prodLo, prodHi);
+ }
+ }
+
+ ///
+ /// Unpack and interleave 64-bit integers from the high half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 64-bit integers to unpack from the high half.
+ ///
+ ///
+ /// The second vector containing packed 64-bit integers to unpack from the high half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 64-bit integers from the high
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackHigh(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackHigh(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipHigh(left, right);
+ }
+
+ return Vector128.Create(left.GetUpper(), right.GetUpper());
+ }
+
+ ///
+ /// Unpack and interleave 64-bit integers from the low half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 64-bit integers to unpack from the low half.
+ ///
+ ///
+ /// The second vector containing packed 64-bit integers to unpack from the low half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 64-bit integers from the low
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackLow(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackLow(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipLow(left, right);
+ }
+
+ return Vector128.Create(left.GetLower(), right.GetLower());
+ }
+
+ ///
+ /// Unpack and interleave 32-bit integers from the high half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 32-bit integers to unpack from the high half.
+ ///
+ ///
+ /// The second vector containing packed 32-bit integers to unpack from the high half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 32-bit integers from the high
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackHigh(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackHigh(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipHigh(left, right);
+ }
+
+ Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper());
+ return Vector128.Shuffle(unpacked, Vector128.Create(0, 2, 1, 3));
+ }
+
+ ///
+ /// Unpack and interleave 32-bit integers from the low half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 32-bit integers to unpack from the low half.
+ ///
+ ///
+ /// The second vector containing packed 32-bit integers to unpack from the low half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 32-bit integers from the low
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackLow(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackLow(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipLow(left, right);
+ }
+
+ Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower());
+ return Vector128.Shuffle(unpacked, Vector128.Create(0, 2, 1, 3));
+ }
+
+ ///
+ /// Unpack and interleave 16-bit integers from the high half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 16-bit integers to unpack from the high half.
+ ///
+ ///
+ /// The second vector containing packed 16-bit integers to unpack from the high half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 16-bit integers from the high
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackHigh(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackHigh(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipHigh(left, right);
+ }
+
+ Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper());
+ return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7));
+ }
+
+ ///
+ /// Unpack and interleave 16-bit integers from the low half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 16-bit integers to unpack from the low half.
+ ///
+ ///
+ /// The second vector containing packed 16-bit integers to unpack from the low half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 16-bit integers from the low
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackLow(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackLow(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipLow(left, right);
+ }
+
+ Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower());
+ return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7));
+ }
+
+ ///
+ /// Unpack and interleave 8-bit integers from the high half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 8-bit integers to unpack from the high half.
+ ///
+ ///
+ /// The second vector containing packed 8-bit integers to unpack from the high half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 8-bit integers from the high
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackHigh(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackHigh(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipHigh(left, right);
+ }
+
+ Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper());
+ return Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15));
+ }
+
+ ///
+ /// Unpack and interleave 8-bit integers from the low half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 8-bit integers to unpack from the low half.
+ ///
+ ///
+ /// The second vector containing packed 8-bit integers to unpack from the low half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 8-bit integers from the low
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackLow(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackLow(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipLow(left, right);
+ }
+
+ Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower());
+ return Vector128.Shuffle(unpacked, Vector128.Create((byte)0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15));
+ }
+
+ ///
+ /// Unpack and interleave 8-bit signed integers from the high half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 8-bit signed integers to unpack from the high half.
+ ///
+ ///
+ /// The second vector containing packed 8-bit signed integers to unpack from the high half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 8-bit signed integers from the high
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackHigh(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackHigh(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipHigh(left, right);
+ }
+
+ Vector128 unpacked = Vector128.Create(left.GetUpper(), right.GetUpper());
+ return Vector128.Shuffle(unpacked, Vector128.Create(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15));
+ }
+
+ ///
+ /// Unpack and interleave 8-bit signed integers from the low half of and
+ /// and store the results in the result.
+ ///
+ ///
+ /// The first vector containing packed 8-bit signed integers to unpack from the low half.
+ ///
+ ///
+ /// The second vector containing packed 8-bit signed integers to unpack from the low half.
+ ///
+ ///
+ /// A vector containing the unpacked and interleaved 8-bit signed integers from the low
+ /// halves of and .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 UnpackLow(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.UnpackLow(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.Arm64.ZipLow(left, right);
+ }
+
+ Vector128 unpacked = Vector128.Create(left.GetLower(), right.GetLower());
+ return Vector128.Shuffle(unpacked, Vector128.Create(0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15));
+ }
+
+ ///
+ /// Subtract packed signed 16-bit integers in from packed signed 16-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed signed 16-bit integers to subtract from.
+ ///
+ ///
+ /// The second vector containing packed signed 16-bit integers to subtract.
+ ///
+ ///
+ /// A vector containing the results of subtracting packed signed 16-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 SubtractSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.SubtractSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.SubtractSaturate(left, right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.SubtractSaturate(left, right);
+ }
+
+ // Widen inputs to 32-bit signed
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Subtract
+ Vector128 diffLo = leftLo - rightLo;
+ Vector128 diffHi = leftHi - rightHi;
+
+ // Clamp to signed 16-bit range
+ Vector128 min = Vector128.Create((int)short.MinValue);
+ Vector128 max = Vector128.Create((int)short.MaxValue);
+
+ diffLo = Clamp(diffLo, min, max);
+ diffHi = Clamp(diffHi, min, max);
+
+ // Narrow back to 16 bit signed.
+ return Vector128.Narrow(diffLo, diffHi);
+ }
+
+ ///
+ /// Subtract packed unsigned 16-bit integers in from packed unsigned 16-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed unsigned 16-bit integers to subtract from.
+ ///
+ ///
+ /// The second vector containing packed unsigned 16-bit integers to subtract.
+ ///
+ ///
+ /// A vector containing the results of subtracting packed unsigned 16-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 SubtractSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.SubtractSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.SubtractSaturate(left, right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.SubtractSaturate(left, right);
+ }
+
+ // Widen inputs to 32-bit signed
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Subtract
+ Vector128 diffLo = leftLo - rightLo;
+ Vector128 diffHi = leftHi - rightHi;
+
+ // Clamp to signed 16-bit range
+ Vector128 min = Vector128.Create((uint)ushort.MinValue);
+ Vector128 max = Vector128.Create((uint)ushort.MaxValue);
+
+ diffLo = Clamp(diffLo, min, max);
+ diffHi = Clamp(diffHi, min, max);
+
+ // Narrow back to 16 bit signed.
+ return Vector128.Narrow(diffLo, diffHi);
+ }
+
+ ///
+ /// Add packed unsigned 8-bit integers in to packed unsigned 8-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed unsigned 8-bit integers to add to.
+ ///
+ ///
+ /// The second vector containing packed unsigned 8-bit integers to add.
+ ///
+ ///
+ /// A vector containing the results of adding packed unsigned 8-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 AddSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.AddSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.AddSaturate(left, right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.AddSaturate(left, right);
+ }
+
+ // Widen inputs to 16-bit
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Add
+ Vector128 sumLo = leftLo + rightLo;
+ Vector128 sumHi = leftHi + rightHi;
+
+ // Clamp to signed 8-bit range
+ Vector128 max = Vector128.Create((ushort)byte.MaxValue);
+
+ sumLo = Clamp(sumLo, Vector128.Zero, max);
+ sumHi = Clamp(sumHi, Vector128.Zero, max);
+
+ // Narrow back to bytes
+ return Vector128.Narrow(sumLo, sumHi);
+ }
+
+ ///
+ /// Add packed unsigned 16-bit integers in to packed unsigned 16-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed unsigned 16-bit integers to add to.
+ ///
+ ///
+ /// The second vector containing packed unsigned 16-bit integers to add.
+ ///
+ ///
+ /// A vector containing the results of adding packed unsigned 16-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 AddSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.AddSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.AddSaturate(left, right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.AddSaturate(left, right);
+ }
+
+ // Widen inputs to 32-bit
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Add
+ Vector128 sumLo = leftLo + rightLo;
+ Vector128 sumHi = leftHi + rightHi;
+
+ // Clamp to signed 16-bit range
+ Vector128 max = Vector128.Create((uint)ushort.MaxValue);
+
+ sumLo = Clamp(sumLo, Vector128.Zero, max);
+ sumHi = Clamp(sumHi, Vector128.Zero, max);
+
+ // Narrow back to 16 bit unsigned.
+ return Vector128.Narrow(sumLo, sumHi);
+ }
+
+ ///
+ /// Subtract packed unsigned 8-bit integers in from packed unsigned 8-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed unsigned 8-bit integers to subtract from.
+ ///
+ ///
+ /// The second vector containing packed unsigned 8-bit integers to subtract.
+ ///
+ ///
+ /// A vector containing the results of subtracting packed unsigned 8-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 SubtractSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.SubtractSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.SubtractSaturate(left, right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.SubtractSaturate(left, right);
+ }
+
+ // Widen inputs to 16-bit
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Subtract
+ Vector128 diffLo = leftLo - rightLo;
+ Vector128 diffHi = leftHi - rightHi;
+
+ // Clamp to signed 8-bit range
+ Vector128 max = Vector128.Create((ushort)byte.MaxValue);
+
+ diffLo = Clamp(diffLo, Vector128.Zero, max);
+ diffHi = Clamp(diffHi, Vector128.Zero, max);
+
+ // Narrow back to bytes
+ return Vector128.Narrow(diffLo, diffHi);
+ }
+
+ ///
+ /// Add packed unsigned 8-bit integers in from packed unsigned 8-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed unsigned 8-bit integers to add to.
+ ///
+ ///
+ /// The second vector containing packed unsigned 8-bit integers to add.
+ ///
+ ///
+ /// A vector containing the results of adding packed unsigned 8-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 AddSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.AddSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.AddSaturate(left, right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.AddSaturate(left, right);
+ }
+
+ // Widen inputs to 16-bit
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Add
+ Vector128 sumLo = leftLo + rightLo;
+ Vector128 sumHi = leftHi + rightHi;
+
+ // Clamp to signed 8-bit range
+ Vector128 min = Vector128.Create((short)sbyte.MinValue);
+ Vector128 max = Vector128.Create((short)sbyte.MaxValue);
+
+ sumLo = Clamp(sumLo, min, max);
+ sumHi = Clamp(sumHi, min, max);
+
+ // Narrow back to signed bytes
+ return Vector128.Narrow(sumLo, sumHi);
+ }
+
+ ///
+ /// Subtract packed signed 8-bit integers in from packed signed 8-bit integers
+ /// in using saturation, and store the results.
+ ///
+ ///
+ /// The first vector containing packed signed 8-bit integers to subtract from.
+ ///
+ ///
+ /// The second vector containing packed signed 8-bit integers to subtract.
+ ///
+ ///
+ /// A vector containing the results of subtracting packed signed 8-bit integers
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 SubtractSaturate(Vector128 left, Vector128 right)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.SubtractSaturate(left, right);
+ }
+
+ if (AdvSimd.IsSupported)
+ {
+ return AdvSimd.SubtractSaturate(left, right);
+ }
+
+ if (PackedSimd.IsSupported)
+ {
+ return PackedSimd.SubtractSaturate(left, right);
+ }
+
+ // Widen inputs to 16-bit
+ (Vector128 leftLo, Vector128 leftHi) = Vector128.Widen(left);
+ (Vector128 rightLo, Vector128 rightHi) = Vector128.Widen(right);
+
+ // Subtract
+ Vector128 diffLo = leftLo - rightLo;
+ Vector128 diffHi = leftHi - rightHi;
+
+ // Clamp to signed 8-bit range
+ Vector128 min = Vector128.Create((short)sbyte.MinValue);
+ Vector128 max = Vector128.Create((short)sbyte.MaxValue);
+
+ diffLo = Clamp(diffLo, min, max);
+ diffHi = Clamp(diffHi, min, max);
+
+ // Narrow back to signed bytes
+ return Vector128.Narrow(diffLo, diffHi);
+ }
}
diff --git a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
index 817d6e607..14ac13dd8 100644
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@@ -1,7 +1,6 @@
// Copyright (c) Six Labors.
// Licensed under the Six Labors Split License.
-using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
using System.Runtime.Intrinsics;
@@ -21,24 +20,6 @@ namespace SixLabors.ImageSharp.Common.Helpers;
internal static class Vector256_
#pragma warning restore SA1649 // File name should match first type name
{
- ///
- /// Gets a value indicating whether shuffle byte operations are supported.
- ///
- public static bool SupportsShuffleNativeFloat
- {
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- get => Avx.IsSupported;
- }
-
- ///
- /// Gets a value indicating whether shuffle byte operations are supported.
- ///
- public static bool SupportsShuffleNativeByte
- {
- [MethodImpl(MethodImplOptions.AggressiveInlining)]
- get => Avx2.IsSupported;
- }
-
///
/// Creates a new vector by selecting values from an input vector using a set of indices.
///
@@ -47,15 +28,7 @@ internal static class Vector256_
/// The .
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector256 ShuffleNative(Vector256 vector, [ConstantExpected] byte control)
- {
- if (Avx.IsSupported)
- {
- return Avx.Shuffle(vector, vector, control);
- }
-
- ThrowUnreachableException();
- return default;
- }
+ => Avx.Shuffle(vector, vector, control);
///
/// Creates a new vector by selecting values from an input vector using a set of indices.
@@ -66,15 +39,17 @@ internal static class Vector256_
///
/// The .
[MethodImpl(MethodImplOptions.AggressiveInlining)]
- public static Vector256 ShuffleNative(Vector256 vector, Vector256 indices)
+ public static Vector256