Browse Source

Port TTransformSse41

pull/2933/head
James Jackson-South 12 months ago
parent
commit
c490bc6f66
  1. 17
      src/ImageSharp/Common/Helpers/Numerics.cs
  2. 227
      src/ImageSharp/Common/Helpers/Vector128Utilities.cs
  3. 145
      src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

17
src/ImageSharp/Common/Helpers/Numerics.cs

@ -884,23 +884,6 @@ internal static class Numerics
accumulator += intHigh;
}
/// <summary>
/// Reduces elements of the vector into one sum.
/// </summary>
/// <param name="accumulator">The accumulator to reduce.</param>
/// <returns>The sum of all elements.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int ReduceSum(Vector128<int> accumulator)
{
// Add odd to even.
Vector128<int> vsum = Sse2.Add(accumulator, Sse2.Shuffle(accumulator, 0b_11_11_01_01));
// Add high to low.
vsum = Sse2.Add(vsum, Sse2.Shuffle(vsum, 0b_11_10_11_10));
return Sse2.ConvertToInt32(vsum);
}
/// <summary>
/// Reduces elements of the vector into one sum.
/// </summary>

227
src/ImageSharp/Common/Helpers/Vector128Utilities.cs

@ -88,6 +88,30 @@ internal static class Vector128_
return Vector128.Shuffle(vector, indices);
}
/// <summary>
/// Creates a new vector by selecting values from an input vector using the control.
/// </summary>
/// <param name="vector">The input vector from which values are selected.</param>
/// <param name="control">The shuffle control byte.</param>
/// <returns>The <see cref="Vector128{Single}"/>.</returns>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static Vector128<int> ShuffleNative(Vector128<int> vector, [ConstantExpected] byte control)
{
if (Sse2.IsSupported)
{
return Sse2.Shuffle(vector, control);
}
// Don't use InverseMMShuffle here as we want to avoid the cast.
Vector128<int> indices = Vector128.Create(
control & 0x3,
(control >> 2) & 0x3,
(control >> 4) & 0x3,
(control >> 6) & 0x3);
return Vector128.Shuffle(vector, indices);
}
/// <summary>
/// Creates a new vector by selecting values from an input vector using a set of indices.
/// </summary>
@ -412,6 +436,31 @@ internal static class Vector128_
return Vector128.Narrow(prodLo, prodHi);
}
public static Vector128<int> MultiplyAddAdjacent(Vector128<short> left, Vector128<short> right)
{
if (Sse2.IsSupported)
{
return Sse2.MultiplyAddAdjacent(left, right);
}
// Widen each half of the short vectors into two int vectors
(Vector128<int> leftLower, Vector128<int> leftUpper) = Vector128.Widen(left);
(Vector128<int> rightLower, Vector128<int> rightUpper) = Vector128.Widen(right);
// Elementwise multiply: each int lane now holds the full 32-bit product
Vector128<int> prodLo = leftLower * rightLower;
Vector128<int> prodHi = leftUpper * rightUpper;
// Extract the low and high parts of the products shuffling them to form a result we can add together.
// Use out-of-bounds to zero out the unused lanes.
Vector128<int> v0 = Vector128.Shuffle(prodLo, Vector128.Create(0, 2, 8, 8));
Vector128<int> v1 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 0, 2));
Vector128<int> v2 = Vector128.Shuffle(prodLo, Vector128.Create(1, 3, 8, 8));
Vector128<int> v3 = Vector128.Shuffle(prodHi, Vector128.Create(8, 8, 1, 3));
return v0 + v1 + v2 + v3;
}
/// <summary>
/// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
/// intermediate 32-bit integers, and store the high 16 bits of the intermediate integers in the result.
@ -450,6 +499,184 @@ internal static class Vector128_
return Vector128.Narrow(prodLo, prodHi);
}
/// <summary>
/// Unpack and interleave 64-bit integers from the high half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 64-bit integers to unpack from the high half.
/// </param>
/// <param name="right">
/// The second vector containing packed 64-bit integers to unpack from the high half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 64-bit integers from the high
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
public static Vector128<long> UnpackHigh(Vector128<long> left, Vector128<long> right)
{
if (Sse2.IsSupported)
{
return Sse2.UnpackHigh(left, right);
}
if (AdvSimd.IsSupported)
{
return AdvSimd.Arm64.ZipHigh(left, right);
}
return Vector128.Create(left.GetUpper(), right.GetUpper());
}
/// <summary>
/// Unpack and interleave 64-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 64-bit integers to unpack from the low half.
/// </param>
/// <param name="right">
/// The second vector containing packed 64-bit integers to unpack from the low half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 64-bit integers from the low
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
public static Vector128<long> UnpackLow(Vector128<long> left, Vector128<long> right)
{
if (Sse2.IsSupported)
{
return Sse2.UnpackLow(left, right);
}
if (AdvSimd.IsSupported)
{
return AdvSimd.Arm64.ZipLow(left, right);
}
return Vector128.Create(left.GetLower(), right.GetLower());
}
/// <summary>
/// Unpack and interleave 32-bit integers from the high half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 32-bit integers to unpack from the high half.
/// </param>
/// <param name="right">
/// The second vector containing packed 32-bit integers to unpack from the high half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 32-bit integers from the high
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
public static Vector128<int> UnpackHigh(Vector128<int> left, Vector128<int> right)
{
if (Sse2.IsSupported)
{
return Sse2.UnpackHigh(left, right);
}
if (AdvSimd.IsSupported)
{
return AdvSimd.Arm64.ZipHigh(left, right);
}
Vector128<int> unpacked = Vector128.Create(left.GetUpper(), right.GetUpper());
return Vector128.Shuffle(unpacked, Vector128.Create(0, 2, 1, 3));
}
/// <summary>
/// Unpack and interleave 32-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 32-bit integers to unpack from the low half.
/// </param>
/// <param name="right">
/// The second vector containing packed 32-bit integers to unpack from the low half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 32-bit integers from the low
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
public static Vector128<int> UnpackLow(Vector128<int> left, Vector128<int> right)
{
if (Sse2.IsSupported)
{
return Sse2.UnpackLow(left, right);
}
if (AdvSimd.IsSupported)
{
return AdvSimd.Arm64.ZipLow(left, right);
}
Vector128<int> unpacked = Vector128.Create(left.GetLower(), right.GetLower());
return Vector128.Shuffle(unpacked, Vector128.Create(0, 2, 1, 3));
}
/// <summary>
/// Unpack and interleave 16-bit integers from the high half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 16-bit integers to unpack from the high half.
/// </param>
/// <param name="right">
/// The second vector containing packed 16-bit integers to unpack from the high half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 16-bit integers from the high
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
public static Vector128<short> UnpackHigh(Vector128<short> left, Vector128<short> right)
{
if (Sse2.IsSupported)
{
return Sse2.UnpackHigh(left, right);
}
if (AdvSimd.IsSupported)
{
return AdvSimd.Arm64.ZipHigh(left, right);
}
Vector128<short> unpacked = Vector128.Create(left.GetUpper(), right.GetUpper());
return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7));
}
/// <summary>
/// Unpack and interleave 16-bit integers from the low half of <paramref name="left"/> and <paramref name="right"/>
/// and store the results in the result.
/// </summary>
/// <param name="left">
/// The first vector containing packed 16-bit integers to unpack from the low half.
/// </param>
/// <param name="right">
/// The second vector containing packed 16-bit integers to unpack from the low half.
/// </param>
/// <returns>
/// A vector containing the unpacked and interleaved 16-bit integers from the low
/// halves of <paramref name="left"/> and <paramref name="right"/>.
/// </returns>
public static Vector128<short> UnpackLow(Vector128<short> left, Vector128<short> right)
{
if (Sse2.IsSupported)
{
return Sse2.UnpackLow(left, right);
}
if (AdvSimd.IsSupported)
{
return AdvSimd.Arm64.ZipLow(left, right);
}
Vector128<short> unpacked = Vector128.Create(left.GetLower(), right.GetLower());
return Vector128.Shuffle(unpacked, Vector128.Create(0, 4, 1, 5, 2, 6, 3, 7));
}
[DoesNotReturn]
private static void ThrowUnreachableException() => throw new UnreachableException();
}

145
src/ImageSharp/Formats/Webp/Lossy/LossyUtils.cs

@ -7,6 +7,7 @@ using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using SixLabors.ImageSharp.Common.Helpers;
// ReSharper disable InconsistentNaming
namespace SixLabors.ImageSharp.Formats.Webp.Lossy;
@ -127,7 +128,7 @@ internal static class LossyUtils
Vector128<int> e1 = Sse2.MultiplyAddAdjacent(d1, d1);
Vector128<int> sum = Sse2.Add(e0, e1);
return Numerics.ReduceSum(sum);
return ReduceSum(sum);
}
if (AdvSimd.IsSupported)
@ -174,12 +175,12 @@ internal static class LossyUtils
Vector128<int> sum1 = SubtractAndAccumulate(a0, b0);
Vector128<int> sum2 = SubtractAndAccumulate(a1, b1);
sum = Sse2.Add(sum, Sse2.Add(sum1, sum2));
sum += sum1 + sum2;
offset += 2 * WebpConstants.Bps;
}
return Numerics.ReduceSum(sum);
return ReduceSum(sum);
}
[MethodImpl(InliningOptions.ShortMethod)]
@ -378,17 +379,16 @@ internal static class LossyUtils
[MethodImpl(InliningOptions.ShortMethod)]
public static int Vp8Disto4X4(Span<byte> a, Span<byte> b, Span<ushort> w, Span<int> scratch)
{
if (Sse41.IsSupported)
if (Vector128.IsHardwareAccelerated)
{
int diffSum = TTransformSse41(a, b, w);
int diffSum = TTransformVector128(a, b, w);
return Math.Abs(diffSum) >> 5;
}
else
{
int sum1 = TTransform(a, w, scratch);
int sum2 = TTransform(b, w, scratch);
return Math.Abs(sum2 - sum1) >> 5;
}
int sum1 = TTransform(a, w, scratch);
int sum2 = TTransform(b, w, scratch);
return Math.Abs(sum2 - sum1) >> 5;
}
public static void DC16(Span<byte> dst, Span<byte> yuv, int offset)
@ -905,7 +905,7 @@ internal static class LossyUtils
/// Returns the weighted sum of the absolute value of transformed coefficients.
/// w[] contains a row-major 4 by 4 symmetric matrix.
/// </summary>
public static int TTransformSse41(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
public static int TTransformVector128(Span<byte> inputA, Span<byte> inputB, Span<ushort> w)
{
// Load and combine inputs.
Vector128<byte> ina0 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputA));
@ -918,14 +918,14 @@ internal static class LossyUtils
Vector128<long> inb3 = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(inputB.Slice(WebpConstants.Bps * 3, 16))).AsInt64();
// Combine inA and inB (we'll do two transforms in parallel).
Vector128<int> inab0 = Sse2.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
Vector128<int> inab1 = Sse2.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
Vector128<int> inab2 = Sse2.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
Vector128<int> inab3 = Sse2.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
Vector128<short> tmp0 = Sse41.ConvertToVector128Int16(inab0.AsByte());
Vector128<short> tmp1 = Sse41.ConvertToVector128Int16(inab1.AsByte());
Vector128<short> tmp2 = Sse41.ConvertToVector128Int16(inab2.AsByte());
Vector128<short> tmp3 = Sse41.ConvertToVector128Int16(inab3.AsByte());
Vector128<int> inab0 = Vector128_.UnpackLow(ina0.AsInt32(), inb0.AsInt32());
Vector128<int> inab1 = Vector128_.UnpackLow(ina1.AsInt32(), inb1.AsInt32());
Vector128<int> inab2 = Vector128_.UnpackLow(ina2.AsInt32(), inb2.AsInt32());
Vector128<int> inab3 = Vector128_.UnpackLow(ina3.AsInt32(), inb3.AsInt32());
Vector128<short> tmp0 = Vector128.WidenLower(inab0.AsByte()).AsInt16();
Vector128<short> tmp1 = Vector128.WidenLower(inab1.AsByte()).AsInt16();
Vector128<short> tmp2 = Vector128.WidenLower(inab2.AsByte()).AsInt16();
Vector128<short> tmp3 = Vector128.WidenLower(inab3.AsByte()).AsInt16();
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
@ -934,14 +934,14 @@ internal static class LossyUtils
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent transpose.
// Calculate a and b (two 4x4 at once).
Vector128<short> a0 = Sse2.Add(tmp0, tmp2);
Vector128<short> a1 = Sse2.Add(tmp1, tmp3);
Vector128<short> a2 = Sse2.Subtract(tmp1, tmp3);
Vector128<short> a3 = Sse2.Subtract(tmp0, tmp2);
Vector128<short> b0 = Sse2.Add(a0, a1);
Vector128<short> b1 = Sse2.Add(a3, a2);
Vector128<short> b2 = Sse2.Subtract(a3, a2);
Vector128<short> b3 = Sse2.Subtract(a0, a1);
Vector128<short> a0 = tmp0 + tmp2;
Vector128<short> a1 = tmp1 + tmp3;
Vector128<short> a2 = tmp1 - tmp3;
Vector128<short> a3 = tmp0 - tmp2;
Vector128<short> b0 = a0 + a1;
Vector128<short> b1 = a3 + a2;
Vector128<short> b2 = a3 - a2;
Vector128<short> b3 = a0 - a1;
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
@ -959,38 +959,38 @@ internal static class LossyUtils
Vector128<ushort> w8 = Unsafe.As<ushort, Vector128<ushort>>(ref MemoryMarshal.GetReference(w.Slice(8, 8)));
// Calculate a and b (two 4x4 at once).
a0 = Sse2.Add(output0.AsInt16(), output2.AsInt16());
a1 = Sse2.Add(output1.AsInt16(), output3.AsInt16());
a2 = Sse2.Subtract(output1.AsInt16(), output3.AsInt16());
a3 = Sse2.Subtract(output0.AsInt16(), output2.AsInt16());
b0 = Sse2.Add(a0, a1);
b1 = Sse2.Add(a3, a2);
b2 = Sse2.Subtract(a3, a2);
b3 = Sse2.Subtract(a0, a1);
a0 = output0.AsInt16() + output2.AsInt16();
a1 = output1.AsInt16() + output3.AsInt16();
a2 = output1.AsInt16() - output3.AsInt16();
a3 = output0.AsInt16() - output2.AsInt16();
b0 = a0 + a1;
b1 = a3 + a2;
b2 = a3 - a2;
b3 = a0 - a1;
// Separate the transforms of inA and inB.
Vector128<long> ab0 = Sse2.UnpackLow(b0.AsInt64(), b1.AsInt64());
Vector128<long> ab2 = Sse2.UnpackLow(b2.AsInt64(), b3.AsInt64());
Vector128<long> bb0 = Sse2.UnpackHigh(b0.AsInt64(), b1.AsInt64());
Vector128<long> bb2 = Sse2.UnpackHigh(b2.AsInt64(), b3.AsInt64());
Vector128<long> ab0 = Vector128_.UnpackLow(b0.AsInt64(), b1.AsInt64());
Vector128<long> ab2 = Vector128_.UnpackLow(b2.AsInt64(), b3.AsInt64());
Vector128<long> bb0 = Vector128_.UnpackHigh(b0.AsInt64(), b1.AsInt64());
Vector128<long> bb2 = Vector128_.UnpackHigh(b2.AsInt64(), b3.AsInt64());
Vector128<ushort> ab0Abs = Ssse3.Abs(ab0.AsInt16());
Vector128<ushort> ab2Abs = Ssse3.Abs(ab2.AsInt16());
Vector128<ushort> b0Abs = Ssse3.Abs(bb0.AsInt16());
Vector128<ushort> bb2Abs = Ssse3.Abs(bb2.AsInt16());
Vector128<short> ab0Abs = Vector128.Abs(ab0.AsInt16());
Vector128<short> ab2Abs = Vector128.Abs(ab2.AsInt16());
Vector128<short> b0Abs = Vector128.Abs(bb0.AsInt16());
Vector128<short> bb2Abs = Vector128.Abs(bb2.AsInt16());
// weighted sums.
Vector128<int> ab0mulw0 = Sse2.MultiplyAddAdjacent(ab0Abs.AsInt16(), w0.AsInt16());
Vector128<int> ab2mulw8 = Sse2.MultiplyAddAdjacent(ab2Abs.AsInt16(), w8.AsInt16());
Vector128<int> b0mulw0 = Sse2.MultiplyAddAdjacent(b0Abs.AsInt16(), w0.AsInt16());
Vector128<int> bb2mulw8 = Sse2.MultiplyAddAdjacent(bb2Abs.AsInt16(), w8.AsInt16());
Vector128<int> ab0ab2Sum = Sse2.Add(ab0mulw0, ab2mulw8);
Vector128<int> b0w0bb2w8Sum = Sse2.Add(b0mulw0, bb2mulw8);
Vector128<int> ab0mulw0 = Vector128_.MultiplyAddAdjacent(ab0Abs, w0.AsInt16());
Vector128<int> ab2mulw8 = Vector128_.MultiplyAddAdjacent(ab2Abs, w8.AsInt16());
Vector128<int> b0mulw0 = Vector128_.MultiplyAddAdjacent(b0Abs, w0.AsInt16());
Vector128<int> bb2mulw8 = Vector128_.MultiplyAddAdjacent(bb2Abs, w8.AsInt16());
Vector128<int> ab0ab2Sum = ab0mulw0 + ab2mulw8;
Vector128<int> b0w0bb2w8Sum = b0mulw0 + bb2mulw8;
// difference of weighted sums.
Vector128<int> result = Sse2.Subtract(ab0ab2Sum.AsInt32(), b0w0bb2w8Sum.AsInt32());
Vector128<int> result = ab0ab2Sum - b0w0bb2w8Sum;
return Numerics.ReduceSum(result);
return ReduceSum(result);
}
// Transpose two 4x4 16b matrices horizontally stored in registers.
@ -1002,28 +1002,28 @@ internal static class LossyUtils
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
Vector128<short> transpose00 = Sse2.UnpackLow(b0, b1);
Vector128<short> transpose01 = Sse2.UnpackLow(b2, b3);
Vector128<short> transpose02 = Sse2.UnpackHigh(b0, b1);
Vector128<short> transpose03 = Sse2.UnpackHigh(b2, b3);
Vector128<short> transpose00 = Vector128_.UnpackLow(b0, b1);
Vector128<short> transpose01 = Vector128_.UnpackLow(b2, b3);
Vector128<short> transpose02 = Vector128_.UnpackHigh(b0, b1);
Vector128<short> transpose03 = Vector128_.UnpackHigh(b2, b3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
Vector128<int> transpose10 = Sse2.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose11 = Sse2.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
Vector128<int> transpose12 = Sse2.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose13 = Sse2.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
Vector128<int> transpose10 = Vector128_.UnpackLow(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose11 = Vector128_.UnpackLow(transpose02.AsInt32(), transpose03.AsInt32());
Vector128<int> transpose12 = Vector128_.UnpackHigh(transpose00.AsInt32(), transpose01.AsInt32());
Vector128<int> transpose13 = Vector128_.UnpackHigh(transpose02.AsInt32(), transpose03.AsInt32());
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
output0 = Sse2.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
output1 = Sse2.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
output2 = Sse2.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
output3 = Sse2.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
output0 = Vector128_.UnpackLow(transpose10.AsInt64(), transpose11.AsInt64());
output1 = Vector128_.UnpackHigh(transpose10.AsInt64(), transpose11.AsInt64());
output2 = Vector128_.UnpackLow(transpose12.AsInt64(), transpose13.AsInt64());
output3 = Vector128_.UnpackHigh(transpose12.AsInt64(), transpose13.AsInt64());
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
@ -1910,6 +1910,23 @@ internal static class LossyUtils
// Cost of coding one event with probability 'proba'.
public static int Vp8BitCost(int bit, byte proba) => bit == 0 ? WebpLookupTables.Vp8EntropyCost[proba] : WebpLookupTables.Vp8EntropyCost[255 - proba];
/// <summary>
/// Reduces elements of the vector into one sum.
/// </summary>
/// <param name="accumulator">The accumulator to reduce.</param>
/// <returns>The sum of all elements.</returns>
[MethodImpl(InliningOptions.ShortMethod)]
private static int ReduceSum(Vector128<int> accumulator)
{
// Add odd to even.
Vector128<int> vsum = accumulator + Vector128_.ShuffleNative(accumulator, 0b_11_11_01_01);
// Add high to low.
vsum += Vector128_.ShuffleNative(vsum, 0b_11_10_11_10);
return vsum.ToScalar();
}
[MethodImpl(InliningOptions.ShortMethod)]
private static void Put16(int v, Span<byte> dst)
{

Loading…
Cancel
Save