diff --git a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
index bfd237a2d7..50eeb8e0a7 100644
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@@ -99,6 +99,33 @@ internal static class Vector128_
return Vector128.Shuffle(vector, indices);
}
+ ///
+ /// Shuffle 16-bit integers in the high 64 bits of using the control in .
+ /// Store the results in the high 64 bits of the destination, with the low 64 bits being copied from .
+ ///
+ /// The input vector containing packed 16-bit integers to shuffle.
+ /// The shuffle control byte.
+ ///
+ /// A vector containing the shuffled 16-bit integers in the high 64 bits, with the low 64 bits copied from .
+ ///
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static Vector128 ShuffleHigh(Vector128 value, [ConstantExpected] byte control)
+ {
+ if (Sse2.IsSupported)
+ {
+ return Sse2.ShuffleHigh(value, control);
+ }
+
+ // Don't use InverseMMShuffle here as we want to avoid the cast.
+ Vector64 indices = Vector64.Create(
+ (short)(control & 0x3),
+ (short)((control >> 2) & 0x3),
+ (short)((control >> 4) & 0x3),
+ (short)((control >> 6) & 0x3));
+
+ return Vector128.Create(value.GetLower(), Vector64.Shuffle(value.GetUpper(), indices));
+ }
+
///
/// Creates a new vector by selecting values from an input vector using a set of indices.
///
diff --git a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
index fd8d48dd00..72420a0947 100644
--- a/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
+++ b/src/ImageSharp/Formats/Webp/Lossy/Vp8Encoding.cs
@@ -2,10 +2,11 @@
// Licensed under the Six Labors Split License.
using System.Buffers.Binary;
+using System.Numerics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
-using System.Runtime.Intrinsics.X86;
+using SixLabors.ImageSharp.Common.Helpers;
namespace SixLabors.ImageSharp.Formats.Webp.Lossy;
@@ -78,7 +79,7 @@ internal static unsafe class Vp8Encoding
// Does two inverse transforms.
public static void ITransformTwo(Span reference, Span input, Span dst, Span scratch)
{
- if (Sse2.IsSupported)
+ if (Vector128.IsHardwareAccelerated)
{
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
@@ -116,10 +117,10 @@ internal static unsafe class Vp8Encoding
Vector128 inb2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 24)), 0);
Vector128 inb3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref inputRef, 28)), 0);
- in0 = Sse2.UnpackLow(in0, inb0);
- in1 = Sse2.UnpackLow(in1, inb1);
- in2 = Sse2.UnpackLow(in2, inb2);
- in3 = Sse2.UnpackLow(in3, inb3);
+ in0 = Vector128_.UnpackLow(in0, inb0);
+ in1 = Vector128_.UnpackLow(in1, inb1);
+ in2 = Vector128_.UnpackLow(in2, inb2);
+ in3 = Vector128_.UnpackLow(in3, inb3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
@@ -128,49 +129,45 @@ internal static unsafe class Vp8Encoding
// Vertical pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
- InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3);
+ InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3);
// Horizontal pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
- InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3);
+ InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
// Add inverse transform to 'ref' and store.
// Load the reference(s).
- Vector128 ref0 = Vector128.Zero;
- Vector128 ref1 = Vector128.Zero;
- Vector128 ref2 = Vector128.Zero;
- Vector128 ref3 = Vector128.Zero;
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
// Load eight bytes/pixels per line.
- ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte();
- ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
- ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
- ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
+ Vector128 ref0 = Vector128.Create(Unsafe.As(ref referenceRef), 0).AsByte();
+ Vector128 ref1 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps)), 0).AsByte();
+ Vector128 ref2 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2)), 0).AsByte();
+ Vector128 ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0).AsByte();
// Convert to 16b.
- ref0 = Sse2.UnpackLow(ref0, Vector128.Zero);
- ref1 = Sse2.UnpackLow(ref1, Vector128.Zero);
- ref2 = Sse2.UnpackLow(ref2, Vector128.Zero);
- ref3 = Sse2.UnpackLow(ref3, Vector128.Zero);
+ ref0 = Vector128_.UnpackLow(ref0, Vector128.Zero);
+ ref1 = Vector128_.UnpackLow(ref1, Vector128.Zero);
+ ref2 = Vector128_.UnpackLow(ref2, Vector128.Zero);
+ ref3 = Vector128_.UnpackLow(ref3, Vector128.Zero);
// Add the inverse transform(s).
- Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
- Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
- Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
- Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
+ Vector128 ref0InvAdded = ref0.AsInt16() + t0.AsInt16();
+ Vector128 ref1InvAdded = ref1.AsInt16() + t1.AsInt16();
+ Vector128 ref2InvAdded = ref2.AsInt16() + t2.AsInt16();
+ Vector128 ref3InvAdded = ref3.AsInt16() + t3.AsInt16();
// Unsigned saturate to 8b.
- ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
- ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
- ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
- ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
+ ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
+ ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
+ ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
+ ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
// Store eight bytes/pixels per line.
ref byte outputRef = ref MemoryMarshal.GetReference(dst);
@@ -188,7 +185,7 @@ internal static unsafe class Vp8Encoding
public static void ITransformOne(Span reference, Span input, Span dst, Span scratch)
{
- if (Sse2.IsSupported)
+ if (Vector128.IsHardwareAccelerated)
{
// Load and concatenate the transform coefficients (we'll do two inverse
// transforms in parallel). In the case of only one inverse transform, the
@@ -207,58 +204,54 @@ internal static unsafe class Vp8Encoding
// Vertical pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
- InverseTransformVerticalPass(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3);
+ InverseTransformVerticalPassVector128(in0, in2, in1, in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16bVector128(tmp0, tmp1, tmp2, tmp3, out Vector128 t0, out Vector128 t1, out Vector128 t2, out Vector128 t3);
// Horizontal pass and subsequent transpose.
// First pass, c and d calculations are longer because of the "trick" multiplications.
- InverseTransformHorizontalPass(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3);
+ InverseTransformHorizontalPassVector128(t0, t2, t1, t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3);
// Transpose the two 4x4.
LossyUtils.Vp8Transpose_2_4x4_16bVector128(shifted0, shifted1, shifted2, shifted3, out t0, out t1, out t2, out t3);
// Add inverse transform to 'ref' and store.
// Load the reference(s).
- Vector128 ref0 = Vector128.Zero;
- Vector128 ref1 = Vector128.Zero;
- Vector128 ref2 = Vector128.Zero;
- Vector128 ref3 = Vector128.Zero;
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
// Load four bytes/pixels per line.
- ref0 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref referenceRef)).AsByte();
- ref1 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
- ref2 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
- ref3 = Sse2.ConvertScalarToVector128Int32(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
+ Vector128 ref0 = Vector128.CreateScalar(Unsafe.As(ref referenceRef)).AsByte();
+ Vector128 ref1 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps))).AsByte();
+ Vector128 ref2 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 2))).AsByte();
+ Vector128 ref3 = Vector128.CreateScalar(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3))).AsByte();
// Convert to 16b.
- ref0 = Sse2.UnpackLow(ref0, Vector128.Zero);
- ref1 = Sse2.UnpackLow(ref1, Vector128.Zero);
- ref2 = Sse2.UnpackLow(ref2, Vector128.Zero);
- ref3 = Sse2.UnpackLow(ref3, Vector128.Zero);
+ ref0 = Vector128_.UnpackLow(ref0, Vector128.Zero);
+ ref1 = Vector128_.UnpackLow(ref1, Vector128.Zero);
+ ref2 = Vector128_.UnpackLow(ref2, Vector128.Zero);
+ ref3 = Vector128_.UnpackLow(ref3, Vector128.Zero);
// Add the inverse transform(s).
- Vector128 ref0InvAdded = Sse2.Add(ref0.AsInt16(), t0.AsInt16());
- Vector128 ref1InvAdded = Sse2.Add(ref1.AsInt16(), t1.AsInt16());
- Vector128 ref2InvAdded = Sse2.Add(ref2.AsInt16(), t2.AsInt16());
- Vector128 ref3InvAdded = Sse2.Add(ref3.AsInt16(), t3.AsInt16());
+ Vector128 ref0InvAdded = ref0.AsInt16() + t0.AsInt16();
+ Vector128 ref1InvAdded = ref1.AsInt16() + t1.AsInt16();
+ Vector128 ref2InvAdded = ref2.AsInt16() + t2.AsInt16();
+ Vector128 ref3InvAdded = ref3.AsInt16() + t3.AsInt16();
// Unsigned saturate to 8b.
- ref0 = Sse2.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
- ref1 = Sse2.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
- ref2 = Sse2.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
- ref3 = Sse2.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
+ ref0 = Vector128_.PackUnsignedSaturate(ref0InvAdded, ref0InvAdded);
+ ref1 = Vector128_.PackUnsignedSaturate(ref1InvAdded, ref1InvAdded);
+ ref2 = Vector128_.PackUnsignedSaturate(ref2InvAdded, ref2InvAdded);
+ ref3 = Vector128_.PackUnsignedSaturate(ref3InvAdded, ref3InvAdded);
// Unsigned saturate to 8b.
ref byte outputRef = ref MemoryMarshal.GetReference(dst);
// Store four bytes/pixels per line.
- int output0 = Sse2.ConvertToInt32(ref0.AsInt32());
- int output1 = Sse2.ConvertToInt32(ref1.AsInt32());
- int output2 = Sse2.ConvertToInt32(ref2.AsInt32());
- int output3 = Sse2.ConvertToInt32(ref3.AsInt32());
+ int output0 = ref0.AsInt32().ToScalar();
+ int output1 = ref1.AsInt32().ToScalar();
+ int output2 = ref2.AsInt32().ToScalar();
+ int output3 = ref3.AsInt32().ToScalar();
Unsafe.As(ref outputRef) = output0;
Unsafe.As(ref Unsafe.Add(ref outputRef, WebpConstants.Bps)) = output1;
@@ -302,72 +295,72 @@ internal static unsafe class Vp8Encoding
}
}
- private static void InverseTransformVerticalPass(Vector128 in0, Vector128 in2, Vector128 in1, Vector128 in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3)
+ private static void InverseTransformVerticalPassVector128(Vector128 in0, Vector128 in2, Vector128 in1, Vector128 in3, out Vector128 tmp0, out Vector128 tmp1, out Vector128 tmp2, out Vector128 tmp3)
{
- Vector128 a = Sse2.Add(in0.AsInt16(), in2.AsInt16());
- Vector128 b = Sse2.Subtract(in0.AsInt16(), in2.AsInt16());
+ Vector128 a = in0.AsInt16() + in2.AsInt16();
+ Vector128 b = in0.AsInt16() - in2.AsInt16();
Vector128 k1 = Vector128.Create((short)20091).AsInt16();
Vector128 k2 = Vector128.Create((short)-30068).AsInt16();
// c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
- Vector128 c1 = Sse2.MultiplyHigh(in1.AsInt16(), k2);
- Vector128 c2 = Sse2.MultiplyHigh(in3.AsInt16(), k1);
- Vector128 c3 = Sse2.Subtract(in1.AsInt16(), in3.AsInt16());
- Vector128 c4 = Sse2.Subtract(c1, c2);
- Vector128 c = Sse2.Add(c3, c4);
+ Vector128 c1 = Vector128_.MultiplyHigh(in1.AsInt16(), k2);
+ Vector128 c2 = Vector128_.MultiplyHigh(in3.AsInt16(), k1);
+ Vector128 c3 = in1.AsInt16() - in3.AsInt16();
+ Vector128 c4 = c1 - c2;
+ Vector128 c = c3 + c4;
// d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
- Vector128 d1 = Sse2.MultiplyHigh(in1.AsInt16(), k1);
- Vector128 d2 = Sse2.MultiplyHigh(in3.AsInt16(), k2);
- Vector128 d3 = Sse2.Add(in1.AsInt16(), in3.AsInt16());
- Vector128 d4 = Sse2.Add(d1, d2);
- Vector128 d = Sse2.Add(d3, d4);
+ Vector128 d1 = Vector128_.MultiplyHigh(in1.AsInt16(), k1);
+ Vector128 d2 = Vector128_.MultiplyHigh(in3.AsInt16(), k2);
+ Vector128 d3 = in1.AsInt16() + in3.AsInt16();
+ Vector128 d4 = d1 + d2;
+ Vector128 d = d3 + d4;
// Second pass.
- tmp0 = Sse2.Add(a, d);
- tmp1 = Sse2.Add(b, c);
- tmp2 = Sse2.Subtract(b, c);
- tmp3 = Sse2.Subtract(a, d);
+ tmp0 = a + d;
+ tmp1 = b + c;
+ tmp2 = b - c;
+ tmp3 = a - d;
}
- private static void InverseTransformHorizontalPass(Vector128 t0, Vector128 t2, Vector128 t1, Vector128 t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3)
+ private static void InverseTransformHorizontalPassVector128(Vector128 t0, Vector128 t2, Vector128 t1, Vector128 t3, out Vector128 shifted0, out Vector128 shifted1, out Vector128 shifted2, out Vector128 shifted3)
{
- Vector128 dc = Sse2.Add(t0.AsInt16(), Vector128.Create((short)4));
- Vector128 a = Sse2.Add(dc, t2.AsInt16());
- Vector128 b = Sse2.Subtract(dc, t2.AsInt16());
+ Vector128 dc = t0.AsInt16() + Vector128.Create((short)4);
+ Vector128 a = dc + t2.AsInt16();
+ Vector128 b = dc - t2.AsInt16();
Vector128 k1 = Vector128.Create((short)20091).AsInt16();
Vector128 k2 = Vector128.Create((short)-30068).AsInt16();
// c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
- Vector128 c1 = Sse2.MultiplyHigh(t1.AsInt16(), k2);
- Vector128 c2 = Sse2.MultiplyHigh(t3.AsInt16(), k1);
- Vector128 c3 = Sse2.Subtract(t1.AsInt16(), t3.AsInt16());
- Vector128 c4 = Sse2.Subtract(c1, c2);
- Vector128 c = Sse2.Add(c3, c4);
+ Vector128 c1 = Vector128_.MultiplyHigh(t1.AsInt16(), k2);
+ Vector128 c2 = Vector128_.MultiplyHigh(t3.AsInt16(), k1);
+ Vector128 c3 = t1.AsInt16() - t3.AsInt16();
+ Vector128 c4 = c1 - c2;
+ Vector128 c = c3 + c4;
// d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
- Vector128 d1 = Sse2.MultiplyHigh(t1.AsInt16(), k1);
- Vector128 d2 = Sse2.MultiplyHigh(t3.AsInt16(), k2);
- Vector128 d3 = Sse2.Add(t1.AsInt16(), t3.AsInt16());
- Vector128 d4 = Sse2.Add(d1, d2);
- Vector128 d = Sse2.Add(d3, d4);
+ Vector128 d1 = Vector128_.MultiplyHigh(t1.AsInt16(), k1);
+ Vector128 d2 = Vector128_.MultiplyHigh(t3.AsInt16(), k2);
+ Vector128 d3 = t1.AsInt16() + t3.AsInt16();
+ Vector128 d4 = d1 + d2;
+ Vector128 d = d3 + d4;
// Second pass.
- Vector128 tmp0 = Sse2.Add(a, d);
- Vector128 tmp1 = Sse2.Add(b, c);
- Vector128 tmp2 = Sse2.Subtract(b, c);
- Vector128 tmp3 = Sse2.Subtract(a, d);
- shifted0 = Sse2.ShiftRightArithmetic(tmp0, 3);
- shifted1 = Sse2.ShiftRightArithmetic(tmp1, 3);
- shifted2 = Sse2.ShiftRightArithmetic(tmp2, 3);
- shifted3 = Sse2.ShiftRightArithmetic(tmp3, 3);
+ Vector128 tmp0 = a + d;
+ Vector128 tmp1 = b + c;
+ Vector128 tmp2 = b - c;
+ Vector128 tmp3 = a - d;
+ shifted0 = Vector128.ShiftRightArithmetic(tmp0, 3);
+ shifted1 = Vector128.ShiftRightArithmetic(tmp1, 3);
+ shifted2 = Vector128.ShiftRightArithmetic(tmp2, 3);
+ shifted3 = Vector128.ShiftRightArithmetic(tmp3, 3);
}
public static void FTransform2(Span src, Span reference, Span output, Span output2, Span scratch)
{
- if (Sse2.IsSupported)
+ if (Vector128.IsHardwareAccelerated)
{
ref byte srcRef = ref MemoryMarshal.GetReference(src);
ref byte referenceRef = ref MemoryMarshal.GetReference(reference);
@@ -385,38 +378,38 @@ internal static unsafe class Vp8Encoding
Vector128 ref3 = Vector128.Create(Unsafe.As(ref Unsafe.Add(ref referenceRef, WebpConstants.Bps * 3)), 0);
// Convert both to 16 bit.
- Vector128 srcLow0 = Sse2.UnpackLow(src0.AsByte(), Vector128.Zero);
- Vector128 srcLow1 = Sse2.UnpackLow(src1.AsByte(), Vector128.Zero);
- Vector128 srcLow2 = Sse2.UnpackLow(src2.AsByte(), Vector128.Zero);
- Vector128 srcLow3 = Sse2.UnpackLow(src3.AsByte(), Vector128.Zero);
- Vector128 refLow0 = Sse2.UnpackLow(ref0.AsByte(), Vector128.Zero);
- Vector128 refLow1 = Sse2.UnpackLow(ref1.AsByte(), Vector128.Zero);
- Vector128 refLow2 = Sse2.UnpackLow(ref2.AsByte(), Vector128.Zero);
- Vector128 refLow3 = Sse2.UnpackLow(ref3.AsByte(), Vector128.Zero);
+ Vector128 srcLow0 = Vector128_.UnpackLow(src0.AsByte(), Vector128.Zero);
+ Vector128 srcLow1 = Vector128_.UnpackLow(src1.AsByte(), Vector128.Zero);
+ Vector128 srcLow2 = Vector128_.UnpackLow(src2.AsByte(), Vector128.Zero);
+ Vector128 srcLow3 = Vector128_.UnpackLow(src3.AsByte(), Vector128.Zero);
+ Vector128