Update based on feedback

11 months ago · 362707343f
6 changed files with 125 additions and 217 deletions
--- a/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.HwIntrinsics.cs
@ -375,6 +375,11 @@ internal static partial class SimdUtils
            }
            else if (Vector256.IsHardwareAccelerated)
            {
+                // ShufflePerLane performs per-128-bit-lane shuffling using Avx2.Shuffle (vpshufb).
+                // MMShuffleSpan generates indices in the range [0, 31] and never sets bit 7 in any byte,
+                // so the shuffle will not zero elements. Because vpshufb uses only the low 4 bits (b[i] & 0x0F)
+                // for indexing within each lane, and ignores the upper bits unless bit 7 is set,
+                // this usage is guaranteed to remain within-lane and non-zeroing.
                Span<byte> temp = stackalloc byte[Vector256<byte>.Count];
                Shuffle.MMShuffleSpan(ref temp, control);
                Vector256<byte> mask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp));
@ -391,17 +396,17 @@ internal static partial class SimdUtils
                    ref Vector256<byte> vs0 = ref Unsafe.Add(ref sourceBase, i);
                    ref Vector256<byte> vd0 = ref Unsafe.Add(ref destinationBase, i);

-                    vd0 = Vector256_.ShuffleNative(vs0, mask);
-                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)1), mask);
-                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)2), mask);
-                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShuffleNative(Unsafe.Add(ref vs0, (nuint)3), mask);
+                    vd0 = Vector256_.ShufflePerLane(vs0, mask);
+                    Unsafe.Add(ref vd0, (nuint)1) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)1), mask);
+                    Unsafe.Add(ref vd0, (nuint)2) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)2), mask);
+                    Unsafe.Add(ref vd0, (nuint)3) = Vector256_.ShufflePerLane(Unsafe.Add(ref vs0, (nuint)3), mask);
                }

                if (m > 0)
                {
                    for (nuint i = u; i < n; i++)
                    {
-                        Unsafe.Add(ref destinationBase, i) = Vector256_.ShuffleNative(Unsafe.Add(ref sourceBase, i), mask);
+                        Unsafe.Add(ref destinationBase, i) = Vector256_.ShufflePerLane(Unsafe.Add(ref sourceBase, i), mask);
                    }
                }
            }
--- a/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector128Utilities.cs
@ -47,8 +47,10 @@ internal static class Vector128_
            return AdvSimd.FusedAddRoundedHalving(left, right);
        }

-        // Portable fallback: (a + b + 1) >> 1
-        return (left + right + Vector128.Create((byte)1)) >> 1;
+        // Account for potential 9th bit to ensure correct rounded result.
+        return Vector128.Narrow(
+            (Vector128.WidenLower(left) + Vector128.WidenLower(right) + Vector128<ushort>.One) >> 1,
+            (Vector128.WidenUpper(left) + Vector128.WidenUpper(right) + Vector128<ushort>.One) >> 1);
    }

    /// <summary>
@ -117,13 +119,17 @@ internal static class Vector128_
        }

        // Don't use InverseMMShuffle here as we want to avoid the cast.
-        Vector64<short> indices = Vector64.Create(
-            (short)(control & 0x3),
-            (short)((control >> 2) & 0x3),
-            (short)((control >> 4) & 0x3),
-            (short)((control >> 6) & 0x3));
-
-        return Vector128.Create(value.GetLower(), Vector64.Shuffle(value.GetUpper(), indices));
+        Vector128<short> indices = Vector128.Create(
+           0,
+           1,
+           2,
+           3,
+           (short)((control & 0x3) + 4),
+           (short)(((control >> 2) & 0x3) + 4),
+           (short)(((control >> 4) & 0x3) + 4),
+           (short)(((control >> 6) & 0x3) + 4));
+
+        return Vector128.Shuffle(value, indices);
    }

    /// <summary>
@ -144,13 +150,17 @@ internal static class Vector128_
        }

        // Don't use InverseMMShuffle here as we want to avoid the cast.
-        Vector64<short> indices = Vector64.Create(
-            (short)(control & 0x3),
-            (short)((control >> 2) & 0x3),
-            (short)((control >> 4) & 0x3),
-            (short)((control >> 6) & 0x3));
-
-        return Vector128.Create(Vector64.Shuffle(value.GetLower(), indices), value.GetUpper());
+        Vector128<short> indices = Vector128.Create(
+           (short)(control & 0x3),
+           (short)((control >> 2) & 0x3),
+           (short)((control >> 4) & 0x3),
+           (short)((control >> 6) & 0x3),
+           4,
+           5,
+           6,
+           7);
+
+        return Vector128.Shuffle(value, indices);
    }

    /// <summary>
@ -237,28 +247,13 @@ internal static class Vector128_
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
    public static Vector128<short> ShiftLeftLogical(Vector128<short> value, [ConstantExpected] byte count)
    {
-        if (Sse2.IsSupported)
-        {
-            return Sse2.ShiftLeftLogical(value, count);
-        }
-
        // Zero lanes where count >= 16 to match SSE2
        if (count >= 16)
        {
            return Vector128<short>.Zero;
        }

-        if (AdvSimd.IsSupported)
-        {
-            return AdvSimd.ShiftLogical(value, Vector128.Create((short)count));
-        }
-
-        if (PackedSimd.IsSupported)
-        {
-            return PackedSimd.ShiftLeft(value, count);
-        }
-
-        return Vector128.ShiftLeft(value, count);
+        return value << count;
    }

    /// <summary>
@ -536,6 +531,11 @@ internal static class Vector128_
            Vector128<int> prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower());
            Vector128<int> prodHi = AdvSimd.MultiplyWideningLower(left.GetUpper(), right.GetUpper());

+            if (AdvSimd.Arm64.IsSupported)
+            {
+                return AdvSimd.Arm64.AddPairwise(prodLo, prodHi);
+            }
+
            Vector128<long> v0 = AdvSimd.AddPairwiseWidening(prodLo);
            Vector128<long> v1 = AdvSimd.AddPairwiseWidening(prodHi);

@ -587,50 +587,26 @@ internal static class Vector128_
            return AdvSimd.Arm64.AddPairwise(left, right);
        }

-        // Extract the low and high parts of the products shuffling them to form a result we can add together.
-        // Use out-of-bounds to zero out the unused lanes.
-        Vector128<short> even = Vector128.Create(0, 2, 4, 6, 8, 8, 8, 8);
-        Vector128<short> odd = Vector128.Create(1, 3, 5, 7, 8, 8, 8, 8);
-        Vector128<short> v0 = Vector128.Shuffle(right, even);
-        Vector128<short> v1 = Vector128.Shuffle(right, odd);
-        Vector128<short> v2 = Vector128.Shuffle(left, even);
-        Vector128<short> v3 = Vector128.Shuffle(left, odd);
-
-        return v0 + v1 + v2 + v3;
-    }
-
-    /// <summary>
-    /// Multiply the packed 16-bit integers in <paramref name="left"/> and <paramref name="right"/>, producing
-    /// intermediate 32-bit integers, and store the low 16 bits of the intermediate integers in the result.
-    /// </summary>
-    /// <param name="left">
-    /// The first vector containing packed 16-bit integers to multiply.
-    /// </param>
-    /// <param name="right">
-    /// The second vector containing packed 16-bit integers to multiply.
-    /// </param>
-    /// <returns>
-    /// A vector containing the low 16 bits of the products of the packed 16-bit integers
-    /// from <paramref name="left"/> and <paramref name="right"/>.
-    /// </returns>
-    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector128<short> MultiplyLow(Vector128<short> left, Vector128<short> right)
-    {
-        if (Sse2.IsSupported)
+        if (AdvSimd.IsSupported)
        {
-            return Sse2.MultiplyLow(left, right);
-        }
+            Vector128<int> v0 = AdvSimd.AddPairwiseWidening(left);
+            Vector128<int> v1 = AdvSimd.AddPairwiseWidening(right);

-        // Widen each half of the short vectors into two int vectors
-        (Vector128<int> leftLo, Vector128<int> leftHi) = Vector128.Widen(left);
-        (Vector128<int> rightLo, Vector128<int> rightHi) = Vector128.Widen(right);
+            return Vector128.Narrow(v0, v1);
+        }

-        // Elementwise multiply: each int lane now holds the full 32-bit product
-        Vector128<int> prodLo = leftLo * rightLo;
-        Vector128<int> prodHi = leftHi * rightHi;
+        {
+            // Extract the low and high parts of the products shuffling them to form a result we can add together.
+            // Use out-of-bounds to zero out the unused lanes.
+            Vector128<short> even = Vector128.Create(0, 2, 4, 6, 8, 8, 8, 8);
+            Vector128<short> odd = Vector128.Create(1, 3, 5, 7, 8, 8, 8, 8);
+            Vector128<short> v0 = Vector128.Shuffle(right, even);
+            Vector128<short> v1 = Vector128.Shuffle(right, odd);
+            Vector128<short> v2 = Vector128.Shuffle(left, even);
+            Vector128<short> v3 = Vector128.Shuffle(left, odd);

-        // Narrow the two int vectors back into one short vector
-        return Vector128.Narrow(prodLo, prodHi);
+            return v0 + v1 + v2 + v3;
+        }
    }

    /// <summary>
@ -655,20 +631,33 @@ internal static class Vector128_
            return Sse2.MultiplyHigh(left, right);
        }

-        // Widen each half of the short vectors into two int vectors
-        (Vector128<int> leftLo, Vector128<int> leftHi) = Vector128.Widen(left);
-        (Vector128<int> rightLo, Vector128<int> rightHi) = Vector128.Widen(right);
+        if (AdvSimd.IsSupported)
+        {
+            Vector128<int> prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower());
+            Vector128<int> prodHi = AdvSimd.MultiplyWideningUpper(left, right);
+
+            prodLo >>= 16;
+            prodHi >>= 16;
+
+            return Vector128.Narrow(prodLo, prodHi);
+        }
+
+        {
+            // Widen each half of the short vectors into two int vectors
+            (Vector128<int> leftLo, Vector128<int> leftHi) = Vector128.Widen(left);
+            (Vector128<int> rightLo, Vector128<int> rightHi) = Vector128.Widen(right);

-        // Elementwise multiply: each int lane now holds the full 32-bit product
-        Vector128<int> prodLo = leftLo * rightLo;
-        Vector128<int> prodHi = leftHi * rightHi;
+            // Elementwise multiply: each int lane now holds the full 32-bit product
+            Vector128<int> prodLo = leftLo * rightLo;
+            Vector128<int> prodHi = leftHi * rightHi;

-        // Arithmetic shift right by 16 bits to extract the high word
-        prodLo >>= 16;
-        prodHi >>= 16;
+            // Arithmetic shift right by 16 bits to extract the high word
+            prodLo >>= 16;
+            prodHi >>= 16;

-        // Narrow the two int vectors back into one short vector
-        return Vector128.Narrow(prodLo, prodHi);
+            // Narrow the two int vectors back into one short vector
+            return Vector128.Narrow(prodLo, prodHi);
+        }
    }

    /// <summary>
@ -693,20 +682,33 @@ internal static class Vector128_
            return Sse2.MultiplyHigh(left, right);
        }

-        // Widen each half of the short vectors into two uint vectors
-        (Vector128<uint> leftLo, Vector128<uint> leftHi) = Vector128.Widen(left);
-        (Vector128<uint> rightLo, Vector128<uint> rightHi) = Vector128.Widen(right);
+        if (AdvSimd.IsSupported)
+        {
+            Vector128<uint> prodLo = AdvSimd.MultiplyWideningLower(left.GetLower(), right.GetLower());
+            Vector128<uint> prodHi = AdvSimd.MultiplyWideningUpper(left, right);

-        // Elementwise multiply: each int lane now holds the full 32-bit product
-        Vector128<uint> prodLo = leftLo * rightLo;
-        Vector128<uint> prodHi = leftHi * rightHi;
+            prodLo >>= 16;
+            prodHi >>= 16;

-        // Arithmetic shift right by 16 bits to extract the high word
-        prodLo >>= 16;
-        prodHi >>= 16;
+            return Vector128.Narrow(prodLo, prodHi);
+        }
+
+        {
+            // Widen each half of the short vectors into two uint vectors
+            (Vector128<uint> leftLo, Vector128<uint> leftHi) = Vector128.Widen(left);
+            (Vector128<uint> rightLo, Vector128<uint> rightHi) = Vector128.Widen(right);

-        // Narrow the two int vectors back into one short vector
-        return Vector128.Narrow(prodLo, prodHi);
+            // Elementwise multiply: each int lane now holds the full 32-bit product
+            Vector128<uint> prodLo = leftLo * rightLo;
+            Vector128<uint> prodHi = leftHi * rightHi;
+
+            // Arithmetic shift right by 16 bits to extract the high word
+            prodLo >>= 16;
+            prodHi >>= 16;
+
+            // Narrow the two int vectors back into one short vector
+            return Vector128.Narrow(prodLo, prodHi);
+        }
    }

    /// <summary>
@ -1363,90 +1365,4 @@ internal static class Vector128_
        // Narrow back to signed bytes
        return Vector128.Narrow(diffLo, diffHi);
    }
-
-    /// <summary>
-    /// Create mask from the most significant bit of each 8-bit element in <paramref name="value"/>, and store the result.
-    /// </summary>
-    /// <param name="value">
-    /// The vector containing packed 8-bit integers from which to create the mask.
-    /// </param>
-    /// <returns>
-    /// A 16-bit integer mask where each bit corresponds to the most significant bit of each 8-bit element
-    /// in <paramref name="value"/>.
-    /// </returns>
-    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static int MoveMask(Vector128<byte> value)
-    {
-        if (Sse2.IsSupported)
-        {
-            return Sse2.MoveMask(value);
-        }
-
-        // AdvSimd versions ported from Stack Overflow answer:
-        // https://stackoverflow.com/questions/11870910/sse-mm-movemask-epi8-equivalent-method-for-arm-neon
-        if (AdvSimd.Arm64.IsSupported)
-        {
-            // Shift values to align each MSB to its corresponding bit in the output
-            Vector128<sbyte> shift = Vector128.Create(-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0);
-
-            // Mask to isolate MSBs
-            Vector128<byte> msbMask = Vector128.Create((byte)0x80);
-            Vector128<byte> masked = value & msbMask;
-
-            // Shift each MSB into the correct bit position
-            Vector128<byte> shifted = AdvSimd.ShiftLogical(masked.AsSByte(), shift).AsByte();
-
-            // Sum lanes: lower 8 go into bits 0–7, upper 8 go into bits 8–15
-            byte lo = AdvSimd.Arm64.AddAcross(shifted.GetLower()).ToScalar();
-            byte hi = AdvSimd.Arm64.AddAcross(shifted.GetUpper()).ToScalar();
-
-            return lo + (hi << 8);
-        }
-
-        if (AdvSimd.IsSupported)
-        {
-            Vector128<byte> powers = Vector128.Create(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
-            Vector128<byte> msbMask = Vector128.Create((byte)0x80);
-            Vector128<byte> normalized = AdvSimd.CompareEqual(value & msbMask, msbMask); // 0xFF or 0x00
-            Vector128<byte> masked = normalized & powers;
-
-            Vector128<ushort> sum8 = AdvSimd.AddPairwiseWidening(masked);
-            Vector128<uint> sum16 = AdvSimd.AddPairwiseWidening(sum8);
-            Vector128<ulong> sum32 = AdvSimd.AddPairwiseWidening(sum16);
-
-            // Extract lower 8 bits of each 64-bit lane
-            byte lo = sum32.AsByte().GetElement(0);
-            byte hi = sum32.AsByte().GetElement(8);
-
-            return (hi << 8) | lo;
-        }
-
-        {
-            // Step 1: isolate MSBs
-            Vector128<byte> msbMask = Vector128.Create((byte)0x80);
-            Vector128<byte> masked = value & msbMask;
-
-            // Step 2: shift each byte so MSB lands in bit position [0..15]
-            // i.e. convert: 0x80 → 1 << i
-            Vector128<ushort> bitShifts = Vector128.Create((ushort)1, 2, 4, 8, 16, 32, 64, 128);
-            Vector128<ushort> bitShiftsHigh = Vector128.Create(256, 512, 1024, 2048, 4096, 8192, 16384, 32768);
-
-            // Step 3: widen to ushort
-            (Vector128<ushort> lo, Vector128<ushort> hi) = Vector128.Widen(masked);
-
-            // Step 4: compare > 0 to get 0xFFFF where MSB was set
-            lo = Vector128.ConditionalSelect(Vector128.Equals(lo, Vector128<ushort>.Zero), Vector128<ushort>.Zero, bitShifts);
-            hi = Vector128.ConditionalSelect(Vector128.Equals(hi, Vector128<ushort>.Zero), Vector128<ushort>.Zero, bitShiftsHigh);
-
-            // Step 5: bitwise OR the two halves
-            Vector128<ushort> maskVector = lo | hi;
-
-            // Step 6: horizontal OR reduction via shuffles
-            maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)4, 5, 6, 7, 0, 1, 2, 3));
-            maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)2, 3, 0, 1, 6, 7, 4, 5));
-            maskVector |= Vector128.Shuffle(maskVector, Vector128.Create((ushort)1, 0, 3, 2, 5, 4, 7, 6));
-
-            return maskVector.ToScalar();
-        }
-    }
 }
--- a/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
+++ b/src/ImageSharp/Common/Helpers/Vector256Utilities.cs
@ -39,14 +39,17 @@ internal static class Vector256_
    /// </param>
    /// <returns>The <see cref="Vector256{Single}"/>.</returns>
    [MethodImpl(MethodImplOptions.AggressiveInlining)]
-    public static Vector256<byte> ShuffleNative(Vector256<byte> vector, Vector256<byte> indices)
+    public static Vector256<byte> ShufflePerLane(Vector256<byte> vector, Vector256<byte> indices)
    {
        if (Avx2.IsSupported)
        {
            return Avx2.Shuffle(vector, indices);
        }

-        return Vector256.Shuffle(vector, indices);
+        Vector128<byte> indicesLo = indices.GetLower();
+        Vector128<byte> lower = Vector128_.ShuffleNative(vector.GetLower(), indicesLo);
+        Vector128<byte> upper = Vector128_.ShuffleNative(vector.GetUpper(), indicesLo);
+        return Vector256.Create(lower, upper);
    }

    /// <summary>
@ -458,26 +461,4 @@ internal static class Vector256_
            Vector128_.SubtractSaturate(left.GetLower(), right.GetLower()),
            Vector128_.SubtractSaturate(left.GetUpper(), right.GetUpper()));
    }
-
-    /// <summary>
-    /// Create mask from the most significant bit of each 8-bit element in <paramref name="value"/>, and store the result.
-    /// </summary>
-    /// <param name="value">
-    /// The vector containing packed 8-bit integers from which to create the mask.
-    /// </param>
-    /// <returns>
-    /// A 16-bit integer mask where each bit corresponds to the most significant bit of each 8-bit element
-    /// in <paramref name="value"/>.
-    /// </returns>
-    public static int MoveMask(Vector256<byte> value)
-    {
-        if (Avx2.IsSupported)
-        {
-            return Avx2.MoveMask(value);
-        }
-
-        int loMask = Vector128_.MoveMask(value.GetLower());
-        int hiMask = Vector128_.MoveMask(value.GetUpper());
-        return loMask | (hiMask << 16);
-    }
 }
--- a/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/ColorSpaceTransformUtils.cs
@ -16,6 +16,9 @@ internal static class ColorSpaceTransformUtils
        {
            const int span = 16;
            Span<ushort> values = stackalloc ushort[span];
+
+            // These shuffle masks are safe for use with Avx2.Shuffle because all indices are within their respective 128-bit lanes (0–15 for the low mask, 16–31 for the high mask),
+            // and all disabled lanes are set to 0xFF to zero those bytes per the vpshufb specification. This guarantees lane-local shuffling with no cross-lane violations.
            Vector256<byte> collectColorBlueTransformsShuffleLowMask256 = Vector256.Create(255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30, 255, 255, 255, 255, 255, 255, 255, 255);
            Vector256<byte> collectColorBlueTransformsShuffleHighMask256 = Vector256.Create(255, 255, 255, 255, 255, 255, 255, 255, 255, 2, 255, 6, 255, 10, 255, 14, 255, 255, 255, 255, 255, 255, 255, 255, 255, 18, 255, 22, 255, 26, 255, 30);
            Vector256<byte> collectColorBlueTransformsGreenBlueMask256 = Vector256.Create(255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0, 255, 255, 0, 0);
@ -33,8 +36,8 @@ internal static class ColorSpaceTransformUtils
                    nuint input1Idx = x + (span / 2);
                    Vector256<byte> input0 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input0Idx)).AsByte();
                    Vector256<byte> input1 = Unsafe.As<uint, Vector256<uint>>(ref Unsafe.Add(ref inputRef, input1Idx)).AsByte();
-                    Vector256<byte> r0 = Vector256_.ShuffleNative(input0, collectColorBlueTransformsShuffleLowMask256);
-                    Vector256<byte> r1 = Vector256_.ShuffleNative(input1, collectColorBlueTransformsShuffleHighMask256);
+                    Vector256<byte> r0 = Vector256_.ShufflePerLane(input0, collectColorBlueTransformsShuffleLowMask256);
+                    Vector256<byte> r1 = Vector256_.ShufflePerLane(input1, collectColorBlueTransformsShuffleHighMask256);
                    Vector256<byte> r = r0 | r1;
                    Vector256<byte> gb0 = input0 & collectColorBlueTransformsGreenBlueMask256;
                    Vector256<byte> gb1 = input1 & collectColorBlueTransformsGreenBlueMask256;
--- a/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
+++ b/src/ImageSharp/Formats/Webp/Lossless/LosslessUtils.cs
@ -97,6 +97,9 @@ internal static unsafe class LosslessUtils
    {
        if (Vector256.IsHardwareAccelerated && pixelData.Length >= 8)
        {
+            // The `255` values disable the write for alpha (A), since 0x80 is set in the control byte (high bit set).
+            // Each byte index is within its respective 128-bit lane (0–15 and 16–31), so this is safe for per-lane shuffle.
+            // The high bits are not set for the index bytes, and the values are always < 16 per lane, satisfying AVX2 lane rules.
            Vector256<byte> addGreenToBlueAndRedMask = Vector256.Create(1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255, 17, 255, 17, 255, 21, 255, 21, 255, 25, 255, 25, 255, 29, 255, 29, 255);
            nuint numPixels = (uint)pixelData.Length;
            nuint i = 0;
@ -104,7 +107,7 @@ internal static unsafe class LosslessUtils
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
-                Vector256<byte> in0g0g = Vector256_.ShuffleNative(input, addGreenToBlueAndRedMask);
+                Vector256<byte> in0g0g = Vector256_.ShufflePerLane(input, addGreenToBlueAndRedMask);
                Vector256<byte> output = input + in0g0g;
                Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
                i += 8;
@ -168,7 +171,7 @@ internal static unsafe class LosslessUtils
            {
                ref uint pos = ref Unsafe.Add(ref MemoryMarshal.GetReference(pixelData), i);
                Vector256<byte> input = Unsafe.As<uint, Vector256<uint>>(ref pos).AsByte();
-                Vector256<byte> in0g0g = Vector256_.ShuffleNative(input, subtractGreenFromBlueAndRedMask);
+                Vector256<byte> in0g0g = Vector256_.ShufflePerLane(input, subtractGreenFromBlueAndRedMask);
                Vector256<byte> output = input - in0g0g;
                Unsafe.As<uint, Vector256<uint>>(ref pos) = output.AsUInt32();
                i += 8;
--- a/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
+++ b/src/ImageSharp/Formats/Webp/WebpCommonUtils.cs
@ -44,8 +44,8 @@ internal static class WebpCommonUtils
                    Vector256<short> c1 = Vector256_.PackSignedSaturate(b2, b3).AsInt16();
                    Vector256<byte> d = Vector256_.PackSignedSaturate(c0, c1).AsByte();
                    Vector256<byte> bits = Vector256.Equals(d, all0x80Vector256);
-                    int mask = Vector256_.MoveMask(bits);
-                    if (mask != -1)
+                    uint mask = bits.ExtractMostSignificantBits();
+                    if (mask != 0xFFFF_FFFF)
                    {
                        return true;
                    }
@ -138,7 +138,7 @@ internal static class WebpCommonUtils
        Vector128<short> c1 = Vector128_.PackSignedSaturate(b2, b3).AsInt16();
        Vector128<byte> d = Vector128_.PackSignedSaturate(c0, c1).AsByte();
        Vector128<byte> bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte());
-        int mask = Vector128_.MoveMask(bits);
+        uint mask = bits.ExtractMostSignificantBits();
        return mask != 0xFFFF;
    }

@ -153,7 +153,7 @@ internal static class WebpCommonUtils
        Vector128<short> c = Vector128_.PackSignedSaturate(b0, b1).AsInt16();
        Vector128<byte> d = Vector128_.PackSignedSaturate(c, c).AsByte();
        Vector128<byte> bits = Vector128.Equals(d, Vector128.Create((byte)0x80).AsByte());
-        int mask = Vector128_.MoveMask(bits);
+        uint mask = bits.ExtractMostSignificantBits();
        return mask != 0xFFFF;
    }
 }