Fix in-place shuffles and vectorize PNG CgBI transform

2 months ago · a3e9cc6fd4
7 changed files with 390 additions and 62 deletions
--- a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
@ -31,19 +31,23 @@ internal readonly struct DefaultPad3Shuffle4([ConstantExpected] byte control) :

        SimdUtils.Shuffle.InverseMMShuffle(this.Control, out uint p3, out uint p2, out uint p1, out uint p0);

-        Span<byte> temp = stackalloc byte[4];
-        ref byte t = ref MemoryMarshal.GetReference(temp);
-        ref uint tu = ref Unsafe.As<byte, uint>(ref t);
-
        for (nuint i = 0, j = 0; i < (uint)source.Length; i += 3, j += 4)
        {
-            ref byte s = ref Unsafe.Add(ref sBase, i);
-            tu = Unsafe.As<byte, uint>(ref s) | 0xFF000000;
-
-            Unsafe.Add(ref dBase, j + 0) = Unsafe.Add(ref t, p0);
-            Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref t, p1);
-            Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref t, p2);
-            Unsafe.Add(ref dBase, j + 3) = Unsafe.Add(ref t, p3);
+            // Expanding 3-byte pixels to 4 bytes can overwrite the next source
+            // triplet when spans overlap. Assemble the padded pixel first, then
+            // shuffle from the staged uint.
+            uint packed =
+                Unsafe.Add(ref sBase, i + 0u) |
+                ((uint)Unsafe.Add(ref sBase, i + 1u) << 8) |
+                ((uint)Unsafe.Add(ref sBase, i + 2u) << 16) |
+                0xFF000000;
+
+            ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);
+
+            Unsafe.Add(ref dBase, j + 0u) = Unsafe.Add(ref pBase, p0);
+            Unsafe.Add(ref dBase, j + 1u) = Unsafe.Add(ref pBase, p1);
+            Unsafe.Add(ref dBase, j + 2u) = Unsafe.Add(ref pBase, p2);
+            Unsafe.Add(ref dBase, j + 3u) = Unsafe.Add(ref pBase, p3);
        }
    }
 }
@ -65,7 +69,12 @@ internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4

        while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd))
        {
-            Unsafe.As<byte, uint>(ref dBase) = Unsafe.As<byte, uint>(ref sBase) | 0xFF000000;
+            // The fast scalar path reads one extra byte past the source triplet.
+            // Keep that widened read in a local before writing the expanded pixel
+            // so overlapping destinations cannot change what was read.
+            uint packed = Unsafe.As<byte, uint>(ref sBase) | 0xFF000000;
+
+            Unsafe.As<byte, uint>(ref dBase) = packed;

            sBase = ref Unsafe.Add(ref sBase, 3);
            dBase = ref Unsafe.Add(ref dBase, 4);
@ -73,10 +82,15 @@ internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4

        while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd))
        {
-            Unsafe.Add(ref dBase, 0) = Unsafe.Add(ref sBase, 0);
-            Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1);
-            Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2);
-            Unsafe.Add(ref dBase, 3) = byte.MaxValue;
+            // The final triplet cannot use the widened read above, so assemble
+            // the same padded uint byte-by-byte before the overlapping store.
+            uint packed =
+                Unsafe.Add(ref sBase, 0u) |
+                ((uint)Unsafe.Add(ref sBase, 1u) << 8) |
+                ((uint)Unsafe.Add(ref sBase, 2u) << 16) |
+                0xFF000000;
+
+            Unsafe.As<byte, uint>(ref dBase) = packed;

            sBase = ref Unsafe.Add(ref sBase, 3);
            dBase = ref Unsafe.Add(ref dBase, 4);
--- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
@ -33,9 +33,19 @@ internal readonly struct DefaultShuffle3([ConstantExpected] byte control) : IShu

        for (nuint i = 0; i < (uint)source.Length; i += 3)
        {
-            Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i);
-            Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
-            Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
+            // The scalar remainder can run in-place after the vector body. Load
+            // the full 3-byte pixel into a register-sized value before stores so
+            // channel swaps cannot corrupt later reads from the same pixel.
+            uint packed =
+                Unsafe.Add(ref sBase, i + 0u) |
+                ((uint)Unsafe.Add(ref sBase, i + 1u) << 8) |
+                ((uint)Unsafe.Add(ref sBase, i + 2u) << 16);
+
+            ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);
+
+            Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
+            Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
+            Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
        }
    }
 }
--- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs
@ -35,10 +35,16 @@ internal readonly struct DefaultShuffle4([ConstantExpected] byte control) : IShu

        for (nuint i = 0; i < (uint)source.Length; i += 4)
        {
-            Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i);
-            Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
-            Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
-            Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
+            // The generic path may be used with source and destination pointing
+            // at the same pixel. Load all channels first so subsequent stores
+            // index only staged bytes, matching the specialized uint shuffles.
+            uint packed = Unsafe.As<byte, uint>(ref Unsafe.Add(ref sBase, i));
+            ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);
+
+            Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
+            Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
+            Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
+            Unsafe.Add(ref dBase, i + 3u) = Unsafe.Add(ref pBase, p3);
        }
    }
 }
--- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
+++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
@ -33,9 +33,15 @@ internal readonly struct DefaultShuffle4Slice3([ConstantExpected] byte control)

        for (nuint i = 0, j = 0; i < (uint)destination.Length; i += 3, j += 4)
        {
-            Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + j);
-            Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j);
-            Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j);
+            // Shrinking 4-byte pixels to 3 bytes can still be called in-place by
+            // tail code. Read the complete source pixel first, then write only
+            // the requested channels into the destination triplet.
+            uint packed = Unsafe.As<byte, uint>(ref Unsafe.Add(ref sBase, j));
+            ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);
+
+            Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
+            Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
+            Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
        }
    }
 }
@ -61,10 +67,18 @@ internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3

        while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd))
        {
-            Unsafe.Add(ref dBase, 0) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 0));
-            Unsafe.Add(ref dBase, 1) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 1));
-            Unsafe.Add(ref dBase, 2) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 2));
-            Unsafe.Add(ref dBase, 3) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 3));
+            // Stage the four source pixels before the 3-byte stores. Even
+            // though this path preserves XYZ order, the packed loads must happen
+            // before destination writes when the spans overlap.
+            uint packed0 = Unsafe.Add(ref sBase, 0u);
+            uint packed1 = Unsafe.Add(ref sBase, 1u);
+            uint packed2 = Unsafe.Add(ref sBase, 2u);
+            uint packed3 = Unsafe.Add(ref sBase, 3u);
+
+            Unsafe.Add(ref dBase, 0u) = Unsafe.As<uint, Byte3>(ref packed0);
+            Unsafe.Add(ref dBase, 1u) = Unsafe.As<uint, Byte3>(ref packed1);
+            Unsafe.Add(ref dBase, 2u) = Unsafe.As<uint, Byte3>(ref packed2);
+            Unsafe.Add(ref dBase, 3u) = Unsafe.As<uint, Byte3>(ref packed3);

            sBase = ref Unsafe.Add(ref sBase, 4);
            dBase = ref Unsafe.Add(ref dBase, 4);
@ -72,7 +86,11 @@ internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3

        while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd))
        {
-            Unsafe.Add(ref dBase, 0) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 0));
+            // Same overlap rule as the unrolled loop: take the 4-byte source
+            // pixel before storing the 3-byte destination value.
+            uint packed = Unsafe.Add(ref sBase, 0u);
+
+            Unsafe.Add(ref dBase, 0u) = Unsafe.As<uint, Byte3>(ref packed);

            sBase = ref Unsafe.Add(ref sBase, 1);
            dBase = ref Unsafe.Add(ref dBase, 1);
--- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
+++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
@ -3,6 +3,7 @@

 using System.Diagnostics;
 using System.Diagnostics.CodeAnalysis;
+using System.Numerics;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;

@ -150,10 +151,15 @@ internal static partial class SimdUtils

        for (nuint i = 0; i < (uint)source.Length; i += 4)
        {
-            Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i);
-            Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i);
-            Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i);
-            Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i);
+            // Stage the scalar tail in a local Vector4 so p0..p3 index source
+            // values that were captured before any overlapping destination writes.
+            Vector4 v = Unsafe.As<float, Vector4>(ref Unsafe.Add(ref sBase, i));
+            ref float pBase = ref Unsafe.As<Vector4, float>(ref v);
+
+            Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
+            Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
+            Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
+            Unsafe.Add(ref dBase, i + 3u) = Unsafe.Add(ref pBase, p3);
        }
    }

--- a/src/ImageSharp/Formats/Png/PngDecoderCore.cs
+++ b/src/ImageSharp/Formats/Png/PngDecoderCore.cs
@ -9,6 +9,8 @@ using System.IO.Compression;
 using System.IO.Hashing;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
 using System.Text;
 using SixLabors.ImageSharp.Common.Helpers;
 using SixLabors.ImageSharp.Compression.Zlib;
@ -900,7 +902,7 @@ internal sealed class PngDecoderCore : ImageDecoderCore

            if (this.isCgbi)
            {
-                ApplyCgbiTransform(scanSpan[1..], this.pngColorType);
+                this.ApplyCgbiTransform(scanSpan[1..], this.pngColorType);
            }

            this.ProcessDefilteredScanline(frameControl, currentRow, scanSpan, imageFrame, pngMetadata, blendRowBuffer);
@ -1035,7 +1037,7 @@ internal sealed class PngDecoderCore : ImageDecoderCore

                if (this.isCgbi)
                {
-                    ApplyCgbiTransform(scanSpan[1..], this.pngColorType);
+                    this.ApplyCgbiTransform(scanSpan[1..], this.pngColorType);
                }

                Span<TPixel> rowSpan = imageBuffer.DangerousGetRowSpan(currentRow);
@ -2505,39 +2507,289 @@ internal sealed class PngDecoderCore : ImageDecoderCore
    /// </remarks>
    /// <param name="scanline">The defiltered pixel bytes (without the leading filter byte).</param>
    /// <param name="colorType">The PNG color type from IHDR.</param>
-    private static void ApplyCgbiTransform(Span<byte> scanline, PngColorType colorType)
+    private void ApplyCgbiTransform(Span<byte> scanline, PngColorType colorType)
    {
        if (colorType == PngColorType.RgbWithAlpha)
        {
            Span<Rgba32> pixels = MemoryMarshal.Cast<byte, Rgba32>(scanline);
-            for (int i = 0; i < pixels.Length; i++)
+            int i = 0;
+
+            if (Vector512.IsHardwareAccelerated && pixels.Length >= 16)
            {
-                ref Rgba32 p = ref pixels[i];
-                byte r = p.B;
-                byte g = p.G;
-                byte b = p.R;
-                byte a = p.A;
+                i = ApplyCgbiTransformVector512(scanline, pixels.Length);
+            }

-                if (a is not 0 and not byte.MaxValue)
-                {
-                    // Reverse: c' = c * a / 255  =>  c = round(c' * 255 / a)
-                    int half = a >> 1;
-                    r = (byte)Math.Min(byte.MaxValue, ((r * byte.MaxValue) + half) / a);
-                    g = (byte)Math.Min(byte.MaxValue, ((g * byte.MaxValue) + half) / a);
-                    b = (byte)Math.Min(byte.MaxValue, ((b * byte.MaxValue) + half) / a);
-                }
+            if (Vector256.IsHardwareAccelerated && Avx2.IsSupported && (pixels.Length - i) >= 8)
+            {
+                i = ApplyCgbiTransformVector256(scanline, i, pixels.Length);
+            }

-                p = new Rgba32(r, g, b, a);
+            if (Vector128.IsHardwareAccelerated && (pixels.Length - i) >= 4)
+            {
+                i = ApplyCgbiTransformVector128(scanline, i, pixels.Length);
+            }
+
+            for (; i < pixels.Length; i++)
+            {
+                ref Rgba32 pixel = ref pixels[i];
+                pixel = new Rgba32(pixel.B, pixel.G, pixel.R, pixel.A);
+                UndoCgbiPremultiplicationScalar(ref pixel);
            }
        }
        else if (colorType == PngColorType.Rgb)
        {
-            Span<Rgb24> pixels = MemoryMarshal.Cast<byte, Rgb24>(scanline);
-            for (int i = 0; i < pixels.Length; i++)
-            {
-                ref Rgb24 p = ref pixels[i];
-                (p.R, p.B) = (p.B, p.R);
-            }
+            // No alpha channel, so just swap R and B using built in SIMD-optimized pixel operations.
+            Span<Rgb24> target = MemoryMarshal.Cast<byte, Rgb24>(scanline);
+            PixelOperations<Rgb24>.Instance.FromBgr24Bytes(this.configuration, scanline, target, target.Length);
        }
    }
+
+    [MethodImpl(MethodImplOptions.AggressiveInlining)]
+    private static void UndoCgbiPremultiplicationScalar(ref Rgba32 pixel)
+    {
+        byte a = pixel.A;
+        if (a is 0 or byte.MaxValue)
+        {
+            return;
+        }
+
+        // Reverse: c' = c * a / 255  =>  c = round(c' * 255 / a)
+        int half = a >> 1;
+        byte r = (byte)Math.Min(byte.MaxValue, ((pixel.R * byte.MaxValue) + half) / a);
+        byte g = (byte)Math.Min(byte.MaxValue, ((pixel.G * byte.MaxValue) + half) / a);
+        byte b = (byte)Math.Min(byte.MaxValue, ((pixel.B * byte.MaxValue) + half) / a);
+        pixel = new Rgba32(r, g, b, a);
+    }
+
+    private static int ApplyCgbiTransformVector512(Span<byte> scanline, int pixelCount)
+    {
+        ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline);
+        int i = 0;
+
+        Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
+        SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012);
+
+        // MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting
+        // CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place.
+        // The generated mask only swaps bytes inside each pixel, so it remains
+        // correct for the optimized 512-bit byte shuffle helper.
+        Vector512<byte> shuffleMask = Unsafe.As<byte, Vector512<byte>>(ref MemoryMarshal.GetReference(temp));
+
+        Vector512<int> zero = Vector512<int>.Zero;
+        Vector512<int> one = Vector512<int>.One;
+        Vector512<int> byteMask = Vector512.Create(0xFF);
+        Vector512<int> opaque = Vector512.Create(0xFF);
+        Vector512<int> byteMax = Vector512.Create((int)byte.MaxValue);
+
+        for (; i <= pixelCount - 16; i += 16)
+        {
+            ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf<Rgba32>());
+            Vector512<byte> bgra = Unsafe.ReadUnaligned<Vector512<byte>>(ref blockRef);
+            Vector512<byte> rgba = Vector512_.ShuffleNative(bgra, shuffleMask);
+            Vector512<int> packed = rgba.AsInt32();
+            Vector512<int> alpha = Vector512.ShiftRightLogical(packed, 24);
+
+            // Fully transparent and fully opaque pixels are identity cases for
+            // unpremultiplication. Masking them keeps the scalar behavior and lets
+            // safeAlpha avoid dividing by zero for alpha == 0.
+            Vector512<int> partialMask = ~(Vector512.Equals(alpha, zero) | Vector512.Equals(alpha, opaque));
+
+            Vector512<int> r = packed & byteMask;
+            Vector512<int> g = Vector512.ShiftRightLogical(packed, 8) & byteMask;
+            Vector512<int> b = Vector512.ShiftRightLogical(packed, 16) & byteMask;
+
+            Vector512<int> safeAlpha = Vector512.ConditionalSelect(partialMask, alpha, one);
+            Vector512<int> halfAlpha = Vector512.ShiftRightLogical(safeAlpha, 1);
+            Vector512<float> safeAlphaF = Vector512.ConvertToSingle(safeAlpha);
+
+            // The scalar path computes ((c * 255) + (a >> 1)) / a with integer
+            // division. Floor the positive quotient before converting so SIMD does
+            // not use the default round-to-nearest conversion and drift by one.
+            Vector512<int> unpremultipliedR = Vector512.Min(
+                byteMax,
+                Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF)));
+
+            Vector512<int> unpremultipliedG = Vector512.Min(
+                byteMax,
+                Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF)));
+
+            Vector512<int> unpremultipliedB = Vector512.Min(
+                byteMax,
+                Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF)));
+
+            // ConditionalSelect applies the expensive unpremultiply only to pixels
+            // where alpha is between 1 and 254; alpha 0 and 255 lanes keep the
+            // shuffled channel values exactly as the scalar path does.
+            Vector512<int> finalR = Vector512.ConditionalSelect(partialMask, unpremultipliedR, r);
+            Vector512<int> finalG = Vector512.ConditionalSelect(partialMask, unpremultipliedG, g);
+            Vector512<int> finalB = Vector512.ConditionalSelect(partialMask, unpremultipliedB, b);
+
+            // Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so
+            // shifting the unpacked channels back to byte offsets 0, 1, 2, and 3
+            // recreates the in-memory RGBA bytes for the unaligned store.
+            Vector512<int> result =
+                finalR |
+                Vector512.ShiftLeft(finalG, 8) |
+                Vector512.ShiftLeft(finalB, 16) |
+                Vector512.ShiftLeft(alpha, 24);
+
+            Unsafe.WriteUnaligned(ref blockRef, result.AsByte());
+        }
+
+        return i;
+    }
+
+    private static int ApplyCgbiTransformVector256(Span<byte> scanline, int startPixel, int pixelCount)
+    {
+        ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline);
+        int i = startPixel;
+
+        Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
+        SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012);
+
+        // MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting
+        // CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place.
+        // Avx2.Shuffle is 128-bit lane-local, and the generated mask repeats inside
+        // each lane, so no byte ever needs to cross the lane boundary.
+        Vector256<byte> shuffleMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp));
+
+        Vector256<int> zero = Vector256<int>.Zero;
+        Vector256<int> one = Vector256<int>.One;
+        Vector256<int> byteMask = Vector256.Create(0xFF);
+        Vector256<int> opaque = Vector256.Create(0xFF);
+        Vector256<int> byteMax = Vector256.Create((int)byte.MaxValue);
+
+        for (; i <= pixelCount - 8; i += 8)
+        {
+            ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf<Rgba32>());
+            Vector256<byte> bgra = Unsafe.ReadUnaligned<Vector256<byte>>(ref blockRef);
+            Vector256<byte> rgba = Vector256_.ShufflePerLane(bgra, shuffleMask);
+            Vector256<int> packed = rgba.AsInt32();
+            Vector256<int> alpha = Vector256.ShiftRightLogical(packed, 24);
+
+            // Fully transparent and fully opaque pixels are identity cases for
+            // unpremultiplication. Masking them keeps the scalar behavior and lets
+            // safeAlpha avoid dividing by zero for alpha == 0.
+            Vector256<int> partialMask = ~(Vector256.Equals(alpha, zero) | Vector256.Equals(alpha, opaque));
+
+            Vector256<int> r = packed & byteMask;
+            Vector256<int> g = Vector256.ShiftRightLogical(packed, 8) & byteMask;
+            Vector256<int> b = Vector256.ShiftRightLogical(packed, 16) & byteMask;
+
+            Vector256<int> safeAlpha = Vector256.ConditionalSelect(partialMask, alpha, one);
+            Vector256<int> halfAlpha = Vector256.ShiftRightLogical(safeAlpha, 1);
+            Vector256<float> safeAlphaF = Vector256.ConvertToSingle(safeAlpha);
+
+            // The scalar path computes ((c * 255) + (a >> 1)) / a with integer
+            // division. Floor the positive quotient before converting so SIMD does
+            // not use the default round-to-nearest conversion and drift by one.
+            Vector256<int> unpremultipliedR = Vector256.Min(
+                byteMax,
+                Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF)));
+
+            Vector256<int> unpremultipliedG = Vector256.Min(
+                byteMax,
+                Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF)));
+
+            Vector256<int> unpremultipliedB = Vector256.Min(
+                byteMax,
+                Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF)));
+
+            // ConditionalSelect applies the expensive unpremultiply only to pixels
+            // where alpha is between 1 and 254; alpha 0 and 255 lanes keep the
+            // shuffled channel values exactly as the scalar path does.
+            Vector256<int> finalR = Vector256.ConditionalSelect(partialMask, unpremultipliedR, r);
+            Vector256<int> finalG = Vector256.ConditionalSelect(partialMask, unpremultipliedG, g);
+            Vector256<int> finalB = Vector256.ConditionalSelect(partialMask, unpremultipliedB, b);
+
+            // Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so
+            // shifting the unpacked channels back to byte offsets 0, 1, 2, and 3
+            // recreates the in-memory RGBA bytes for the unaligned store.
+            Vector256<int> result =
+                finalR |
+                Vector256.ShiftLeft(finalG, 8) |
+                Vector256.ShiftLeft(finalB, 16) |
+                Vector256.ShiftLeft(alpha, 24);
+
+            Unsafe.WriteUnaligned(ref blockRef, result.AsByte());
+        }
+
+        return i;
+    }
+
+    private static int ApplyCgbiTransformVector128(Span<byte> scanline, int startPixel, int pixelCount)
+    {
+        ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline);
+        int i = startPixel;
+
+        Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
+        SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012);
+
+        // MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting
+        // CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place.
+        Vector128<byte> shuffleMask = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(temp));
+
+        Vector128<int> zero = Vector128<int>.Zero;
+        Vector128<int> one = Vector128<int>.One;
+        Vector128<int> byteMask = Vector128.Create(0xFF);
+        Vector128<int> opaque = Vector128.Create(0xFF);
+        Vector128<int> byteMax = Vector128.Create((int)byte.MaxValue);
+
+        for (; i <= pixelCount - 4; i += 4)
+        {
+            ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf<Rgba32>());
+            Vector128<byte> bgra = Unsafe.ReadUnaligned<Vector128<byte>>(ref blockRef);
+            Vector128<byte> rgba = Vector128_.ShuffleNative(bgra, shuffleMask);
+            Vector128<int> packed = rgba.AsInt32();
+            Vector128<int> alpha = Vector128.ShiftRightLogical(packed, 24);
+
+            // Fully transparent and fully opaque pixels are identity cases for
+            // unpremultiplication. Masking them keeps the scalar behavior and lets
+            // safeAlpha avoid dividing by zero for alpha == 0.
+            Vector128<int> partialMask = ~(Vector128.Equals(alpha, zero) | Vector128.Equals(alpha, opaque));
+
+            Vector128<int> r = packed & byteMask;
+            Vector128<int> g = Vector128.ShiftRightLogical(packed, 8) & byteMask;
+            Vector128<int> b = Vector128.ShiftRightLogical(packed, 16) & byteMask;
+
+            Vector128<int> safeAlpha = Vector128.ConditionalSelect(partialMask, alpha, one);
+            Vector128<int> halfAlpha = Vector128.ShiftRightLogical(safeAlpha, 1);
+            Vector128<float> safeAlphaF = Vector128.ConvertToSingle(safeAlpha);
+
+            // The scalar path computes ((c * 255) + (a >> 1)) / a with integer
+            // division. Floor the positive quotient before converting so SIMD does
+            // not use the default round-to-nearest conversion and drift by one.
+            Vector128<int> unpremultipliedR = Vector128.Min(
+                byteMax,
+                Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF)));
+
+            Vector128<int> unpremultipliedG = Vector128.Min(
+                byteMax,
+                Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF)));
+
+            Vector128<int> unpremultipliedB = Vector128.Min(
+                byteMax,
+                Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF)));
+
+            // ConditionalSelect applies the expensive unpremultiply only to pixels
+            // where alpha is between 1 and 254; alpha 0 and 255 lanes keep the
+            // shuffled channel values exactly as the scalar path does.
+            Vector128<int> finalR = Vector128.ConditionalSelect(partialMask, unpremultipliedR, r);
+            Vector128<int> finalG = Vector128.ConditionalSelect(partialMask, unpremultipliedG, g);
+            Vector128<int> finalB = Vector128.ConditionalSelect(partialMask, unpremultipliedB, b);
+
+            // Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so
+            // shifting the unpacked channels back to byte offsets 0, 1, 2, and 3
+            // recreates the in-memory RGBA bytes for the unaligned store.
+            Vector128<int> result =
+                finalR |
+                Vector128.ShiftLeft(finalG, 8) |
+                Vector128.ShiftLeft(finalB, 16) |
+                Vector128.ShiftLeft(alpha, 24);
+
+            Unsafe.WriteUnaligned(ref blockRef, result.AsByte());
+        }
+
+        return i;
+    }
 }
--- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
+++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
@ -722,10 +722,32 @@ public partial class PngDecoderTests
    [WithFile(TestImages.Png.Cgbi.Flecks, PixelTypes.Rgb24)]
    public void Decode_AppleCgBI<TPixel>(TestImageProvider<TPixel> provider)
        where TPixel : unmanaged, IPixel<TPixel>
+        => FeatureTestRunner.RunWithHwIntrinsicsFeature(
+            RunDecodeAppleCgbi,
+            HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512F | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableHWIntrinsic,
+            provider,
+            provider.PixelType.ToString());
+
+    private static void RunDecodeAppleCgbi(string providerDump, string pixelType)
    {
-        using Image<TPixel> image = provider.GetImage(PngDecoder.Instance);
-        image.DebugSave(provider);
-        image.CompareToReferenceOutput(provider, ImageComparer.Exact);
+        if (Enum.Parse<PixelTypes>(pixelType) == PixelTypes.Rgb24)
+        {
+            TestImageProvider<Rgb24> provider =
+                FeatureTestRunner.DeserializeForXunit<TestImageProvider<Rgb24>>(providerDump);
+
+            using Image<Rgb24> image = provider.GetImage(PngDecoder.Instance);
+            image.DebugSave(provider);
+            image.CompareToReferenceOutput(provider, ImageComparer.Exact);
+
+            return;
+        }
+
+        TestImageProvider<Rgba32> rgbaProvider =
+            FeatureTestRunner.DeserializeForXunit<TestImageProvider<Rgba32>>(providerDump);
+
+        using Image<Rgba32> rgbaImage = rgbaProvider.GetImage(PngDecoder.Instance);
+        rgbaImage.DebugSave(rgbaProvider);
+        rgbaImage.CompareToReferenceOutput(rgbaProvider, ImageComparer.Exact);
    }

    [Theory]