diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs index 0f282c7f9a..bbe14b0991 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs @@ -31,19 +31,23 @@ internal readonly struct DefaultPad3Shuffle4([ConstantExpected] byte control) : SimdUtils.Shuffle.InverseMMShuffle(this.Control, out uint p3, out uint p2, out uint p1, out uint p0); - Span temp = stackalloc byte[4]; - ref byte t = ref MemoryMarshal.GetReference(temp); - ref uint tu = ref Unsafe.As(ref t); - for (nuint i = 0, j = 0; i < (uint)source.Length; i += 3, j += 4) { - ref byte s = ref Unsafe.Add(ref sBase, i); - tu = Unsafe.As(ref s) | 0xFF000000; - - Unsafe.Add(ref dBase, j + 0) = Unsafe.Add(ref t, p0); - Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref t, p1); - Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref t, p2); - Unsafe.Add(ref dBase, j + 3) = Unsafe.Add(ref t, p3); + // Expanding 3-byte pixels to 4 bytes can overwrite the next source + // triplet when spans overlap. Assemble the padded pixel first, then + // shuffle from the staged uint. + uint packed = + Unsafe.Add(ref sBase, i + 0u) | + ((uint)Unsafe.Add(ref sBase, i + 1u) << 8) | + ((uint)Unsafe.Add(ref sBase, i + 2u) << 16) | + 0xFF000000; + + ref byte pBase = ref Unsafe.As(ref packed); + + Unsafe.Add(ref dBase, j + 0u) = Unsafe.Add(ref pBase, p0); + Unsafe.Add(ref dBase, j + 1u) = Unsafe.Add(ref pBase, p1); + Unsafe.Add(ref dBase, j + 2u) = Unsafe.Add(ref pBase, p2); + Unsafe.Add(ref dBase, j + 3u) = Unsafe.Add(ref pBase, p3); } } } @@ -65,7 +69,12 @@ internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4 while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) { - Unsafe.As(ref dBase) = Unsafe.As(ref sBase) | 0xFF000000; + // The fast scalar path reads one extra byte past the source triplet. + // Keep that widened read in a local before writing the expanded pixel + // so overlapping destinations cannot change what was read. + uint packed = Unsafe.As(ref sBase) | 0xFF000000; + + Unsafe.As(ref dBase) = packed; sBase = ref Unsafe.Add(ref sBase, 3); dBase = ref Unsafe.Add(ref dBase, 4); @@ -73,10 +82,15 @@ internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4 while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) { - Unsafe.Add(ref dBase, 0) = Unsafe.Add(ref sBase, 0); - Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1); - Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2); - Unsafe.Add(ref dBase, 3) = byte.MaxValue; + // The final triplet cannot use the widened read above, so assemble + // the same padded uint byte-by-byte before the overlapping store. + uint packed = + Unsafe.Add(ref sBase, 0u) | + ((uint)Unsafe.Add(ref sBase, 1u) << 8) | + ((uint)Unsafe.Add(ref sBase, 2u) << 16) | + 0xFF000000; + + Unsafe.As(ref dBase) = packed; sBase = ref Unsafe.Add(ref sBase, 3); dBase = ref Unsafe.Add(ref dBase, 4); diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs index 3c0973ad69..3907df58c6 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs @@ -33,9 +33,19 @@ internal readonly struct DefaultShuffle3([ConstantExpected] byte control) : IShu for (nuint i = 0; i < (uint)source.Length; i += 3) { - Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i); - Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); - Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); + // The scalar remainder can run in-place after the vector body. Load + // the full 3-byte pixel into a register-sized value before stores so + // channel swaps cannot corrupt later reads from the same pixel. + uint packed = + Unsafe.Add(ref sBase, i + 0u) | + ((uint)Unsafe.Add(ref sBase, i + 1u) << 8) | + ((uint)Unsafe.Add(ref sBase, i + 2u) << 16); + + ref byte pBase = ref Unsafe.As(ref packed); + + Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0); + Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1); + Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2); } } } diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs index d5c6df2c8b..68f34efd7c 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs @@ -35,10 +35,16 @@ internal readonly struct DefaultShuffle4([ConstantExpected] byte control) : IShu for (nuint i = 0; i < (uint)source.Length; i += 4) { - Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i); - Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); - Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); - Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); + // The generic path may be used with source and destination pointing + // at the same pixel. Load all channels first so subsequent stores + // index only staged bytes, matching the specialized uint shuffles. + uint packed = Unsafe.As(ref Unsafe.Add(ref sBase, i)); + ref byte pBase = ref Unsafe.As(ref packed); + + Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0); + Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1); + Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2); + Unsafe.Add(ref dBase, i + 3u) = Unsafe.Add(ref pBase, p3); } } } diff --git a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs index 3e7e440664..6134061670 100644 --- a/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs +++ b/src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs @@ -33,9 +33,15 @@ internal readonly struct DefaultShuffle4Slice3([ConstantExpected] byte control) for (nuint i = 0, j = 0; i < (uint)destination.Length; i += 3, j += 4) { - Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + j); - Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j); - Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j); + // Shrinking 4-byte pixels to 3 bytes can still be called in-place by + // tail code. Read the complete source pixel first, then write only + // the requested channels into the destination triplet. + uint packed = Unsafe.As(ref Unsafe.Add(ref sBase, j)); + ref byte pBase = ref Unsafe.As(ref packed); + + Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0); + Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1); + Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2); } } } @@ -61,10 +67,18 @@ internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3 while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) { - Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0)); - Unsafe.Add(ref dBase, 1) = Unsafe.As(ref Unsafe.Add(ref sBase, 1)); - Unsafe.Add(ref dBase, 2) = Unsafe.As(ref Unsafe.Add(ref sBase, 2)); - Unsafe.Add(ref dBase, 3) = Unsafe.As(ref Unsafe.Add(ref sBase, 3)); + // Stage the four source pixels before the 3-byte stores. Even + // though this path preserves XYZ order, the packed loads must happen + // before destination writes when the spans overlap. + uint packed0 = Unsafe.Add(ref sBase, 0u); + uint packed1 = Unsafe.Add(ref sBase, 1u); + uint packed2 = Unsafe.Add(ref sBase, 2u); + uint packed3 = Unsafe.Add(ref sBase, 3u); + + Unsafe.Add(ref dBase, 0u) = Unsafe.As(ref packed0); + Unsafe.Add(ref dBase, 1u) = Unsafe.As(ref packed1); + Unsafe.Add(ref dBase, 2u) = Unsafe.As(ref packed2); + Unsafe.Add(ref dBase, 3u) = Unsafe.As(ref packed3); sBase = ref Unsafe.Add(ref sBase, 4); dBase = ref Unsafe.Add(ref dBase, 4); @@ -72,7 +86,11 @@ internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3 while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) { - Unsafe.Add(ref dBase, 0) = Unsafe.As(ref Unsafe.Add(ref sBase, 0)); + // Same overlap rule as the unrolled loop: take the 4-byte source + // pixel before storing the 3-byte destination value. + uint packed = Unsafe.Add(ref sBase, 0u); + + Unsafe.Add(ref dBase, 0u) = Unsafe.As(ref packed); sBase = ref Unsafe.Add(ref sBase, 1); dBase = ref Unsafe.Add(ref dBase, 1); diff --git a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs index dbeb54a80c..8b2baec213 100644 --- a/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs +++ b/src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs @@ -3,6 +3,7 @@ using System.Diagnostics; using System.Diagnostics.CodeAnalysis; +using System.Numerics; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; @@ -150,10 +151,15 @@ internal static partial class SimdUtils for (nuint i = 0; i < (uint)source.Length; i += 4) { - Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i); - Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); - Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); - Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); + // Stage the scalar tail in a local Vector4 so p0..p3 index source + // values that were captured before any overlapping destination writes. + Vector4 v = Unsafe.As(ref Unsafe.Add(ref sBase, i)); + ref float pBase = ref Unsafe.As(ref v); + + Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0); + Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1); + Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2); + Unsafe.Add(ref dBase, i + 3u) = Unsafe.Add(ref pBase, p3); } } diff --git a/src/ImageSharp/Formats/Png/PngDecoderCore.cs b/src/ImageSharp/Formats/Png/PngDecoderCore.cs index dcd9ffd6c2..84245254a2 100644 --- a/src/ImageSharp/Formats/Png/PngDecoderCore.cs +++ b/src/ImageSharp/Formats/Png/PngDecoderCore.cs @@ -9,6 +9,8 @@ using System.IO.Compression; using System.IO.Hashing; using System.Runtime.CompilerServices; using System.Runtime.InteropServices; +using System.Runtime.Intrinsics; +using System.Runtime.Intrinsics.X86; using System.Text; using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Compression.Zlib; @@ -900,7 +902,7 @@ internal sealed class PngDecoderCore : ImageDecoderCore if (this.isCgbi) { - ApplyCgbiTransform(scanSpan[1..], this.pngColorType); + this.ApplyCgbiTransform(scanSpan[1..], this.pngColorType); } this.ProcessDefilteredScanline(frameControl, currentRow, scanSpan, imageFrame, pngMetadata, blendRowBuffer); @@ -1035,7 +1037,7 @@ internal sealed class PngDecoderCore : ImageDecoderCore if (this.isCgbi) { - ApplyCgbiTransform(scanSpan[1..], this.pngColorType); + this.ApplyCgbiTransform(scanSpan[1..], this.pngColorType); } Span rowSpan = imageBuffer.DangerousGetRowSpan(currentRow); @@ -2505,39 +2507,289 @@ internal sealed class PngDecoderCore : ImageDecoderCore /// /// The defiltered pixel bytes (without the leading filter byte). /// The PNG color type from IHDR. - private static void ApplyCgbiTransform(Span scanline, PngColorType colorType) + private void ApplyCgbiTransform(Span scanline, PngColorType colorType) { if (colorType == PngColorType.RgbWithAlpha) { Span pixels = MemoryMarshal.Cast(scanline); - for (int i = 0; i < pixels.Length; i++) + int i = 0; + + if (Vector512.IsHardwareAccelerated && pixels.Length >= 16) { - ref Rgba32 p = ref pixels[i]; - byte r = p.B; - byte g = p.G; - byte b = p.R; - byte a = p.A; + i = ApplyCgbiTransformVector512(scanline, pixels.Length); + } - if (a is not 0 and not byte.MaxValue) - { - // Reverse: c' = c * a / 255 => c = round(c' * 255 / a) - int half = a >> 1; - r = (byte)Math.Min(byte.MaxValue, ((r * byte.MaxValue) + half) / a); - g = (byte)Math.Min(byte.MaxValue, ((g * byte.MaxValue) + half) / a); - b = (byte)Math.Min(byte.MaxValue, ((b * byte.MaxValue) + half) / a); - } + if (Vector256.IsHardwareAccelerated && Avx2.IsSupported && (pixels.Length - i) >= 8) + { + i = ApplyCgbiTransformVector256(scanline, i, pixels.Length); + } - p = new Rgba32(r, g, b, a); + if (Vector128.IsHardwareAccelerated && (pixels.Length - i) >= 4) + { + i = ApplyCgbiTransformVector128(scanline, i, pixels.Length); + } + + for (; i < pixels.Length; i++) + { + ref Rgba32 pixel = ref pixels[i]; + pixel = new Rgba32(pixel.B, pixel.G, pixel.R, pixel.A); + UndoCgbiPremultiplicationScalar(ref pixel); } } else if (colorType == PngColorType.Rgb) { - Span pixels = MemoryMarshal.Cast(scanline); - for (int i = 0; i < pixels.Length; i++) - { - ref Rgb24 p = ref pixels[i]; - (p.R, p.B) = (p.B, p.R); - } + // No alpha channel, so just swap R and B using built in SIMD-optimized pixel operations. + Span target = MemoryMarshal.Cast(scanline); + PixelOperations.Instance.FromBgr24Bytes(this.configuration, scanline, target, target.Length); } } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void UndoCgbiPremultiplicationScalar(ref Rgba32 pixel) + { + byte a = pixel.A; + if (a is 0 or byte.MaxValue) + { + return; + } + + // Reverse: c' = c * a / 255 => c = round(c' * 255 / a) + int half = a >> 1; + byte r = (byte)Math.Min(byte.MaxValue, ((pixel.R * byte.MaxValue) + half) / a); + byte g = (byte)Math.Min(byte.MaxValue, ((pixel.G * byte.MaxValue) + half) / a); + byte b = (byte)Math.Min(byte.MaxValue, ((pixel.B * byte.MaxValue) + half) / a); + pixel = new Rgba32(r, g, b, a); + } + + private static int ApplyCgbiTransformVector512(Span scanline, int pixelCount) + { + ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline); + int i = 0; + + Span temp = stackalloc byte[Vector512.Count]; + SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012); + + // MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting + // CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place. + // The generated mask only swaps bytes inside each pixel, so it remains + // correct for the optimized 512-bit byte shuffle helper. + Vector512 shuffleMask = Unsafe.As>(ref MemoryMarshal.GetReference(temp)); + + Vector512 zero = Vector512.Zero; + Vector512 one = Vector512.One; + Vector512 byteMask = Vector512.Create(0xFF); + Vector512 opaque = Vector512.Create(0xFF); + Vector512 byteMax = Vector512.Create((int)byte.MaxValue); + + for (; i <= pixelCount - 16; i += 16) + { + ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf()); + Vector512 bgra = Unsafe.ReadUnaligned>(ref blockRef); + Vector512 rgba = Vector512_.ShuffleNative(bgra, shuffleMask); + Vector512 packed = rgba.AsInt32(); + Vector512 alpha = Vector512.ShiftRightLogical(packed, 24); + + // Fully transparent and fully opaque pixels are identity cases for + // unpremultiplication. Masking them keeps the scalar behavior and lets + // safeAlpha avoid dividing by zero for alpha == 0. + Vector512 partialMask = ~(Vector512.Equals(alpha, zero) | Vector512.Equals(alpha, opaque)); + + Vector512 r = packed & byteMask; + Vector512 g = Vector512.ShiftRightLogical(packed, 8) & byteMask; + Vector512 b = Vector512.ShiftRightLogical(packed, 16) & byteMask; + + Vector512 safeAlpha = Vector512.ConditionalSelect(partialMask, alpha, one); + Vector512 halfAlpha = Vector512.ShiftRightLogical(safeAlpha, 1); + Vector512 safeAlphaF = Vector512.ConvertToSingle(safeAlpha); + + // The scalar path computes ((c * 255) + (a >> 1)) / a with integer + // division. Floor the positive quotient before converting so SIMD does + // not use the default round-to-nearest conversion and drift by one. + Vector512 unpremultipliedR = Vector512.Min( + byteMax, + Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF))); + + Vector512 unpremultipliedG = Vector512.Min( + byteMax, + Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF))); + + Vector512 unpremultipliedB = Vector512.Min( + byteMax, + Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF))); + + // ConditionalSelect applies the expensive unpremultiply only to pixels + // where alpha is between 1 and 254; alpha 0 and 255 lanes keep the + // shuffled channel values exactly as the scalar path does. + Vector512 finalR = Vector512.ConditionalSelect(partialMask, unpremultipliedR, r); + Vector512 finalG = Vector512.ConditionalSelect(partialMask, unpremultipliedG, g); + Vector512 finalB = Vector512.ConditionalSelect(partialMask, unpremultipliedB, b); + + // Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so + // shifting the unpacked channels back to byte offsets 0, 1, 2, and 3 + // recreates the in-memory RGBA bytes for the unaligned store. + Vector512 result = + finalR | + Vector512.ShiftLeft(finalG, 8) | + Vector512.ShiftLeft(finalB, 16) | + Vector512.ShiftLeft(alpha, 24); + + Unsafe.WriteUnaligned(ref blockRef, result.AsByte()); + } + + return i; + } + + private static int ApplyCgbiTransformVector256(Span scanline, int startPixel, int pixelCount) + { + ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline); + int i = startPixel; + + Span temp = stackalloc byte[Vector512.Count]; + SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012); + + // MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting + // CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place. + // Avx2.Shuffle is 128-bit lane-local, and the generated mask repeats inside + // each lane, so no byte ever needs to cross the lane boundary. + Vector256 shuffleMask = Unsafe.As>(ref MemoryMarshal.GetReference(temp)); + + Vector256 zero = Vector256.Zero; + Vector256 one = Vector256.One; + Vector256 byteMask = Vector256.Create(0xFF); + Vector256 opaque = Vector256.Create(0xFF); + Vector256 byteMax = Vector256.Create((int)byte.MaxValue); + + for (; i <= pixelCount - 8; i += 8) + { + ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf()); + Vector256 bgra = Unsafe.ReadUnaligned>(ref blockRef); + Vector256 rgba = Vector256_.ShufflePerLane(bgra, shuffleMask); + Vector256 packed = rgba.AsInt32(); + Vector256 alpha = Vector256.ShiftRightLogical(packed, 24); + + // Fully transparent and fully opaque pixels are identity cases for + // unpremultiplication. Masking them keeps the scalar behavior and lets + // safeAlpha avoid dividing by zero for alpha == 0. + Vector256 partialMask = ~(Vector256.Equals(alpha, zero) | Vector256.Equals(alpha, opaque)); + + Vector256 r = packed & byteMask; + Vector256 g = Vector256.ShiftRightLogical(packed, 8) & byteMask; + Vector256 b = Vector256.ShiftRightLogical(packed, 16) & byteMask; + + Vector256 safeAlpha = Vector256.ConditionalSelect(partialMask, alpha, one); + Vector256 halfAlpha = Vector256.ShiftRightLogical(safeAlpha, 1); + Vector256 safeAlphaF = Vector256.ConvertToSingle(safeAlpha); + + // The scalar path computes ((c * 255) + (a >> 1)) / a with integer + // division. Floor the positive quotient before converting so SIMD does + // not use the default round-to-nearest conversion and drift by one. + Vector256 unpremultipliedR = Vector256.Min( + byteMax, + Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF))); + + Vector256 unpremultipliedG = Vector256.Min( + byteMax, + Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF))); + + Vector256 unpremultipliedB = Vector256.Min( + byteMax, + Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF))); + + // ConditionalSelect applies the expensive unpremultiply only to pixels + // where alpha is between 1 and 254; alpha 0 and 255 lanes keep the + // shuffled channel values exactly as the scalar path does. + Vector256 finalR = Vector256.ConditionalSelect(partialMask, unpremultipliedR, r); + Vector256 finalG = Vector256.ConditionalSelect(partialMask, unpremultipliedG, g); + Vector256 finalB = Vector256.ConditionalSelect(partialMask, unpremultipliedB, b); + + // Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so + // shifting the unpacked channels back to byte offsets 0, 1, 2, and 3 + // recreates the in-memory RGBA bytes for the unaligned store. + Vector256 result = + finalR | + Vector256.ShiftLeft(finalG, 8) | + Vector256.ShiftLeft(finalB, 16) | + Vector256.ShiftLeft(alpha, 24); + + Unsafe.WriteUnaligned(ref blockRef, result.AsByte()); + } + + return i; + } + + private static int ApplyCgbiTransformVector128(Span scanline, int startPixel, int pixelCount) + { + ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline); + int i = startPixel; + + Span temp = stackalloc byte[Vector512.Count]; + SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012); + + // MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting + // CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place. + Vector128 shuffleMask = Unsafe.As>(ref MemoryMarshal.GetReference(temp)); + + Vector128 zero = Vector128.Zero; + Vector128 one = Vector128.One; + Vector128 byteMask = Vector128.Create(0xFF); + Vector128 opaque = Vector128.Create(0xFF); + Vector128 byteMax = Vector128.Create((int)byte.MaxValue); + + for (; i <= pixelCount - 4; i += 4) + { + ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf()); + Vector128 bgra = Unsafe.ReadUnaligned>(ref blockRef); + Vector128 rgba = Vector128_.ShuffleNative(bgra, shuffleMask); + Vector128 packed = rgba.AsInt32(); + Vector128 alpha = Vector128.ShiftRightLogical(packed, 24); + + // Fully transparent and fully opaque pixels are identity cases for + // unpremultiplication. Masking them keeps the scalar behavior and lets + // safeAlpha avoid dividing by zero for alpha == 0. + Vector128 partialMask = ~(Vector128.Equals(alpha, zero) | Vector128.Equals(alpha, opaque)); + + Vector128 r = packed & byteMask; + Vector128 g = Vector128.ShiftRightLogical(packed, 8) & byteMask; + Vector128 b = Vector128.ShiftRightLogical(packed, 16) & byteMask; + + Vector128 safeAlpha = Vector128.ConditionalSelect(partialMask, alpha, one); + Vector128 halfAlpha = Vector128.ShiftRightLogical(safeAlpha, 1); + Vector128 safeAlphaF = Vector128.ConvertToSingle(safeAlpha); + + // The scalar path computes ((c * 255) + (a >> 1)) / a with integer + // division. Floor the positive quotient before converting so SIMD does + // not use the default round-to-nearest conversion and drift by one. + Vector128 unpremultipliedR = Vector128.Min( + byteMax, + Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF))); + + Vector128 unpremultipliedG = Vector128.Min( + byteMax, + Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF))); + + Vector128 unpremultipliedB = Vector128.Min( + byteMax, + Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF))); + + // ConditionalSelect applies the expensive unpremultiply only to pixels + // where alpha is between 1 and 254; alpha 0 and 255 lanes keep the + // shuffled channel values exactly as the scalar path does. + Vector128 finalR = Vector128.ConditionalSelect(partialMask, unpremultipliedR, r); + Vector128 finalG = Vector128.ConditionalSelect(partialMask, unpremultipliedG, g); + Vector128 finalB = Vector128.ConditionalSelect(partialMask, unpremultipliedB, b); + + // Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so + // shifting the unpacked channels back to byte offsets 0, 1, 2, and 3 + // recreates the in-memory RGBA bytes for the unaligned store. + Vector128 result = + finalR | + Vector128.ShiftLeft(finalG, 8) | + Vector128.ShiftLeft(finalB, 16) | + Vector128.ShiftLeft(alpha, 24); + + Unsafe.WriteUnaligned(ref blockRef, result.AsByte()); + } + + return i; + } } diff --git a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs index d3a699c492..2e452b896d 100644 --- a/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs +++ b/tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs @@ -722,10 +722,32 @@ public partial class PngDecoderTests [WithFile(TestImages.Png.Cgbi.Flecks, PixelTypes.Rgb24)] public void Decode_AppleCgBI(TestImageProvider provider) where TPixel : unmanaged, IPixel + => FeatureTestRunner.RunWithHwIntrinsicsFeature( + RunDecodeAppleCgbi, + HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512F | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableHWIntrinsic, + provider, + provider.PixelType.ToString()); + + private static void RunDecodeAppleCgbi(string providerDump, string pixelType) { - using Image image = provider.GetImage(PngDecoder.Instance); - image.DebugSave(provider); - image.CompareToReferenceOutput(provider, ImageComparer.Exact); + if (Enum.Parse(pixelType) == PixelTypes.Rgb24) + { + TestImageProvider provider = + FeatureTestRunner.DeserializeForXunit>(providerDump); + + using Image image = provider.GetImage(PngDecoder.Instance); + image.DebugSave(provider); + image.CompareToReferenceOutput(provider, ImageComparer.Exact); + + return; + } + + TestImageProvider rgbaProvider = + FeatureTestRunner.DeserializeForXunit>(providerDump); + + using Image rgbaImage = rgbaProvider.GetImage(PngDecoder.Instance); + rgbaImage.DebugSave(rgbaProvider); + rgbaImage.CompareToReferenceOutput(rgbaProvider, ImageComparer.Exact); } [Theory]