Browse Source

Merge pull request #3136 from Erik-White/png-cgbi

Add support for Apple CgBI PNG images
main
James Jackson-South 4 days ago
committed by GitHub
parent
commit
04ff1e91de
No known key found for this signature in database GPG Key ID: B5690EEEBB952194
  1. 46
      src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs
  2. 16
      src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs
  3. 14
      src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs
  4. 34
      src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs
  5. 14
      src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs
  6. 31
      src/ImageSharp/Compression/Zlib/ZlibInflateStream.cs
  7. 326
      src/ImageSharp/Formats/Png/PngDecoderCore.cs
  8. 63
      tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs
  9. 14
      tests/ImageSharp.Tests/TestImages.cs
  10. 3
      tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgb24_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png
  11. 3
      tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_Issue_410_ImageThreshold-0_PerPixelManhattanThreshold-0.png
  12. 3
      tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_clocks_ImageThreshold-0_PerPixelManhattanThreshold-0.png
  13. 3
      tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_colors_ImageThreshold-0_PerPixelManhattanThreshold-0.png
  14. 3
      tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_screen_ImageThreshold-0_PerPixelManhattanThreshold-0.png
  15. 3
      tests/Images/Input/Png/cgbi/clocks.png
  16. 3
      tests/Images/Input/Png/cgbi/colors.png
  17. 3
      tests/Images/Input/Png/cgbi/flecks.png
  18. 3
      tests/Images/Input/Png/cgbi/screen.png

46
src/ImageSharp/Common/Helpers/Shuffle/IPad3Shuffle4.cs

@ -31,19 +31,23 @@ internal readonly struct DefaultPad3Shuffle4([ConstantExpected] byte control) :
SimdUtils.Shuffle.InverseMMShuffle(this.Control, out uint p3, out uint p2, out uint p1, out uint p0); SimdUtils.Shuffle.InverseMMShuffle(this.Control, out uint p3, out uint p2, out uint p1, out uint p0);
Span<byte> temp = stackalloc byte[4];
ref byte t = ref MemoryMarshal.GetReference(temp);
ref uint tu = ref Unsafe.As<byte, uint>(ref t);
for (nuint i = 0, j = 0; i < (uint)source.Length; i += 3, j += 4) for (nuint i = 0, j = 0; i < (uint)source.Length; i += 3, j += 4)
{ {
ref byte s = ref Unsafe.Add(ref sBase, i); // Expanding 3-byte pixels to 4 bytes can overwrite the next source
tu = Unsafe.As<byte, uint>(ref s) | 0xFF000000; // triplet when spans overlap. Assemble the padded pixel first, then
// shuffle from the staged uint.
Unsafe.Add(ref dBase, j + 0) = Unsafe.Add(ref t, p0); uint packed =
Unsafe.Add(ref dBase, j + 1) = Unsafe.Add(ref t, p1); Unsafe.Add(ref sBase, i + 0u) |
Unsafe.Add(ref dBase, j + 2) = Unsafe.Add(ref t, p2); ((uint)Unsafe.Add(ref sBase, i + 1u) << 8) |
Unsafe.Add(ref dBase, j + 3) = Unsafe.Add(ref t, p3); ((uint)Unsafe.Add(ref sBase, i + 2u) << 16) |
0xFF000000;
ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);
Unsafe.Add(ref dBase, j + 0u) = Unsafe.Add(ref pBase, p0);
Unsafe.Add(ref dBase, j + 1u) = Unsafe.Add(ref pBase, p1);
Unsafe.Add(ref dBase, j + 2u) = Unsafe.Add(ref pBase, p2);
Unsafe.Add(ref dBase, j + 3u) = Unsafe.Add(ref pBase, p3);
} }
} }
} }
@ -65,7 +69,12 @@ internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4
while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd))
{ {
Unsafe.As<byte, uint>(ref dBase) = Unsafe.As<byte, uint>(ref sBase) | 0xFF000000; // The fast scalar path reads one extra byte past the source triplet.
// Keep that widened read in a local before writing the expanded pixel
// so overlapping destinations cannot change what was read.
uint packed = Unsafe.As<byte, uint>(ref sBase) | 0xFF000000;
Unsafe.As<byte, uint>(ref dBase) = packed;
sBase = ref Unsafe.Add(ref sBase, 3); sBase = ref Unsafe.Add(ref sBase, 3);
dBase = ref Unsafe.Add(ref dBase, 4); dBase = ref Unsafe.Add(ref dBase, 4);
@ -73,10 +82,15 @@ internal readonly struct XYZWPad3Shuffle4 : IPad3Shuffle4
while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd))
{ {
Unsafe.Add(ref dBase, 0) = Unsafe.Add(ref sBase, 0); // The final triplet cannot use the widened read above, so assemble
Unsafe.Add(ref dBase, 1) = Unsafe.Add(ref sBase, 1); // the same padded uint byte-by-byte before the overlapping store.
Unsafe.Add(ref dBase, 2) = Unsafe.Add(ref sBase, 2); uint packed =
Unsafe.Add(ref dBase, 3) = byte.MaxValue; Unsafe.Add(ref sBase, 0u) |
((uint)Unsafe.Add(ref sBase, 1u) << 8) |
((uint)Unsafe.Add(ref sBase, 2u) << 16) |
0xFF000000;
Unsafe.As<byte, uint>(ref dBase) = packed;
sBase = ref Unsafe.Add(ref sBase, 3); sBase = ref Unsafe.Add(ref sBase, 3);
dBase = ref Unsafe.Add(ref dBase, 4); dBase = ref Unsafe.Add(ref dBase, 4);

16
src/ImageSharp/Common/Helpers/Shuffle/IShuffle3.cs

@ -33,9 +33,19 @@ internal readonly struct DefaultShuffle3([ConstantExpected] byte control) : IShu
for (nuint i = 0; i < (uint)source.Length; i += 3) for (nuint i = 0; i < (uint)source.Length; i += 3)
{ {
Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i); // The scalar remainder can run in-place after the vector body. Load
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); // the full 3-byte pixel into a register-sized value before stores so
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); // channel swaps cannot corrupt later reads from the same pixel.
uint packed =
Unsafe.Add(ref sBase, i + 0u) |
((uint)Unsafe.Add(ref sBase, i + 1u) << 8) |
((uint)Unsafe.Add(ref sBase, i + 2u) << 16);
ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);
Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
} }
} }
} }

14
src/ImageSharp/Common/Helpers/Shuffle/IShuffle4.cs

@ -35,10 +35,16 @@ internal readonly struct DefaultShuffle4([ConstantExpected] byte control) : IShu
for (nuint i = 0; i < (uint)source.Length; i += 4) for (nuint i = 0; i < (uint)source.Length; i += 4)
{ {
Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i); // The generic path may be used with source and destination pointing
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); // at the same pixel. Load all channels first so subsequent stores
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); // index only staged bytes, matching the specialized uint shuffles.
Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); uint packed = Unsafe.As<byte, uint>(ref Unsafe.Add(ref sBase, i));
ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);
Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
Unsafe.Add(ref dBase, i + 3u) = Unsafe.Add(ref pBase, p3);
} }
} }
} }

34
src/ImageSharp/Common/Helpers/Shuffle/IShuffle4Slice3.cs

@ -33,9 +33,15 @@ internal readonly struct DefaultShuffle4Slice3([ConstantExpected] byte control)
for (nuint i = 0, j = 0; i < (uint)destination.Length; i += 3, j += 4) for (nuint i = 0, j = 0; i < (uint)destination.Length; i += 3, j += 4)
{ {
Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + j); // Shrinking 4-byte pixels to 3 bytes can still be called in-place by
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + j); // tail code. Read the complete source pixel first, then write only
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + j); // the requested channels into the destination triplet.
uint packed = Unsafe.As<byte, uint>(ref Unsafe.Add(ref sBase, j));
ref byte pBase = ref Unsafe.As<uint, byte>(ref packed);
Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
} }
} }
} }
@ -61,10 +67,18 @@ internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3
while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd)) while (Unsafe.IsAddressLessThan(ref sBase, ref sLoopEnd))
{ {
Unsafe.Add(ref dBase, 0) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 0)); // Stage the four source pixels before the 3-byte stores. Even
Unsafe.Add(ref dBase, 1) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 1)); // though this path preserves XYZ order, the packed loads must happen
Unsafe.Add(ref dBase, 2) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 2)); // before destination writes when the spans overlap.
Unsafe.Add(ref dBase, 3) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 3)); uint packed0 = Unsafe.Add(ref sBase, 0u);
uint packed1 = Unsafe.Add(ref sBase, 1u);
uint packed2 = Unsafe.Add(ref sBase, 2u);
uint packed3 = Unsafe.Add(ref sBase, 3u);
Unsafe.Add(ref dBase, 0u) = Unsafe.As<uint, Byte3>(ref packed0);
Unsafe.Add(ref dBase, 1u) = Unsafe.As<uint, Byte3>(ref packed1);
Unsafe.Add(ref dBase, 2u) = Unsafe.As<uint, Byte3>(ref packed2);
Unsafe.Add(ref dBase, 3u) = Unsafe.As<uint, Byte3>(ref packed3);
sBase = ref Unsafe.Add(ref sBase, 4); sBase = ref Unsafe.Add(ref sBase, 4);
dBase = ref Unsafe.Add(ref dBase, 4); dBase = ref Unsafe.Add(ref dBase, 4);
@ -72,7 +86,11 @@ internal readonly struct XYZWShuffle4Slice3 : IShuffle4Slice3
while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd)) while (Unsafe.IsAddressLessThan(ref sBase, ref sEnd))
{ {
Unsafe.Add(ref dBase, 0) = Unsafe.As<uint, Byte3>(ref Unsafe.Add(ref sBase, 0)); // Same overlap rule as the unrolled loop: take the 4-byte source
// pixel before storing the 3-byte destination value.
uint packed = Unsafe.Add(ref sBase, 0u);
Unsafe.Add(ref dBase, 0u) = Unsafe.As<uint, Byte3>(ref packed);
sBase = ref Unsafe.Add(ref sBase, 1); sBase = ref Unsafe.Add(ref sBase, 1);
dBase = ref Unsafe.Add(ref dBase, 1); dBase = ref Unsafe.Add(ref dBase, 1);

14
src/ImageSharp/Common/Helpers/SimdUtils.Shuffle.cs

@ -3,6 +3,7 @@
using System.Diagnostics; using System.Diagnostics;
using System.Diagnostics.CodeAnalysis; using System.Diagnostics.CodeAnalysis;
using System.Numerics;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
using System.Runtime.InteropServices; using System.Runtime.InteropServices;
@ -150,10 +151,15 @@ internal static partial class SimdUtils
for (nuint i = 0; i < (uint)source.Length; i += 4) for (nuint i = 0; i < (uint)source.Length; i += 4)
{ {
Unsafe.Add(ref dBase, i + 0) = Unsafe.Add(ref sBase, p0 + i); // Stage the scalar tail in a local Vector4 so p0..p3 index source
Unsafe.Add(ref dBase, i + 1) = Unsafe.Add(ref sBase, p1 + i); // values that were captured before any overlapping destination writes.
Unsafe.Add(ref dBase, i + 2) = Unsafe.Add(ref sBase, p2 + i); Vector4 v = Unsafe.As<float, Vector4>(ref Unsafe.Add(ref sBase, i));
Unsafe.Add(ref dBase, i + 3) = Unsafe.Add(ref sBase, p3 + i); ref float pBase = ref Unsafe.As<Vector4, float>(ref v);
Unsafe.Add(ref dBase, i + 0u) = Unsafe.Add(ref pBase, p0);
Unsafe.Add(ref dBase, i + 1u) = Unsafe.Add(ref pBase, p1);
Unsafe.Add(ref dBase, i + 2u) = Unsafe.Add(ref pBase, p2);
Unsafe.Add(ref dBase, i + 3u) = Unsafe.Add(ref pBase, p3);
} }
} }

31
src/ImageSharp/Compression/Zlib/ZlibInflateStream.cs

@ -52,12 +52,19 @@ internal sealed class ZlibInflateStream : Stream
/// </summary> /// </summary>
private readonly Func<int> getData; private readonly Func<int> getData;
/// <summary>
/// When true, the inflated payload is treated as a raw DEFLATE stream with no zlib
/// CMF/FLG header (and no Adler-32 trailer). This is required to decode IDATs in
/// Apple's proprietary CgBI PNG variant.
/// </summary>
private readonly bool noHeader;
/// <summary> /// <summary>
/// Initializes a new instance of the <see cref="ZlibInflateStream"/> class. /// Initializes a new instance of the <see cref="ZlibInflateStream"/> class.
/// </summary> /// </summary>
/// <param name="innerStream">The inner raw stream.</param> /// <param name="innerStream">The inner raw stream.</param>
public ZlibInflateStream(BufferedReadStream innerStream) public ZlibInflateStream(BufferedReadStream innerStream)
: this(innerStream, GetDataNoOp) : this(innerStream, GetDataNoOp, noHeader: false)
{ {
} }
@ -67,9 +74,23 @@ internal sealed class ZlibInflateStream : Stream
/// <param name="innerStream">The inner raw stream.</param> /// <param name="innerStream">The inner raw stream.</param>
/// <param name="getData">A delegate to get more data from the inner stream.</param> /// <param name="getData">A delegate to get more data from the inner stream.</param>
public ZlibInflateStream(BufferedReadStream innerStream, Func<int> getData) public ZlibInflateStream(BufferedReadStream innerStream, Func<int> getData)
: this(innerStream, getData, noHeader: false)
{
}
/// <summary>
/// Initializes a new instance of the <see cref="ZlibInflateStream"/> class.
/// </summary>
/// <param name="innerStream">The inner raw stream.</param>
/// <param name="getData">A delegate to get more data from the inner stream.</param>
/// <param name="noHeader">
/// When <see langword="true"/>, the payload is treated as raw DEFLATE with no zlib header.
/// </param>
public ZlibInflateStream(BufferedReadStream innerStream, Func<int> getData, bool noHeader)
{ {
this.innerStream = innerStream; this.innerStream = innerStream;
this.getData = getData; this.getData = getData;
this.noHeader = noHeader;
} }
/// <inheritdoc/> /// <inheritdoc/>
@ -210,6 +231,14 @@ internal sealed class ZlibInflateStream : Stream
[MemberNotNullWhen(true, nameof(CompressedStream))] [MemberNotNullWhen(true, nameof(CompressedStream))]
private bool InitializeInflateStream(bool isCriticalChunk) private bool InitializeInflateStream(bool isCriticalChunk)
{ {
// Apple CgBI IDATs omit the zlib CMF/FLG header and the Adler-32 trailer,
// wrapping a raw DEFLATE payload directly. Skip the header parsing in that mode.
if (this.noHeader)
{
this.CompressedStream = new DeflateStream(this, CompressionMode.Decompress, true);
return true;
}
// Read the zlib header : http://tools.ietf.org/html/rfc1950 // Read the zlib header : http://tools.ietf.org/html/rfc1950
// CMF(Compression Method and flags) // CMF(Compression Method and flags)
// This byte is divided into a 4 - bit compression method and a // This byte is divided into a 4 - bit compression method and a

326
src/ImageSharp/Formats/Png/PngDecoderCore.cs

@ -9,6 +9,8 @@ using System.IO.Compression;
using System.IO.Hashing; using System.IO.Hashing;
using System.Runtime.CompilerServices; using System.Runtime.CompilerServices;
using System.Runtime.InteropServices; using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Text; using System.Text;
using SixLabors.ImageSharp.Common.Helpers; using SixLabors.ImageSharp.Common.Helpers;
using SixLabors.ImageSharp.Compression.Zlib; using SixLabors.ImageSharp.Compression.Zlib;
@ -137,6 +139,13 @@ internal sealed class PngDecoderCore : ImageDecoderCore
/// </summary> /// </summary>
private bool hasImageData; private bool hasImageData;
/// <summary>
/// Whether this is an Apple CgBI PNG. CgBI files store IDATs as raw DEFLATE
/// (no zlib header/Adler-32) and pixels as premultiplied BGRA, so they need
/// extra inversion steps to round-trip back to standard PNG semantics.
/// </summary>
private bool isCgbi;
/// <summary> /// <summary>
/// Initializes a new instance of the <see cref="PngDecoderCore"/> class. /// Initializes a new instance of the <see cref="PngDecoderCore"/> class.
/// </summary> /// </summary>
@ -314,7 +323,7 @@ internal sealed class PngDecoderCore : ImageDecoderCore
case PngChunkType.End: case PngChunkType.End:
goto EOF; goto EOF;
case PngChunkType.ProprietaryApple: case PngChunkType.ProprietaryApple:
PngThrowHelper.ThrowInvalidChunkType("Proprietary Apple PNG detected! This PNG file is not conform to the specification and cannot be decoded."); this.isCgbi = true;
break; break;
} }
} }
@ -517,6 +526,10 @@ internal sealed class PngDecoderCore : ImageDecoderCore
case PngChunkType.End: case PngChunkType.End:
goto EOF; goto EOF;
case PngChunkType.ProprietaryApple:
this.isCgbi = true;
break;
default: default:
if (this.colorMetadataOnly) if (this.colorMetadataOnly)
{ {
@ -766,7 +779,7 @@ internal sealed class PngDecoderCore : ImageDecoderCore
CancellationToken cancellationToken) CancellationToken cancellationToken)
where TPixel : unmanaged, IPixel<TPixel> where TPixel : unmanaged, IPixel<TPixel>
{ {
using ZlibInflateStream inflateStream = new(this.currentStream, getData); using ZlibInflateStream inflateStream = new(this.currentStream, getData, noHeader: this.isCgbi);
if (!inflateStream.AllocateNewBytes(chunkLength, !this.hasImageData)) if (!inflateStream.AllocateNewBytes(chunkLength, !this.hasImageData))
{ {
return; return;
@ -887,6 +900,11 @@ internal sealed class PngDecoderCore : ImageDecoderCore
break; break;
} }
if (this.isCgbi)
{
this.ApplyCgbiTransform(scanSpan[1..], this.pngColorType);
}
this.ProcessDefilteredScanline(frameControl, currentRow, scanSpan, imageFrame, pngMetadata, blendRowBuffer); this.ProcessDefilteredScanline(frameControl, currentRow, scanSpan, imageFrame, pngMetadata, blendRowBuffer);
this.SwapScanlineBuffers(); this.SwapScanlineBuffers();
currentRow++; currentRow++;
@ -1017,6 +1035,11 @@ internal sealed class PngDecoderCore : ImageDecoderCore
break; break;
} }
if (this.isCgbi)
{
this.ApplyCgbiTransform(scanSpan[1..], this.pngColorType);
}
Span<TPixel> rowSpan = imageBuffer.DangerousGetRowSpan(currentRow); Span<TPixel> rowSpan = imageBuffer.DangerousGetRowSpan(currentRow);
this.ProcessInterlacedDefilteredScanline( this.ProcessInterlacedDefilteredScanline(
frameControl, frameControl,
@ -2470,4 +2493,303 @@ internal sealed class PngDecoderCore : ImageDecoderCore
private void SwapScanlineBuffers() private void SwapScanlineBuffers()
=> (this.scanline, this.previousScanline) = (this.previousScanline, this.scanline); => (this.scanline, this.previousScanline) = (this.previousScanline, this.scanline);
/// <summary>
/// Applies the inverse of Apple's CgBI pixel mangling to a defiltered scanline.
/// CgBI PNGs are emitted by <c>pngcrush -iphone</c> with channel order swapped
/// from RGB(A) to BGR(A) and RGB samples premultiplied by alpha. This converts
/// the bytes back to standard PNG semantics in place so the existing scanline
/// processors can consume them unchanged. CgBI is only emitted for 8-bit
/// truecolor (with or without alpha); other color types are left alone.
/// </summary>
/// <remarks>
/// See https://theapplewiki.com/wiki/PNG_CgBI_Format
/// </remarks>
/// <param name="scanline">The defiltered pixel bytes (without the leading filter byte).</param>
/// <param name="colorType">The PNG color type from IHDR.</param>
private void ApplyCgbiTransform(Span<byte> scanline, PngColorType colorType)
{
if (colorType == PngColorType.RgbWithAlpha)
{
Span<Rgba32> pixels = MemoryMarshal.Cast<byte, Rgba32>(scanline);
int i = 0;
if (Vector512.IsHardwareAccelerated && pixels.Length >= 16)
{
i = ApplyCgbiTransformVector512(scanline, pixels.Length);
}
if (Vector256.IsHardwareAccelerated && Avx2.IsSupported && (pixels.Length - i) >= 8)
{
i = ApplyCgbiTransformVector256(scanline, i, pixels.Length);
}
if (Vector128.IsHardwareAccelerated && (pixels.Length - i) >= 4)
{
i = ApplyCgbiTransformVector128(scanline, i, pixels.Length);
}
for (; i < pixels.Length; i++)
{
ref Rgba32 pixel = ref pixels[i];
pixel = new Rgba32(pixel.B, pixel.G, pixel.R, pixel.A);
UndoCgbiPremultiplicationScalar(ref pixel);
}
}
else if (colorType == PngColorType.Rgb)
{
// No alpha channel, so just swap R and B using built in SIMD-optimized pixel operations.
Span<Rgb24> target = MemoryMarshal.Cast<byte, Rgb24>(scanline);
PixelOperations<Rgb24>.Instance.FromBgr24Bytes(this.configuration, scanline, target, target.Length);
}
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void UndoCgbiPremultiplicationScalar(ref Rgba32 pixel)
{
byte a = pixel.A;
if (a is 0 or byte.MaxValue)
{
return;
}
// Reverse: c' = c * a / 255 => c = round(c' * 255 / a)
int half = a >> 1;
byte r = (byte)Math.Min(byte.MaxValue, ((pixel.R * byte.MaxValue) + half) / a);
byte g = (byte)Math.Min(byte.MaxValue, ((pixel.G * byte.MaxValue) + half) / a);
byte b = (byte)Math.Min(byte.MaxValue, ((pixel.B * byte.MaxValue) + half) / a);
pixel = new Rgba32(r, g, b, a);
}
private static int ApplyCgbiTransformVector512(Span<byte> scanline, int pixelCount)
{
ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline);
int i = 0;
Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012);
// MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting
// CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place.
// The generated mask only swaps bytes inside each pixel, so it remains
// correct for the optimized 512-bit byte shuffle helper.
Vector512<byte> shuffleMask = Unsafe.As<byte, Vector512<byte>>(ref MemoryMarshal.GetReference(temp));
Vector512<int> zero = Vector512<int>.Zero;
Vector512<int> one = Vector512<int>.One;
Vector512<int> byteMask = Vector512.Create(0xFF);
Vector512<int> opaque = Vector512.Create(0xFF);
Vector512<int> byteMax = Vector512.Create((int)byte.MaxValue);
for (; i <= pixelCount - 16; i += 16)
{
ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf<Rgba32>());
Vector512<byte> bgra = Unsafe.ReadUnaligned<Vector512<byte>>(ref blockRef);
Vector512<byte> rgba = Vector512_.ShuffleNative(bgra, shuffleMask);
Vector512<int> packed = rgba.AsInt32();
Vector512<int> alpha = Vector512.ShiftRightLogical(packed, 24);
// Fully transparent and fully opaque pixels are identity cases for
// unpremultiplication. Masking them keeps the scalar behavior and lets
// safeAlpha avoid dividing by zero for alpha == 0.
Vector512<int> partialMask = ~(Vector512.Equals(alpha, zero) | Vector512.Equals(alpha, opaque));
Vector512<int> r = packed & byteMask;
Vector512<int> g = Vector512.ShiftRightLogical(packed, 8) & byteMask;
Vector512<int> b = Vector512.ShiftRightLogical(packed, 16) & byteMask;
Vector512<int> safeAlpha = Vector512.ConditionalSelect(partialMask, alpha, one);
Vector512<int> halfAlpha = Vector512.ShiftRightLogical(safeAlpha, 1);
Vector512<float> safeAlphaF = Vector512.ConvertToSingle(safeAlpha);
// The scalar path computes ((c * 255) + (a >> 1)) / a with integer
// division. Floor the positive quotient before converting so SIMD does
// not use the default round-to-nearest conversion and drift by one.
Vector512<int> unpremultipliedR = Vector512.Min(
byteMax,
Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF)));
Vector512<int> unpremultipliedG = Vector512.Min(
byteMax,
Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF)));
Vector512<int> unpremultipliedB = Vector512.Min(
byteMax,
Vector512.ConvertToInt32(Vector512.Floor(Vector512.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF)));
// ConditionalSelect applies the expensive unpremultiply only to pixels
// where alpha is between 1 and 254; alpha 0 and 255 lanes keep the
// shuffled channel values exactly as the scalar path does.
Vector512<int> finalR = Vector512.ConditionalSelect(partialMask, unpremultipliedR, r);
Vector512<int> finalG = Vector512.ConditionalSelect(partialMask, unpremultipliedG, g);
Vector512<int> finalB = Vector512.ConditionalSelect(partialMask, unpremultipliedB, b);
// Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so
// shifting the unpacked channels back to byte offsets 0, 1, 2, and 3
// recreates the in-memory RGBA bytes for the unaligned store.
Vector512<int> result =
finalR |
Vector512.ShiftLeft(finalG, 8) |
Vector512.ShiftLeft(finalB, 16) |
Vector512.ShiftLeft(alpha, 24);
Unsafe.WriteUnaligned(ref blockRef, result.AsByte());
}
return i;
}
private static int ApplyCgbiTransformVector256(Span<byte> scanline, int startPixel, int pixelCount)
{
ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline);
int i = startPixel;
Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012);
// MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting
// CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place.
// Avx2.Shuffle is 128-bit lane-local, and the generated mask repeats inside
// each lane, so no byte ever needs to cross the lane boundary.
Vector256<byte> shuffleMask = Unsafe.As<byte, Vector256<byte>>(ref MemoryMarshal.GetReference(temp));
Vector256<int> zero = Vector256<int>.Zero;
Vector256<int> one = Vector256<int>.One;
Vector256<int> byteMask = Vector256.Create(0xFF);
Vector256<int> opaque = Vector256.Create(0xFF);
Vector256<int> byteMax = Vector256.Create((int)byte.MaxValue);
for (; i <= pixelCount - 8; i += 8)
{
ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf<Rgba32>());
Vector256<byte> bgra = Unsafe.ReadUnaligned<Vector256<byte>>(ref blockRef);
Vector256<byte> rgba = Vector256_.ShufflePerLane(bgra, shuffleMask);
Vector256<int> packed = rgba.AsInt32();
Vector256<int> alpha = Vector256.ShiftRightLogical(packed, 24);
// Fully transparent and fully opaque pixels are identity cases for
// unpremultiplication. Masking them keeps the scalar behavior and lets
// safeAlpha avoid dividing by zero for alpha == 0.
Vector256<int> partialMask = ~(Vector256.Equals(alpha, zero) | Vector256.Equals(alpha, opaque));
Vector256<int> r = packed & byteMask;
Vector256<int> g = Vector256.ShiftRightLogical(packed, 8) & byteMask;
Vector256<int> b = Vector256.ShiftRightLogical(packed, 16) & byteMask;
Vector256<int> safeAlpha = Vector256.ConditionalSelect(partialMask, alpha, one);
Vector256<int> halfAlpha = Vector256.ShiftRightLogical(safeAlpha, 1);
Vector256<float> safeAlphaF = Vector256.ConvertToSingle(safeAlpha);
// The scalar path computes ((c * 255) + (a >> 1)) / a with integer
// division. Floor the positive quotient before converting so SIMD does
// not use the default round-to-nearest conversion and drift by one.
Vector256<int> unpremultipliedR = Vector256.Min(
byteMax,
Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF)));
Vector256<int> unpremultipliedG = Vector256.Min(
byteMax,
Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF)));
Vector256<int> unpremultipliedB = Vector256.Min(
byteMax,
Vector256.ConvertToInt32(Vector256.Floor(Vector256.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF)));
// ConditionalSelect applies the expensive unpremultiply only to pixels
// where alpha is between 1 and 254; alpha 0 and 255 lanes keep the
// shuffled channel values exactly as the scalar path does.
Vector256<int> finalR = Vector256.ConditionalSelect(partialMask, unpremultipliedR, r);
Vector256<int> finalG = Vector256.ConditionalSelect(partialMask, unpremultipliedG, g);
Vector256<int> finalB = Vector256.ConditionalSelect(partialMask, unpremultipliedB, b);
// Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so
// shifting the unpacked channels back to byte offsets 0, 1, 2, and 3
// recreates the in-memory RGBA bytes for the unaligned store.
Vector256<int> result =
finalR |
Vector256.ShiftLeft(finalG, 8) |
Vector256.ShiftLeft(finalB, 16) |
Vector256.ShiftLeft(alpha, 24);
Unsafe.WriteUnaligned(ref blockRef, result.AsByte());
}
return i;
}
private static int ApplyCgbiTransformVector128(Span<byte> scanline, int startPixel, int pixelCount)
{
ref byte scanlineRef = ref MemoryMarshal.GetReference(scanline);
int i = startPixel;
Span<byte> temp = stackalloc byte[Vector512<byte>.Count];
SimdUtils.Shuffle.MMShuffleSpan(ref temp, SimdUtils.Shuffle.MMShuffle3012);
// MMShuffle3012 expands to [2, 1, 0, 3] for each 4-byte pixel, converting
// CgBI's BGRA byte order to Rgba32's RGBA layout while keeping alpha in place.
Vector128<byte> shuffleMask = Unsafe.As<byte, Vector128<byte>>(ref MemoryMarshal.GetReference(temp));
Vector128<int> zero = Vector128<int>.Zero;
Vector128<int> one = Vector128<int>.One;
Vector128<int> byteMask = Vector128.Create(0xFF);
Vector128<int> opaque = Vector128.Create(0xFF);
Vector128<int> byteMax = Vector128.Create((int)byte.MaxValue);
for (; i <= pixelCount - 4; i += 4)
{
ref byte blockRef = ref Unsafe.Add(ref scanlineRef, i * Unsafe.SizeOf<Rgba32>());
Vector128<byte> bgra = Unsafe.ReadUnaligned<Vector128<byte>>(ref blockRef);
Vector128<byte> rgba = Vector128_.ShuffleNative(bgra, shuffleMask);
Vector128<int> packed = rgba.AsInt32();
Vector128<int> alpha = Vector128.ShiftRightLogical(packed, 24);
// Fully transparent and fully opaque pixels are identity cases for
// unpremultiplication. Masking them keeps the scalar behavior and lets
// safeAlpha avoid dividing by zero for alpha == 0.
Vector128<int> partialMask = ~(Vector128.Equals(alpha, zero) | Vector128.Equals(alpha, opaque));
Vector128<int> r = packed & byteMask;
Vector128<int> g = Vector128.ShiftRightLogical(packed, 8) & byteMask;
Vector128<int> b = Vector128.ShiftRightLogical(packed, 16) & byteMask;
Vector128<int> safeAlpha = Vector128.ConditionalSelect(partialMask, alpha, one);
Vector128<int> halfAlpha = Vector128.ShiftRightLogical(safeAlpha, 1);
Vector128<float> safeAlphaF = Vector128.ConvertToSingle(safeAlpha);
// The scalar path computes ((c * 255) + (a >> 1)) / a with integer
// division. Floor the positive quotient before converting so SIMD does
// not use the default round-to-nearest conversion and drift by one.
Vector128<int> unpremultipliedR = Vector128.Min(
byteMax,
Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((r * byteMax) + halfAlpha) / safeAlphaF)));
Vector128<int> unpremultipliedG = Vector128.Min(
byteMax,
Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((g * byteMax) + halfAlpha) / safeAlphaF)));
Vector128<int> unpremultipliedB = Vector128.Min(
byteMax,
Vector128.ConvertToInt32(Vector128.Floor(Vector128.ConvertToSingle((b * byteMax) + halfAlpha) / safeAlphaF)));
// ConditionalSelect applies the expensive unpremultiply only to pixels
// where alpha is between 1 and 254; alpha 0 and 255 lanes keep the
// shuffled channel values exactly as the scalar path does.
Vector128<int> finalR = Vector128.ConditionalSelect(partialMask, unpremultipliedR, r);
Vector128<int> finalG = Vector128.ConditionalSelect(partialMask, unpremultipliedG, g);
Vector128<int> finalB = Vector128.ConditionalSelect(partialMask, unpremultipliedB, b);
// Rgba32 is laid out as little-endian 0xAABBGGRR in an int lane, so
// shifting the unpacked channels back to byte offsets 0, 1, 2, and 3
// recreates the in-memory RGBA bytes for the unaligned store.
Vector128<int> result =
finalR |
Vector128.ShiftLeft(finalG, 8) |
Vector128.ShiftLeft(finalB, 16) |
Vector128.ShiftLeft(alpha, 24);
Unsafe.WriteUnaligned(ref blockRef, result.AsByte());
}
return i;
}
} }

63
tests/ImageSharp.Tests/Formats/Png/PngDecoderTests.cs

@ -714,26 +714,57 @@ public partial class PngDecoderTests
Assert.Contains(metadata.ColorTable.Value.ToArray(), x => x.ToPixel<Rgba32>().A < 255); Assert.Contains(metadata.ColorTable.Value.ToArray(), x => x.ToPixel<Rgba32>().A < 255);
} }
// https://github.com/SixLabors/ImageSharp/issues/410
[Theory] [Theory]
[WithFile(TestImages.Png.Bad.Issue410_MalformedApplePng, PixelTypes.Rgba32)] [WithFile(TestImages.Png.Cgbi.Issue410, PixelTypes.Rgba32)]
public void Issue410_MalformedApplePng<TPixel>(TestImageProvider<TPixel> provider) [WithFile(TestImages.Png.Cgbi.Colors, PixelTypes.Rgba32)]
[WithFile(TestImages.Png.Cgbi.Clocks, PixelTypes.Rgba32)]
[WithFile(TestImages.Png.Cgbi.Screen, PixelTypes.Rgba32)]
[WithFile(TestImages.Png.Cgbi.Flecks, PixelTypes.Rgb24)]
public void Decode_AppleCgBI<TPixel>(TestImageProvider<TPixel> provider)
where TPixel : unmanaged, IPixel<TPixel> where TPixel : unmanaged, IPixel<TPixel>
=> FeatureTestRunner.RunWithHwIntrinsicsFeature(
RunDecodeAppleCgbi,
HwIntrinsics.AllowAll | HwIntrinsics.DisableAVX512F | HwIntrinsics.DisableAVX2 | HwIntrinsics.DisableHWIntrinsic,
provider,
provider.PixelType.ToString());
private static void RunDecodeAppleCgbi(string providerDump, string pixelType)
{ {
Exception ex = Record.Exception( if (Enum.Parse<PixelTypes>(pixelType) == PixelTypes.Rgb24)
() => {
{ TestImageProvider<Rgb24> provider =
using Image<TPixel> image = provider.GetImage(PngDecoder.Instance); FeatureTestRunner.DeserializeForXunit<TestImageProvider<Rgb24>>(providerDump);
image.DebugSave(provider);
// We don't have another x-plat reference decoder that can be compared for this image. using Image<Rgb24> image = provider.GetImage(PngDecoder.Instance);
if (TestEnvironment.IsWindows) image.DebugSave(provider);
{ image.CompareToReferenceOutput(provider, ImageComparer.Exact);
image.CompareToOriginal(provider, ImageComparer.Exact, SystemDrawingReferenceDecoder.Png);
} return;
}); }
Assert.NotNull(ex);
Assert.Contains("Proprietary Apple PNG detected!", ex.Message); TestImageProvider<Rgba32> rgbaProvider =
FeatureTestRunner.DeserializeForXunit<TestImageProvider<Rgba32>>(providerDump);
using Image<Rgba32> rgbaImage = rgbaProvider.GetImage(PngDecoder.Instance);
rgbaImage.DebugSave(rgbaProvider);
rgbaImage.CompareToReferenceOutput(rgbaProvider, ImageComparer.Exact);
}
[Theory]
[InlineData(TestImages.Png.Cgbi.Colors, 120, 120)]
[InlineData(TestImages.Png.Cgbi.Issue410, 42, 26)]
[InlineData(TestImages.Png.Cgbi.Flecks, 510, 512)]
public void Identify_AppleCgBI(string imagePath, int expectedWidth, int expectedHeight)
{
TestFile testFile = TestFile.Create(imagePath);
using MemoryStream stream = new(testFile.Bytes, false);
ImageInfo imageInfo = Image.Identify(stream);
Assert.NotNull(imageInfo);
Assert.Equal(PngFormat.Instance, imageInfo.Metadata.DecodedImageFormat);
Assert.Equal(expectedWidth, imageInfo.Width);
Assert.Equal(expectedHeight, imageInfo.Height);
} }
[Theory] [Theory]

14
tests/ImageSharp.Tests/TestImages.cs

@ -180,6 +180,17 @@ public static class TestImages
public const string PerceptualcLUTOnly = "Png/icc-profiles/Perceptual-cLUT-only.png"; public const string PerceptualcLUTOnly = "Png/icc-profiles/Perceptual-cLUT-only.png";
} }
public static class Cgbi
{
public const string Colors = "Png/cgbi/colors.png";
public const string Clocks = "Png/cgbi/clocks.png";
public const string Flecks = "Png/cgbi/flecks.png";
public const string Screen = "Png/cgbi/screen.png";
// Issue 410: https://github.com/SixLabors/ImageSharp/issues/410
public const string Issue410 = "Png/issues/Issue_410.png";
}
public static class Bad public static class Bad
{ {
public const string MissingDataChunk = "Png/xdtn0g01.png"; public const string MissingDataChunk = "Png/xdtn0g01.png";
@ -202,9 +213,6 @@ public static class TestImages
// Issue 1047: https://github.com/SixLabors/ImageSharp/issues/1047 // Issue 1047: https://github.com/SixLabors/ImageSharp/issues/1047
public const string Issue1047_BadEndChunk = "Png/issues/Issue_1047.png"; public const string Issue1047_BadEndChunk = "Png/issues/Issue_1047.png";
// Issue 410: https://github.com/SixLabors/ImageSharp/issues/410
public const string Issue410_MalformedApplePng = "Png/issues/Issue_410.png";
// Bad bit depth. // Bad bit depth.
public const string BitDepthZero = "Png/xd0n2c08.png"; public const string BitDepthZero = "Png/xd0n2c08.png";
public const string BitDepthThree = "Png/xd3n2c08.png"; public const string BitDepthThree = "Png/xd3n2c08.png";

3
tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgb24_flecks_ImageThreshold-0_PerPixelManhattanThreshold-0.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:10ee142fc1d3638ebe53fdd21c0a4c53008801befcc5e986051b796561cae887
size 237676

3
tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_Issue_410_ImageThreshold-0_PerPixelManhattanThreshold-0.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:511cb90e72fcb837e4c9a31561a3c914f5201452d4ca63502cb5219cb4dc42be
size 619

3
tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_clocks_ImageThreshold-0_PerPixelManhattanThreshold-0.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2dc73f1b4435a26125d910f005b5df7a540168a954f44c01a8e46df201adb1b6
size 335851

3
tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_colors_ImageThreshold-0_PerPixelManhattanThreshold-0.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8cdcc80a8c662c50d2f72ad8e123d595c6e80394538c05666b8d3531d651e71a
size 11270

3
tests/Images/External/ReferenceOutput/PngDecoderTests/Decode_AppleCgBI_Rgba32_screen_ImageThreshold-0_PerPixelManhattanThreshold-0.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:00bb4c7b345389f5d95252c19d70fc2b654c4f0198e6a704b603da92b78e9a0a
size 102982

3
tests/Images/Input/Png/cgbi/clocks.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:cc462d8c2697060cde9a2e975ffb828d822ee3b0d4d12e3c5f081114176c036b
size 389981

3
tests/Images/Input/Png/cgbi/colors.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4f34436f755e3c6d15341f29c992331045ae16ad413144ca798ede5c085c8e6a
size 12853

3
tests/Images/Input/Png/cgbi/flecks.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:6be7b478594ba5e4d37bc135c881c0a16cf1c804fece5440bab997c7b69182f1
size 187703

3
tests/Images/Input/Png/cgbi/screen.png

@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2e9bfbac37a57b71fa27b38b21314bd49dbe1cb19a2eb9d0f272ec7be3b72a33
size 94834
Loading…
Cancel
Save